CoCalc -- pcre2_script

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_script_run.c
²¹⁴²¹ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2021 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41

42
/* This module contains the function for checking a script run. */
43

44

45
#include "pcre2_internal.h"
46

47

48

49
/*************************************************
50
*                Check script run                *
51
*************************************************/
52

53
/* A script run is conceptually a sequence of characters all in the same
54
Unicode script. However, it isn't quite that simple. There are special rules
55
for scripts that are commonly used together, and also special rules for digits.
56
This function implements the appropriate checks, which is possible only when
57
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
58
no Unicode support; however, it should never be called in that circumstance
59
because an error is given by pcre2_compile() if a script run is called for in a
60
version of PCRE2 compiled without Unicode support.
61

62
Arguments:
63
  pgr       point to the first character
64
  endptr    point after the last character
65
  utf       TRUE if in UTF mode
66

67
Returns:    TRUE if this is a valid script run
68
*/
69

70
/* These are states in the checking process. */
71

72
enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
73
       SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
74
       SCRIPT_HANPENDING,     /* Have had only Han characters */
75
       SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
76
       SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
77
       SCRIPT_HANHANGUL       /* Expect Han or Hangul */
78
       };
79

80
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
81
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
82

83
BOOL
84
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
85
{
86
#ifdef SUPPORT_UNICODE
87
uint32_t require_state = SCRIPT_UNSET;
88
uint32_t require_map[FULL_MAPSIZE];
89
uint32_t map[FULL_MAPSIZE];
90
uint32_t require_digitset = 0;
91
uint32_t c;
92

93
#if PCRE2_CODE_UNIT_WIDTH == 32
94
(void)utf;    /* Avoid compiler warning */
95
#endif
96

97
/* Any string containing fewer than 2 characters is a valid script run. */
98

99
if (ptr >= endptr) return TRUE;
100
GETCHARINCTEST(c, ptr);
101
if (ptr >= endptr) return TRUE;
102

103
/* Initialize the require map. This is a full-size bitmap that has a bit for
104
every script, as opposed to the maps in ucd_script_sets, which only have bits
105
for scripts less than ucp_Unknown - those that appear in script extension
106
lists. */
107

108
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
109

110
/* Scan strings of two or more characters, checking the Unicode characteristics
111
of each code point. There is special code for scripts that can be combined with
112
characters from the Han Chinese script. This may be used in conjunction with
113
four other scripts in these combinations:
114

115
. Han with Hiragana and Katakana is allowed (for Japanese).
116
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
117
. Han with Hangul is allowed (for Korean).
118

119
If the first significant character's script is one of the four, the required
120
script type is immediately known. However, if the first significant
121
character's script is Han, we have to keep checking for a non-Han character.
122
Hence the SCRIPT_HANPENDING state. */
123

124
for (;;)
125
  {
126
  const ucd_record *ucd = GET_UCD(c);
127
  uint32_t script = ucd->script;
128

129
  /* If the script is Unknown, the string is not a valid script run. Such
130
  characters can only form script runs of length one (see test above). */
131

132
  if (script == ucp_Unknown) return FALSE;
133

134
  /* A character without any script extensions whose script is Inherited or
135
  Common is always accepted with any script. If there are extensions, the
136
  following processing happens for all scripts. */
137

138
  if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
139
    {
140
    BOOL OK;
141

142
    /* Set up a full-sized map for this character that can include bits for all
143
    scripts. Copy the scriptx map for this character (which covers those
144
    scripts that appear in script extension lists), set the remaining values to
145
    zero, and then, except for Common or Inherited, add this script's bit to
146
    the map. */
147

148
    memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
149
    memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
150
    if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
151

152
    /* Handle the different checking states */
153

154
    switch(require_state)
155
      {
156
      /* First significant character - it might follow Common or Inherited
157
      characters that do not have any script extensions. */
158

159
      case SCRIPT_UNSET:
160
      switch(script)
161
        {
162
        case ucp_Han:
163
        require_state = SCRIPT_HANPENDING;
164
        break;
165

166
        case ucp_Hiragana:
167
        case ucp_Katakana:
168
        require_state = SCRIPT_HANHIRAKATA;
169
        break;
170

171
        case ucp_Bopomofo:
172
        require_state = SCRIPT_HANBOPOMOFO;
173
        break;
174

175
        case ucp_Hangul:
176
        require_state = SCRIPT_HANHANGUL;
177
        break;
178

179
        default:
180
        memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
181
        require_state = SCRIPT_MAP;
182
        break;
183
        }
184
      break;
185

186
      /* The first significant character was Han. An inspection of the Unicode
187
      11.0.0 files shows that there are the following types of Script Extension
188
      list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
189
      scripts:
190

191
      . Bopomofo + Han
192
      . Han + Hiragana + Katakana
193
      . Hiragana + Katakana
194
      . Bopopmofo + Hangul + Han + Hiragana + Katakana
195

196
      The following code tries to make sense of this. */
197

198
#define FOUND_BOPOMOFO 1
199
#define FOUND_HIRAGANA 2
200
#define FOUND_KATAKANA 4
201
#define FOUND_HANGUL   8
202

203
      case SCRIPT_HANPENDING:
204
      if (script != ucp_Han)   /* Another Han does nothing */
205
        {
206
        uint32_t chspecial = 0;
207

208
        if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
209
        if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
210
        if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
211
        if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
212

213
        if (chspecial == 0) return FALSE;   /* Not allowed with Han */
214

215
        if (chspecial == FOUND_BOPOMOFO)
216
          require_state = SCRIPT_HANBOPOMOFO;
217
        else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
218
          require_state = SCRIPT_HANHIRAKATA;
219

220
        /* Otherwise this character must be allowed with all of them, so remain
221
        in the pending state. */
222
        }
223
      break;
224

225
      /* Previously encountered one of the "with Han" scripts. Check that
226
      this character is appropriate. */
227

228
      case SCRIPT_HANHIRAKATA:
229
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
230
          MAPBIT(map, ucp_Katakana) == 0) return FALSE;
231
      break;
232

233
      case SCRIPT_HANBOPOMOFO:
234
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
235
      break;
236

237
      case SCRIPT_HANHANGUL:
238
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
239
      break;
240

241
      /* Previously encountered one or more characters that are allowed with a
242
      list of scripts. */
243

244
      case SCRIPT_MAP:
245
      OK = FALSE;
246

247
      for (int i = 0; i < FULL_MAPSIZE; i++)
248
        {
249
        if ((require_map[i] & map[i]) != 0)
250
          {
251
          OK = TRUE;
252
          break;
253
          }
254
        }
255

256
      if (!OK) return FALSE;
257

258
      /* The rest of the string must be in this script, but we have to
259
      allow for the Han complications. */
260

261
      switch(script)
262
        {
263
        case ucp_Han:
264
        require_state = SCRIPT_HANPENDING;
265
        break;
266

267
        case ucp_Hiragana:
268
        case ucp_Katakana:
269
        require_state = SCRIPT_HANHIRAKATA;
270
        break;
271

272
        case ucp_Bopomofo:
273
        require_state = SCRIPT_HANBOPOMOFO;
274
        break;
275

276
        case ucp_Hangul:
277
        require_state = SCRIPT_HANHANGUL;
278
        break;
279

280
        /* Compute the intersection of the required list of scripts and the
281
        allowed scripts for this character. */
282

283
        default:
284
        for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
285
        break;
286
        }
287

288
      break;
289
      }
290
    }   /* End checking character's script and extensions. */
291

292
  /* The character is in an acceptable script. We must now ensure that all
293
  decimal digits in the string come from the same set. Some scripts (e.g.
294
  Common, Arabic) have more than one set of decimal digits. This code does
295
  not allow mixing sets, even within the same script. The vector called
296
  PRIV(ucd_digit_sets)[] contains, in its first element, the number of
297
  following elements, and then, in ascending order, the code points of the
298
  '9' characters in every set of 10 digits. Each set is identified by the
299
  offset in the vector of its '9' character. An initial check of the first
300
  value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
301

302
  if (ucd->chartype == ucp_Nd)
303
    {
304
    uint32_t digitset;
305

306
    if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
307
      {
308
      int mid;
309
      int bot = 1;
310
      int top = PRIV(ucd_digit_sets)[0];
311
      for (;;)
312
        {
313
        if (top <= bot + 1)    /* <= rather than == is paranoia */
314
          {
315
          digitset = top;
316
          break;
317
          }
318
        mid = (top + bot) / 2;
319
        if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
320
        }
321
      }
322

323
    /* A required value of 0 means "unset". */
324

325
    if (require_digitset == 0) require_digitset = digitset;
326
      else if (digitset != require_digitset) return FALSE;
327
    }   /* End digit handling */
328

329
  /* If we haven't yet got to the end, pick up the next character. */
330

331
  if (ptr >= endptr) return TRUE;
332
  GETCHARINCTEST(c, ptr);
333
  }  /* End checking loop */
334

335
#else   /* NOT SUPPORT_UNICODE */
336
(void)ptr;
337
(void)endptr;
338
(void)utf;
339
return TRUE;
340
#endif  /* SUPPORT_UNICODE */
341
}
342

343
/* End of pcre2_script_run.c */
344

345
Product

Resources

Company