Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_script_run.c
21421 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2021 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains the function for checking a script run. */
43
44
45
#include "pcre2_internal.h"
46
47
48
49
/*************************************************
50
* Check script run *
51
*************************************************/
52
53
/* A script run is conceptually a sequence of characters all in the same
54
Unicode script. However, it isn't quite that simple. There are special rules
55
for scripts that are commonly used together, and also special rules for digits.
56
This function implements the appropriate checks, which is possible only when
57
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
58
no Unicode support; however, it should never be called in that circumstance
59
because an error is given by pcre2_compile() if a script run is called for in a
60
version of PCRE2 compiled without Unicode support.
61
62
Arguments:
63
pgr point to the first character
64
endptr point after the last character
65
utf TRUE if in UTF mode
66
67
Returns: TRUE if this is a valid script run
68
*/
69
70
/* These are states in the checking process. */
71
72
enum { SCRIPT_UNSET, /* Requirement as yet unknown */
73
SCRIPT_MAP, /* Bitmap contains acceptable scripts */
74
SCRIPT_HANPENDING, /* Have had only Han characters */
75
SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
76
SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
77
SCRIPT_HANHANGUL /* Expect Han or Hangul */
78
};
79
80
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
81
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
82
83
BOOL
84
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
85
{
86
#ifdef SUPPORT_UNICODE
87
uint32_t require_state = SCRIPT_UNSET;
88
uint32_t require_map[FULL_MAPSIZE];
89
uint32_t map[FULL_MAPSIZE];
90
uint32_t require_digitset = 0;
91
uint32_t c;
92
93
#if PCRE2_CODE_UNIT_WIDTH == 32
94
(void)utf; /* Avoid compiler warning */
95
#endif
96
97
/* Any string containing fewer than 2 characters is a valid script run. */
98
99
if (ptr >= endptr) return TRUE;
100
GETCHARINCTEST(c, ptr);
101
if (ptr >= endptr) return TRUE;
102
103
/* Initialize the require map. This is a full-size bitmap that has a bit for
104
every script, as opposed to the maps in ucd_script_sets, which only have bits
105
for scripts less than ucp_Unknown - those that appear in script extension
106
lists. */
107
108
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
109
110
/* Scan strings of two or more characters, checking the Unicode characteristics
111
of each code point. There is special code for scripts that can be combined with
112
characters from the Han Chinese script. This may be used in conjunction with
113
four other scripts in these combinations:
114
115
. Han with Hiragana and Katakana is allowed (for Japanese).
116
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
117
. Han with Hangul is allowed (for Korean).
118
119
If the first significant character's script is one of the four, the required
120
script type is immediately known. However, if the first significant
121
character's script is Han, we have to keep checking for a non-Han character.
122
Hence the SCRIPT_HANPENDING state. */
123
124
for (;;)
125
{
126
const ucd_record *ucd = GET_UCD(c);
127
uint32_t script = ucd->script;
128
129
/* If the script is Unknown, the string is not a valid script run. Such
130
characters can only form script runs of length one (see test above). */
131
132
if (script == ucp_Unknown) return FALSE;
133
134
/* A character without any script extensions whose script is Inherited or
135
Common is always accepted with any script. If there are extensions, the
136
following processing happens for all scripts. */
137
138
if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
139
{
140
BOOL OK;
141
142
/* Set up a full-sized map for this character that can include bits for all
143
scripts. Copy the scriptx map for this character (which covers those
144
scripts that appear in script extension lists), set the remaining values to
145
zero, and then, except for Common or Inherited, add this script's bit to
146
the map. */
147
148
memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
149
memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
150
if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
151
152
/* Handle the different checking states */
153
154
switch(require_state)
155
{
156
/* First significant character - it might follow Common or Inherited
157
characters that do not have any script extensions. */
158
159
case SCRIPT_UNSET:
160
switch(script)
161
{
162
case ucp_Han:
163
require_state = SCRIPT_HANPENDING;
164
break;
165
166
case ucp_Hiragana:
167
case ucp_Katakana:
168
require_state = SCRIPT_HANHIRAKATA;
169
break;
170
171
case ucp_Bopomofo:
172
require_state = SCRIPT_HANBOPOMOFO;
173
break;
174
175
case ucp_Hangul:
176
require_state = SCRIPT_HANHANGUL;
177
break;
178
179
default:
180
memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
181
require_state = SCRIPT_MAP;
182
break;
183
}
184
break;
185
186
/* The first significant character was Han. An inspection of the Unicode
187
11.0.0 files shows that there are the following types of Script Extension
188
list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
189
scripts:
190
191
. Bopomofo + Han
192
. Han + Hiragana + Katakana
193
. Hiragana + Katakana
194
. Bopopmofo + Hangul + Han + Hiragana + Katakana
195
196
The following code tries to make sense of this. */
197
198
#define FOUND_BOPOMOFO 1
199
#define FOUND_HIRAGANA 2
200
#define FOUND_KATAKANA 4
201
#define FOUND_HANGUL 8
202
203
case SCRIPT_HANPENDING:
204
if (script != ucp_Han) /* Another Han does nothing */
205
{
206
uint32_t chspecial = 0;
207
208
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
209
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
210
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
211
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
212
213
if (chspecial == 0) return FALSE; /* Not allowed with Han */
214
215
if (chspecial == FOUND_BOPOMOFO)
216
require_state = SCRIPT_HANBOPOMOFO;
217
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
218
require_state = SCRIPT_HANHIRAKATA;
219
220
/* Otherwise this character must be allowed with all of them, so remain
221
in the pending state. */
222
}
223
break;
224
225
/* Previously encountered one of the "with Han" scripts. Check that
226
this character is appropriate. */
227
228
case SCRIPT_HANHIRAKATA:
229
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
230
MAPBIT(map, ucp_Katakana) == 0) return FALSE;
231
break;
232
233
case SCRIPT_HANBOPOMOFO:
234
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
235
break;
236
237
case SCRIPT_HANHANGUL:
238
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
239
break;
240
241
/* Previously encountered one or more characters that are allowed with a
242
list of scripts. */
243
244
case SCRIPT_MAP:
245
OK = FALSE;
246
247
for (int i = 0; i < FULL_MAPSIZE; i++)
248
{
249
if ((require_map[i] & map[i]) != 0)
250
{
251
OK = TRUE;
252
break;
253
}
254
}
255
256
if (!OK) return FALSE;
257
258
/* The rest of the string must be in this script, but we have to
259
allow for the Han complications. */
260
261
switch(script)
262
{
263
case ucp_Han:
264
require_state = SCRIPT_HANPENDING;
265
break;
266
267
case ucp_Hiragana:
268
case ucp_Katakana:
269
require_state = SCRIPT_HANHIRAKATA;
270
break;
271
272
case ucp_Bopomofo:
273
require_state = SCRIPT_HANBOPOMOFO;
274
break;
275
276
case ucp_Hangul:
277
require_state = SCRIPT_HANHANGUL;
278
break;
279
280
/* Compute the intersection of the required list of scripts and the
281
allowed scripts for this character. */
282
283
default:
284
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
285
break;
286
}
287
288
break;
289
}
290
} /* End checking character's script and extensions. */
291
292
/* The character is in an acceptable script. We must now ensure that all
293
decimal digits in the string come from the same set. Some scripts (e.g.
294
Common, Arabic) have more than one set of decimal digits. This code does
295
not allow mixing sets, even within the same script. The vector called
296
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
297
following elements, and then, in ascending order, the code points of the
298
'9' characters in every set of 10 digits. Each set is identified by the
299
offset in the vector of its '9' character. An initial check of the first
300
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
301
302
if (ucd->chartype == ucp_Nd)
303
{
304
uint32_t digitset;
305
306
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
307
{
308
int mid;
309
int bot = 1;
310
int top = PRIV(ucd_digit_sets)[0];
311
for (;;)
312
{
313
if (top <= bot + 1) /* <= rather than == is paranoia */
314
{
315
digitset = top;
316
break;
317
}
318
mid = (top + bot) / 2;
319
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
320
}
321
}
322
323
/* A required value of 0 means "unset". */
324
325
if (require_digitset == 0) require_digitset = digitset;
326
else if (digitset != require_digitset) return FALSE;
327
} /* End digit handling */
328
329
/* If we haven't yet got to the end, pick up the next character. */
330
331
if (ptr >= endptr) return TRUE;
332
GETCHARINCTEST(c, ptr);
333
} /* End checking loop */
334
335
#else /* NOT SUPPORT_UNICODE */
336
(void)ptr;
337
(void)endptr;
338
(void)utf;
339
return TRUE;
340
#endif /* SUPPORT_UNICODE */
341
}
342
343
/* End of pcre2_script_run.c */
344
345