Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_xclass.c
21677 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains two internal functions that are used to match
43
OP_XCLASS and OP_ECLASS. It is used by pcre2_auto_possessify() and by both
44
pcre2_match() and pcre2_dfa_match(). */
45
46
47
#include "pcre2_internal.h"
48
49
50
51
/*************************************************
52
* Match character against an XCLASS *
53
*************************************************/
54
55
/* This function is called to match a character against an extended class that
56
might contain codepoints above 255 and/or Unicode properties.
57
58
Arguments:
59
c the character
60
data points to the flag code unit of the XCLASS data
61
utf TRUE if in UTF mode
62
63
Returns: TRUE if character matches, else FALSE
64
*/
65
66
BOOL
67
PRIV(xclass)(uint32_t c, PCRE2_SPTR data, const uint8_t *char_lists_end, BOOL utf)
68
{
69
/* Update PRIV(update_classbits) when this function is changed. */
70
PCRE2_UCHAR t;
71
BOOL not_negated = (*data & XCL_NOT) == 0;
72
uint32_t type, max_index, min_index, value;
73
const uint8_t *next_char;
74
75
#if PCRE2_CODE_UNIT_WIDTH == 8
76
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
77
utf = TRUE;
78
#endif
79
80
/* Code points < 256 are matched against a bitmap, if one is present. */
81
82
if ((*data++ & XCL_MAP) != 0)
83
{
84
if (c < 256)
85
return (((const uint8_t *)data)[c/8] & (1u << (c&7))) != 0;
86
/* Skip bitmap. */
87
data += 32 / sizeof(PCRE2_UCHAR);
88
}
89
90
/* Match against the list of Unicode properties. We won't ever
91
encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
92
#ifdef SUPPORT_UNICODE
93
if (*data == XCL_PROP || *data == XCL_NOTPROP)
94
{
95
/* The UCD record is the same for all properties. */
96
const ucd_record *prop = GET_UCD(c);
97
98
do
99
{
100
int chartype;
101
BOOL isprop = (*data++) == XCL_PROP;
102
BOOL ok;
103
104
switch(*data)
105
{
106
case PT_LAMP:
107
chartype = prop->chartype;
108
if ((chartype == ucp_Lu || chartype == ucp_Ll ||
109
chartype == ucp_Lt) == isprop) return not_negated;
110
break;
111
112
case PT_GC:
113
if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
114
return not_negated;
115
break;
116
117
case PT_PC:
118
if ((data[1] == prop->chartype) == isprop) return not_negated;
119
break;
120
121
case PT_SC:
122
if ((data[1] == prop->script) == isprop) return not_negated;
123
break;
124
125
case PT_SCX:
126
ok = (data[1] == prop->script ||
127
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0);
128
if (ok == isprop) return not_negated;
129
break;
130
131
case PT_ALNUM:
132
chartype = prop->chartype;
133
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
134
PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
135
return not_negated;
136
break;
137
138
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
139
which means that Perl space and POSIX space are now identical. PCRE
140
was changed at release 8.34. */
141
142
case PT_SPACE: /* Perl space */
143
case PT_PXSPACE: /* POSIX space */
144
switch(c)
145
{
146
HSPACE_CASES:
147
VSPACE_CASES:
148
if (isprop) return not_negated;
149
break;
150
151
default:
152
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
153
return not_negated;
154
break;
155
}
156
break;
157
158
case PT_WORD:
159
chartype = prop->chartype;
160
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
161
PRIV(ucp_gentype)[chartype] == ucp_N ||
162
chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
163
return not_negated;
164
break;
165
166
case PT_UCNC:
167
if (c < 0xa0)
168
{
169
if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
170
c == CHAR_GRAVE_ACCENT) == isprop)
171
return not_negated;
172
}
173
else
174
{
175
if ((c < 0xd800 || c > 0xdfff) == isprop)
176
return not_negated;
177
}
178
break;
179
180
case PT_BIDICL:
181
if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop)
182
return not_negated;
183
break;
184
185
case PT_BOOL:
186
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
187
UCD_BPROPS_PROP(prop), data[1]) != 0;
188
if (ok == isprop) return not_negated;
189
break;
190
191
/* The following three properties can occur only in an XCLASS, as there
192
is no \p or \P coding for them. */
193
194
/* Graphic character. Implement this as not Z (space or separator) and
195
not C (other), except for Cf (format) with a few exceptions. This seems
196
to be what Perl does. The exceptional characters are:
197
198
U+061C Arabic Letter Mark
199
U+180E Mongolian Vowel Separator
200
U+2066 - U+2069 Various "isolate"s
201
*/
202
203
case PT_PXGRAPH:
204
chartype = prop->chartype;
205
if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
206
(PRIV(ucp_gentype)[chartype] != ucp_C ||
207
(chartype == ucp_Cf &&
208
c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
209
)) == isprop)
210
return not_negated;
211
break;
212
213
/* Printable character: same as graphic, with the addition of Zs, i.e.
214
not Zl and not Zp, and U+180E. */
215
216
case PT_PXPRINT:
217
chartype = prop->chartype;
218
if ((chartype != ucp_Zl &&
219
chartype != ucp_Zp &&
220
(PRIV(ucp_gentype)[chartype] != ucp_C ||
221
(chartype == ucp_Cf &&
222
c != 0x061c && (c < 0x2066 || c > 0x2069))
223
)) == isprop)
224
return not_negated;
225
break;
226
227
/* Punctuation: all Unicode punctuation, plus ASCII characters that
228
Unicode treats as symbols rather than punctuation, for Perl
229
compatibility (these are $+<=>^`|~). */
230
231
case PT_PXPUNCT:
232
chartype = prop->chartype;
233
if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
234
(c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
235
return not_negated;
236
break;
237
238
/* Perl has two sets of hex digits */
239
240
case PT_PXXDIGIT:
241
if (((c >= CHAR_0 && c <= CHAR_9) ||
242
(c >= CHAR_A && c <= CHAR_F) ||
243
(c >= CHAR_a && c <= CHAR_f) ||
244
(c >= 0xff10 && c <= 0xff19) || /* Fullwidth digits */
245
(c >= 0xff21 && c <= 0xff26) || /* Fullwidth letters */
246
(c >= 0xff41 && c <= 0xff46)) == isprop)
247
return not_negated;
248
break;
249
250
/* This should never occur, but compilers may mutter if there is no
251
default. */
252
253
/* LCOV_EXCL_START */
254
default:
255
PCRE2_DEBUG_UNREACHABLE();
256
return FALSE;
257
/* LCOV_EXCL_STOP */
258
}
259
260
data += 2;
261
}
262
while (*data == XCL_PROP || *data == XCL_NOTPROP);
263
}
264
#else
265
(void)utf; /* Avoid compiler warning */
266
#endif /* SUPPORT_UNICODE */
267
268
/* Match against large chars or ranges that end with a large char. */
269
if (*data < XCL_LIST)
270
{
271
while ((t = *data++) != XCL_END)
272
{
273
uint32_t x, y;
274
275
#ifdef SUPPORT_UNICODE
276
if (utf)
277
{
278
GETCHARINC(x, data); /* macro generates multiple statements */
279
}
280
else
281
#endif
282
x = *data++;
283
284
if (t == XCL_SINGLE)
285
{
286
/* Since character ranges follow the properties, and they are
287
sorted, early return is possible for all characters <= x. */
288
if (c <= x) return (c == x) ? not_negated : !not_negated;
289
continue;
290
}
291
292
PCRE2_ASSERT(t == XCL_RANGE);
293
#ifdef SUPPORT_UNICODE
294
if (utf)
295
{
296
GETCHARINC(y, data); /* macro generates multiple statements */
297
}
298
else
299
#endif
300
y = *data++;
301
302
/* Since character ranges follow the properties, and they are
303
sorted, early return is possible for all characters <= y. */
304
if (c <= y) return (c >= x) ? not_negated : !not_negated;
305
}
306
307
return !not_negated; /* char did not match */
308
}
309
310
#if PCRE2_CODE_UNIT_WIDTH == 8
311
type = (uint32_t)(data[0] << 8) | data[1];
312
data += 2;
313
#else
314
type = data[0];
315
data++;
316
#endif /* CODE_UNIT_WIDTH */
317
318
/* Align characters. */
319
next_char = char_lists_end - (GET(data, 0) << 1);
320
type &= XCL_TYPE_MASK;
321
322
/* Alignment check. */
323
PCRE2_ASSERT(((uintptr_t)next_char & 0x1) == 0);
324
325
if (c >= XCL_CHAR_LIST_HIGH_16_START)
326
{
327
max_index = type & XCL_ITEM_COUNT_MASK;
328
if (max_index == XCL_ITEM_COUNT_MASK)
329
{
330
max_index = *(const uint16_t*)next_char;
331
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
332
next_char += 2;
333
}
334
335
next_char += max_index << 1;
336
type >>= XCL_TYPE_BIT_LEN;
337
}
338
339
if (c < XCL_CHAR_LIST_LOW_32_START)
340
{
341
max_index = type & XCL_ITEM_COUNT_MASK;
342
343
c = (uint16_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
344
345
if (max_index == XCL_ITEM_COUNT_MASK)
346
{
347
max_index = *(const uint16_t*)next_char;
348
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
349
next_char += 2;
350
}
351
352
if (max_index == 0 || c < *(const uint16_t*)next_char)
353
return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
354
355
min_index = 0;
356
value = ((const uint16_t*)next_char)[--max_index];
357
if (c >= value)
358
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
359
360
max_index--;
361
362
/* Binary search of a range. */
363
while (TRUE)
364
{
365
uint32_t mid_index = (min_index + max_index) >> 1;
366
value = ((const uint16_t*)next_char)[mid_index];
367
368
if (c < value)
369
max_index = mid_index - 1;
370
else if (((const uint16_t*)next_char)[mid_index + 1] <= c)
371
min_index = mid_index + 1;
372
else
373
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
374
}
375
}
376
377
/* Skip the 16 bit ranges. */
378
max_index = type & XCL_ITEM_COUNT_MASK;
379
if (max_index == XCL_ITEM_COUNT_MASK)
380
{
381
max_index = *(const uint16_t*)next_char;
382
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
383
next_char += 2;
384
}
385
386
next_char += (max_index << 1);
387
type >>= XCL_TYPE_BIT_LEN;
388
389
/* Alignment check. */
390
PCRE2_ASSERT(((uintptr_t)next_char & 0x3) == 0);
391
392
max_index = type & XCL_ITEM_COUNT_MASK;
393
394
#if PCRE2_CODE_UNIT_WIDTH == 32
395
if (c >= XCL_CHAR_LIST_HIGH_32_START)
396
{
397
if (max_index == XCL_ITEM_COUNT_MASK)
398
{
399
max_index = *(const uint32_t*)next_char;
400
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
401
next_char += 4;
402
}
403
404
next_char += max_index << 2;
405
type >>= XCL_TYPE_BIT_LEN;
406
max_index = type & XCL_ITEM_COUNT_MASK;
407
}
408
#endif
409
410
c = (uint32_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
411
412
if (max_index == XCL_ITEM_COUNT_MASK)
413
{
414
max_index = *(const uint32_t*)next_char;
415
next_char += 4;
416
}
417
418
if (max_index == 0 || c < *(const uint32_t*)next_char)
419
return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
420
421
min_index = 0;
422
value = ((const uint32_t*)next_char)[--max_index];
423
if (c >= value)
424
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
425
426
max_index--;
427
428
/* Binary search of a range. */
429
while (TRUE)
430
{
431
uint32_t mid_index = (min_index + max_index) >> 1;
432
value = ((const uint32_t*)next_char)[mid_index];
433
434
if (c < value)
435
max_index = mid_index - 1;
436
else if (((const uint32_t*)next_char)[mid_index + 1] <= c)
437
min_index = mid_index + 1;
438
else
439
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
440
}
441
}
442
443
444
445
/*************************************************
446
* Match character against an ECLASS *
447
*************************************************/
448
449
/* This function is called to match a character against an extended class
450
used for describing characters using boolean operations on sets.
451
452
Arguments:
453
c the character
454
data_start points to the start of the ECLASS data
455
data_end points one-past-the-last of the ECLASS data
456
utf TRUE if in UTF mode
457
458
Returns: TRUE if character matches, else FALSE
459
*/
460
461
BOOL
462
PRIV(eclass)(uint32_t c, PCRE2_SPTR data_start, PCRE2_SPTR data_end,
463
const uint8_t *char_lists_end, BOOL utf)
464
{
465
PCRE2_SPTR ptr = data_start;
466
PCRE2_UCHAR flags;
467
uint32_t stack = 0;
468
int stack_depth = 0;
469
470
PCRE2_ASSERT(data_start < data_end);
471
flags = *ptr++;
472
PCRE2_ASSERT((flags & ECL_MAP) == 0 ||
473
(data_end - ptr) >= 32 / (int)sizeof(PCRE2_UCHAR));
474
475
/* Code points < 256 are matched against a bitmap, if one is present.
476
Otherwise all codepoints are checked later. */
477
478
if ((flags & ECL_MAP) != 0)
479
{
480
if (c < 256)
481
return (((const uint8_t *)ptr)[c/8] & (1u << (c&7))) != 0;
482
483
/* Skip the bitmap. */
484
ptr += 32 / sizeof(PCRE2_UCHAR);
485
}
486
487
/* Do a little loop, until we reach the end of the ECLASS. */
488
while (ptr < data_end)
489
{
490
switch (*ptr)
491
{
492
case ECL_AND:
493
++ptr;
494
stack = (stack >> 1) & (stack | ~(uint32_t)1u);
495
PCRE2_ASSERT(stack_depth >= 2);
496
--stack_depth;
497
break;
498
499
case ECL_OR:
500
++ptr;
501
stack = (stack >> 1) | (stack & (uint32_t)1u);
502
PCRE2_ASSERT(stack_depth >= 2);
503
--stack_depth;
504
break;
505
506
case ECL_XOR:
507
++ptr;
508
stack = (stack >> 1) ^ (stack & (uint32_t)1u);
509
PCRE2_ASSERT(stack_depth >= 2);
510
--stack_depth;
511
break;
512
513
case ECL_NOT:
514
++ptr;
515
stack ^= (uint32_t)1u;
516
PCRE2_ASSERT(stack_depth >= 1);
517
break;
518
519
case ECL_XCLASS:
520
{
521
uint32_t matched = PRIV(xclass)(c, ptr + 1 + LINK_SIZE, char_lists_end, utf);
522
523
ptr += GET(ptr, 1);
524
stack = (stack << 1) | matched;
525
++stack_depth;
526
break;
527
}
528
529
/* This should never occur, but compilers may mutter if there is no
530
default. */
531
532
/* LCOV_EXCL_START */
533
default:
534
PCRE2_DEBUG_UNREACHABLE();
535
return FALSE;
536
/* LCOV_EXCL_STOP */
537
}
538
}
539
540
PCRE2_ASSERT(stack_depth == 1);
541
(void)stack_depth; /* Ignore unused variable, if assertions are disabled. */
542
543
/* The final bit left on the stack now holds the match result. */
544
return (stack & 1u) != 0;
545
}
546
547
/* End of pcre2_xclass.c */
548
549