Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_xclass.c
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains two internal functions that are used to match
42
OP_XCLASS and OP_ECLASS. It is used by pcre2_auto_possessify() and by both
43
pcre2_match() and pcre2_dfa_match(). */
44
45
46
#ifdef HAVE_CONFIG_H
47
#include "config.h"
48
#endif
49
50
51
#include "pcre2_internal.h"
52
53
/*************************************************
54
* Match character against an XCLASS *
55
*************************************************/
56
57
/* This function is called to match a character against an extended class that
58
might contain codepoints above 255 and/or Unicode properties.
59
60
Arguments:
61
c the character
62
data points to the flag code unit of the XCLASS data
63
utf TRUE if in UTF mode
64
65
Returns: TRUE if character matches, else FALSE
66
*/
67
68
BOOL
69
PRIV(xclass)(uint32_t c, PCRE2_SPTR data, const uint8_t *char_lists_end, BOOL utf)
70
{
71
/* Update PRIV(update_classbits) when this function is changed. */
72
PCRE2_UCHAR t;
73
BOOL not_negated = (*data & XCL_NOT) == 0;
74
uint32_t type, max_index, min_index, value;
75
const uint8_t *next_char;
76
77
#if PCRE2_CODE_UNIT_WIDTH == 8
78
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
79
utf = TRUE;
80
#endif
81
82
/* Code points < 256 are matched against a bitmap, if one is present. */
83
84
if ((*data++ & XCL_MAP) != 0)
85
{
86
if (c < 256)
87
return (((const uint8_t *)data)[c/8] & (1u << (c&7))) != 0;
88
/* Skip bitmap. */
89
data += 32 / sizeof(PCRE2_UCHAR);
90
}
91
92
/* Match against the list of Unicode properties. We won't ever
93
encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
94
#ifdef SUPPORT_UNICODE
95
if (*data == XCL_PROP || *data == XCL_NOTPROP)
96
{
97
/* The UCD record is the same for all properties. */
98
const ucd_record *prop = GET_UCD(c);
99
100
do
101
{
102
int chartype;
103
BOOL isprop = (*data++) == XCL_PROP;
104
BOOL ok;
105
106
switch(*data)
107
{
108
case PT_LAMP:
109
chartype = prop->chartype;
110
if ((chartype == ucp_Lu || chartype == ucp_Ll ||
111
chartype == ucp_Lt) == isprop) return not_negated;
112
break;
113
114
case PT_GC:
115
if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
116
return not_negated;
117
break;
118
119
case PT_PC:
120
if ((data[1] == prop->chartype) == isprop) return not_negated;
121
break;
122
123
case PT_SC:
124
if ((data[1] == prop->script) == isprop) return not_negated;
125
break;
126
127
case PT_SCX:
128
ok = (data[1] == prop->script ||
129
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0);
130
if (ok == isprop) return not_negated;
131
break;
132
133
case PT_ALNUM:
134
chartype = prop->chartype;
135
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
136
PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
137
return not_negated;
138
break;
139
140
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
141
which means that Perl space and POSIX space are now identical. PCRE
142
was changed at release 8.34. */
143
144
case PT_SPACE: /* Perl space */
145
case PT_PXSPACE: /* POSIX space */
146
switch(c)
147
{
148
HSPACE_CASES:
149
VSPACE_CASES:
150
if (isprop) return not_negated;
151
break;
152
153
default:
154
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
155
return not_negated;
156
break;
157
}
158
break;
159
160
case PT_WORD:
161
chartype = prop->chartype;
162
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
163
PRIV(ucp_gentype)[chartype] == ucp_N ||
164
chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
165
return not_negated;
166
break;
167
168
case PT_UCNC:
169
if (c < 0xa0)
170
{
171
if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
172
c == CHAR_GRAVE_ACCENT) == isprop)
173
return not_negated;
174
}
175
else
176
{
177
if ((c < 0xd800 || c > 0xdfff) == isprop)
178
return not_negated;
179
}
180
break;
181
182
case PT_BIDICL:
183
if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop)
184
return not_negated;
185
break;
186
187
case PT_BOOL:
188
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
189
UCD_BPROPS_PROP(prop), data[1]) != 0;
190
if (ok == isprop) return not_negated;
191
break;
192
193
/* The following three properties can occur only in an XCLASS, as there
194
is no \p or \P coding for them. */
195
196
/* Graphic character. Implement this as not Z (space or separator) and
197
not C (other), except for Cf (format) with a few exceptions. This seems
198
to be what Perl does. The exceptional characters are:
199
200
U+061C Arabic Letter Mark
201
U+180E Mongolian Vowel Separator
202
U+2066 - U+2069 Various "isolate"s
203
*/
204
205
case PT_PXGRAPH:
206
chartype = prop->chartype;
207
if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
208
(PRIV(ucp_gentype)[chartype] != ucp_C ||
209
(chartype == ucp_Cf &&
210
c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
211
)) == isprop)
212
return not_negated;
213
break;
214
215
/* Printable character: same as graphic, with the addition of Zs, i.e.
216
not Zl and not Zp, and U+180E. */
217
218
case PT_PXPRINT:
219
chartype = prop->chartype;
220
if ((chartype != ucp_Zl &&
221
chartype != ucp_Zp &&
222
(PRIV(ucp_gentype)[chartype] != ucp_C ||
223
(chartype == ucp_Cf &&
224
c != 0x061c && (c < 0x2066 || c > 0x2069))
225
)) == isprop)
226
return not_negated;
227
break;
228
229
/* Punctuation: all Unicode punctuation, plus ASCII characters that
230
Unicode treats as symbols rather than punctuation, for Perl
231
compatibility (these are $+<=>^`|~). */
232
233
case PT_PXPUNCT:
234
chartype = prop->chartype;
235
if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
236
(c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
237
return not_negated;
238
break;
239
240
/* Perl has two sets of hex digits */
241
242
case PT_PXXDIGIT:
243
if (((c >= CHAR_0 && c <= CHAR_9) ||
244
(c >= CHAR_A && c <= CHAR_F) ||
245
(c >= CHAR_a && c <= CHAR_f) ||
246
(c >= 0xff10 && c <= 0xff19) || /* Fullwidth digits */
247
(c >= 0xff21 && c <= 0xff26) || /* Fullwidth letters */
248
(c >= 0xff41 && c <= 0xff46)) == isprop)
249
return not_negated;
250
break;
251
252
/* This should never occur, but compilers may mutter if there is no
253
default. */
254
255
default:
256
PCRE2_DEBUG_UNREACHABLE();
257
return FALSE;
258
}
259
260
data += 2;
261
}
262
while (*data == XCL_PROP || *data == XCL_NOTPROP);
263
}
264
#else
265
(void)utf; /* Avoid compiler warning */
266
#endif /* SUPPORT_UNICODE */
267
268
/* Match against large chars or ranges that end with a large char. */
269
if (*data < XCL_LIST)
270
{
271
while ((t = *data++) != XCL_END)
272
{
273
uint32_t x, y;
274
275
#ifdef SUPPORT_UNICODE
276
if (utf)
277
{
278
GETCHARINC(x, data); /* macro generates multiple statements */
279
}
280
else
281
#endif
282
x = *data++;
283
284
if (t == XCL_SINGLE)
285
{
286
/* Since character ranges follow the properties, and they are
287
sorted, early return is possible for all characters <= x. */
288
if (c <= x) return (c == x) ? not_negated : !not_negated;
289
continue;
290
}
291
292
PCRE2_ASSERT(t == XCL_RANGE);
293
#ifdef SUPPORT_UNICODE
294
if (utf)
295
{
296
GETCHARINC(y, data); /* macro generates multiple statements */
297
}
298
else
299
#endif
300
y = *data++;
301
302
/* Since character ranges follow the properties, and they are
303
sorted, early return is possible for all characters <= y. */
304
if (c <= y) return (c >= x) ? not_negated : !not_negated;
305
}
306
307
return !not_negated; /* char did not match */
308
}
309
310
#if PCRE2_CODE_UNIT_WIDTH == 8
311
type = (uint32_t)(data[0] << 8) | data[1];
312
data += 2;
313
#else
314
type = data[0];
315
data++;
316
#endif /* CODE_UNIT_WIDTH */
317
318
/* Align characters. */
319
next_char = char_lists_end - (GET(data, 0) << 1);
320
type &= XCL_TYPE_MASK;
321
322
/* Alignment check. */
323
PCRE2_ASSERT(((uintptr_t)next_char & 0x1) == 0);
324
325
if (c >= XCL_CHAR_LIST_HIGH_16_START)
326
{
327
max_index = type & XCL_ITEM_COUNT_MASK;
328
if (max_index == XCL_ITEM_COUNT_MASK)
329
{
330
max_index = *(const uint16_t*)next_char;
331
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
332
next_char += 2;
333
}
334
335
next_char += max_index << 1;
336
type >>= XCL_TYPE_BIT_LEN;
337
}
338
339
if (c < XCL_CHAR_LIST_LOW_32_START)
340
{
341
max_index = type & XCL_ITEM_COUNT_MASK;
342
343
c = (uint16_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
344
345
if (max_index == XCL_ITEM_COUNT_MASK)
346
{
347
max_index = *(const uint16_t*)next_char;
348
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
349
next_char += 2;
350
}
351
352
if (max_index == 0 || c < *(const uint16_t*)next_char)
353
return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
354
355
min_index = 0;
356
value = ((const uint16_t*)next_char)[--max_index];
357
if (c >= value)
358
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
359
360
max_index--;
361
362
/* Binary search of a range. */
363
while (TRUE)
364
{
365
uint32_t mid_index = (min_index + max_index) >> 1;
366
value = ((const uint16_t*)next_char)[mid_index];
367
368
if (c < value)
369
max_index = mid_index - 1;
370
else if (((const uint16_t*)next_char)[mid_index + 1] <= c)
371
min_index = mid_index + 1;
372
else
373
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
374
}
375
}
376
377
/* Skip the 16 bit ranges. */
378
max_index = type & XCL_ITEM_COUNT_MASK;
379
if (max_index == XCL_ITEM_COUNT_MASK)
380
{
381
max_index = *(const uint16_t*)next_char;
382
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
383
next_char += 2;
384
}
385
386
next_char += (max_index << 1);
387
type >>= XCL_TYPE_BIT_LEN;
388
389
/* Alignment check. */
390
PCRE2_ASSERT(((uintptr_t)next_char & 0x3) == 0);
391
392
max_index = type & XCL_ITEM_COUNT_MASK;
393
394
#if PCRE2_CODE_UNIT_WIDTH == 32
395
if (c >= XCL_CHAR_LIST_HIGH_32_START)
396
{
397
if (max_index == XCL_ITEM_COUNT_MASK)
398
{
399
max_index = *(const uint32_t*)next_char;
400
PCRE2_ASSERT(max_index >= XCL_ITEM_COUNT_MASK);
401
next_char += 4;
402
}
403
404
next_char += max_index << 2;
405
type >>= XCL_TYPE_BIT_LEN;
406
max_index = type & XCL_ITEM_COUNT_MASK;
407
}
408
#endif
409
410
c = (uint32_t)((c << XCL_CHAR_SHIFT) | XCL_CHAR_END);
411
412
if (max_index == XCL_ITEM_COUNT_MASK)
413
{
414
max_index = *(const uint32_t*)next_char;
415
next_char += 4;
416
}
417
418
if (max_index == 0 || c < *(const uint32_t*)next_char)
419
return ((type & XCL_BEGIN_WITH_RANGE) != 0) == not_negated;
420
421
min_index = 0;
422
value = ((const uint32_t*)next_char)[--max_index];
423
if (c >= value)
424
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
425
426
max_index--;
427
428
/* Binary search of a range. */
429
while (TRUE)
430
{
431
uint32_t mid_index = (min_index + max_index) >> 1;
432
value = ((const uint32_t*)next_char)[mid_index];
433
434
if (c < value)
435
max_index = mid_index - 1;
436
else if (((const uint32_t*)next_char)[mid_index + 1] <= c)
437
min_index = mid_index + 1;
438
else
439
return (value == c || (value & XCL_CHAR_END) == 0) == not_negated;
440
}
441
}
442
443
444
445
/*************************************************
446
* Match character against an ECLASS *
447
*************************************************/
448
449
/* This function is called to match a character against an extended class
450
used for describing characters using boolean operations on sets.
451
452
Arguments:
453
c the character
454
data_start points to the start of the ECLASS data
455
data_end points one-past-the-last of the ECLASS data
456
utf TRUE if in UTF mode
457
458
Returns: TRUE if character matches, else FALSE
459
*/
460
461
BOOL
462
PRIV(eclass)(uint32_t c, PCRE2_SPTR data_start, PCRE2_SPTR data_end,
463
const uint8_t *char_lists_end, BOOL utf)
464
{
465
PCRE2_SPTR ptr = data_start;
466
PCRE2_UCHAR flags;
467
uint32_t stack = 0;
468
int stack_depth = 0;
469
470
PCRE2_ASSERT(data_start < data_end);
471
flags = *ptr++;
472
PCRE2_ASSERT((flags & ECL_MAP) == 0 ||
473
(data_end - ptr) >= 32 / (int)sizeof(PCRE2_UCHAR));
474
475
/* Code points < 256 are matched against a bitmap, if one is present.
476
Otherwise all codepoints are checked later. */
477
478
if ((flags & ECL_MAP) != 0)
479
{
480
if (c < 256)
481
return (((const uint8_t *)ptr)[c/8] & (1u << (c&7))) != 0;
482
483
/* Skip the bitmap. */
484
ptr += 32 / sizeof(PCRE2_UCHAR);
485
}
486
487
/* Do a little loop, until we reach the end of the ECLASS. */
488
while (ptr < data_end)
489
{
490
switch (*ptr)
491
{
492
case ECL_AND:
493
++ptr;
494
stack = (stack >> 1) & (stack | ~(uint32_t)1u);
495
PCRE2_ASSERT(stack_depth >= 2);
496
--stack_depth;
497
break;
498
499
case ECL_OR:
500
++ptr;
501
stack = (stack >> 1) | (stack & (uint32_t)1u);
502
PCRE2_ASSERT(stack_depth >= 2);
503
--stack_depth;
504
break;
505
506
case ECL_XOR:
507
++ptr;
508
stack = (stack >> 1) ^ (stack & (uint32_t)1u);
509
PCRE2_ASSERT(stack_depth >= 2);
510
--stack_depth;
511
break;
512
513
case ECL_NOT:
514
++ptr;
515
stack ^= (uint32_t)1u;
516
PCRE2_ASSERT(stack_depth >= 1);
517
break;
518
519
case ECL_XCLASS:
520
{
521
uint32_t matched = PRIV(xclass)(c, ptr + 1 + LINK_SIZE, char_lists_end, utf);
522
523
ptr += GET(ptr, 1);
524
stack = (stack << 1) | matched;
525
++stack_depth;
526
break;
527
}
528
529
/* This should never occur, but compilers may mutter if there is no
530
default. */
531
532
default:
533
PCRE2_DEBUG_UNREACHABLE();
534
return FALSE;
535
}
536
}
537
538
PCRE2_ASSERT(stack_depth == 1);
539
(void)stack_depth; /* Ignore unused variable, if assertions are disabled. */
540
541
/* The final bit left on the stack now holds the match result. */
542
return (stack & 1u) != 0;
543
}
544
545
/* End of pcre2_xclass.c */
546
547