Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_jit_char_inc.h
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
This module by Zoltan Herczeg
10
Original API code Copyright (c) 1997-2012 University of Cambridge
11
New API code Copyright (c) 2016-2024 University of Cambridge
12
13
-----------------------------------------------------------------------------
14
Redistribution and use in source and binary forms, with or without
15
modification, are permitted provided that the following conditions are met:
16
17
* Redistributions of source code must retain the above copyright notice,
18
this list of conditions and the following disclaimer.
19
20
* Redistributions in binary form must reproduce the above copyright
21
notice, this list of conditions and the following disclaimer in the
22
documentation and/or other materials provided with the distribution.
23
24
* Neither the name of the University of Cambridge nor the names of its
25
contributors may be used to endorse or promote products derived from
26
this software without specific prior written permission.
27
28
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
-----------------------------------------------------------------------------
40
*/
41
42
/* XClass matching code. */
43
44
#ifdef SUPPORT_WIDE_CHARS
45
46
#define ECLASS_CHAR_DATA STACK_TOP
47
#define ECLASS_STACK_DATA STACK_LIMIT
48
49
#define SET_CHAR_OFFSET(value) \
50
if ((value) != charoffset) \
51
{ \
52
if ((value) < charoffset) \
53
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(charoffset - (value))); \
54
else \
55
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)((value) - charoffset)); \
56
} \
57
charoffset = (value);
58
59
#define READ_FROM_CHAR_LIST(destination) \
60
if (list_ind <= 1) \
61
{ \
62
destination = *(const uint16_t*)next_char; \
63
next_char += 2; \
64
} \
65
else \
66
{ \
67
destination = *(const uint32_t*)next_char; \
68
next_char += 4; \
69
}
70
71
#define XCLASS_LOCAL_RANGES_SIZE 32
72
#define XCLASS_LOCAL_RANGES_LOG2_SIZE 5
73
74
typedef struct xclass_stack_item {
75
sljit_u32 first_item;
76
sljit_u32 last_item;
77
struct sljit_jump *jump;
78
} xclass_stack_item;
79
80
typedef struct xclass_ranges {
81
size_t range_count;
82
/* Pointer to ranges. A stack area is provided when a small buffer is enough. */
83
uint32_t *ranges;
84
uint32_t local_ranges[XCLASS_LOCAL_RANGES_SIZE * 2];
85
/* Stack size must be log2(ranges / 2). */
86
xclass_stack_item *stack;
87
xclass_stack_item local_stack[XCLASS_LOCAL_RANGES_LOG2_SIZE];
88
} xclass_ranges;
89
90
static void xclass_compute_ranges(compiler_common *common, PCRE2_SPTR cc, xclass_ranges *ranges)
91
{
92
DEFINE_COMPILER;
93
size_t range_count = 0, est_range_count;
94
size_t est_stack_size, tmp;
95
uint32_t type, list_ind;
96
uint32_t est_type;
97
uint32_t char_list_add, range_start, range_end;
98
const uint8_t *next_char;
99
const uint8_t *est_next_char;
100
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
101
BOOL utf = common->utf;
102
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
103
104
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
105
{
106
/* Only a few ranges are present. */
107
do
108
{
109
type = *cc++;
110
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
111
GETCHARINCTEST(range_end, cc);
112
ranges->ranges[range_count] = range_end;
113
114
if (type == XCL_RANGE)
115
{
116
GETCHARINCTEST(range_end, cc);
117
}
118
119
ranges->ranges[range_count + 1] = range_end;
120
range_count += 2;
121
}
122
while (*cc != XCL_END);
123
124
SLJIT_ASSERT(range_count <= XCLASS_LOCAL_RANGES_SIZE);
125
ranges->range_count = range_count;
126
return;
127
}
128
129
SLJIT_ASSERT(cc[0] >= XCL_LIST);
130
#if PCRE2_CODE_UNIT_WIDTH == 8
131
type = (uint32_t)(cc[0] << 8) | cc[1];
132
cc += 2;
133
#else
134
type = cc[0];
135
cc++;
136
#endif /* CODE_UNIT_WIDTH */
137
138
/* Align characters. */
139
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
140
type &= XCL_TYPE_MASK;
141
142
/* Estimate size. */
143
est_next_char = next_char;
144
est_type = type;
145
est_range_count = 0;
146
list_ind = 0;
147
148
while (est_type > 0)
149
{
150
uint32_t item_count = est_type & XCL_ITEM_COUNT_MASK;
151
152
if (item_count == XCL_ITEM_COUNT_MASK)
153
{
154
if (list_ind <= 1)
155
{
156
item_count = *(const uint16_t*)est_next_char;
157
est_next_char += 2;
158
}
159
else
160
{
161
item_count = *(const uint32_t*)est_next_char;
162
est_next_char += 4;
163
}
164
}
165
166
est_type >>= XCL_TYPE_BIT_LEN;
167
est_next_char += (size_t)item_count << (list_ind <= 1 ? 1 : 2);
168
list_ind++;
169
est_range_count += item_count + 1;
170
}
171
172
if (est_range_count > XCLASS_LOCAL_RANGES_SIZE)
173
{
174
est_stack_size = 0;
175
tmp = est_range_count - 1;
176
177
/* Compute log2(est_range_count) */
178
while (tmp > 0)
179
{
180
est_stack_size++;
181
tmp >>= 1;
182
}
183
184
ranges->stack = (xclass_stack_item*)SLJIT_MALLOC((sizeof(xclass_stack_item) * est_stack_size)
185
+ ((sizeof(uint32_t) << 1) * (size_t)est_range_count), compiler->allocator_data);
186
187
if (ranges->stack == NULL)
188
{
189
sljit_set_compiler_memory_error(compiler);
190
ranges->ranges = NULL;
191
return;
192
}
193
194
ranges->ranges = (uint32_t*)(ranges->stack + est_stack_size);
195
}
196
197
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
198
range_start = ~(uint32_t)0;
199
list_ind = 0;
200
201
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
202
range_start = XCL_CHAR_LIST_LOW_16_START;
203
204
while (type > 0)
205
{
206
uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
207
208
if (item_count == XCL_ITEM_COUNT_MASK)
209
{
210
READ_FROM_CHAR_LIST(item_count);
211
SLJIT_ASSERT(item_count >= XCL_ITEM_COUNT_MASK);
212
}
213
214
while (item_count > 0)
215
{
216
READ_FROM_CHAR_LIST(range_end);
217
218
if ((range_end & XCL_CHAR_END) != 0)
219
{
220
range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
221
222
if (range_start == ~(uint32_t)0)
223
range_start = range_end;
224
225
ranges->ranges[range_count] = range_start;
226
ranges->ranges[range_count + 1] = range_end;
227
range_count += 2;
228
range_start = ~(uint32_t)0;
229
}
230
else
231
range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
232
233
item_count--;
234
}
235
236
list_ind++;
237
type >>= XCL_TYPE_BIT_LEN;
238
239
if (range_start == ~(uint32_t)0)
240
{
241
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
242
{
243
if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
244
#if PCRE2_CODE_UNIT_WIDTH == 32
245
else if (list_ind == 2) range_start = XCL_CHAR_LIST_LOW_32_START;
246
else range_start = XCL_CHAR_LIST_HIGH_32_START;
247
#else
248
else range_start = XCL_CHAR_LIST_LOW_32_START;
249
#endif
250
}
251
}
252
else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
253
{
254
if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
255
else if (list_ind == 2) range_end = XCL_CHAR_LIST_HIGH_16_END;
256
#if PCRE2_CODE_UNIT_WIDTH == 32
257
else if (list_ind == 3) range_end = XCL_CHAR_LIST_LOW_32_END;
258
else range_end = XCL_CHAR_LIST_HIGH_32_END;
259
#else
260
else range_end = XCL_CHAR_LIST_LOW_32_END;
261
#endif
262
263
ranges->ranges[range_count] = range_start;
264
ranges->ranges[range_count + 1] = range_end;
265
range_count += 2;
266
range_start = ~(uint32_t)0;
267
}
268
269
if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
270
#if PCRE2_CODE_UNIT_WIDTH == 32
271
else if (list_ind == 2) char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
272
else char_list_add = XCL_CHAR_LIST_HIGH_32_ADD;
273
#else
274
else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
275
#endif
276
}
277
278
SLJIT_ASSERT(range_count > 0 && range_count <= (est_range_count << 1));
279
SLJIT_ASSERT(next_char <= (const uint8_t*)common->start);
280
ranges->range_count = range_count;
281
}
282
283
static void xclass_check_bitset(compiler_common *common, const sljit_u8 *bitset, jump_list **found, jump_list **backtracks)
284
{
285
DEFINE_COMPILER;
286
struct sljit_jump *jump;
287
288
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
289
if (!optimize_class(common, bitset, (bitset[31] & 0x80) != 0, TRUE, found))
290
{
291
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
292
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
293
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)bitset);
294
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
295
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);
296
add_jump(compiler, found, JUMP(SLJIT_NOT_ZERO));
297
}
298
299
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
300
JUMPHERE(jump);
301
}
302
303
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
304
305
static void xclass_update_min_max(compiler_common *common, PCRE2_SPTR cc, sljit_u32 *min_ptr, sljit_u32 *max_ptr)
306
{
307
uint32_t type, list_ind, c;
308
sljit_u32 min = *min_ptr;
309
sljit_u32 max = *max_ptr;
310
uint32_t char_list_add;
311
const uint8_t *next_char;
312
BOOL utf = TRUE;
313
314
/* This function is pointless without utf 8/16. */
315
SLJIT_ASSERT(common->utf);
316
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
317
{
318
/* Only a few ranges are present. */
319
do
320
{
321
type = *cc++;
322
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
323
GETCHARINCTEST(c, cc);
324
325
if (c < min)
326
min = c;
327
328
if (type == XCL_RANGE)
329
{
330
GETCHARINCTEST(c, cc);
331
}
332
333
if (c > max)
334
max = c;
335
}
336
while (*cc != XCL_END);
337
338
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
339
*min_ptr = min;
340
*max_ptr = max;
341
return;
342
}
343
344
SLJIT_ASSERT(cc[0] >= XCL_LIST);
345
#if PCRE2_CODE_UNIT_WIDTH == 8
346
type = (uint32_t)(cc[0] << 8) | cc[1];
347
cc += 2;
348
#else
349
type = cc[0];
350
cc++;
351
#endif /* CODE_UNIT_WIDTH */
352
353
/* Align characters. */
354
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
355
type &= XCL_TYPE_MASK;
356
357
SLJIT_ASSERT(type != 0);
358
359
/* Detect minimum. */
360
361
/* Skip unused ranges. */
362
list_ind = 0;
363
while ((type & (XCL_BEGIN_WITH_RANGE | XCL_ITEM_COUNT_MASK)) == 0)
364
{
365
type >>= XCL_TYPE_BIT_LEN;
366
list_ind++;
367
}
368
369
SLJIT_ASSERT(list_ind <= 2);
370
switch (list_ind)
371
{
372
case 0:
373
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
374
c = XCL_CHAR_LIST_LOW_16_START;
375
break;
376
377
case 1:
378
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
379
c = XCL_CHAR_LIST_HIGH_16_START;
380
break;
381
382
default:
383
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
384
c = XCL_CHAR_LIST_LOW_32_START;
385
break;
386
}
387
388
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
389
{
390
if (c < min)
391
min = c;
392
}
393
else
394
{
395
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
396
{
397
if (list_ind <= 1)
398
c = *(const uint16_t*)(next_char + 2);
399
else
400
c = *(const uint32_t*)(next_char + 4);
401
}
402
else
403
{
404
if (list_ind <= 1)
405
c = *(const uint16_t*)next_char;
406
else
407
c = *(const uint32_t*)next_char;
408
}
409
410
c = char_list_add + (c >> XCL_CHAR_SHIFT);
411
if (c < min)
412
min = c;
413
}
414
415
/* Detect maximum. */
416
417
/* Skip intermediate ranges. */
418
while (TRUE)
419
{
420
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
421
{
422
if (list_ind <= 1)
423
{
424
c = *(const uint16_t*)next_char;
425
next_char += (c + 1) << 1;
426
}
427
else
428
{
429
c = *(const uint32_t*)next_char;
430
next_char += (c + 1) << 2;
431
}
432
}
433
else
434
next_char += (type & XCL_ITEM_COUNT_MASK) << (list_ind <= 1 ? 1 : 2);
435
436
if ((type >> XCL_TYPE_BIT_LEN) == 0)
437
break;
438
439
list_ind++;
440
type >>= XCL_TYPE_BIT_LEN;
441
}
442
443
SLJIT_ASSERT(list_ind <= 2 && type != 0);
444
switch (list_ind)
445
{
446
case 0:
447
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
448
c = XCL_CHAR_LIST_LOW_16_END;
449
break;
450
451
case 1:
452
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
453
c = XCL_CHAR_LIST_HIGH_16_END;
454
break;
455
456
default:
457
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
458
c = XCL_CHAR_LIST_LOW_32_END;
459
break;
460
}
461
462
if ((type & XCL_ITEM_COUNT_MASK) != 0)
463
{
464
/* Type is reused as temporary. */
465
if (list_ind <= 1)
466
type = *(const uint16_t*)(next_char - 2);
467
else
468
type = *(const uint32_t*)(next_char - 4);
469
470
if (type & XCL_CHAR_END)
471
c = char_list_add + (type >> XCL_CHAR_SHIFT);
472
}
473
474
if (c > max)
475
max = c;
476
477
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
478
*min_ptr = min;
479
*max_ptr = max;
480
}
481
482
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
483
484
#define XCLASS_IS_ECLASS 0x001
485
#ifdef SUPPORT_UNICODE
486
#define XCLASS_SAVE_CHAR 0x002
487
#define XCLASS_HAS_TYPE 0x004
488
#define XCLASS_HAS_SCRIPT 0x008
489
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
490
#define XCLASS_HAS_BOOL 0x020
491
#define XCLASS_HAS_BIDICL 0x040
492
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BOOL | XCLASS_HAS_BIDICL)
493
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
494
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
495
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0 0x200
496
#endif /* SUPPORT_UNICODE */
497
498
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
499
500
/* TMP3 must be preserved because it is used by compile_iterator_matchingpath. */
501
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks, sljit_u32 status)
502
{
503
DEFINE_COMPILER;
504
jump_list *found = NULL;
505
jump_list *check_result = NULL;
506
jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks;
507
sljit_uw c, charoffset;
508
sljit_u32 max = READ_CHAR_MAX, min = 0;
509
struct sljit_jump *jump = NULL;
510
PCRE2_UCHAR flags;
511
PCRE2_SPTR ccbegin;
512
sljit_u32 compares, invertcmp, depth;
513
sljit_u32 first_item, last_item, mid_item;
514
sljit_u32 range_start, range_end;
515
xclass_ranges ranges;
516
BOOL has_cmov, last_range_set;
517
518
#ifdef SUPPORT_UNICODE
519
sljit_u32 category_list = 0;
520
sljit_u32 items;
521
int typereg = TMP1;
522
#endif /* SUPPORT_UNICODE */
523
524
SLJIT_ASSERT(common->locals_size >= SSIZE_OF(sw));
525
/* Scanning the necessary info. */
526
flags = *cc++;
527
ccbegin = cc;
528
compares = 0;
529
530
if (flags & XCL_MAP)
531
cc += 32 / sizeof(PCRE2_UCHAR);
532
533
#ifdef SUPPORT_UNICODE
534
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
535
{
536
compares++;
537
cc++;
538
539
items = 0;
540
541
switch(*cc)
542
{
543
case PT_LAMP:
544
items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
545
break;
546
547
case PT_GC:
548
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
549
break;
550
551
case PT_PC:
552
items = UCPCAT(cc[1]);
553
break;
554
555
case PT_WORD:
556
items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
557
break;
558
559
case PT_ALNUM:
560
items = UCPCAT_L | UCPCAT_N;
561
break;
562
563
case PT_SCX:
564
status |= XCLASS_HAS_SCRIPT_EXTENSION;
565
if (cc[-1] == XCL_NOTPROP)
566
{
567
status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
568
break;
569
}
570
compares++;
571
/* Fall through */
572
573
case PT_SC:
574
status |= XCLASS_HAS_SCRIPT;
575
break;
576
577
case PT_SPACE:
578
case PT_PXSPACE:
579
case PT_PXGRAPH:
580
case PT_PXPRINT:
581
case PT_PXPUNCT:
582
status |= XCLASS_SAVE_CHAR | XCLASS_HAS_TYPE;
583
break;
584
585
case PT_UCNC:
586
case PT_PXXDIGIT:
587
status |= XCLASS_SAVE_CHAR;
588
break;
589
590
case PT_BOOL:
591
status |= XCLASS_HAS_BOOL;
592
break;
593
594
case PT_BIDICL:
595
status |= XCLASS_HAS_BIDICL;
596
break;
597
598
default:
599
SLJIT_UNREACHABLE();
600
break;
601
}
602
603
if (items > 0)
604
{
605
if (cc[-1] == XCL_NOTPROP)
606
items ^= UCPCAT_ALL;
607
category_list |= items;
608
status |= XCLASS_HAS_TYPE;
609
compares--;
610
}
611
612
cc += 2;
613
}
614
615
if (category_list == UCPCAT_ALL)
616
{
617
/* All or no characters are accepted, same as dotall. */
618
if (status & XCLASS_IS_ECLASS)
619
{
620
if (list != backtracks)
621
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
622
return;
623
}
624
625
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
626
if (list == backtracks)
627
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
628
return;
629
}
630
631
if (category_list != 0)
632
compares++;
633
#endif
634
635
if (*cc != XCL_END)
636
{
637
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
638
if (common->utf && compares == 0 && !(status & XCLASS_IS_ECLASS))
639
{
640
SLJIT_ASSERT(category_list == 0);
641
max = 0;
642
min = (flags & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX;
643
xclass_update_min_max(common, cc, &min, &max);
644
}
645
#endif
646
compares++;
647
#ifdef SUPPORT_UNICODE
648
status |= XCLASS_SAVE_CHAR;
649
#endif /* SUPPORT_UNICODE */
650
}
651
652
#ifdef SUPPORT_UNICODE
653
SLJIT_ASSERT(compares > 0 || category_list != 0);
654
#else /* !SUPPORT_UNICODE */
655
SLJIT_ASSERT(compares > 0);
656
#endif /* SUPPORT_UNICODE */
657
658
/* We are not necessary in utf mode even in 8 bit mode. */
659
cc = ccbegin;
660
if (!(status & XCLASS_IS_ECLASS))
661
{
662
if ((flags & XCL_NOT) != 0)
663
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
664
else
665
{
666
#ifdef SUPPORT_UNICODE
667
read_char(common, min, max, (status & XCLASS_NEEDS_UCD) ? backtracks : NULL, 0);
668
#else /* !SUPPORT_UNICODE */
669
read_char(common, min, max, NULL, 0);
670
#endif /* SUPPORT_UNICODE */
671
}
672
}
673
674
if ((flags & XCL_MAP) != 0)
675
{
676
SLJIT_ASSERT(!(status & XCLASS_IS_ECLASS));
677
xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);
678
cc += 32 / sizeof(PCRE2_UCHAR);
679
}
680
681
#ifdef SUPPORT_UNICODE
682
if (status & XCLASS_NEEDS_UCD)
683
{
684
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
685
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0);
686
687
#if PCRE2_CODE_UNIT_WIDTH == 32
688
if (!common->utf)
689
{
690
OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
691
SELECT(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, UNASSIGNED_UTF_CHAR, TMP1);
692
}
693
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
694
695
OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
696
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
697
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
698
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
699
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
700
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
701
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
702
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
703
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 3);
704
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
705
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
706
707
ccbegin = cc;
708
709
if (status & XCLASS_HAS_BIDICL)
710
{
711
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
712
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BIDICLASS_SHIFT);
713
714
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
715
{
716
cc++;
717
718
if (*cc == PT_BIDICL)
719
{
720
compares--;
721
invertcmp = (compares == 0 && list != backtracks);
722
if (cc[-1] == XCL_NOTPROP)
723
invertcmp ^= 0x1;
724
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
725
add_jump(compiler, compares > 0 ? list : backtracks, jump);
726
}
727
cc += 2;
728
}
729
730
cc = ccbegin;
731
}
732
733
if (status & XCLASS_HAS_BOOL)
734
{
735
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bprops));
736
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BPROPS_MASK);
737
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
738
739
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
740
{
741
cc++;
742
if (*cc == PT_BOOL)
743
{
744
compares--;
745
invertcmp = (compares == 0 && list != backtracks);
746
if (cc[-1] == XCL_NOTPROP)
747
invertcmp ^= 0x1;
748
749
OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_boolprop_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));
750
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
751
}
752
cc += 2;
753
}
754
755
cc = ccbegin;
756
}
757
758
if (status & XCLASS_HAS_SCRIPT)
759
{
760
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
761
762
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
763
{
764
cc++;
765
766
switch (*cc)
767
{
768
case PT_SCX:
769
if (cc[-1] == XCL_NOTPROP)
770
break;
771
/* Fall through */
772
773
case PT_SC:
774
compares--;
775
invertcmp = (compares == 0 && list != backtracks);
776
if (cc[-1] == XCL_NOTPROP)
777
invertcmp ^= 0x1;
778
779
add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
780
}
781
cc += 2;
782
}
783
784
cc = ccbegin;
785
}
786
787
if (status & XCLASS_HAS_SCRIPT_EXTENSION)
788
{
789
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
790
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_SCRIPTX_MASK);
791
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
792
793
if (status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
794
{
795
if (status & XCLASS_HAS_TYPE)
796
{
797
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
798
{
799
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, TMP2, 0);
800
status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0;
801
}
802
else
803
{
804
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
805
status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
806
}
807
}
808
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
809
}
810
811
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
812
{
813
cc++;
814
815
if (*cc == PT_SCX)
816
{
817
compares--;
818
invertcmp = (compares == 0 && list != backtracks);
819
820
jump = NULL;
821
if (cc[-1] == XCL_NOTPROP)
822
{
823
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
824
if (invertcmp)
825
{
826
add_jump(compiler, backtracks, jump);
827
jump = NULL;
828
}
829
invertcmp ^= 0x1;
830
}
831
832
OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));
833
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
834
835
if (jump != NULL)
836
JUMPHERE(jump);
837
}
838
cc += 2;
839
}
840
841
if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0)
842
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);
843
else if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
844
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
845
cc = ccbegin;
846
}
847
848
if (status & XCLASS_SAVE_CHAR)
849
OP1(SLJIT_MOV, TMP1, 0, (status & XCLASS_IS_ECLASS) ? ECLASS_CHAR_DATA : RETURN_ADDR, 0);
850
851
if (status & XCLASS_HAS_TYPE)
852
{
853
if (status & XCLASS_SAVE_CHAR)
854
typereg = RETURN_ADDR;
855
856
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
857
OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0);
858
859
if (category_list > 0)
860
{
861
compares--;
862
invertcmp = (compares == 0 && list != backtracks);
863
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list);
864
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
865
}
866
}
867
}
868
#endif /* SUPPORT_UNICODE */
869
870
/* Generating code. */
871
charoffset = 0;
872
873
#ifdef SUPPORT_UNICODE
874
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
875
{
876
compares--;
877
invertcmp = (compares == 0 && list != backtracks);
878
jump = NULL;
879
880
if (*cc == XCL_NOTPROP)
881
invertcmp ^= 0x1;
882
cc++;
883
switch(*cc)
884
{
885
case PT_LAMP:
886
case PT_GC:
887
case PT_PC:
888
case PT_SC:
889
case PT_SCX:
890
case PT_BOOL:
891
case PT_BIDICL:
892
case PT_WORD:
893
case PT_ALNUM:
894
compares++;
895
/* Already handled. */
896
break;
897
898
case PT_SPACE:
899
case PT_PXSPACE:
900
SET_CHAR_OFFSET(9);
901
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xd - 0x9);
902
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
903
904
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x85 - 0x9);
905
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
906
907
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x9);
908
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
909
910
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Zl, ucp_Zs));
911
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
912
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
913
break;
914
915
case PT_UCNC:
916
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset));
917
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
918
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset));
919
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
920
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset));
921
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
922
923
SET_CHAR_OFFSET(0xa0);
924
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset));
925
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
926
SET_CHAR_OFFSET(0);
927
OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0);
928
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL);
929
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
930
break;
931
932
case PT_PXGRAPH:
933
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT_RANGE(ucp_Zl, ucp_Zs));
934
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
935
936
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
937
jump = JUMP(SLJIT_ZERO);
938
939
c = charoffset;
940
/* In case of ucp_Cf, we overwrite the result. */
941
SET_CHAR_OFFSET(0x2066);
942
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
943
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
944
945
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
946
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
947
948
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066);
949
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
950
951
/* Restore charoffset. */
952
SET_CHAR_OFFSET(c);
953
954
JUMPHERE(jump);
955
jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
956
break;
957
958
case PT_PXPRINT:
959
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT2(ucp_Zl, ucp_Zp));
960
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
961
962
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
963
jump = JUMP(SLJIT_ZERO);
964
965
c = charoffset;
966
/* In case of ucp_Cf, we overwrite the result. */
967
SET_CHAR_OFFSET(0x2066);
968
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
969
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
970
971
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
972
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
973
974
/* Restore charoffset. */
975
SET_CHAR_OFFSET(c);
976
977
JUMPHERE(jump);
978
jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
979
break;
980
981
case PT_PXPUNCT:
982
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Sc, ucp_So));
983
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
984
985
SET_CHAR_OFFSET(0);
986
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x7f);
987
OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL);
988
989
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Pc, ucp_Ps));
990
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
991
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
992
break;
993
994
case PT_PXXDIGIT:
995
SET_CHAR_OFFSET(CHAR_A);
996
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, ~0x20);
997
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP2, 0, SLJIT_IMM, CHAR_F - CHAR_A);
998
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
999
1000
SET_CHAR_OFFSET(CHAR_0);
1001
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_9 - CHAR_0);
1002
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
1003
1004
SET_CHAR_OFFSET(0xff10);
1005
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff10);
1006
1007
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff19 - 0xff10);
1008
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
1009
1010
SET_CHAR_OFFSET(0xff21);
1011
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff26 - 0xff21);
1012
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
1013
1014
SET_CHAR_OFFSET(0xff41);
1015
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff41);
1016
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
1017
1018
SET_CHAR_OFFSET(0xff10);
1019
1020
JUMPHERE(jump);
1021
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, 0);
1022
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
1023
break;
1024
1025
default:
1026
SLJIT_UNREACHABLE();
1027
break;
1028
}
1029
1030
cc += 2;
1031
1032
if (jump != NULL)
1033
add_jump(compiler, compares > 0 ? list : backtracks, jump);
1034
}
1035
1036
if (compares == 0)
1037
{
1038
if (found != NULL)
1039
set_jumps(found, LABEL());
1040
1041
if (status & XCLASS_IS_ECLASS)
1042
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1043
return;
1044
}
1045
#endif /* SUPPORT_UNICODE */
1046
1047
SLJIT_ASSERT(compares == 1);
1048
ranges.range_count = 0;
1049
ranges.ranges = ranges.local_ranges;
1050
ranges.stack = ranges.local_stack;
1051
1052
xclass_compute_ranges(common, cc, &ranges);
1053
1054
/* Memory error is set for the compiler. */
1055
if (ranges.stack == NULL)
1056
return;
1057
1058
#if (defined SLJIT_DEBUG && SLJIT_DEBUG) && \
1059
defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
1060
if (common->utf)
1061
{
1062
min = READ_CHAR_MAX;
1063
max = 0;
1064
xclass_update_min_max(common, cc, &min, &max);
1065
SLJIT_ASSERT(ranges.ranges[0] == min && ranges.ranges[ranges.range_count - 1] == max);
1066
}
1067
#endif /* SLJIT_DEBUG && SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
1068
1069
invertcmp = (list != backtracks);
1070
1071
if (ranges.range_count == 2)
1072
{
1073
range_start = ranges.ranges[0];
1074
range_end = ranges.ranges[1];
1075
1076
if (range_start < range_end)
1077
{
1078
SET_CHAR_OFFSET(range_start);
1079
jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
1080
}
1081
else
1082
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
1083
1084
add_jump(compiler, backtracks, jump);
1085
1086
SLJIT_ASSERT(ranges.stack == ranges.local_stack);
1087
if (found != NULL)
1088
set_jumps(found, LABEL());
1089
1090
if (status & XCLASS_IS_ECLASS)
1091
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1092
return;
1093
}
1094
1095
range_start = ranges.ranges[0];
1096
SET_CHAR_OFFSET(range_start);
1097
if (ranges.range_count >= 6)
1098
{
1099
/* Early fail. */
1100
range_end = ranges.ranges[ranges.range_count - 1];
1101
add_jump(compiler, (flags & XCL_NOT) == 0 ? backtracks : &found,
1102
CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)));
1103
}
1104
1105
depth = 0;
1106
first_item = 0;
1107
last_item = ranges.range_count - 2;
1108
has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;
1109
1110
while (TRUE)
1111
{
1112
/* At least two items are present. */
1113
SLJIT_ASSERT(first_item < last_item && charoffset == ranges.ranges[0]);
1114
last_range_set = FALSE;
1115
1116
if (first_item + 6 <= last_item)
1117
{
1118
mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1;
1119
SLJIT_ASSERT(last_item >= mid_item + 4);
1120
1121
range_end = ranges.ranges[mid_item + 1];
1122
if (first_item + 6 > mid_item && ranges.ranges[mid_item] == range_end)
1123
{
1124
OP2U(SLJIT_SUB | SLJIT_SET_GREATER | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));
1125
ranges.stack[depth].jump = JUMP(SLJIT_GREATER);
1126
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
1127
last_range_set = TRUE;
1128
}
1129
else
1130
ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));
1131
1132
ranges.stack[depth].first_item = (sljit_u32)(mid_item + 2);
1133
ranges.stack[depth].last_item = (sljit_u32)last_item;
1134
1135
depth++;
1136
SLJIT_ASSERT(ranges.stack == ranges.local_stack ?
1137
depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges);
1138
1139
last_item = mid_item;
1140
if (!last_range_set)
1141
continue;
1142
1143
last_item -= 2;
1144
}
1145
1146
if (!last_range_set)
1147
{
1148
range_start = ranges.ranges[first_item];
1149
range_end = ranges.ranges[first_item + 1];
1150
1151
if (range_start < range_end)
1152
{
1153
SET_CHAR_OFFSET(range_start);
1154
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
1155
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
1156
}
1157
else
1158
{
1159
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
1160
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
1161
}
1162
first_item += 2;
1163
}
1164
1165
SLJIT_ASSERT(first_item <= last_item);
1166
1167
do
1168
{
1169
range_start = ranges.ranges[first_item];
1170
range_end = ranges.ranges[first_item + 1];
1171
1172
if (range_start < range_end)
1173
{
1174
SET_CHAR_OFFSET(range_start);
1175
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
1176
1177
if (has_cmov)
1178
SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2);
1179
else
1180
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL);
1181
}
1182
else
1183
{
1184
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
1185
1186
if (has_cmov)
1187
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
1188
else
1189
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
1190
}
1191
1192
first_item += 2;
1193
}
1194
while (first_item <= last_item);
1195
1196
if (depth == 0) break;
1197
1198
add_jump(compiler, &check_result, JUMP(SLJIT_JUMP));
1199
1200
/* The charoffset resets after the end of a branch is reached. */
1201
charoffset = ranges.ranges[0];
1202
depth--;
1203
first_item = ranges.stack[depth].first_item;
1204
last_item = ranges.stack[depth].last_item;
1205
JUMPHERE(ranges.stack[depth].jump);
1206
}
1207
1208
if (check_result != NULL)
1209
set_jumps(check_result, LABEL());
1210
1211
if (has_cmov)
1212
jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
1213
else
1214
{
1215
sljit_set_current_flags(compiler, SLJIT_SET_Z);
1216
jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp);
1217
}
1218
1219
add_jump(compiler, backtracks, jump);
1220
1221
if (found != NULL)
1222
set_jumps(found, LABEL());
1223
1224
if (status & XCLASS_IS_ECLASS)
1225
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1226
1227
if (ranges.stack != ranges.local_stack)
1228
SLJIT_FREE(ranges.stack, compiler->allocator_data);
1229
}
1230
1231
static PCRE2_SPTR compile_eclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
1232
{
1233
DEFINE_COMPILER;
1234
PCRE2_SPTR end = cc + GET(cc, 0) - 1;
1235
PCRE2_SPTR begin;
1236
jump_list *not_found;
1237
jump_list *found = NULL;
1238
1239
cc += LINK_SIZE;
1240
1241
/* Should be optimized later. */
1242
read_char(common, 0, READ_CHAR_MAX, backtracks, 0);
1243
1244
if (((*cc++) & ECL_MAP) != 0)
1245
{
1246
xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);
1247
cc += 32 / sizeof(PCRE2_UCHAR);
1248
}
1249
1250
begin = cc;
1251
1252
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, ECLASS_CHAR_DATA, 0);
1253
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL1, ECLASS_STACK_DATA, 0);
1254
OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);
1255
OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, TMP1, 0);
1256
1257
/* All eclass must start with an xclass. */
1258
SLJIT_ASSERT(*cc == ECL_XCLASS);
1259
1260
while (cc < end)
1261
{
1262
switch (*cc)
1263
{
1264
case ECL_AND:
1265
++cc;
1266
OP2(SLJIT_OR, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, ~(sljit_sw)1);
1267
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1268
OP2(SLJIT_AND, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
1269
break;
1270
1271
case ECL_OR:
1272
++cc;
1273
OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1274
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1275
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
1276
break;
1277
1278
case ECL_XOR:
1279
++cc;
1280
OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1281
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1282
OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
1283
break;
1284
1285
case ECL_NOT:
1286
++cc;
1287
OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1288
break;
1289
1290
default:
1291
SLJIT_ASSERT(*cc == ECL_XCLASS);
1292
if (cc != begin)
1293
{
1294
OP1(SLJIT_MOV, TMP1, 0, ECLASS_CHAR_DATA, 0);
1295
OP2(SLJIT_SHL, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
1296
}
1297
1298
not_found = NULL;
1299
compile_xclass_matchingpath(common, cc + 1 + LINK_SIZE, &not_found, XCLASS_IS_ECLASS);
1300
set_jumps(not_found, LABEL());
1301
1302
cc += GET(cc, 1);
1303
break;
1304
}
1305
}
1306
1307
OP2U(SLJIT_SUB | SLJIT_SET_Z, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);
1308
OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);
1309
OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL1);
1310
add_jump(compiler, backtracks, JUMP(SLJIT_EQUAL));
1311
set_jumps(found, LABEL());
1312
return end;
1313
}
1314
1315
/* Generic character matching code. */
1316
1317
#undef SET_CHAR_OFFSET
1318
#undef READ_FROM_CHAR_LIST
1319
#undef XCLASS_LOCAL_RANGES_SIZE
1320
#undef XCLASS_LOCAL_RANGES_LOG2_SIZE
1321
1322
#endif /* SUPPORT_WIDE_CHARS */
1323
1324
static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc,
1325
compare_context *context, jump_list **backtracks)
1326
{
1327
DEFINE_COMPILER;
1328
unsigned int othercasebit = 0;
1329
PCRE2_SPTR othercasechar = NULL;
1330
#ifdef SUPPORT_UNICODE
1331
int utflength;
1332
#endif
1333
1334
if (caseless && char_has_othercase(common, cc))
1335
{
1336
othercasebit = char_get_othercase_bit(common, cc);
1337
SLJIT_ASSERT(othercasebit);
1338
/* Extracting bit difference info. */
1339
#if PCRE2_CODE_UNIT_WIDTH == 8
1340
othercasechar = cc + (othercasebit >> 8);
1341
othercasebit &= 0xff;
1342
#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
1343
/* Note that this code only handles characters in the BMP. If there
1344
ever are characters outside the BMP whose othercase differs in only one
1345
bit from itself (there currently are none), this code will need to be
1346
revised for PCRE2_CODE_UNIT_WIDTH == 32. */
1347
othercasechar = cc + (othercasebit >> 9);
1348
if ((othercasebit & 0x100) != 0)
1349
othercasebit = (othercasebit & 0xff) << 8;
1350
else
1351
othercasebit &= 0xff;
1352
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
1353
}
1354
1355
if (context->sourcereg == -1)
1356
{
1357
#if PCRE2_CODE_UNIT_WIDTH == 8
1358
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
1359
if (context->length >= 4)
1360
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
1361
else if (context->length >= 2)
1362
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
1363
else
1364
#endif
1365
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
1366
#elif PCRE2_CODE_UNIT_WIDTH == 16
1367
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
1368
if (context->length >= 4)
1369
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
1370
else
1371
#endif
1372
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
1373
#elif PCRE2_CODE_UNIT_WIDTH == 32
1374
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
1375
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
1376
context->sourcereg = TMP2;
1377
}
1378
1379
#ifdef SUPPORT_UNICODE
1380
utflength = 1;
1381
if (common->utf && HAS_EXTRALEN(*cc))
1382
utflength += GET_EXTRALEN(*cc);
1383
1384
do
1385
{
1386
#endif
1387
1388
context->length -= IN_UCHARS(1);
1389
#if (defined SLJIT_UNALIGNED && SLJIT_UNALIGNED) && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
1390
1391
/* Unaligned read is supported. */
1392
if (othercasebit != 0 && othercasechar == cc)
1393
{
1394
context->c.asuchars[context->ucharptr] = *cc | othercasebit;
1395
context->oc.asuchars[context->ucharptr] = othercasebit;
1396
}
1397
else
1398
{
1399
context->c.asuchars[context->ucharptr] = *cc;
1400
context->oc.asuchars[context->ucharptr] = 0;
1401
}
1402
context->ucharptr++;
1403
1404
#if PCRE2_CODE_UNIT_WIDTH == 8
1405
if (context->ucharptr >= 4 || context->length == 0 || (context->ucharptr == 2 && context->length == 1))
1406
#else
1407
if (context->ucharptr >= 2 || context->length == 0)
1408
#endif
1409
{
1410
if (context->length >= 4)
1411
OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
1412
else if (context->length >= 2)
1413
OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
1414
#if PCRE2_CODE_UNIT_WIDTH == 8
1415
else if (context->length >= 1)
1416
OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
1417
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1418
context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;
1419
1420
switch(context->ucharptr)
1421
{
1422
case 4 / sizeof(PCRE2_UCHAR):
1423
if (context->oc.asint != 0)
1424
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asint);
1425
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asint | context->oc.asint));
1426
break;
1427
1428
case 2 / sizeof(PCRE2_UCHAR):
1429
if (context->oc.asushort != 0)
1430
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asushort);
1431
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asushort | context->oc.asushort));
1432
break;
1433
1434
#if PCRE2_CODE_UNIT_WIDTH == 8
1435
case 1:
1436
if (context->oc.asbyte != 0)
1437
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asbyte);
1438
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asbyte | context->oc.asbyte));
1439
break;
1440
#endif
1441
1442
default:
1443
SLJIT_UNREACHABLE();
1444
break;
1445
}
1446
context->ucharptr = 0;
1447
}
1448
1449
#else
1450
1451
/* Unaligned read is unsupported or in 32 bit mode. */
1452
if (context->length >= 1)
1453
OP1(MOV_UCHAR, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
1454
1455
context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;
1456
1457
if (othercasebit != 0 && othercasechar == cc)
1458
{
1459
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, othercasebit);
1460
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc | othercasebit));
1461
}
1462
else
1463
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc));
1464
1465
#endif
1466
1467
cc++;
1468
#ifdef SUPPORT_UNICODE
1469
utflength--;
1470
}
1471
while (utflength > 0);
1472
#endif
1473
1474
return cc;
1475
}
1476
1477
#ifdef SUPPORT_UNICODE
1478
1479
#if PCRE2_CODE_UNIT_WIDTH != 32
1480
1481
/* The code in this function copies the logic of the interpreter function that
1482
is defined in the pcre2_extuni.c source. If that code is updated, this
1483
function, and those below it, must be kept in step (note by PH, June 2024). */
1484
1485
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc)
1486
{
1487
PCRE2_SPTR start_subject = args->begin;
1488
PCRE2_SPTR end_subject = args->end;
1489
int lgb, rgb, ricount;
1490
PCRE2_SPTR prevcc, endcc, bptr;
1491
BOOL first = TRUE;
1492
BOOL was_ep_ZWJ = FALSE;
1493
uint32_t c;
1494
1495
prevcc = cc;
1496
endcc = NULL;
1497
do
1498
{
1499
GETCHARINC(c, cc);
1500
rgb = UCD_GRAPHBREAK(c);
1501
1502
if (first)
1503
{
1504
lgb = rgb;
1505
endcc = cc;
1506
first = FALSE;
1507
continue;
1508
}
1509
1510
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
1511
break;
1512
1513
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
1514
preceded by Extended Pictographic. */
1515
1516
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
1517
break;
1518
1519
/* Not breaking between Regional Indicators is allowed only if there
1520
are an even number of preceding RIs. */
1521
1522
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
1523
{
1524
ricount = 0;
1525
bptr = prevcc;
1526
1527
/* bptr is pointing to the left-hand character */
1528
while (bptr > start_subject)
1529
{
1530
bptr--;
1531
BACKCHAR(bptr);
1532
GETCHAR(c, bptr);
1533
1534
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)
1535
break;
1536
1537
ricount++;
1538
}
1539
1540
if ((ricount & 1) != 0) break; /* Grapheme break required */
1541
}
1542
1543
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
1544
between; see next statement). */
1545
1546
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
1547
1548
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
1549
any number of them before a following ZWJ. */
1550
1551
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
1552
lgb = rgb;
1553
1554
prevcc = endcc;
1555
endcc = cc;
1556
}
1557
while (cc < end_subject);
1558
1559
return endcc;
1560
}
1561
1562
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1563
1564
/* The code in this function copies the logic of the interpreter function that
1565
is defined in the pcre2_extuni.c source. If that code is updated, this
1566
function, and the one below it, must be kept in step (note by PH, June 2024). */
1567
1568
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)
1569
{
1570
PCRE2_SPTR start_subject = args->begin;
1571
PCRE2_SPTR end_subject = args->end;
1572
int lgb, rgb, ricount;
1573
PCRE2_SPTR prevcc, endcc, bptr;
1574
BOOL first = TRUE;
1575
BOOL was_ep_ZWJ = FALSE;
1576
uint32_t c;
1577
1578
prevcc = cc;
1579
endcc = NULL;
1580
do
1581
{
1582
GETCHARINC_INVALID(c, cc, end_subject, break);
1583
rgb = UCD_GRAPHBREAK(c);
1584
1585
if (first)
1586
{
1587
lgb = rgb;
1588
endcc = cc;
1589
first = FALSE;
1590
continue;
1591
}
1592
1593
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
1594
break;
1595
1596
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
1597
preceded by Extended Pictographic. */
1598
1599
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
1600
break;
1601
1602
/* Not breaking between Regional Indicators is allowed only if there
1603
are an even number of preceding RIs. */
1604
1605
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
1606
{
1607
ricount = 0;
1608
bptr = prevcc;
1609
1610
/* bptr is pointing to the left-hand character */
1611
while (bptr > start_subject)
1612
{
1613
GETCHARBACK_INVALID(c, bptr, start_subject, break);
1614
1615
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)
1616
break;
1617
1618
ricount++;
1619
}
1620
1621
if ((ricount & 1) != 0)
1622
break; /* Grapheme break required */
1623
}
1624
1625
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
1626
between; see next statement). */
1627
1628
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
1629
1630
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
1631
any number of them before a following ZWJ. */
1632
1633
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
1634
lgb = rgb;
1635
1636
prevcc = endcc;
1637
endcc = cc;
1638
}
1639
while (cc < end_subject);
1640
1641
return endcc;
1642
}
1643
1644
/* The code in this function copies the logic of the interpreter function that
1645
is defined in the pcre2_extuni.c source. If that code is updated, this
1646
function must be kept in step (note by PH, June 2024). */
1647
1648
static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)
1649
{
1650
PCRE2_SPTR start_subject = args->begin;
1651
PCRE2_SPTR end_subject = args->end;
1652
int lgb, rgb, ricount;
1653
PCRE2_SPTR bptr;
1654
uint32_t c;
1655
BOOL was_ep_ZWJ = FALSE;
1656
1657
/* Patch by PH */
1658
/* GETCHARINC(c, cc); */
1659
c = *cc++;
1660
1661
#if PCRE2_CODE_UNIT_WIDTH == 32
1662
if (c >= 0x110000)
1663
return cc;
1664
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
1665
lgb = UCD_GRAPHBREAK(c);
1666
1667
while (cc < end_subject)
1668
{
1669
c = *cc;
1670
#if PCRE2_CODE_UNIT_WIDTH == 32
1671
if (c >= 0x110000)
1672
break;
1673
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
1674
rgb = UCD_GRAPHBREAK(c);
1675
1676
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
1677
break;
1678
1679
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
1680
preceded by Extended Pictographic. */
1681
1682
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
1683
break;
1684
1685
/* Not breaking between Regional Indicators is allowed only if there
1686
are an even number of preceding RIs. */
1687
1688
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
1689
{
1690
ricount = 0;
1691
bptr = cc - 1;
1692
1693
/* bptr is pointing to the left-hand character */
1694
while (bptr > start_subject)
1695
{
1696
bptr--;
1697
c = *bptr;
1698
#if PCRE2_CODE_UNIT_WIDTH == 32
1699
if (c >= 0x110000)
1700
break;
1701
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
1702
1703
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
1704
1705
ricount++;
1706
}
1707
1708
if ((ricount & 1) != 0)
1709
break; /* Grapheme break required */
1710
}
1711
1712
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
1713
between; see next statement). */
1714
1715
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
1716
1717
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
1718
any number of them before a following ZWJ. */
1719
1720
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
1721
lgb = rgb;
1722
1723
cc++;
1724
}
1725
1726
return cc;
1727
}
1728
1729
static void compile_clist(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
1730
{
1731
DEFINE_COMPILER;
1732
const sljit_u32 *other_cases;
1733
struct sljit_jump *jump;
1734
sljit_u32 min = 0, max = READ_CHAR_MAX;
1735
BOOL has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;
1736
1737
SLJIT_ASSERT(cc[1] == PT_CLIST);
1738
1739
if (cc[0] == OP_PROP)
1740
{
1741
other_cases = PRIV(ucd_caseless_sets) + cc[2];
1742
1743
min = *other_cases++;
1744
max = min;
1745
1746
while (*other_cases != NOTACHAR)
1747
{
1748
if (*other_cases > max) max = *other_cases;
1749
if (*other_cases < min) min = *other_cases;
1750
other_cases++;
1751
}
1752
}
1753
1754
other_cases = PRIV(ucd_caseless_sets) + cc[2];
1755
SLJIT_ASSERT(other_cases[0] != NOTACHAR && other_cases[1] != NOTACHAR);
1756
/* The NOTACHAR is higher than any character. */
1757
SLJIT_ASSERT(other_cases[0] < other_cases[1] && other_cases[1] < other_cases[2]);
1758
1759
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
1760
1761
/* At least two characters are required.
1762
Otherwise this case would be handled by the normal code path. */
1763
/* NOTACHAR is the unsigned maximum. */
1764
1765
/* Optimizing character pairs, if their difference is power of 2. */
1766
if (is_powerof2(other_cases[1] ^ other_cases[0]))
1767
{
1768
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[1] ^ other_cases[0]));
1769
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[1]);
1770
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
1771
other_cases += 2;
1772
}
1773
else if (is_powerof2(other_cases[2] ^ other_cases[1]))
1774
{
1775
SLJIT_ASSERT(other_cases[2] != NOTACHAR);
1776
1777
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[2] ^ other_cases[1]));
1778
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[2]);
1779
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
1780
1781
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)other_cases[0]);
1782
1783
if (has_cmov)
1784
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
1785
else
1786
OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
1787
1788
other_cases += 3;
1789
}
1790
else
1791
{
1792
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));
1793
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
1794
}
1795
1796
while (*other_cases != NOTACHAR)
1797
{
1798
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));
1799
1800
if (has_cmov)
1801
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
1802
else
1803
OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
1804
}
1805
1806
if (has_cmov)
1807
jump = CMP(cc[0] == OP_PROP ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0);
1808
else
1809
jump = JUMP(cc[0] == OP_PROP ? SLJIT_ZERO : SLJIT_NOT_ZERO);
1810
1811
add_jump(compiler, backtracks, jump);
1812
}
1813
1814
#endif /* SUPPORT_UNICODE */
1815
1816
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr)
1817
{
1818
DEFINE_COMPILER;
1819
int length;
1820
unsigned int c, oc, bit;
1821
compare_context context;
1822
struct sljit_jump *jump[3];
1823
jump_list *end_list;
1824
#ifdef SUPPORT_UNICODE
1825
PCRE2_UCHAR propdata[5];
1826
#endif /* SUPPORT_UNICODE */
1827
1828
switch(type)
1829
{
1830
case OP_NOT_DIGIT:
1831
case OP_DIGIT:
1832
/* Digits are usually 0-9, so it is worth to optimize them. */
1833
if (check_str_ptr)
1834
detect_partial_match(common, backtracks);
1835
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1836
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE))
1837
read_char7_type(common, backtracks, type == OP_NOT_DIGIT);
1838
else
1839
#endif
1840
read_char8_type(common, backtracks, type == OP_NOT_DIGIT);
1841
/* Flip the starting bit in the negative case. */
1842
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_digit);
1843
add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO));
1844
return cc;
1845
1846
case OP_NOT_WHITESPACE:
1847
case OP_WHITESPACE:
1848
if (check_str_ptr)
1849
detect_partial_match(common, backtracks);
1850
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1851
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE))
1852
read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE);
1853
else
1854
#endif
1855
read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE);
1856
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_space);
1857
add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO));
1858
return cc;
1859
1860
case OP_NOT_WORDCHAR:
1861
case OP_WORDCHAR:
1862
if (check_str_ptr)
1863
detect_partial_match(common, backtracks);
1864
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1865
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE))
1866
read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR);
1867
else
1868
#endif
1869
read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR);
1870
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_word);
1871
add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO));
1872
return cc;
1873
1874
case OP_ANY:
1875
if (check_str_ptr)
1876
detect_partial_match(common, backtracks);
1877
read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
1878
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
1879
{
1880
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
1881
end_list = NULL;
1882
if (common->mode != PCRE2_JIT_PARTIAL_HARD)
1883
add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1884
else
1885
check_str_end(common, &end_list);
1886
1887
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
1888
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
1889
set_jumps(end_list, LABEL());
1890
JUMPHERE(jump[0]);
1891
}
1892
else
1893
check_newlinechar(common, common->nltype, backtracks, TRUE);
1894
return cc;
1895
1896
case OP_ALLANY:
1897
if (check_str_ptr)
1898
detect_partial_match(common, backtracks);
1899
#ifdef SUPPORT_UNICODE
1900
if (common->utf && common->invalid_utf)
1901
{
1902
read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);
1903
return cc;
1904
}
1905
#endif /* SUPPORT_UNICODE */
1906
1907
skip_valid_char(common);
1908
return cc;
1909
1910
case OP_ANYBYTE:
1911
if (check_str_ptr)
1912
detect_partial_match(common, backtracks);
1913
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1914
return cc;
1915
1916
#ifdef SUPPORT_UNICODE
1917
case OP_NOTPROP:
1918
case OP_PROP:
1919
if (check_str_ptr)
1920
detect_partial_match(common, backtracks);
1921
if (cc[0] == PT_CLIST)
1922
{
1923
compile_clist(common, cc - 1, backtracks);
1924
return cc + 2;
1925
}
1926
1927
propdata[0] = 0;
1928
propdata[1] = type == OP_NOTPROP ? XCL_NOTPROP : XCL_PROP;
1929
propdata[2] = cc[0];
1930
propdata[3] = cc[1];
1931
propdata[4] = XCL_END;
1932
compile_xclass_matchingpath(common, propdata, backtracks, 0);
1933
return cc + 2;
1934
#endif
1935
1936
case OP_ANYNL:
1937
if (check_str_ptr)
1938
detect_partial_match(common, backtracks);
1939
read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);
1940
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
1941
/* We don't need to handle soft partial matching case. */
1942
end_list = NULL;
1943
if (common->mode != PCRE2_JIT_PARTIAL_HARD)
1944
add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1945
else
1946
check_str_end(common, &end_list);
1947
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
1948
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL);
1949
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
1950
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
1951
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
1952
#endif
1953
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1954
jump[1] = JUMP(SLJIT_JUMP);
1955
JUMPHERE(jump[0]);
1956
check_newlinechar(common, common->bsr_nltype, backtracks, FALSE);
1957
set_jumps(end_list, LABEL());
1958
JUMPHERE(jump[1]);
1959
return cc;
1960
1961
case OP_NOT_HSPACE:
1962
case OP_HSPACE:
1963
if (check_str_ptr)
1964
detect_partial_match(common, backtracks);
1965
1966
if (type == OP_NOT_HSPACE)
1967
read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);
1968
else
1969
read_char(common, 0x9, 0x3000, NULL, 0);
1970
1971
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
1972
sljit_set_current_flags(compiler, SLJIT_SET_Z);
1973
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
1974
return cc;
1975
1976
case OP_NOT_VSPACE:
1977
case OP_VSPACE:
1978
if (check_str_ptr)
1979
detect_partial_match(common, backtracks);
1980
1981
if (type == OP_NOT_VSPACE)
1982
read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);
1983
else
1984
read_char(common, 0xa, 0x2029, NULL, 0);
1985
1986
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
1987
sljit_set_current_flags(compiler, SLJIT_SET_Z);
1988
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
1989
return cc;
1990
1991
#ifdef SUPPORT_UNICODE
1992
case OP_EXTUNI:
1993
if (check_str_ptr)
1994
detect_partial_match(common, backtracks);
1995
1996
SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1);
1997
OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0);
1998
1999
#if PCRE2_CODE_UNIT_WIDTH != 32
2000
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,
2001
common->utf ? (common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_utf)) : SLJIT_FUNC_ADDR(do_extuni_no_utf));
2002
if (common->invalid_utf)
2003
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
2004
#else
2005
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,
2006
common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_no_utf));
2007
if (common->invalid_utf)
2008
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
2009
#endif
2010
2011
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
2012
2013
if (common->mode == PCRE2_JIT_PARTIAL_HARD)
2014
{
2015
jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0);
2016
/* Since we successfully read a char above, partial matching must occur. */
2017
check_partial(common, TRUE);
2018
JUMPHERE(jump[0]);
2019
}
2020
return cc;
2021
#endif
2022
2023
case OP_CHAR:
2024
case OP_CHARI:
2025
length = 1;
2026
#ifdef SUPPORT_UNICODE
2027
if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
2028
#endif
2029
2030
if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE)
2031
detect_partial_match(common, backtracks);
2032
2033
if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
2034
{
2035
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length));
2036
if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE))
2037
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
2038
2039
context.length = IN_UCHARS(length);
2040
context.sourcereg = -1;
2041
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
2042
context.ucharptr = 0;
2043
#endif
2044
return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);
2045
}
2046
2047
#ifdef SUPPORT_UNICODE
2048
if (common->utf)
2049
{
2050
GETCHAR(c, cc);
2051
}
2052
else
2053
#endif
2054
c = *cc;
2055
2056
SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc));
2057
2058
if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)
2059
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2060
2061
oc = char_othercase(common, c);
2062
read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
2063
2064
SLJIT_ASSERT(!is_powerof2(c ^ oc));
2065
2066
if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
2067
{
2068
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, oc);
2069
SELECT(SLJIT_EQUAL, TMP1, SLJIT_IMM, c, TMP1);
2070
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
2071
}
2072
else
2073
{
2074
jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
2075
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
2076
JUMPHERE(jump[0]);
2077
}
2078
return cc + length;
2079
2080
case OP_NOT:
2081
case OP_NOTI:
2082
if (check_str_ptr)
2083
detect_partial_match(common, backtracks);
2084
2085
length = 1;
2086
#ifdef SUPPORT_UNICODE
2087
if (common->utf)
2088
{
2089
#if PCRE2_CODE_UNIT_WIDTH == 8
2090
c = *cc;
2091
if (c < 128 && !common->invalid_utf)
2092
{
2093
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
2094
if (type == OP_NOT || !char_has_othercase(common, cc))
2095
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
2096
else
2097
{
2098
/* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */
2099
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);
2100
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));
2101
}
2102
/* Skip the variable-length character. */
2103
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2104
jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
2105
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
2106
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2107
JUMPHERE(jump[0]);
2108
return cc + 1;
2109
}
2110
else
2111
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
2112
{
2113
GETCHARLEN(c, cc, length);
2114
}
2115
}
2116
else
2117
#endif /* SUPPORT_UNICODE */
2118
c = *cc;
2119
2120
if (type == OP_NOT || !char_has_othercase(common, cc))
2121
{
2122
read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);
2123
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
2124
}
2125
else
2126
{
2127
oc = char_othercase(common, c);
2128
read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);
2129
bit = c ^ oc;
2130
if (is_powerof2(bit))
2131
{
2132
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit);
2133
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
2134
}
2135
else
2136
{
2137
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
2138
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
2139
}
2140
}
2141
return cc + length;
2142
2143
case OP_CLASS:
2144
case OP_NCLASS:
2145
if (check_str_ptr)
2146
detect_partial_match(common, backtracks);
2147
2148
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
2149
bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
2150
if (type == OP_NCLASS)
2151
read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);
2152
else
2153
read_char(common, 0, bit, NULL, 0);
2154
#else
2155
if (type == OP_NCLASS)
2156
read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);
2157
else
2158
read_char(common, 0, 255, NULL, 0);
2159
#endif
2160
2161
if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
2162
return cc + 32 / sizeof(PCRE2_UCHAR);
2163
2164
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
2165
jump[0] = NULL;
2166
if (common->utf)
2167
{
2168
jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, bit);
2169
if (type == OP_CLASS)
2170
{
2171
add_jump(compiler, backtracks, jump[0]);
2172
jump[0] = NULL;
2173
}
2174
}
2175
#elif PCRE2_CODE_UNIT_WIDTH != 8
2176
jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
2177
if (type == OP_CLASS)
2178
{
2179
add_jump(compiler, backtracks, jump[0]);
2180
jump[0] = NULL;
2181
}
2182
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
2183
2184
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
2185
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
2186
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc);
2187
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
2188
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);
2189
add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));
2190
2191
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
2192
if (jump[0] != NULL)
2193
JUMPHERE(jump[0]);
2194
#endif
2195
return cc + 32 / sizeof(PCRE2_UCHAR);
2196
2197
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
2198
case OP_XCLASS:
2199
if (check_str_ptr)
2200
detect_partial_match(common, backtracks);
2201
compile_xclass_matchingpath(common, cc + LINK_SIZE, backtracks, 0);
2202
return cc + GET(cc, 0) - 1;
2203
2204
case OP_ECLASS:
2205
if (check_str_ptr)
2206
detect_partial_match(common, backtracks);
2207
return compile_eclass_matchingpath(common, cc, backtracks);
2208
#endif
2209
}
2210
SLJIT_UNREACHABLE();
2211
return cc;
2212
}
2213
2214
static SLJIT_INLINE PCRE2_SPTR compile_charn_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, jump_list **backtracks)
2215
{
2216
/* This function consumes at least one input character. */
2217
/* To decrease the number of length checks, we try to concatenate the fixed length character sequences. */
2218
DEFINE_COMPILER;
2219
PCRE2_SPTR ccbegin = cc;
2220
compare_context context;
2221
int size;
2222
2223
context.length = 0;
2224
do
2225
{
2226
if (cc >= ccend)
2227
break;
2228
2229
if (*cc == OP_CHAR)
2230
{
2231
size = 1;
2232
#ifdef SUPPORT_UNICODE
2233
if (common->utf && HAS_EXTRALEN(cc[1]))
2234
size += GET_EXTRALEN(cc[1]);
2235
#endif
2236
}
2237
else if (*cc == OP_CHARI)
2238
{
2239
size = 1;
2240
#ifdef SUPPORT_UNICODE
2241
if (common->utf)
2242
{
2243
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
2244
size = 0;
2245
else if (HAS_EXTRALEN(cc[1]))
2246
size += GET_EXTRALEN(cc[1]);
2247
}
2248
else
2249
#endif
2250
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
2251
size = 0;
2252
}
2253
else
2254
size = 0;
2255
2256
cc += 1 + size;
2257
context.length += IN_UCHARS(size);
2258
}
2259
while (size > 0 && context.length <= 128);
2260
2261
cc = ccbegin;
2262
if (context.length > 0)
2263
{
2264
/* We have a fixed-length byte sequence. */
2265
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, context.length);
2266
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
2267
2268
context.sourcereg = -1;
2269
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
2270
context.ucharptr = 0;
2271
#endif
2272
do cc = byte_sequence_compare(common, *cc == OP_CHARI, cc + 1, &context, backtracks); while (context.length > 0);
2273
return cc;
2274
}
2275
2276
/* A non-fixed length character will be checked if length == 0. */
2277
return compile_char1_matchingpath(common, *cc, cc + 1, backtracks, TRUE);
2278
}
2279
2280
2281
2282