Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_compile_class.c
21733 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
typedef struct {
47
/* Option bits for eclass. */
48
uint32_t options;
49
uint32_t xoptions;
50
/* Rarely used members. */
51
int *errorcodeptr;
52
compile_block *cb;
53
/* Bitmap is needed. */
54
BOOL needs_bitmap;
55
} eclass_context;
56
57
/* Checks the allowed tokens at the end of a class structure in debug mode.
58
When a new token is not processed by all loops, and the token is equals to
59
a) one of the cases here:
60
the compiler will complain about a duplicated case value.
61
b) none of the cases here:
62
the loop without the handler will stop with an assertion failure. */
63
64
#ifdef PCRE2_DEBUG
65
#define CLASS_END_CASES(meta) \
66
default: \
67
PCRE2_ASSERT((meta) <= META_END); \
68
PCRE2_FALLTHROUGH /* Fall through */ \
69
case META_CLASS: \
70
case META_CLASS_NOT: \
71
case META_CLASS_EMPTY: \
72
case META_CLASS_EMPTY_NOT: \
73
case META_CLASS_END: \
74
case META_ECLASS_AND: \
75
case META_ECLASS_OR: \
76
case META_ECLASS_SUB: \
77
case META_ECLASS_XOR: \
78
case META_ECLASS_NOT:
79
#else
80
#define CLASS_END_CASES(meta) \
81
default:
82
#endif
83
84
#ifdef SUPPORT_WIDE_CHARS
85
86
/* Heapsort algorithm. */
87
88
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
89
{
90
size_t max;
91
size_t left;
92
size_t right;
93
uint32_t tmp1, tmp2;
94
95
while (TRUE)
96
{
97
max = i;
98
left = (i << 1) + 2;
99
right = left + 2;
100
101
if (left < size && buffer[left] > buffer[max]) max = left;
102
if (right < size && buffer[right] > buffer[max]) max = right;
103
if (i == max) return;
104
105
/* Swap items. */
106
tmp1 = buffer[i];
107
tmp2 = buffer[i + 1];
108
buffer[i] = buffer[max];
109
buffer[i + 1] = buffer[max + 1];
110
buffer[max] = tmp1;
111
buffer[max + 1] = tmp2;
112
i = max;
113
}
114
}
115
116
#ifdef SUPPORT_UNICODE
117
118
#define PARSE_CLASS_UTF 0x1
119
#define PARSE_CLASS_CASELESS_UTF 0x2
120
#define PARSE_CLASS_RESTRICTED_UTF 0x4
121
#define PARSE_CLASS_TURKISH_UTF 0x8
122
123
/* Get the range of nocase characters which includes the
124
'c' character passed as argument, or directly follows 'c'. */
125
126
static const uint32_t*
127
get_nocase_range(uint32_t c)
128
{
129
uint32_t left = 0;
130
uint32_t right = PRIV(ucd_nocase_ranges_size);
131
uint32_t middle;
132
133
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
134
135
while (TRUE)
136
{
137
/* Range end of the middle element. */
138
middle = ((left + right) >> 1) | 0x1;
139
140
if (PRIV(ucd_nocase_ranges)[middle] <= c)
141
left = middle + 1;
142
else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
143
right = middle - 1;
144
else
145
return PRIV(ucd_nocase_ranges) + (middle - 1);
146
}
147
}
148
149
/* Get the list of othercase characters, which belongs to the passed range.
150
Create ranges from these characters, and append them to the buffer argument. */
151
152
static size_t
153
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
154
uint32_t *buffer)
155
{
156
uint32_t new_start = start;
157
uint32_t new_end = end;
158
uint32_t c = start;
159
const uint32_t *list;
160
uint32_t tmp[3];
161
size_t result = 2;
162
const uint32_t *skip_range = get_nocase_range(c);
163
uint32_t skip_start = skip_range[0];
164
165
#if PCRE2_CODE_UNIT_WIDTH == 8
166
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
167
#endif
168
169
#if PCRE2_CODE_UNIT_WIDTH == 32
170
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
171
#endif
172
173
while (c <= end)
174
{
175
uint32_t co;
176
177
if (c > skip_start)
178
{
179
c = skip_range[1];
180
skip_range += 2;
181
skip_start = skip_range[0];
182
continue;
183
}
184
185
/* Compute caseless set. */
186
187
if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
188
PARSE_CLASS_TURKISH_UTF &&
189
UCD_ANY_I(c))
190
{
191
co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
192
}
193
else if ((co = UCD_CASESET(c)) != 0 &&
194
(options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
195
PRIV(ucd_caseless_sets)[co] < 128)
196
{
197
co = 0; /* Ignore the caseless set if it's restricted. */
198
}
199
200
if (co != 0)
201
list = PRIV(ucd_caseless_sets) + co;
202
else
203
{
204
co = UCD_OTHERCASE(c);
205
list = tmp;
206
tmp[0] = c;
207
tmp[1] = NOTACHAR;
208
209
if (co != c)
210
{
211
tmp[1] = co;
212
tmp[2] = NOTACHAR;
213
}
214
}
215
c++;
216
217
/* Add characters. */
218
do
219
{
220
#if PCRE2_CODE_UNIT_WIDTH == 16
221
if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
222
#endif
223
224
if (*list < new_start)
225
{
226
if (*list + 1 == new_start)
227
{
228
new_start--;
229
continue;
230
}
231
}
232
else if (*list > new_end)
233
{
234
if (*list - 1 == new_end)
235
{
236
new_end++;
237
continue;
238
}
239
}
240
else continue;
241
242
result += 2;
243
if (buffer != NULL)
244
{
245
buffer[0] = *list;
246
buffer[1] = *list;
247
buffer += 2;
248
}
249
}
250
while (*(++list) != NOTACHAR);
251
}
252
253
if (buffer != NULL)
254
{
255
buffer[0] = new_start;
256
buffer[1] = new_end;
257
buffer += 2;
258
(void)buffer;
259
}
260
return result;
261
}
262
263
#endif
264
265
/* Add a character list to a buffer. */
266
267
static size_t
268
append_char_list(const uint32_t *p, uint32_t *buffer)
269
{
270
const uint32_t *n;
271
size_t result = 0;
272
273
while (*p != NOTACHAR)
274
{
275
n = p;
276
while (n[0] == n[1] - 1) n++;
277
278
PCRE2_ASSERT(*p < 0xffff);
279
280
if (buffer != NULL)
281
{
282
buffer[0] = *p;
283
buffer[1] = *n;
284
buffer += 2;
285
}
286
287
result += 2;
288
p = n + 1;
289
}
290
291
return result;
292
}
293
294
static uint32_t
295
get_highest_char(uint32_t options)
296
{
297
(void)options; /* Avoid compiler warning. */
298
299
#if PCRE2_CODE_UNIT_WIDTH == 8
300
return MAX_UTF_CODE_POINT;
301
#else
302
#ifdef SUPPORT_UNICODE
303
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
304
#else
305
return MAX_UCHAR_VALUE;
306
#endif
307
#endif
308
}
309
310
/* Add a negated character list to a buffer. */
311
static size_t
312
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
313
{
314
const uint32_t *n;
315
uint32_t start = 0;
316
size_t result = 2;
317
318
PCRE2_ASSERT(*p > 0);
319
320
while (*p != NOTACHAR)
321
{
322
n = p;
323
while (n[0] == n[1] - 1) n++;
324
325
PCRE2_ASSERT(*p < 0xffff);
326
327
if (buffer != NULL)
328
{
329
buffer[0] = start;
330
buffer[1] = *p - 1;
331
buffer += 2;
332
}
333
334
result += 2;
335
start = *n + 1;
336
p = n + 1;
337
}
338
339
if (buffer != NULL)
340
{
341
buffer[0] = start;
342
buffer[1] = get_highest_char(options);
343
buffer += 2;
344
(void)buffer;
345
}
346
347
return result;
348
}
349
350
static uint32_t *
351
append_non_ascii_range(uint32_t options, uint32_t *buffer)
352
{
353
if (buffer == NULL) return NULL;
354
355
buffer[0] = 0x100;
356
buffer[1] = get_highest_char(options);
357
return buffer + 2;
358
}
359
360
static size_t
361
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
362
{
363
size_t total_size = 0;
364
size_t size;
365
uint32_t meta_arg;
366
uint32_t start_char;
367
368
while (TRUE)
369
{
370
switch (META_CODE(*ptr))
371
{
372
case META_ESCAPE:
373
meta_arg = META_DATA(*ptr);
374
switch (meta_arg)
375
{
376
case ESC_D:
377
case ESC_W:
378
case ESC_S:
379
buffer = append_non_ascii_range(options, buffer);
380
total_size += 2;
381
break;
382
383
case ESC_h:
384
size = append_char_list(PRIV(hspace_list), buffer);
385
total_size += size;
386
if (buffer != NULL) buffer += size;
387
break;
388
389
case ESC_H:
390
size = append_negated_char_list(PRIV(hspace_list), options, buffer);
391
total_size += size;
392
if (buffer != NULL) buffer += size;
393
break;
394
395
case ESC_v:
396
size = append_char_list(PRIV(vspace_list), buffer);
397
total_size += size;
398
if (buffer != NULL) buffer += size;
399
break;
400
401
case ESC_V:
402
size = append_negated_char_list(PRIV(vspace_list), options, buffer);
403
total_size += size;
404
if (buffer != NULL) buffer += size;
405
break;
406
407
case ESC_p:
408
case ESC_P:
409
ptr++;
410
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
411
{
412
if (buffer != NULL)
413
{
414
buffer[0] = 0;
415
buffer[1] = get_highest_char(options);
416
buffer += 2;
417
}
418
total_size += 2;
419
}
420
break;
421
}
422
ptr++;
423
continue;
424
case META_POSIX_NEG:
425
buffer = append_non_ascii_range(options, buffer);
426
total_size += 2;
427
ptr += 2;
428
continue;
429
case META_POSIX:
430
ptr += 2;
431
continue;
432
case META_BIGVALUE:
433
/* Character literal */
434
ptr++;
435
break;
436
CLASS_END_CASES(*ptr)
437
if (*ptr >= META_END) return total_size;
438
break;
439
}
440
441
start_char = *ptr;
442
443
if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
444
{
445
ptr += 2;
446
PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
447
448
if (*ptr == META_BIGVALUE) ptr++;
449
450
#ifdef EBCDIC
451
#error "Missing EBCDIC support"
452
#endif
453
}
454
455
#ifdef SUPPORT_UNICODE
456
if (options & PARSE_CLASS_CASELESS_UTF)
457
{
458
size = utf_caseless_extend(start_char, *ptr++, options, buffer);
459
if (buffer != NULL) buffer += size;
460
total_size += size;
461
continue;
462
}
463
#endif
464
465
if (buffer != NULL)
466
{
467
buffer[0] = start_char;
468
buffer[1] = *ptr;
469
buffer += 2;
470
}
471
472
ptr++;
473
total_size += 2;
474
}
475
476
return total_size;
477
}
478
479
/* Extra uint32_t values for storing the lengths of range lists in
480
the worst case. Two uint32_t lengths and a range end for a range
481
starting before 255 */
482
#define CHAR_LIST_EXTRA_SIZE 3
483
484
/* Starting character values for each character list. */
485
486
static const uint32_t char_list_starts[] = {
487
#if PCRE2_CODE_UNIT_WIDTH == 32
488
XCL_CHAR_LIST_HIGH_32_START,
489
#endif
490
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
491
XCL_CHAR_LIST_LOW_32_START,
492
#endif
493
XCL_CHAR_LIST_HIGH_16_START,
494
/* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
495
which also represents the end of the bitset. */
496
XCL_CHAR_LIST_LOW_16_START,
497
};
498
499
static class_ranges *
500
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
501
uint32_t xoptions, compile_block *cb)
502
{
503
class_ranges* cranges;
504
uint32_t *ptr;
505
uint32_t *buffer;
506
uint32_t *dst;
507
uint32_t class_options = 0;
508
size_t range_list_size = 0, total_size, i;
509
uint32_t tmp1, tmp2;
510
const uint32_t *char_list_next;
511
uint16_t *next_char;
512
uint32_t char_list_start, char_list_end;
513
uint32_t range_start, range_end;
514
515
#ifdef SUPPORT_UNICODE
516
if (options & PCRE2_UTF)
517
class_options |= PARSE_CLASS_UTF;
518
519
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
520
class_options |= PARSE_CLASS_CASELESS_UTF;
521
522
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
523
class_options |= PARSE_CLASS_RESTRICTED_UTF;
524
525
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
526
class_options |= PARSE_CLASS_TURKISH_UTF;
527
#else
528
(void)options; /* Avoid compiler warning. */
529
(void)xoptions; /* Avoid compiler warning. */
530
#endif
531
532
/* Compute required space for the range. */
533
534
range_list_size = parse_class(start_ptr, class_options, NULL);
535
PCRE2_ASSERT((range_list_size & 0x1) == 0);
536
537
/* Allocate buffer. The total_size also represents the end of the buffer. */
538
539
total_size = range_list_size +
540
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
541
542
cranges = cb->cx->memctl.malloc(
543
sizeof(class_ranges) + total_size * sizeof(uint32_t),
544
cb->cx->memctl.memory_data);
545
546
if (cranges == NULL) return NULL;
547
548
cranges->header.next = NULL;
549
#ifdef PCRE2_DEBUG
550
cranges->header.type = CDATA_CRANGE;
551
#endif
552
cranges->range_list_size = (uint16_t)range_list_size;
553
cranges->char_lists_types = 0;
554
cranges->char_lists_size = 0;
555
cranges->char_lists_start = 0;
556
557
if (range_list_size == 0) return cranges;
558
559
buffer = (uint32_t*)(cranges + 1);
560
parse_class(start_ptr, class_options, buffer);
561
562
/* Using <= instead of == to help static analysis. */
563
if (range_list_size <= 2) return cranges;
564
565
/* In-place sorting of ranges. */
566
567
i = (((range_list_size >> 2) - 1) << 1);
568
while (TRUE)
569
{
570
do_heapify(buffer, range_list_size, i);
571
if (i == 0) break;
572
i -= 2;
573
}
574
575
i = range_list_size - 2;
576
while (TRUE)
577
{
578
tmp1 = buffer[i];
579
tmp2 = buffer[i + 1];
580
buffer[i] = buffer[0];
581
buffer[i + 1] = buffer[1];
582
buffer[0] = tmp1;
583
buffer[1] = tmp2;
584
585
do_heapify(buffer, i, 0);
586
if (i == 0) break;
587
i -= 2;
588
}
589
590
/* Merge ranges whenever possible. */
591
dst = buffer;
592
ptr = buffer + 2;
593
range_list_size -= 2;
594
595
/* The second condition is a very rare corner case, where the end of the last
596
range is the maximum character. This range cannot be extended further. */
597
598
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
599
{
600
if (dst[1] + 1 < ptr[0])
601
{
602
dst += 2;
603
dst[0] = ptr[0];
604
dst[1] = ptr[1];
605
}
606
else if (dst[1] < ptr[1]) dst[1] = ptr[1];
607
608
ptr += 2;
609
range_list_size -= 2;
610
}
611
612
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
613
614
/* When the number of ranges are less than six,
615
they are not converted to range lists. */
616
617
ptr = buffer;
618
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
619
if (dst - ptr < (2 * (6 - 1)))
620
{
621
cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
622
return cranges;
623
}
624
625
/* Compute character lists structures. */
626
627
char_list_next = char_list_starts;
628
char_list_start = *char_list_next++;
629
#if PCRE2_CODE_UNIT_WIDTH == 32
630
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
631
#elif defined SUPPORT_UNICODE
632
char_list_end = XCL_CHAR_LIST_LOW_32_END;
633
#else
634
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
635
#endif
636
next_char = (uint16_t*)(buffer + total_size);
637
638
tmp1 = 0;
639
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
640
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
641
range_start = dst[0];
642
range_end = dst[1];
643
644
while (TRUE)
645
{
646
if (range_start >= char_list_start)
647
{
648
if (range_start == range_end || range_end < char_list_end)
649
{
650
tmp1++;
651
next_char--;
652
653
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
654
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
655
else
656
*(uint32_t*)(--next_char) =
657
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
658
}
659
660
if (range_start < range_end)
661
{
662
if (range_start > char_list_start)
663
{
664
tmp1++;
665
next_char--;
666
667
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
668
*next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
669
else
670
*(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
671
}
672
else
673
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
674
}
675
676
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
677
678
if (dst > buffer)
679
{
680
dst -= 2;
681
range_start = dst[0];
682
range_end = dst[1];
683
continue;
684
}
685
686
range_start = 0;
687
range_end = 0;
688
}
689
690
if (range_end >= char_list_start)
691
{
692
PCRE2_ASSERT(range_start < char_list_start);
693
694
if (range_end < char_list_end)
695
{
696
tmp1++;
697
next_char--;
698
699
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
700
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
701
else
702
*(uint32_t*)(--next_char) =
703
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
704
705
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
706
}
707
708
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
709
}
710
711
if (tmp1 >= XCL_ITEM_COUNT_MASK)
712
{
713
cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
714
next_char--;
715
716
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
717
*next_char = (uint16_t)tmp1;
718
else
719
*(uint32_t*)(--next_char) = tmp1;
720
}
721
else
722
cranges->char_lists_types |= tmp1 << tmp2;
723
724
if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
725
726
PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
727
char_list_end = char_list_start - 1;
728
char_list_start = *char_list_next++;
729
tmp1 = 0;
730
tmp2 -= XCL_TYPE_BIT_LEN;
731
}
732
733
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
734
PCRE2_ASSERT((uint16_t*)dst <= next_char);
735
736
cranges->char_lists_size =
737
(size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
738
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
739
cranges->range_list_size = (uint16_t)(dst - buffer);
740
return cranges;
741
}
742
743
#endif /* SUPPORT_WIDE_CHARS */
744
745
#ifdef SUPPORT_UNICODE
746
747
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
748
uint8_t *classbits)
749
{
750
/* Update PRIV(xclass) when this function is changed. */
751
int c, chartype;
752
const ucd_record *prop;
753
uint32_t gentype;
754
BOOL set_bit;
755
756
if (ptype == PT_ANY)
757
{
758
if (!negated) memset(classbits, 0xff, 32);
759
return;
760
}
761
762
for (c = 0; c < 256; c++)
763
{
764
prop = GET_UCD(c);
765
set_bit = FALSE;
766
(void)set_bit;
767
768
switch (ptype)
769
{
770
case PT_LAMP:
771
chartype = prop->chartype;
772
set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
773
break;
774
775
case PT_GC:
776
set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
777
break;
778
779
case PT_PC:
780
set_bit = (prop->chartype == pdata);
781
break;
782
783
case PT_SC:
784
set_bit = (prop->script == pdata);
785
break;
786
787
case PT_SCX:
788
set_bit = (prop->script == pdata ||
789
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
790
break;
791
792
case PT_ALNUM:
793
gentype = PRIV(ucp_gentype)[prop->chartype];
794
set_bit = (gentype == ucp_L || gentype == ucp_N);
795
break;
796
797
case PT_SPACE: /* Perl space */
798
case PT_PXSPACE: /* POSIX space */
799
switch(c)
800
{
801
HSPACE_BYTE_CASES:
802
VSPACE_BYTE_CASES:
803
set_bit = TRUE;
804
break;
805
806
default:
807
set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
808
break;
809
}
810
break;
811
812
case PT_WORD:
813
chartype = prop->chartype;
814
gentype = PRIV(ucp_gentype)[chartype];
815
set_bit = (gentype == ucp_L || gentype == ucp_N ||
816
chartype == ucp_Mn || chartype == ucp_Pc);
817
break;
818
819
case PT_UCNC:
820
set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
821
c == CHAR_GRAVE_ACCENT || c >= 0xa0);
822
break;
823
824
case PT_BIDICL:
825
set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
826
break;
827
828
case PT_BOOL:
829
set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
830
UCD_BPROPS_PROP(prop), pdata) != 0;
831
break;
832
833
case PT_PXGRAPH:
834
chartype = prop->chartype;
835
gentype = PRIV(ucp_gentype)[chartype];
836
set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
837
break;
838
839
case PT_PXPRINT:
840
chartype = prop->chartype;
841
set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
842
(PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
843
break;
844
845
case PT_PXPUNCT:
846
gentype = PRIV(ucp_gentype)[prop->chartype];
847
set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
848
break;
849
850
default:
851
PCRE2_ASSERT(ptype == PT_PXXDIGIT);
852
set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
853
(c >= CHAR_A && c <= CHAR_F) ||
854
(c >= CHAR_a && c <= CHAR_f);
855
break;
856
}
857
858
if (negated) set_bit = !set_bit;
859
if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
860
if ((c & 0x7) == 0x7) classbits++;
861
}
862
}
863
864
#endif /* SUPPORT_UNICODE */
865
866
867
868
#ifdef SUPPORT_WIDE_CHARS
869
870
/*************************************************
871
* XClass related properties *
872
*************************************************/
873
874
/* XClass needs to be generated. */
875
#define XCLASS_REQUIRED 0x1
876
/* XClass has 8 bit character. */
877
#define XCLASS_HAS_8BIT_CHARS 0x2
878
/* XClass has properties. */
879
#define XCLASS_HAS_PROPS 0x4
880
/* XClass has character lists. */
881
#define XCLASS_HAS_CHAR_LISTS 0x8
882
/* XClass matches to all >= 256 characters. */
883
#define XCLASS_HIGH_ANY 0x10
884
885
#endif
886
887
888
/*************************************************
889
* Internal entry point for add range to class *
890
*************************************************/
891
892
/* This function sets the overall range for characters < 256.
893
It also handles non-utf case folding.
894
895
Arguments:
896
options the options bits
897
xoptions the extra options bits
898
cb compile data
899
start start of range character
900
end end of range character
901
902
Returns: cb->classbits is updated
903
*/
904
905
static void
906
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
907
uint32_t start, uint32_t end)
908
{
909
uint8_t *classbits = cb->classbits.classbits;
910
uint32_t c, byte_start, byte_end;
911
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
912
913
#ifndef SUPPORT_UNICODE
914
(void)xoptions; /* Avoid compiler warning. */
915
#endif
916
917
/* If caseless matching is required, scan the range and process alternate
918
cases. In Unicode, there are 8-bit characters that have alternate cases that
919
are greater than 255 and vice-versa (though these may be ignored if caseless
920
restriction is in force). Sometimes we can just extend the original range. */
921
922
if ((options & PCRE2_CASELESS) != 0)
923
{
924
#ifdef SUPPORT_UNICODE
925
/* UTF mode. This branch is taken if we don't support wide characters (e.g.
926
8-bit library, without UTF), but we do treat those characters as Unicode
927
(if UCP flag is set). In this case, we only need to expand the character class
928
set to include the case pairs which are in the 0-255 codepoint range. */
929
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
930
{
931
BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
932
PCRE2_EXTRA_TURKISH_CASING;
933
if (start < 128)
934
{
935
uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
936
for (c = start; c <= lo_end; c++)
937
{
938
if (turkish_i && UCD_ANY_I(c)) continue;
939
SETBIT(classbits, cb->fcc[c]);
940
}
941
}
942
if (classbits_end >= 128)
943
{
944
uint32_t hi_start = (start > 128 ? start : 128);
945
for (c = hi_start; c <= classbits_end; c++)
946
{
947
uint32_t co = UCD_OTHERCASE(c);
948
if (co <= 0xff) SETBIT(classbits, co);
949
}
950
}
951
}
952
953
else
954
#endif /* SUPPORT_UNICODE */
955
956
/* Not UTF mode */
957
{
958
for (c = start; c <= classbits_end; c++)
959
SETBIT(classbits, cb->fcc[c]);
960
}
961
}
962
963
/* Use the bitmap for characters < 256. Otherwise use extra data. */
964
965
byte_start = (start + 7) >> 3;
966
byte_end = (classbits_end + 1) >> 3;
967
968
if (byte_start >= byte_end)
969
{
970
for (c = start; c <= classbits_end; c++)
971
/* Regardless of start, c will always be <= 255. */
972
SETBIT(classbits, c);
973
return;
974
}
975
976
for (c = byte_start; c < byte_end; c++)
977
classbits[c] = 0xff;
978
979
byte_start <<= 3;
980
byte_end <<= 3;
981
982
for (c = start; c < byte_start; c++)
983
SETBIT(classbits, c);
984
985
for (c = byte_end; c <= classbits_end; c++)
986
SETBIT(classbits, c);
987
}
988
989
990
#if PCRE2_CODE_UNIT_WIDTH == 8
991
/*************************************************
992
* Internal entry point for add list to class *
993
*************************************************/
994
995
/* This function is used for adding a list of horizontal or vertical whitespace
996
characters to a class. The list must be in order so that ranges of characters
997
can be detected and handled appropriately. This function sets the overall range
998
so that the internal functions can try to avoid duplication when handling
999
case-independence.
1000
1001
Arguments:
1002
options the options bits
1003
xoptions the extra options bits
1004
cb contains pointers to tables etc.
1005
p points to row of 32-bit values, terminated by NOTACHAR
1006
1007
Returns: cb->classbits is updated
1008
*/
1009
1010
static void
1011
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1012
const uint32_t *p)
1013
{
1014
while (p[0] < 256)
1015
{
1016
unsigned int n = 0;
1017
1018
while(p[n+1] == p[0] + n + 1) n++;
1019
add_to_class(options, xoptions, cb, p[0], p[n]);
1020
1021
p += n + 1;
1022
}
1023
}
1024
1025
1026
1027
/*************************************************
1028
* Add characters not in a list to a class *
1029
*************************************************/
1030
1031
/* This function is used for adding the complement of a list of horizontal or
1032
vertical whitespace to a class. The list must be in order.
1033
1034
Arguments:
1035
options the options bits
1036
xoptions the extra options bits
1037
cb contains pointers to tables etc.
1038
p points to row of 32-bit values, terminated by NOTACHAR
1039
1040
Returns: cb->classbits is updated
1041
*/
1042
1043
static void
1044
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1045
const uint32_t *p)
1046
{
1047
if (p[0] > 0)
1048
add_to_class(options, xoptions, cb, 0, p[0] - 1);
1049
while (p[0] < 256)
1050
{
1051
while (p[1] == p[0] + 1) p++;
1052
add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
1053
p++;
1054
}
1055
}
1056
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1057
1058
1059
1060
/*************************************************
1061
* Main entry-point to compile a character class *
1062
*************************************************/
1063
1064
/* This function consumes a "leaf", which is a set of characters that will
1065
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
1066
1067
uint32_t *
1068
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
1069
uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
1070
int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
1071
{
1072
uint32_t *pptr = start_ptr;
1073
PCRE2_UCHAR *code = *pcode;
1074
BOOL should_flip_negation;
1075
const uint8_t *cbits = cb->cbits;
1076
/* Some functions such as add_to_class() or eclass processing
1077
expects that the bitset is stored in cb->classbits.classbits. */
1078
uint8_t *const classbits = cb->classbits.classbits;
1079
1080
#ifdef SUPPORT_UNICODE
1081
BOOL utf = (options & PCRE2_UTF) != 0;
1082
#else /* No Unicode support */
1083
BOOL utf = FALSE;
1084
#endif
1085
1086
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
1087
1088
#ifdef SUPPORT_WIDE_CHARS
1089
uint32_t xclass_props;
1090
PCRE2_UCHAR *class_uchardata;
1091
class_ranges* cranges;
1092
#else
1093
(void)has_bitmap; /* Avoid compiler warning. */
1094
(void)errorcodeptr; /* Avoid compiler warning. */
1095
(void)lengthptr; /* Avoid compiler warning. */
1096
#endif
1097
1098
/* If an XClass contains a negative special such as \S, we need to flip the
1099
negation flag at the end, so that support for characters > 255 works correctly
1100
(they are all included in the class). An XClass may need to insert specific
1101
matching or non-matching code for wide characters.
1102
*/
1103
1104
should_flip_negation = FALSE;
1105
1106
/* XClass will be used when characters > 255 might match. */
1107
1108
#ifdef SUPPORT_WIDE_CHARS
1109
xclass_props = 0;
1110
1111
#if PCRE2_CODE_UNIT_WIDTH == 8
1112
cranges = NULL;
1113
1114
if (utf)
1115
#endif
1116
{
1117
if (lengthptr != NULL)
1118
{
1119
cranges = compile_optimize_class(pptr, options, xoptions, cb);
1120
1121
if (cranges == NULL)
1122
{
1123
*errorcodeptr = ERR21;
1124
return NULL;
1125
}
1126
1127
/* Caching the pre-processed character ranges. */
1128
if (cb->last_data != NULL)
1129
cb->last_data->next = &cranges->header;
1130
else
1131
cb->first_data = &cranges->header;
1132
1133
cb->last_data = &cranges->header;
1134
}
1135
else
1136
{
1137
/* Reuse the pre-processed character ranges. */
1138
cranges = (class_ranges*)cb->first_data;
1139
PCRE2_ASSERT(cranges != NULL && cranges->header.type == CDATA_CRANGE);
1140
cb->first_data = cranges->header.next;
1141
}
1142
1143
if (cranges->range_list_size > 0)
1144
{
1145
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
1146
1147
if (ranges[0] <= 255)
1148
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1149
1150
if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
1151
ranges[cranges->range_list_size - 2] <= 256)
1152
xclass_props |= XCLASS_HIGH_ANY;
1153
}
1154
}
1155
1156
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
1157
#endif /* SUPPORT_WIDE_CHARS */
1158
1159
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
1160
in a temporary bit of memory, in case the class contains fewer than two
1161
8-bit characters because in that case the compiled code doesn't use the bit
1162
map. */
1163
1164
memset(classbits, 0, 32);
1165
1166
/* Process items until end_ptr is reached. */
1167
1168
while (TRUE)
1169
{
1170
uint32_t meta = *(pptr++);
1171
BOOL local_negate;
1172
int posix_class;
1173
int taboffset, tabopt;
1174
class_bits_storage pbits;
1175
uint32_t escape, c;
1176
1177
/* Handle POSIX classes such as [:alpha:] etc. */
1178
switch (META_CODE(meta))
1179
{
1180
case META_POSIX:
1181
case META_POSIX_NEG:
1182
1183
local_negate = (meta == META_POSIX_NEG);
1184
posix_class = *(pptr++);
1185
1186
if (local_negate) should_flip_negation = TRUE; /* Note negative special */
1187
1188
/* If matching is caseless, upper and lower are converted to alpha.
1189
This relies on the fact that the class table starts with alpha,
1190
lower, upper as the first 3 entries. */
1191
1192
if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
1193
posix_class = 0;
1194
1195
/* When PCRE2_UCP is set, some of the POSIX classes are converted to
1196
different escape sequences that use Unicode properties \p or \P.
1197
Others that are not available via \p or \P have to generate
1198
XCL_PROP/XCL_NOTPROP directly, which is done here. */
1199
1200
#ifdef SUPPORT_UNICODE
1201
/* TODO This entire block of code here appears to be unreachable!? I simply
1202
can't see how it can be hit, given that the frontend parser doesn't emit
1203
META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
1204
if ((options & PCRE2_UCP) != 0 &&
1205
(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
1206
{
1207
uint32_t ptype;
1208
1209
switch(posix_class)
1210
{
1211
case PC_GRAPH:
1212
case PC_PRINT:
1213
case PC_PUNCT:
1214
ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
1215
(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
1216
1217
PRIV(update_classbits)(ptype, 0, local_negate, classbits);
1218
1219
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1220
{
1221
if (lengthptr != NULL)
1222
*lengthptr += 3;
1223
else
1224
{
1225
*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
1226
*class_uchardata++ = (PCRE2_UCHAR)ptype;
1227
*class_uchardata++ = 0;
1228
}
1229
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1230
}
1231
continue;
1232
1233
/* For the other POSIX classes (ex: ascii) we are going to
1234
fall through to the non-UCP case and build a bit map for
1235
characters with code points less than 256. However, if we are in
1236
a negated POSIX class, characters with code points greater than
1237
255 must either all match or all not match, depending on whether
1238
the whole class is not or is negated. For example, for
1239
[[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
1240
they must not.
1241
1242
In the special case where there are no xclass items, this is
1243
automatically handled by the use of OP_CLASS or OP_NCLASS, but an
1244
explicit range is needed for OP_XCLASS. Setting a flag here
1245
causes the range to be generated later when it is known that
1246
OP_XCLASS is required. In the 8-bit library this is relevant only in
1247
utf mode, since no wide characters can exist otherwise. */
1248
1249
default:
1250
break;
1251
}
1252
}
1253
#endif /* SUPPORT_UNICODE */
1254
1255
/* In the non-UCP case, or when UCP makes no difference, we build the
1256
bit map for the POSIX class in a chunk of local store because we may
1257
be adding and subtracting from it, and we don't want to subtract bits
1258
that may be in the main map already. At the end we or the result into
1259
the bit map that is being built. */
1260
1261
posix_class *= 3;
1262
1263
/* Copy in the first table (always present) */
1264
1265
memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
1266
1267
/* If there is a second table, add or remove it as required. */
1268
1269
taboffset = PRIV(posix_class_maps)[posix_class + 1];
1270
tabopt = PRIV(posix_class_maps)[posix_class + 2];
1271
1272
if (taboffset >= 0)
1273
{
1274
if (tabopt >= 0)
1275
for (int i = 0; i < 32; i++)
1276
pbits.classbits[i] |= cbits[i + taboffset];
1277
else
1278
for (int i = 0; i < 32; i++)
1279
pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
1280
}
1281
1282
/* Now see if we need to remove any special characters. An option
1283
value of 1 removes vertical space and 2 removes underscore. */
1284
1285
if (tabopt < 0) tabopt = -tabopt;
1286
#ifdef EBCDIC
1287
{
1288
uint8_t posix_vertical[4] = { CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR };
1289
uint8_t posix_underscore = CHAR_UNDERSCORE;
1290
uint8_t *chars = NULL;
1291
int n = 0;
1292
1293
if (tabopt == 1) { chars = posix_vertical; n = 4; }
1294
else if (tabopt == 2) { chars = &posix_underscore; n = 1; }
1295
1296
for (; n > 0; ++chars, --n)
1297
pbits.classbits[*chars/8] &= ~(1u << (*chars&7));
1298
}
1299
#else
1300
if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
1301
else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
1302
#endif
1303
1304
/* Add the POSIX table or its complement into the main table that is
1305
being built and we are done. */
1306
1307
{
1308
uint32_t *classwords = cb->classbits.classwords;
1309
1310
if (local_negate)
1311
for (int i = 0; i < 8; i++)
1312
classwords[i] |= (uint32_t)(~pbits.classwords[i]);
1313
else
1314
for (int i = 0; i < 8; i++)
1315
classwords[i] |= pbits.classwords[i];
1316
}
1317
1318
#ifdef SUPPORT_WIDE_CHARS
1319
/* Every class contains at least one < 256 character. */
1320
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1321
#endif
1322
continue; /* End of POSIX handling */
1323
1324
/* Other than POSIX classes, the only items we should encounter are
1325
\d-type escapes and literal characters (possibly as ranges). */
1326
case META_BIGVALUE:
1327
meta = *(pptr++);
1328
break;
1329
1330
case META_ESCAPE:
1331
escape = META_DATA(meta);
1332
1333
switch(escape)
1334
{
1335
case ESC_d:
1336
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
1337
break;
1338
1339
case ESC_D:
1340
should_flip_negation = TRUE;
1341
for (int i = 0; i < 32; i++)
1342
classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
1343
break;
1344
1345
case ESC_w:
1346
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
1347
break;
1348
1349
case ESC_W:
1350
should_flip_negation = TRUE;
1351
for (int i = 0; i < 32; i++)
1352
classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
1353
break;
1354
1355
/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
1356
5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
1357
previously set by something earlier in the character class.
1358
Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
1359
we could just adjust the appropriate bit. From PCRE 8.34 we no
1360
longer treat \s and \S specially. */
1361
1362
case ESC_s:
1363
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
1364
break;
1365
1366
case ESC_S:
1367
should_flip_negation = TRUE;
1368
for (int i = 0; i < 32; i++)
1369
classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
1370
break;
1371
1372
/* When adding the horizontal or vertical space lists to a class, or
1373
their complements, disable PCRE2_CASELESS, because it justs wastes
1374
time, and in the "not-x" UTF cases can create unwanted duplicates in
1375
the XCLASS list (provoked by characters that have more than one other
1376
case and by both cases being in the same "not-x" sublist). */
1377
1378
case ESC_h:
1379
#if PCRE2_CODE_UNIT_WIDTH == 8
1380
#ifdef SUPPORT_UNICODE
1381
if (cranges != NULL) break;
1382
#endif
1383
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1384
cb, PRIV(hspace_list));
1385
#else
1386
PCRE2_ASSERT(cranges != NULL);
1387
#endif
1388
break;
1389
1390
case ESC_H:
1391
#if PCRE2_CODE_UNIT_WIDTH == 8
1392
#ifdef SUPPORT_UNICODE
1393
if (cranges != NULL) break;
1394
#endif
1395
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1396
cb, PRIV(hspace_list));
1397
#else
1398
PCRE2_ASSERT(cranges != NULL);
1399
#endif
1400
break;
1401
1402
case ESC_v:
1403
#if PCRE2_CODE_UNIT_WIDTH == 8
1404
#ifdef SUPPORT_UNICODE
1405
if (cranges != NULL) break;
1406
#endif
1407
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1408
cb, PRIV(vspace_list));
1409
#else
1410
PCRE2_ASSERT(cranges != NULL);
1411
#endif
1412
break;
1413
1414
case ESC_V:
1415
#if PCRE2_CODE_UNIT_WIDTH == 8
1416
#ifdef SUPPORT_UNICODE
1417
if (cranges != NULL) break;
1418
#endif
1419
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1420
cb, PRIV(vspace_list));
1421
#else
1422
PCRE2_ASSERT(cranges != NULL);
1423
#endif
1424
break;
1425
1426
/* If Unicode is not supported, \P and \p are not allowed and are
1427
faulted at parse time, so will never appear here. */
1428
1429
#ifdef SUPPORT_UNICODE
1430
case ESC_p:
1431
case ESC_P:
1432
{
1433
uint32_t ptype = *pptr >> 16;
1434
uint32_t pdata = *(pptr++) & 0xffff;
1435
1436
/* The "Any" is processed by PRIV(update_classbits)(). */
1437
if (ptype == PT_ANY)
1438
{
1439
#if PCRE2_CODE_UNIT_WIDTH == 8
1440
if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
1441
#endif
1442
continue;
1443
}
1444
1445
PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
1446
1447
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1448
{
1449
if (lengthptr != NULL)
1450
*lengthptr += 3;
1451
else
1452
{
1453
*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
1454
*class_uchardata++ = ptype;
1455
*class_uchardata++ = pdata;
1456
}
1457
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1458
}
1459
}
1460
continue;
1461
#endif
1462
}
1463
1464
#ifdef SUPPORT_WIDE_CHARS
1465
/* Every non-property class contains at least one < 256 character. */
1466
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1467
#endif
1468
/* End handling \d-type escapes */
1469
continue;
1470
1471
CLASS_END_CASES(meta)
1472
/* Literals. */
1473
if (meta < META_END) break;
1474
/* Non-literals: end of class contents. */
1475
goto END_PROCESSING;
1476
}
1477
1478
/* A literal character may be followed by a range meta. At parse time
1479
there are checks for out-of-order characters, for ranges where the two
1480
characters are equal, and for hyphens that cannot indicate a range. At
1481
this point, therefore, no checking is needed. */
1482
1483
c = meta;
1484
1485
/* Remember if \r or \n were explicitly used */
1486
1487
if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1488
1489
/* Process a character range */
1490
1491
if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
1492
{
1493
uint32_t d;
1494
1495
#ifdef EBCDIC
1496
BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
1497
#endif
1498
++pptr;
1499
d = *(pptr++);
1500
if (d == META_BIGVALUE) d = *(pptr++);
1501
1502
/* Remember an explicit \r or \n, and add the range to the class. */
1503
1504
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1505
1506
#if PCRE2_CODE_UNIT_WIDTH == 8
1507
#ifdef SUPPORT_UNICODE
1508
if (cranges != NULL) continue;
1509
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1510
#endif
1511
1512
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
1513
because there are holes in the encoding, and simply using the range
1514
A-Z (for example) would include the characters in the holes. This
1515
applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
1516
1517
#ifdef EBCDIC
1518
if (range_is_literal &&
1519
(cb->ctypes[c] & ctype_letter) != 0 &&
1520
(cb->ctypes[d] & ctype_letter) != 0 &&
1521
(c <= CHAR_z) == (d <= CHAR_z))
1522
{
1523
uint32_t uc = (d <= CHAR_z)? 0 : 64;
1524
uint32_t C = c - uc;
1525
uint32_t D = d - uc;
1526
1527
if (C <= CHAR_i)
1528
{
1529
add_to_class(options, xoptions, cb, C + uc,
1530
((D < CHAR_i)? D : CHAR_i) + uc);
1531
C = CHAR_j;
1532
}
1533
1534
if (C <= D && C <= CHAR_r)
1535
{
1536
add_to_class(options, xoptions, cb, C + uc,
1537
((D < CHAR_r)? D : CHAR_r) + uc);
1538
C = CHAR_s;
1539
}
1540
1541
if (C <= D)
1542
add_to_class(options, xoptions, cb, C + uc, D + uc);
1543
}
1544
else
1545
#endif
1546
/* Not an EBCDIC special range */
1547
1548
add_to_class(options, xoptions, cb, c, d);
1549
#else
1550
PCRE2_ASSERT(cranges != NULL);
1551
#endif
1552
continue;
1553
} /* End of range handling */
1554
1555
/* Character ranges are ignored when class_ranges is present. */
1556
#if PCRE2_CODE_UNIT_WIDTH == 8
1557
#ifdef SUPPORT_UNICODE
1558
if (cranges != NULL) continue;
1559
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1560
#endif
1561
/* Handle a single character. */
1562
1563
add_to_class(options, xoptions, cb, meta, meta);
1564
#else
1565
PCRE2_ASSERT(cranges != NULL);
1566
#endif
1567
} /* End of main class-processing loop */
1568
1569
END_PROCESSING:
1570
1571
#ifdef SUPPORT_WIDE_CHARS
1572
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
1573
(xclass_props & XCLASS_HIGH_ANY) == 0);
1574
1575
if (cranges != NULL)
1576
{
1577
uint32_t *range = (uint32_t*)(cranges + 1);
1578
uint32_t *end = range + cranges->range_list_size;
1579
1580
while (range < end && range[0] < 256)
1581
{
1582
PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
1583
/* Add range to bitset. If we are in UTF or UCP mode, then clear the
1584
caseless bit, because the cranges handle caselessness (only) in this
1585
condition; see the condition for PARSE_CLASS_CASELESS_UTF in
1586
compile_optimize_class(). */
1587
add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
1588
(options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
1589
1590
if (range[1] > 255) break;
1591
range += 2;
1592
}
1593
1594
if (cranges->char_lists_size > 0)
1595
{
1596
/* The cranges structure is still used and freed later. */
1597
PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
1598
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
1599
}
1600
else
1601
{
1602
if ((xclass_props & XCLASS_HIGH_ANY) != 0)
1603
{
1604
PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
1605
range[1] >= GET_MAX_CHAR_VALUE(utf));
1606
should_flip_negation = TRUE;
1607
range = end;
1608
}
1609
1610
while (range < end)
1611
{
1612
uint32_t range_start = range[0];
1613
uint32_t range_end = range[1];
1614
1615
range += 2;
1616
xclass_props |= XCLASS_REQUIRED;
1617
1618
if (range_start < 256) range_start = 256;
1619
1620
if (lengthptr != NULL)
1621
{
1622
#ifdef SUPPORT_UNICODE
1623
if (utf)
1624
{
1625
*lengthptr += 1;
1626
1627
if (range_start < range_end)
1628
*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
1629
1630
*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
1631
continue;
1632
}
1633
#endif /* SUPPORT_UNICODE */
1634
1635
*lengthptr += range_start < range_end ? 3 : 2;
1636
continue;
1637
}
1638
1639
#ifdef SUPPORT_UNICODE
1640
if (utf)
1641
{
1642
if (range_start < range_end)
1643
{
1644
*class_uchardata++ = XCL_RANGE;
1645
class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
1646
}
1647
else
1648
*class_uchardata++ = XCL_SINGLE;
1649
1650
class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
1651
continue;
1652
}
1653
#endif /* SUPPORT_UNICODE */
1654
1655
/* Without UTF support, character values are constrained
1656
by the bit length, and can only be > 256 for 16-bit and
1657
32-bit libraries. */
1658
#if PCRE2_CODE_UNIT_WIDTH != 8
1659
if (range_start < range_end)
1660
{
1661
*class_uchardata++ = XCL_RANGE;
1662
*class_uchardata++ = range_start;
1663
}
1664
else
1665
*class_uchardata++ = XCL_SINGLE;
1666
1667
*class_uchardata++ = range_end;
1668
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1669
}
1670
1671
if (lengthptr == NULL)
1672
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1673
}
1674
}
1675
#endif /* SUPPORT_WIDE_CHARS */
1676
1677
/* If there are characters with values > 255, or Unicode property settings
1678
(\p or \P), we have to compile an extended class, with its own opcode,
1679
unless there were no property settings and there was a negated special such
1680
as \S in the class, and PCRE2_UCP is not set, because in that case all
1681
characters > 255 are in or not in the class, so any that were explicitly
1682
given as well can be ignored.
1683
1684
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
1685
were present in a class, we either have to match or not match all wide
1686
characters (depending on whether the whole class is or is not negated).
1687
This requirement is indicated by match_all_or_no_wide_chars being true.
1688
We do this by including an explicit range, which works in both cases.
1689
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
1690
cannot be any wide characters in 8-bit non-UTF mode.
1691
1692
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
1693
class where \S etc is present without PCRE2_UCP, causing an extended class
1694
to be compiled, we make sure that all characters > 255 are included by
1695
forcing match_all_or_no_wide_chars to be true.
1696
1697
If, when generating an xclass, there are no characters < 256, we can omit
1698
the bitmap in the actual compiled code. */
1699
1700
#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
1701
if ((xclass_props & XCLASS_REQUIRED) != 0)
1702
{
1703
PCRE2_UCHAR *previous = code;
1704
1705
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
1706
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
1707
*code++ = OP_XCLASS;
1708
code += LINK_SIZE;
1709
*code = negate_class? XCL_NOT:0;
1710
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
1711
1712
/* If the map is required, move up the extra data to make room for it;
1713
otherwise just move the code pointer to the end of the extra data. */
1714
1715
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
1716
{
1717
if (negate_class)
1718
{
1719
uint32_t *classwords = cb->classbits.classwords;
1720
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1721
}
1722
1723
if (has_bitmap == NULL)
1724
{
1725
*code++ |= XCL_MAP;
1726
(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
1727
CU2BYTES(class_uchardata - code));
1728
memcpy(code, classbits, 32);
1729
code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
1730
}
1731
else
1732
{
1733
code = class_uchardata;
1734
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
1735
*has_bitmap = TRUE;
1736
}
1737
}
1738
else code = class_uchardata;
1739
1740
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
1741
{
1742
/* Char lists size is an even number, because all items are 16 or 32
1743
bit values. The character list data is always aligned to 32 bits. */
1744
size_t char_lists_size = cranges->char_lists_size;
1745
PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
1746
(cb->char_lists_size & 0x3) == 0);
1747
1748
if (lengthptr != NULL)
1749
{
1750
char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1751
1752
#if PCRE2_CODE_UNIT_WIDTH == 8
1753
*lengthptr += 2 + LINK_SIZE;
1754
#else
1755
*lengthptr += 1 + LINK_SIZE;
1756
#endif
1757
1758
cb->char_lists_size += char_lists_size;
1759
1760
char_lists_size /= sizeof(PCRE2_UCHAR);
1761
1762
/* Storage space for character lists is included
1763
in the maximum pattern size. */
1764
if (*lengthptr > MAX_PATTERN_SIZE ||
1765
MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
1766
{
1767
*errorcodeptr = ERR20; /* Pattern is too large */
1768
return NULL;
1769
}
1770
}
1771
else
1772
{
1773
uint8_t *data;
1774
1775
PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
1776
#if PCRE2_CODE_UNIT_WIDTH == 8
1777
/* Encode as high / low bytes. */
1778
code[0] = (uint8_t)(XCL_LIST |
1779
(cranges->char_lists_types >> 8));
1780
code[1] = (uint8_t)cranges->char_lists_types;
1781
code += 2;
1782
#else
1783
*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
1784
#endif
1785
1786
/* Character lists are stored in backwards direction from
1787
byte code start. The non-dfa/dfa matchers can access these
1788
lists using the byte code start stored in match blocks.
1789
Each list is aligned to 32 bit with an optional unused
1790
16 bit value at the beginning of the character list. */
1791
1792
cb->char_lists_size += char_lists_size;
1793
data = (uint8_t*)cb->start_code - cb->char_lists_size;
1794
1795
memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
1796
char_lists_size);
1797
1798
/* Since character lists total size is less than MAX_PATTERN_SIZE,
1799
their starting offset fits into a value which size is LINK_SIZE. */
1800
1801
char_lists_size = cb->char_lists_size;
1802
PUT(code, 0, (uint32_t)(char_lists_size >> 1));
1803
code += LINK_SIZE;
1804
1805
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
1806
if ((char_lists_size & 0x2) != 0)
1807
{
1808
/* In debug the unused 16 bit value is set
1809
to a fixed value and marked unused. */
1810
((uint16_t*)data)[-1] = 0x5555;
1811
#ifdef SUPPORT_VALGRIND
1812
VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
1813
#endif
1814
}
1815
#endif
1816
1817
cb->char_lists_size =
1818
CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1819
1820
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1821
}
1822
}
1823
1824
/* Now fill in the complete length of the item */
1825
1826
PUT(previous, 1, (int)(code - previous));
1827
goto DONE; /* End of class handling */
1828
}
1829
#endif /* SUPPORT_WIDE_CHARS */
1830
1831
/* If there are no characters > 255, or they are all to be included or
1832
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
1833
whole class was negated and whether there were negative specials such as \S
1834
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
1835
negating it if necessary. */
1836
1837
if (negate_class)
1838
{
1839
uint32_t *classwords = cb->classbits.classwords;
1840
1841
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1842
}
1843
1844
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
1845
cb->classbits.classwords[0] == ~(uint32_t)0)
1846
{
1847
const uint32_t *classwords = cb->classbits.classwords;
1848
int i;
1849
1850
for (i = 0; i < 8; i++)
1851
if (classwords[i] != ~(uint32_t)0) break;
1852
1853
if (i == 8)
1854
{
1855
*code++ = OP_ALLANY;
1856
goto DONE; /* End of class handling */
1857
}
1858
}
1859
1860
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1861
memcpy(code, classbits, 32);
1862
code += 32 / sizeof(PCRE2_UCHAR);
1863
1864
DONE:
1865
*pcode = code;
1866
return pptr - 1;
1867
}
1868
1869
1870
1871
/* ===================================================================*/
1872
/* Here follows a block of ECLASS-compiling functions. You may well want to
1873
read them from top to bottom; they are ordered from leafmost (at the top) to
1874
outermost parser (at the bottom of the file). */
1875
1876
/* This function folds one operand using the negation operator.
1877
The new, combined chunk of stack code is written out to *pop_info. */
1878
1879
static void
1880
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
1881
BOOL preserve_classbits)
1882
{
1883
/* If the chunk of stack code is already composed of multiple ops, we won't
1884
descend in and try and propagate the negation down the tree. (That would lead
1885
to O(n^2) compile-time, which could be exploitable with a malicious regex -
1886
although maybe that's not really too much of a worry in a library that offers
1887
an exponential-time matching function!) */
1888
1889
if (pop_info->op_single_type == 0)
1890
{
1891
if (lengthptr != NULL)
1892
*lengthptr += 1;
1893
else
1894
pop_info->code_start[pop_info->length] = ECL_NOT;
1895
pop_info->length += 1;
1896
}
1897
1898
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
1899
without needing to produce an ECL_NOT. */
1900
1901
else if (pop_info->op_single_type == ECL_ANY ||
1902
pop_info->op_single_type == ECL_NONE)
1903
{
1904
pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
1905
ECL_ANY : ECL_NONE;
1906
if (lengthptr == NULL)
1907
*(pop_info->code_start) = pop_info->op_single_type;
1908
}
1909
else
1910
{
1911
PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
1912
pop_info->length >= 1 + LINK_SIZE + 1);
1913
if (lengthptr == NULL)
1914
pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
1915
}
1916
1917
if (!preserve_classbits)
1918
{
1919
for (int i = 0; i < 8; i++)
1920
pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
1921
}
1922
}
1923
1924
1925
1926
/* This function folds together two operands using a binary operator.
1927
The new, combined chunk of stack code is written out to *lhs_op_info. */
1928
1929
static void
1930
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
1931
PCRE2_SIZE *lengthptr)
1932
{
1933
switch (op)
1934
{
1935
/* ECL_AND truth table:
1936
1937
LHS RHS RESULT
1938
----------------
1939
ANY * RHS
1940
* ANY LHS
1941
NONE * NONE
1942
* NONE NONE
1943
X Y X & Y
1944
*/
1945
1946
case ECL_AND:
1947
if (rhs_op_info->op_single_type == ECL_ANY)
1948
{
1949
/* no-op: drop the RHS */
1950
}
1951
else if (lhs_op_info->op_single_type == ECL_ANY)
1952
{
1953
/* no-op: drop the LHS, and memmove the RHS into its place */
1954
if (lengthptr == NULL)
1955
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1956
CU2BYTES(rhs_op_info->length));
1957
lhs_op_info->length = rhs_op_info->length;
1958
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1959
}
1960
else if (rhs_op_info->op_single_type == ECL_NONE)
1961
{
1962
/* the result is ECL_NONE: write into the LHS */
1963
if (lengthptr == NULL)
1964
lhs_op_info->code_start[0] = ECL_NONE;
1965
lhs_op_info->length = 1;
1966
lhs_op_info->op_single_type = ECL_NONE;
1967
}
1968
else if (lhs_op_info->op_single_type == ECL_NONE)
1969
{
1970
/* the result is ECL_NONE: drop the RHS */
1971
}
1972
else
1973
{
1974
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
1975
if (lengthptr != NULL)
1976
*lengthptr += 1;
1977
else
1978
{
1979
PCRE2_ASSERT(rhs_op_info->code_start ==
1980
lhs_op_info->code_start + lhs_op_info->length);
1981
rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
1982
}
1983
lhs_op_info->length += rhs_op_info->length + 1;
1984
lhs_op_info->op_single_type = 0;
1985
}
1986
1987
for (int i = 0; i < 8; i++)
1988
lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
1989
break;
1990
1991
/* ECL_OR truth table:
1992
1993
LHS RHS RESULT
1994
----------------
1995
ANY * ANY
1996
* ANY ANY
1997
NONE * RHS
1998
* NONE LHS
1999
X Y X | Y
2000
*/
2001
2002
case ECL_OR:
2003
if (rhs_op_info->op_single_type == ECL_NONE)
2004
{
2005
/* no-op: drop the RHS */
2006
}
2007
else if (lhs_op_info->op_single_type == ECL_NONE)
2008
{
2009
/* no-op: drop the LHS, and memmove the RHS into its place */
2010
if (lengthptr == NULL)
2011
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2012
CU2BYTES(rhs_op_info->length));
2013
lhs_op_info->length = rhs_op_info->length;
2014
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2015
}
2016
else if (rhs_op_info->op_single_type == ECL_ANY)
2017
{
2018
/* the result is ECL_ANY: write into the LHS */
2019
if (lengthptr == NULL)
2020
lhs_op_info->code_start[0] = ECL_ANY;
2021
lhs_op_info->length = 1;
2022
lhs_op_info->op_single_type = ECL_ANY;
2023
}
2024
else if (lhs_op_info->op_single_type == ECL_ANY)
2025
{
2026
/* the result is ECL_ANY: drop the RHS */
2027
}
2028
else
2029
{
2030
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2031
if (lengthptr != NULL)
2032
*lengthptr += 1;
2033
else
2034
{
2035
PCRE2_ASSERT(rhs_op_info->code_start ==
2036
lhs_op_info->code_start + lhs_op_info->length);
2037
rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
2038
}
2039
lhs_op_info->length += rhs_op_info->length + 1;
2040
lhs_op_info->op_single_type = 0;
2041
}
2042
2043
for (int i = 0; i < 8; i++)
2044
lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
2045
break;
2046
2047
/* ECL_XOR truth table:
2048
2049
LHS RHS RESULT
2050
----------------
2051
ANY * !RHS
2052
* ANY !LHS
2053
NONE * RHS
2054
* NONE LHS
2055
X Y X ^ Y
2056
*/
2057
2058
case ECL_XOR:
2059
if (rhs_op_info->op_single_type == ECL_NONE)
2060
{
2061
/* no-op: drop the RHS */
2062
}
2063
else if (lhs_op_info->op_single_type == ECL_NONE)
2064
{
2065
/* no-op: drop the LHS, and memmove the RHS into its place */
2066
if (lengthptr == NULL)
2067
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2068
CU2BYTES(rhs_op_info->length));
2069
lhs_op_info->length = rhs_op_info->length;
2070
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2071
}
2072
else if (rhs_op_info->op_single_type == ECL_ANY)
2073
{
2074
/* the result is !LHS: fold in the negation, and drop the RHS */
2075
/* Preserve the classbits, because we promise to deal with them later. */
2076
fold_negation(lhs_op_info, lengthptr, TRUE);
2077
}
2078
else if (lhs_op_info->op_single_type == ECL_ANY)
2079
{
2080
/* the result is !RHS: drop the LHS, memmove the RHS into its place, and
2081
fold in the negation */
2082
if (lengthptr == NULL)
2083
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2084
CU2BYTES(rhs_op_info->length));
2085
lhs_op_info->length = rhs_op_info->length;
2086
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2087
2088
/* Preserve the classbits, because we promise to deal with them later. */
2089
fold_negation(lhs_op_info, lengthptr, TRUE);
2090
}
2091
else
2092
{
2093
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2094
if (lengthptr != NULL)
2095
*lengthptr += 1;
2096
else
2097
{
2098
PCRE2_ASSERT(rhs_op_info->code_start ==
2099
lhs_op_info->code_start + lhs_op_info->length);
2100
rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
2101
}
2102
lhs_op_info->length += rhs_op_info->length + 1;
2103
lhs_op_info->op_single_type = 0;
2104
}
2105
2106
for (int i = 0; i < 8; i++)
2107
lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
2108
break;
2109
2110
/* LCOV_EXCL_START */
2111
default:
2112
PCRE2_DEBUG_UNREACHABLE();
2113
break;
2114
/* LCOV_EXCL_STOP */
2115
}
2116
}
2117
2118
2119
2120
static BOOL
2121
compile_eclass_nested(eclass_context *context, BOOL negated,
2122
uint32_t **pptr, PCRE2_UCHAR **pcode,
2123
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
2124
2125
/* This function consumes a group of implicitly-unioned class elements.
2126
These can be characters, ranges, properties, or nested classes, as long
2127
as they are all joined by being placed adjacently. */
2128
2129
static BOOL
2130
compile_class_operand(eclass_context *context, BOOL negated,
2131
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2132
PCRE2_SIZE *lengthptr)
2133
{
2134
uint32_t *ptr = *pptr;
2135
uint32_t *prev_ptr;
2136
PCRE2_UCHAR *code = *pcode;
2137
PCRE2_UCHAR *code_start = code;
2138
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
2139
PCRE2_SIZE extra_length;
2140
uint32_t meta = META_CODE(*ptr);
2141
2142
switch (meta)
2143
{
2144
case META_CLASS_EMPTY_NOT:
2145
case META_CLASS_EMPTY:
2146
++ptr;
2147
pop_info->length = 1;
2148
if ((meta == META_CLASS_EMPTY) == negated)
2149
{
2150
*code++ = pop_info->op_single_type = ECL_ANY;
2151
memset(pop_info->bits.classbits, 0xff, 32);
2152
}
2153
else
2154
{
2155
*code++ = pop_info->op_single_type = ECL_NONE;
2156
memset(pop_info->bits.classbits, 0, 32);
2157
}
2158
break;
2159
2160
case META_CLASS:
2161
case META_CLASS_NOT:
2162
if ((*ptr & CLASS_IS_ECLASS) != 0)
2163
{
2164
if (!compile_eclass_nested(context, negated, &ptr, &code,
2165
pop_info, lengthptr))
2166
return FALSE;
2167
2168
PCRE2_ASSERT(*ptr == META_CLASS_END);
2169
ptr++;
2170
goto DONE;
2171
}
2172
2173
ptr++;
2174
PCRE2_FALLTHROUGH /* Fall through */
2175
2176
default:
2177
/* Scan forward characters, ranges, and properties.
2178
For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
2179
we still need to collect that fragment up into a "leaf" OP_CLASS. */
2180
2181
prev_ptr = ptr;
2182
ptr = PRIV(compile_class_not_nested)(
2183
context->options, context->xoptions, ptr, &code,
2184
(meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
2185
context->errorcodeptr, context->cb, lengthptr);
2186
if (ptr == NULL) return FALSE;
2187
2188
/* We must have a 100% guarantee that ptr increases when
2189
compile_class_operand() returns, even on Release builds, so that we can
2190
statically prove our loops terminate. */
2191
/* LCOV_EXCL_START */
2192
if (ptr <= prev_ptr)
2193
{
2194
PCRE2_DEBUG_UNREACHABLE();
2195
return FALSE;
2196
}
2197
/* LCOV_EXCL_STOP */
2198
2199
/* If we fell through above, consume the closing ']'. */
2200
if (meta == META_CLASS || meta == META_CLASS_NOT)
2201
{
2202
PCRE2_ASSERT(*ptr == META_CLASS_END);
2203
ptr++;
2204
}
2205
2206
/* Regardless of whether (lengthptr == NULL), some data will still be written
2207
out to *pcode, which we need: we have to peek at it, to transform the opcode
2208
into the ECLASS version (since we need to hoist up the bitmaps). */
2209
PCRE2_ASSERT(code > code_start);
2210
extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
2211
2212
/* Easiest case: convert OP_ALLANY to ECL_ANY */
2213
2214
if (*code_start == OP_ALLANY)
2215
{
2216
PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
2217
pop_info->length = 1;
2218
*code_start = pop_info->op_single_type = ECL_ANY;
2219
memset(pop_info->bits.classbits, 0xff, 32);
2220
}
2221
2222
/* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
2223
ECL_NONE / ECL_ANY respectively. */
2224
2225
else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
2226
{
2227
PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
2228
extra_length == 0);
2229
pop_info->length = 1;
2230
*code_start = pop_info->op_single_type =
2231
(*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
2232
memcpy(pop_info->bits.classbits, code_start + 1, 32);
2233
/* Rewind the code pointer, but make sure we adjust *lengthptr, because we
2234
do need to reserve that space (even though we only use it temporarily). */
2235
if (lengthptr != NULL)
2236
*lengthptr += code - (code_start + 1);
2237
code = code_start + 1;
2238
2239
if (!context->needs_bitmap && *code_start == ECL_NONE)
2240
{
2241
uint32_t *classwords = pop_info->bits.classwords;
2242
2243
for (int i = 0; i < 8; i++)
2244
if (classwords[i] != 0)
2245
{
2246
context->needs_bitmap = TRUE;
2247
break;
2248
}
2249
}
2250
else
2251
context->needs_bitmap = TRUE;
2252
}
2253
2254
/* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
2255
ECL_XCLASS. */
2256
2257
else
2258
{
2259
PCRE2_ASSERT(*code_start == OP_XCLASS);
2260
*code_start = pop_info->op_single_type = ECL_XCLASS;
2261
2262
PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
2263
2264
memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
2265
pop_info->length = (code - code_start) + extra_length;
2266
}
2267
2268
break;
2269
} /* End of switch(meta) */
2270
2271
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
2272
2273
if (lengthptr != NULL)
2274
{
2275
*lengthptr += code - code_start;
2276
code = code_start;
2277
}
2278
2279
DONE:
2280
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
2281
2282
*pptr = ptr;
2283
*pcode = code;
2284
return TRUE;
2285
}
2286
2287
2288
2289
/* This function consumes a group of implicitly-unioned class elements.
2290
These can be characters, ranges, properties, or nested classes, as long
2291
as they are all joined by being placed adjacently. */
2292
2293
static BOOL
2294
compile_class_juxtaposition(eclass_context *context, BOOL negated,
2295
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2296
PCRE2_SIZE *lengthptr)
2297
{
2298
uint32_t *ptr = *pptr;
2299
PCRE2_UCHAR *code = *pcode;
2300
#ifdef PCRE2_DEBUG
2301
PCRE2_UCHAR *start_code = *pcode;
2302
#endif
2303
2304
/* See compile_class_binary_loose() for comments on compile-time folding of
2305
the "negated" flag. */
2306
2307
/* Because it's a non-empty class, there must be an operand at the start. */
2308
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
2309
return FALSE;
2310
2311
while (*ptr != META_CLASS_END &&
2312
!(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
2313
{
2314
uint32_t op;
2315
BOOL rhs_negated;
2316
eclass_op_info rhs_op_info;
2317
2318
if (negated)
2319
{
2320
/* !(A juxtapose B) -> !A && !B */
2321
op = ECL_AND;
2322
rhs_negated = TRUE;
2323
}
2324
else
2325
{
2326
/* A juxtapose B -> A || B */
2327
op = ECL_OR;
2328
rhs_negated = FALSE;
2329
}
2330
2331
/* An operand must follow the operator. */
2332
if (!compile_class_operand(context, rhs_negated, &ptr, &code,
2333
&rhs_op_info, lengthptr))
2334
return FALSE;
2335
2336
/* Convert infix to postfix (RPN). */
2337
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2338
if (lengthptr == NULL)
2339
code = pop_info->code_start + pop_info->length;
2340
}
2341
2342
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2343
2344
*pptr = ptr;
2345
*pcode = code;
2346
return TRUE;
2347
}
2348
2349
2350
2351
/* This function consumes unary prefix operators. */
2352
2353
static BOOL
2354
compile_class_unary(eclass_context *context, BOOL negated,
2355
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2356
PCRE2_SIZE *lengthptr)
2357
{
2358
uint32_t *ptr = *pptr;
2359
#ifdef PCRE2_DEBUG
2360
PCRE2_UCHAR *start_code = *pcode;
2361
#endif
2362
2363
while (*ptr == META_ECLASS_NOT)
2364
{
2365
++ptr;
2366
negated = !negated;
2367
}
2368
2369
*pptr = ptr;
2370
/* Because it's a non-empty class, there must be an operand. */
2371
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
2372
pop_info, lengthptr))
2373
return FALSE;
2374
2375
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2376
return TRUE;
2377
}
2378
2379
2380
2381
/* This function consumes tightly-binding binary operators. */
2382
2383
static BOOL
2384
compile_class_binary_tight(eclass_context *context, BOOL negated,
2385
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2386
PCRE2_SIZE *lengthptr)
2387
{
2388
uint32_t *ptr = *pptr;
2389
PCRE2_UCHAR *code = *pcode;
2390
#ifdef PCRE2_DEBUG
2391
PCRE2_UCHAR *start_code = *pcode;
2392
#endif
2393
2394
/* See compile_class_binary_loose() for comments on compile-time folding of
2395
the "negated" flag. */
2396
2397
/* Because it's a non-empty class, there must be an operand at the start. */
2398
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
2399
return FALSE;
2400
2401
while (*ptr == META_ECLASS_AND)
2402
{
2403
uint32_t op;
2404
BOOL rhs_negated;
2405
eclass_op_info rhs_op_info;
2406
2407
if (negated)
2408
{
2409
/* !(A && B) -> !A || !B */
2410
op = ECL_OR;
2411
rhs_negated = TRUE;
2412
}
2413
else
2414
{
2415
/* A && B -> A && B */
2416
op = ECL_AND;
2417
rhs_negated = FALSE;
2418
}
2419
2420
++ptr;
2421
2422
/* An operand must follow the operator. */
2423
if (!compile_class_unary(context, rhs_negated, &ptr, &code,
2424
&rhs_op_info, lengthptr))
2425
return FALSE;
2426
2427
/* Convert infix to postfix (RPN). */
2428
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2429
if (lengthptr == NULL)
2430
code = pop_info->code_start + pop_info->length;
2431
}
2432
2433
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2434
2435
*pptr = ptr;
2436
*pcode = code;
2437
return TRUE;
2438
}
2439
2440
2441
2442
/* This function consumes loosely-binding binary operators. */
2443
2444
static BOOL
2445
compile_class_binary_loose(eclass_context *context, BOOL negated,
2446
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2447
PCRE2_SIZE *lengthptr)
2448
{
2449
uint32_t *ptr = *pptr;
2450
PCRE2_UCHAR *code = *pcode;
2451
#ifdef PCRE2_DEBUG
2452
PCRE2_UCHAR *start_code = *pcode;
2453
#endif
2454
2455
/* We really want to fold the negation operator, if at all possible, so that
2456
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
2457
to produce a fully-folded expression, so that we can guarantee not to emit any
2458
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
2459
2460
This has the consequence that with a little ingenuity, we can in fact avoid
2461
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
2462
!(A ...
2463
We have parsed the preceding "!", and we are about to parse the "A" operand. We
2464
don't know yet whether there will even be a following binary operand! Both of
2465
these are possibilities for what follows:
2466
!(A && B)
2467
!(A)
2468
However, we can still fold the "!" into the "A" operand, because no matter what
2469
the following binary operator will be, we can produce an expression which is
2470
equivalent. */
2471
2472
/* Because it's a non-empty class, there must be an operand at the start. */
2473
if (!compile_class_binary_tight(context, negated, &ptr, &code,
2474
pop_info, lengthptr))
2475
return FALSE;
2476
2477
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
2478
{
2479
uint32_t op;
2480
BOOL op_neg;
2481
BOOL rhs_negated;
2482
eclass_op_info rhs_op_info;
2483
2484
if (negated)
2485
{
2486
/* The whole expression is being negated; we respond by unconditionally
2487
negating the LHS A, before seeing what follows. And hooray! We can recover,
2488
no matter what follows. */
2489
/* !(A || B) -> !A && !B */
2490
/* !(A -- B) -> !(A && !B) -> !A || B */
2491
/* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */
2492
op = (*ptr == META_ECLASS_OR )? ECL_AND :
2493
(*ptr == META_ECLASS_SUB)? ECL_OR :
2494
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
2495
op_neg = (*ptr == META_ECLASS_XOR);
2496
rhs_negated = *ptr != META_ECLASS_SUB;
2497
}
2498
else
2499
{
2500
/* A || B -> A || B */
2501
/* A -- B -> A && !B */
2502
/* A XOR B -> A XOR B */
2503
op = (*ptr == META_ECLASS_OR )? ECL_OR :
2504
(*ptr == META_ECLASS_SUB)? ECL_AND :
2505
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
2506
op_neg = FALSE;
2507
rhs_negated = *ptr == META_ECLASS_SUB;
2508
}
2509
2510
++ptr;
2511
2512
/* An operand must follow the operator. */
2513
if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
2514
&rhs_op_info, lengthptr))
2515
return FALSE;
2516
2517
/* Convert infix to postfix (RPN). */
2518
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2519
if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
2520
if (lengthptr == NULL)
2521
code = pop_info->code_start + pop_info->length;
2522
}
2523
2524
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2525
2526
*pptr = ptr;
2527
*pcode = code;
2528
return TRUE;
2529
}
2530
2531
2532
2533
/* This function converts the META codes in pptr into opcodes written to
2534
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
2535
2536
The class is compiled as a left-associative sequence of operator
2537
applications.
2538
2539
The pptr will be left pointing at the matching META_CLASS_END. */
2540
2541
static BOOL
2542
compile_eclass_nested(eclass_context *context, BOOL negated,
2543
uint32_t **pptr, PCRE2_UCHAR **pcode,
2544
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
2545
{
2546
uint32_t *ptr = *pptr;
2547
#ifdef PCRE2_DEBUG
2548
PCRE2_UCHAR *start_code = *pcode;
2549
#endif
2550
2551
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
2552
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
2553
*ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
2554
2555
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
2556
negated = !negated;
2557
2558
(*pptr)++;
2559
2560
/* Because it's a non-empty class, there must be an operand at the start. */
2561
if (!compile_class_binary_loose(context, negated, pptr, pcode,
2562
pop_info, lengthptr))
2563
return FALSE;
2564
2565
PCRE2_ASSERT(**pptr == META_CLASS_END);
2566
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2567
return TRUE;
2568
}
2569
2570
BOOL
2571
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
2572
uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
2573
compile_block *cb, PCRE2_SIZE *lengthptr)
2574
{
2575
eclass_context context;
2576
eclass_op_info op_info;
2577
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
2578
PCRE2_UCHAR *code = *pcode;
2579
PCRE2_UCHAR *previous;
2580
BOOL allbitsone = TRUE;
2581
2582
context.needs_bitmap = FALSE;
2583
context.options = options;
2584
context.xoptions = xoptions;
2585
context.errorcodeptr = errorcodeptr;
2586
context.cb = cb;
2587
2588
previous = code;
2589
*code++ = OP_ECLASS;
2590
code += LINK_SIZE;
2591
*code++ = 0; /* Flags, currently zero. */
2592
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
2593
return FALSE;
2594
2595
if (lengthptr != NULL)
2596
{
2597
*lengthptr += code - previous;
2598
code = previous;
2599
/* (*lengthptr - previous_length) now holds the amount of buffer that
2600
we require to make the call to compile_class_nested() with
2601
lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
2602
before that call. */
2603
}
2604
2605
/* Do some useful counting of what's in the bitmap. */
2606
for (int i = 0; i < 8; i++)
2607
if (op_info.bits.classwords[i] != 0xffffffff)
2608
{
2609
allbitsone = FALSE;
2610
break;
2611
}
2612
2613
/* After constant-folding the extended class syntax, it may turn out to be
2614
a simple class after all. In that case, we can unwrap it from the
2615
OP_ECLASS container - and in fact, we must do so, because in 8-bit
2616
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
2617
2618
#ifndef SUPPORT_WIDE_CHARS
2619
PCRE2_ASSERT(op_info.op_single_type != 0);
2620
#else
2621
if (op_info.op_single_type != 0)
2622
#endif
2623
{
2624
/* Rewind back over the OP_ECLASS. */
2625
code = previous;
2626
2627
/* If the bits are all ones, and the "high characters" are all matched
2628
too, we use a special-cased encoding of OP_ALLANY. */
2629
2630
if (op_info.op_single_type == ECL_ANY && allbitsone)
2631
{
2632
/* Advancing code means rewinding lengthptr, at this point. */
2633
if (lengthptr != NULL) *lengthptr -= 1;
2634
*code++ = OP_ALLANY;
2635
}
2636
2637
/* If the high bits are all matched / all not-matched, then we emit an
2638
OP_NCLASS/OP_CLASS respectively. */
2639
2640
else if (op_info.op_single_type == ECL_ANY ||
2641
op_info.op_single_type == ECL_NONE)
2642
{
2643
PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
2644
2645
if (lengthptr != NULL)
2646
{
2647
if (required_len > (*lengthptr - previous_length))
2648
*lengthptr = previous_length + required_len;
2649
}
2650
2651
/* Advancing code means rewinding lengthptr, at this point. */
2652
if (lengthptr != NULL) *lengthptr -= required_len;
2653
*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
2654
memcpy(code, op_info.bits.classbits, 32);
2655
code += 32 / sizeof(PCRE2_UCHAR);
2656
}
2657
2658
/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
2659
there, but, we pulled out its bitmap into op_info, so now we have to
2660
put that back into the OP_XCLASS. */
2661
2662
else
2663
{
2664
#ifndef SUPPORT_WIDE_CHARS
2665
PCRE2_DEBUG_UNREACHABLE();
2666
#else
2667
BOOL need_map = context.needs_bitmap;
2668
PCRE2_SIZE required_len;
2669
2670
PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
2671
required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
2672
2673
if (lengthptr != NULL)
2674
{
2675
/* Don't unconditionally request all the space we need - we may
2676
already have asked for more during processing of the ECLASS. */
2677
if (required_len > (*lengthptr - previous_length))
2678
*lengthptr = previous_length + required_len;
2679
2680
/* The code we write out here won't be ignored, even during the
2681
(lengthptr != NULL) phase, because if there's a following quantifier
2682
it will peek backwards. So we do have to write out a (truncated)
2683
OP_XCLASS, even on this branch. */
2684
*lengthptr -= 1 + LINK_SIZE + 1;
2685
*code++ = OP_XCLASS;
2686
PUT(code, 0, 1 + LINK_SIZE + 1);
2687
code += LINK_SIZE;
2688
*code++ = 0;
2689
}
2690
else
2691
{
2692
PCRE2_UCHAR *rest;
2693
PCRE2_SIZE rest_len;
2694
PCRE2_UCHAR flags;
2695
2696
/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
2697
PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
2698
rest = op_info.code_start + 1 + LINK_SIZE + 1;
2699
rest_len = (op_info.code_start + op_info.length) - rest;
2700
2701
/* First read any data we use, before memmove splats it. */
2702
flags = op_info.code_start[1 + LINK_SIZE];
2703
PCRE2_ASSERT((flags & XCL_MAP) == 0);
2704
2705
/* Next do the memmove before any writes. */
2706
memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
2707
rest, CU2BYTES(rest_len));
2708
2709
/* Finally write the header data. */
2710
*code++ = OP_XCLASS;
2711
PUT(code, 0, (int)required_len);
2712
code += LINK_SIZE;
2713
*code++ = flags | (need_map? XCL_MAP : 0);
2714
if (need_map)
2715
{
2716
memcpy(code, op_info.bits.classbits, 32);
2717
code += 32 / sizeof(PCRE2_UCHAR);
2718
}
2719
code += rest_len;
2720
}
2721
#endif /* SUPPORT_WIDE_CHARS */
2722
}
2723
}
2724
2725
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
2726
to do some adjustment to insert the bitmap if we have one. */
2727
2728
#ifdef SUPPORT_WIDE_CHARS
2729
else
2730
{
2731
BOOL need_map = context.needs_bitmap;
2732
PCRE2_SIZE required_len =
2733
1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
2734
2735
if (lengthptr != NULL)
2736
{
2737
if (required_len > (*lengthptr - previous_length))
2738
*lengthptr = previous_length + required_len;
2739
2740
/* As for the XCLASS branch above, we do have to write out a dummy
2741
OP_ECLASS, because of the backwards peek by the quantifier code. Write
2742
out a (truncated) OP_ECLASS, even on this branch. */
2743
*lengthptr -= 1 + LINK_SIZE + 1;
2744
*code++ = OP_ECLASS;
2745
PUT(code, 0, 1 + LINK_SIZE + 1);
2746
code += LINK_SIZE;
2747
*code++ = 0;
2748
}
2749
else
2750
{
2751
if (need_map)
2752
{
2753
PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
2754
previous[1 + LINK_SIZE] |= ECL_MAP;
2755
memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
2756
CU2BYTES(code - map_start));
2757
memcpy(map_start, op_info.bits.classbits, 32);
2758
code += 32 / sizeof(PCRE2_UCHAR);
2759
}
2760
PUT(previous, 1, (int)(code - previous));
2761
}
2762
}
2763
#endif /* SUPPORT_WIDE_CHARS */
2764
2765
*pcode = code;
2766
return TRUE;
2767
}
2768
2769
/* End of pcre2_compile_class.c */
2770
2771