Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_compile_class.c
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
#ifdef HAVE_CONFIG_H
42
#include "config.h"
43
#endif
44
45
#include "pcre2_compile.h"
46
47
typedef struct {
48
/* Option bits for eclass. */
49
uint32_t options;
50
uint32_t xoptions;
51
/* Rarely used members. */
52
int *errorcodeptr;
53
compile_block *cb;
54
/* Bitmap is needed. */
55
BOOL needs_bitmap;
56
} eclass_context;
57
58
/* Checks the allowed tokens at the end of a class structure in debug mode.
59
When a new token is not processed by all loops, and the token is equals to
60
a) one of the cases here:
61
the compiler will complain about a duplicated case value.
62
b) none of the cases here:
63
the loop without the handler will stop with an assertion failure. */
64
65
#ifdef PCRE2_DEBUG
66
#define CLASS_END_CASES(meta) \
67
default: \
68
PCRE2_ASSERT((meta) <= META_END); \
69
/* Fall through */ \
70
case META_CLASS: \
71
case META_CLASS_NOT: \
72
case META_CLASS_EMPTY: \
73
case META_CLASS_EMPTY_NOT: \
74
case META_CLASS_END: \
75
case META_ECLASS_AND: \
76
case META_ECLASS_OR: \
77
case META_ECLASS_SUB: \
78
case META_ECLASS_XOR: \
79
case META_ECLASS_NOT:
80
#else
81
#define CLASS_END_CASES(meta) \
82
default:
83
#endif
84
85
#ifdef SUPPORT_WIDE_CHARS
86
87
/* Heapsort algorithm. */
88
89
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
90
{
91
size_t max;
92
size_t left;
93
size_t right;
94
uint32_t tmp1, tmp2;
95
96
while (TRUE)
97
{
98
max = i;
99
left = (i << 1) + 2;
100
right = left + 2;
101
102
if (left < size && buffer[left] > buffer[max]) max = left;
103
if (right < size && buffer[right] > buffer[max]) max = right;
104
if (i == max) return;
105
106
/* Swap items. */
107
tmp1 = buffer[i];
108
tmp2 = buffer[i + 1];
109
buffer[i] = buffer[max];
110
buffer[i + 1] = buffer[max + 1];
111
buffer[max] = tmp1;
112
buffer[max + 1] = tmp2;
113
i = max;
114
}
115
}
116
117
#ifdef SUPPORT_UNICODE
118
119
#define PARSE_CLASS_UTF 0x1
120
#define PARSE_CLASS_CASELESS_UTF 0x2
121
#define PARSE_CLASS_RESTRICTED_UTF 0x4
122
#define PARSE_CLASS_TURKISH_UTF 0x8
123
124
/* Get the range of nocase characters which includes the
125
'c' character passed as argument, or directly follows 'c'. */
126
127
static const uint32_t*
128
get_nocase_range(uint32_t c)
129
{
130
uint32_t left = 0;
131
uint32_t right = PRIV(ucd_nocase_ranges_size);
132
uint32_t middle;
133
134
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
135
136
while (TRUE)
137
{
138
/* Range end of the middle element. */
139
middle = ((left + right) >> 1) | 0x1;
140
141
if (PRIV(ucd_nocase_ranges)[middle] <= c)
142
left = middle + 1;
143
else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
144
right = middle - 1;
145
else
146
return PRIV(ucd_nocase_ranges) + (middle - 1);
147
}
148
}
149
150
/* Get the list of othercase characters, which belongs to the passed range.
151
Create ranges from these characters, and append them to the buffer argument. */
152
153
static size_t
154
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
155
uint32_t *buffer)
156
{
157
uint32_t new_start = start;
158
uint32_t new_end = end;
159
uint32_t c = start;
160
const uint32_t *list;
161
uint32_t tmp[3];
162
size_t result = 2;
163
const uint32_t *skip_range = get_nocase_range(c);
164
uint32_t skip_start = skip_range[0];
165
166
#if PCRE2_CODE_UNIT_WIDTH == 8
167
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
168
#endif
169
170
#if PCRE2_CODE_UNIT_WIDTH == 32
171
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
172
#endif
173
174
while (c <= end)
175
{
176
uint32_t co;
177
178
if (c > skip_start)
179
{
180
c = skip_range[1];
181
skip_range += 2;
182
skip_start = skip_range[0];
183
continue;
184
}
185
186
/* Compute caseless set. */
187
188
if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
189
PARSE_CLASS_TURKISH_UTF &&
190
UCD_ANY_I(c))
191
{
192
co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
193
}
194
else if ((co = UCD_CASESET(c)) != 0 &&
195
(options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
196
PRIV(ucd_caseless_sets)[co] < 128)
197
{
198
co = 0; /* Ignore the caseless set if it's restricted. */
199
}
200
201
if (co != 0)
202
list = PRIV(ucd_caseless_sets) + co;
203
else
204
{
205
co = UCD_OTHERCASE(c);
206
list = tmp;
207
tmp[0] = c;
208
tmp[1] = NOTACHAR;
209
210
if (co != c)
211
{
212
tmp[1] = co;
213
tmp[2] = NOTACHAR;
214
}
215
}
216
c++;
217
218
/* Add characters. */
219
do
220
{
221
#if PCRE2_CODE_UNIT_WIDTH == 16
222
if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
223
#endif
224
225
if (*list < new_start)
226
{
227
if (*list + 1 == new_start)
228
{
229
new_start--;
230
continue;
231
}
232
}
233
else if (*list > new_end)
234
{
235
if (*list - 1 == new_end)
236
{
237
new_end++;
238
continue;
239
}
240
}
241
else continue;
242
243
result += 2;
244
if (buffer != NULL)
245
{
246
buffer[0] = *list;
247
buffer[1] = *list;
248
buffer += 2;
249
}
250
}
251
while (*(++list) != NOTACHAR);
252
}
253
254
if (buffer != NULL)
255
{
256
buffer[0] = new_start;
257
buffer[1] = new_end;
258
buffer += 2;
259
(void)buffer;
260
}
261
return result;
262
}
263
264
#endif
265
266
/* Add a character list to a buffer. */
267
268
static size_t
269
append_char_list(const uint32_t *p, uint32_t *buffer)
270
{
271
const uint32_t *n;
272
size_t result = 0;
273
274
while (*p != NOTACHAR)
275
{
276
n = p;
277
while (n[0] == n[1] - 1) n++;
278
279
PCRE2_ASSERT(*p < 0xffff);
280
281
if (buffer != NULL)
282
{
283
buffer[0] = *p;
284
buffer[1] = *n;
285
buffer += 2;
286
}
287
288
result += 2;
289
p = n + 1;
290
}
291
292
return result;
293
}
294
295
static uint32_t
296
get_highest_char(uint32_t options)
297
{
298
(void)options; /* Avoid compiler warning. */
299
300
#if PCRE2_CODE_UNIT_WIDTH == 8
301
return MAX_UTF_CODE_POINT;
302
#else
303
#ifdef SUPPORT_UNICODE
304
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
305
#else
306
return MAX_UCHAR_VALUE;
307
#endif
308
#endif
309
}
310
311
/* Add a negated character list to a buffer. */
312
static size_t
313
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
314
{
315
const uint32_t *n;
316
uint32_t start = 0;
317
size_t result = 2;
318
319
PCRE2_ASSERT(*p > 0);
320
321
while (*p != NOTACHAR)
322
{
323
n = p;
324
while (n[0] == n[1] - 1) n++;
325
326
PCRE2_ASSERT(*p < 0xffff);
327
328
if (buffer != NULL)
329
{
330
buffer[0] = start;
331
buffer[1] = *p - 1;
332
buffer += 2;
333
}
334
335
result += 2;
336
start = *n + 1;
337
p = n + 1;
338
}
339
340
if (buffer != NULL)
341
{
342
buffer[0] = start;
343
buffer[1] = get_highest_char(options);
344
buffer += 2;
345
(void)buffer;
346
}
347
348
return result;
349
}
350
351
static uint32_t *
352
append_non_ascii_range(uint32_t options, uint32_t *buffer)
353
{
354
if (buffer == NULL) return NULL;
355
356
buffer[0] = 0x100;
357
buffer[1] = get_highest_char(options);
358
return buffer + 2;
359
}
360
361
static size_t
362
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
363
{
364
size_t total_size = 0;
365
size_t size;
366
uint32_t meta_arg;
367
uint32_t start_char;
368
369
while (TRUE)
370
{
371
switch (META_CODE(*ptr))
372
{
373
case META_ESCAPE:
374
meta_arg = META_DATA(*ptr);
375
switch (meta_arg)
376
{
377
case ESC_D:
378
case ESC_W:
379
case ESC_S:
380
buffer = append_non_ascii_range(options, buffer);
381
total_size += 2;
382
break;
383
384
case ESC_h:
385
size = append_char_list(PRIV(hspace_list), buffer);
386
total_size += size;
387
if (buffer != NULL) buffer += size;
388
break;
389
390
case ESC_H:
391
size = append_negated_char_list(PRIV(hspace_list), options, buffer);
392
total_size += size;
393
if (buffer != NULL) buffer += size;
394
break;
395
396
case ESC_v:
397
size = append_char_list(PRIV(vspace_list), buffer);
398
total_size += size;
399
if (buffer != NULL) buffer += size;
400
break;
401
402
case ESC_V:
403
size = append_negated_char_list(PRIV(vspace_list), options, buffer);
404
total_size += size;
405
if (buffer != NULL) buffer += size;
406
break;
407
408
case ESC_p:
409
case ESC_P:
410
ptr++;
411
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
412
{
413
if (buffer != NULL)
414
{
415
buffer[0] = 0;
416
buffer[1] = get_highest_char(options);
417
buffer += 2;
418
}
419
total_size += 2;
420
}
421
break;
422
}
423
ptr++;
424
continue;
425
case META_POSIX_NEG:
426
buffer = append_non_ascii_range(options, buffer);
427
total_size += 2;
428
ptr += 2;
429
continue;
430
case META_POSIX:
431
ptr += 2;
432
continue;
433
case META_BIGVALUE:
434
/* Character literal */
435
ptr++;
436
break;
437
CLASS_END_CASES(*ptr)
438
if (*ptr >= META_END) return total_size;
439
break;
440
}
441
442
start_char = *ptr;
443
444
if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
445
{
446
ptr += 2;
447
PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
448
449
if (*ptr == META_BIGVALUE) ptr++;
450
451
#ifdef EBCDIC
452
#error "Missing EBCDIC support"
453
#endif
454
}
455
456
#ifdef SUPPORT_UNICODE
457
if (options & PARSE_CLASS_CASELESS_UTF)
458
{
459
size = utf_caseless_extend(start_char, *ptr++, options, buffer);
460
if (buffer != NULL) buffer += size;
461
total_size += size;
462
continue;
463
}
464
#endif
465
466
if (buffer != NULL)
467
{
468
buffer[0] = start_char;
469
buffer[1] = *ptr;
470
buffer += 2;
471
}
472
473
ptr++;
474
total_size += 2;
475
}
476
477
return total_size;
478
}
479
480
/* Extra uint32_t values for storing the lengths of range lists in
481
the worst case. Two uint32_t lengths and a range end for a range
482
starting before 255 */
483
#define CHAR_LIST_EXTRA_SIZE 3
484
485
/* Starting character values for each character list. */
486
487
static const uint32_t char_list_starts[] = {
488
#if PCRE2_CODE_UNIT_WIDTH == 32
489
XCL_CHAR_LIST_HIGH_32_START,
490
#endif
491
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
492
XCL_CHAR_LIST_LOW_32_START,
493
#endif
494
XCL_CHAR_LIST_HIGH_16_START,
495
/* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
496
which also represents the end of the bitset. */
497
XCL_CHAR_LIST_LOW_16_START,
498
};
499
500
static class_ranges *
501
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
502
uint32_t xoptions, compile_block *cb)
503
{
504
class_ranges* cranges;
505
uint32_t *ptr;
506
uint32_t *buffer;
507
uint32_t *dst;
508
uint32_t class_options = 0;
509
size_t range_list_size = 0, total_size, i;
510
uint32_t tmp1, tmp2;
511
const uint32_t *char_list_next;
512
uint16_t *next_char;
513
uint32_t char_list_start, char_list_end;
514
uint32_t range_start, range_end;
515
516
#ifdef SUPPORT_UNICODE
517
if (options & PCRE2_UTF)
518
class_options |= PARSE_CLASS_UTF;
519
520
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
521
class_options |= PARSE_CLASS_CASELESS_UTF;
522
523
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
524
class_options |= PARSE_CLASS_RESTRICTED_UTF;
525
526
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
527
class_options |= PARSE_CLASS_TURKISH_UTF;
528
#endif
529
530
/* Compute required space for the range. */
531
532
range_list_size = parse_class(start_ptr, class_options, NULL);
533
PCRE2_ASSERT((range_list_size & 0x1) == 0);
534
535
/* Allocate buffer. The total_size also represents the end of the buffer. */
536
537
total_size = range_list_size +
538
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
539
540
cranges = cb->cx->memctl.malloc(
541
sizeof(class_ranges) + total_size * sizeof(uint32_t),
542
cb->cx->memctl.memory_data);
543
544
if (cranges == NULL) return NULL;
545
546
cranges->next = NULL;
547
cranges->range_list_size = (uint16_t)range_list_size;
548
cranges->char_lists_types = 0;
549
cranges->char_lists_size = 0;
550
cranges->char_lists_start = 0;
551
552
if (range_list_size == 0) return cranges;
553
554
buffer = (uint32_t*)(cranges + 1);
555
parse_class(start_ptr, class_options, buffer);
556
557
/* Using <= instead of == to help static analysis. */
558
if (range_list_size <= 2) return cranges;
559
560
/* In-place sorting of ranges. */
561
562
i = (((range_list_size >> 2) - 1) << 1);
563
while (TRUE)
564
{
565
do_heapify(buffer, range_list_size, i);
566
if (i == 0) break;
567
i -= 2;
568
}
569
570
i = range_list_size - 2;
571
while (TRUE)
572
{
573
tmp1 = buffer[i];
574
tmp2 = buffer[i + 1];
575
buffer[i] = buffer[0];
576
buffer[i + 1] = buffer[1];
577
buffer[0] = tmp1;
578
buffer[1] = tmp2;
579
580
do_heapify(buffer, i, 0);
581
if (i == 0) break;
582
i -= 2;
583
}
584
585
/* Merge ranges whenever possible. */
586
dst = buffer;
587
ptr = buffer + 2;
588
range_list_size -= 2;
589
590
/* The second condition is a very rare corner case, where the end of the last
591
range is the maximum character. This range cannot be extended further. */
592
593
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
594
{
595
if (dst[1] + 1 < ptr[0])
596
{
597
dst += 2;
598
dst[0] = ptr[0];
599
dst[1] = ptr[1];
600
}
601
else if (dst[1] < ptr[1]) dst[1] = ptr[1];
602
603
ptr += 2;
604
range_list_size -= 2;
605
}
606
607
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
608
609
/* When the number of ranges are less than six,
610
they are not converted to range lists. */
611
612
ptr = buffer;
613
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
614
if (dst - ptr < (2 * (6 - 1)))
615
{
616
cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
617
return cranges;
618
}
619
620
/* Compute character lists structures. */
621
622
char_list_next = char_list_starts;
623
char_list_start = *char_list_next++;
624
#if PCRE2_CODE_UNIT_WIDTH == 32
625
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
626
#elif defined SUPPORT_UNICODE
627
char_list_end = XCL_CHAR_LIST_LOW_32_END;
628
#else
629
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
630
#endif
631
next_char = (uint16_t*)(buffer + total_size);
632
633
tmp1 = 0;
634
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
635
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
636
range_start = dst[0];
637
range_end = dst[1];
638
639
while (TRUE)
640
{
641
if (range_start >= char_list_start)
642
{
643
if (range_start == range_end || range_end < char_list_end)
644
{
645
tmp1++;
646
next_char--;
647
648
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
649
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
650
else
651
*(uint32_t*)(--next_char) =
652
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
653
}
654
655
if (range_start < range_end)
656
{
657
if (range_start > char_list_start)
658
{
659
tmp1++;
660
next_char--;
661
662
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
663
*next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
664
else
665
*(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
666
}
667
else
668
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
669
}
670
671
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
672
673
if (dst > buffer)
674
{
675
dst -= 2;
676
range_start = dst[0];
677
range_end = dst[1];
678
continue;
679
}
680
681
range_start = 0;
682
range_end = 0;
683
}
684
685
if (range_end >= char_list_start)
686
{
687
PCRE2_ASSERT(range_start < char_list_start);
688
689
if (range_end < char_list_end)
690
{
691
tmp1++;
692
next_char--;
693
694
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
695
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
696
else
697
*(uint32_t*)(--next_char) =
698
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
699
700
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
701
}
702
703
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
704
}
705
706
if (tmp1 >= XCL_ITEM_COUNT_MASK)
707
{
708
cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
709
next_char--;
710
711
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
712
*next_char = (uint16_t)tmp1;
713
else
714
*(uint32_t*)(--next_char) = tmp1;
715
}
716
else
717
cranges->char_lists_types |= tmp1 << tmp2;
718
719
if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
720
721
PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
722
char_list_end = char_list_start - 1;
723
char_list_start = *char_list_next++;
724
tmp1 = 0;
725
tmp2 -= XCL_TYPE_BIT_LEN;
726
}
727
728
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
729
PCRE2_ASSERT((uint16_t*)dst <= next_char);
730
731
cranges->char_lists_size =
732
(size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
733
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
734
cranges->range_list_size = (uint16_t)(dst - buffer);
735
return cranges;
736
}
737
738
#endif /* SUPPORT_WIDE_CHARS */
739
740
#ifdef SUPPORT_UNICODE
741
742
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
743
uint8_t *classbits)
744
{
745
/* Update PRIV(xclass) when this function is changed. */
746
int c, chartype;
747
const ucd_record *prop;
748
uint32_t gentype;
749
BOOL set_bit;
750
751
if (ptype == PT_ANY)
752
{
753
if (!negated) memset(classbits, 0xff, 32);
754
return;
755
}
756
757
for (c = 0; c < 256; c++)
758
{
759
prop = GET_UCD(c);
760
set_bit = FALSE;
761
(void)set_bit;
762
763
switch (ptype)
764
{
765
case PT_LAMP:
766
chartype = prop->chartype;
767
set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
768
break;
769
770
case PT_GC:
771
set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
772
break;
773
774
case PT_PC:
775
set_bit = (prop->chartype == pdata);
776
break;
777
778
case PT_SC:
779
set_bit = (prop->script == pdata);
780
break;
781
782
case PT_SCX:
783
set_bit = (prop->script == pdata ||
784
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
785
break;
786
787
case PT_ALNUM:
788
gentype = PRIV(ucp_gentype)[prop->chartype];
789
set_bit = (gentype == ucp_L || gentype == ucp_N);
790
break;
791
792
case PT_SPACE: /* Perl space */
793
case PT_PXSPACE: /* POSIX space */
794
switch(c)
795
{
796
HSPACE_BYTE_CASES:
797
VSPACE_BYTE_CASES:
798
set_bit = TRUE;
799
break;
800
801
default:
802
set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
803
break;
804
}
805
break;
806
807
case PT_WORD:
808
chartype = prop->chartype;
809
gentype = PRIV(ucp_gentype)[chartype];
810
set_bit = (gentype == ucp_L || gentype == ucp_N ||
811
chartype == ucp_Mn || chartype == ucp_Pc);
812
break;
813
814
case PT_UCNC:
815
set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
816
c == CHAR_GRAVE_ACCENT || c >= 0xa0);
817
break;
818
819
case PT_BIDICL:
820
set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
821
break;
822
823
case PT_BOOL:
824
set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
825
UCD_BPROPS_PROP(prop), pdata) != 0;
826
break;
827
828
case PT_PXGRAPH:
829
chartype = prop->chartype;
830
gentype = PRIV(ucp_gentype)[chartype];
831
set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
832
break;
833
834
case PT_PXPRINT:
835
chartype = prop->chartype;
836
set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
837
(PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
838
break;
839
840
case PT_PXPUNCT:
841
gentype = PRIV(ucp_gentype)[prop->chartype];
842
set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
843
break;
844
845
default:
846
PCRE2_ASSERT(ptype == PT_PXXDIGIT);
847
set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
848
(c >= CHAR_A && c <= CHAR_F) ||
849
(c >= CHAR_a && c <= CHAR_f);
850
break;
851
}
852
853
if (negated) set_bit = !set_bit;
854
if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
855
if ((c & 0x7) == 0x7) classbits++;
856
}
857
}
858
859
#endif /* SUPPORT_UNICODE */
860
861
862
863
#ifdef SUPPORT_WIDE_CHARS
864
865
/*************************************************
866
* XClass related properties *
867
*************************************************/
868
869
/* XClass needs to be generated. */
870
#define XCLASS_REQUIRED 0x1
871
/* XClass has 8 bit character. */
872
#define XCLASS_HAS_8BIT_CHARS 0x2
873
/* XClass has properties. */
874
#define XCLASS_HAS_PROPS 0x4
875
/* XClass has character lists. */
876
#define XCLASS_HAS_CHAR_LISTS 0x8
877
/* XClass matches to all >= 256 characters. */
878
#define XCLASS_HIGH_ANY 0x10
879
880
#endif
881
882
883
/*************************************************
884
* Internal entry point for add range to class *
885
*************************************************/
886
887
/* This function sets the overall range for characters < 256.
888
It also handles non-utf case folding.
889
890
Arguments:
891
options the options bits
892
xoptions the extra options bits
893
cb compile data
894
start start of range character
895
end end of range character
896
897
Returns: cb->classbits is updated
898
*/
899
900
static void
901
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
902
uint32_t start, uint32_t end)
903
{
904
uint8_t *classbits = cb->classbits.classbits;
905
uint32_t c, byte_start, byte_end;
906
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
907
908
/* If caseless matching is required, scan the range and process alternate
909
cases. In Unicode, there are 8-bit characters that have alternate cases that
910
are greater than 255 and vice-versa (though these may be ignored if caseless
911
restriction is in force). Sometimes we can just extend the original range. */
912
913
if ((options & PCRE2_CASELESS) != 0)
914
{
915
#ifdef SUPPORT_UNICODE
916
/* UTF mode. This branch is taken if we don't support wide characters (e.g.
917
8-bit library, without UTF), but we do treat those characters as Unicode
918
(if UCP flag is set). In this case, we only need to expand the character class
919
set to include the case pairs which are in the 0-255 codepoint range. */
920
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
921
{
922
BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
923
PCRE2_EXTRA_TURKISH_CASING;
924
if (start < 128)
925
{
926
uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
927
for (c = start; c <= lo_end; c++)
928
{
929
if (turkish_i && UCD_ANY_I(c)) continue;
930
SETBIT(classbits, cb->fcc[c]);
931
}
932
}
933
if (classbits_end >= 128)
934
{
935
uint32_t hi_start = (start > 128 ? start : 128);
936
for (c = hi_start; c <= classbits_end; c++)
937
{
938
uint32_t co = UCD_OTHERCASE(c);
939
if (co <= 0xff) SETBIT(classbits, co);
940
}
941
}
942
}
943
944
else
945
#endif /* SUPPORT_UNICODE */
946
947
/* Not UTF mode */
948
{
949
for (c = start; c <= classbits_end; c++)
950
SETBIT(classbits, cb->fcc[c]);
951
}
952
}
953
954
/* Use the bitmap for characters < 256. Otherwise use extra data. */
955
956
byte_start = (start + 7) >> 3;
957
byte_end = (classbits_end + 1) >> 3;
958
959
if (byte_start >= byte_end)
960
{
961
for (c = start; c <= classbits_end; c++)
962
/* Regardless of start, c will always be <= 255. */
963
SETBIT(classbits, c);
964
return;
965
}
966
967
for (c = byte_start; c < byte_end; c++)
968
classbits[c] = 0xff;
969
970
byte_start <<= 3;
971
byte_end <<= 3;
972
973
for (c = start; c < byte_start; c++)
974
SETBIT(classbits, c);
975
976
for (c = byte_end; c <= classbits_end; c++)
977
SETBIT(classbits, c);
978
}
979
980
981
#if PCRE2_CODE_UNIT_WIDTH == 8
982
/*************************************************
983
* Internal entry point for add list to class *
984
*************************************************/
985
986
/* This function is used for adding a list of horizontal or vertical whitespace
987
characters to a class. The list must be in order so that ranges of characters
988
can be detected and handled appropriately. This function sets the overall range
989
so that the internal functions can try to avoid duplication when handling
990
case-independence.
991
992
Arguments:
993
options the options bits
994
xoptions the extra options bits
995
cb contains pointers to tables etc.
996
p points to row of 32-bit values, terminated by NOTACHAR
997
998
Returns: cb->classbits is updated
999
*/
1000
1001
static void
1002
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1003
const uint32_t *p)
1004
{
1005
while (p[0] < 256)
1006
{
1007
unsigned int n = 0;
1008
1009
while(p[n+1] == p[0] + n + 1) n++;
1010
add_to_class(options, xoptions, cb, p[0], p[n]);
1011
1012
p += n + 1;
1013
}
1014
}
1015
1016
1017
1018
/*************************************************
1019
* Add characters not in a list to a class *
1020
*************************************************/
1021
1022
/* This function is used for adding the complement of a list of horizontal or
1023
vertical whitespace to a class. The list must be in order.
1024
1025
Arguments:
1026
options the options bits
1027
xoptions the extra options bits
1028
cb contains pointers to tables etc.
1029
p points to row of 32-bit values, terminated by NOTACHAR
1030
1031
Returns: cb->classbits is updated
1032
*/
1033
1034
static void
1035
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
1036
const uint32_t *p)
1037
{
1038
if (p[0] > 0)
1039
add_to_class(options, xoptions, cb, 0, p[0] - 1);
1040
while (p[0] < 256)
1041
{
1042
while (p[1] == p[0] + 1) p++;
1043
add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
1044
p++;
1045
}
1046
}
1047
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1048
1049
1050
1051
/*************************************************
1052
* Main entry-point to compile a character class *
1053
*************************************************/
1054
1055
/* This function consumes a "leaf", which is a set of characters that will
1056
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
1057
1058
uint32_t *
1059
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
1060
uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
1061
int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
1062
{
1063
uint32_t *pptr = start_ptr;
1064
PCRE2_UCHAR *code = *pcode;
1065
BOOL should_flip_negation;
1066
const uint8_t *cbits = cb->cbits;
1067
/* Some functions such as add_to_class() or eclass processing
1068
expects that the bitset is stored in cb->classbits.classbits. */
1069
uint8_t *const classbits = cb->classbits.classbits;
1070
1071
#ifdef SUPPORT_UNICODE
1072
BOOL utf = (options & PCRE2_UTF) != 0;
1073
#else /* No Unicode support */
1074
BOOL utf = FALSE;
1075
#endif
1076
1077
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
1078
1079
#ifdef SUPPORT_WIDE_CHARS
1080
uint32_t xclass_props;
1081
PCRE2_UCHAR *class_uchardata;
1082
class_ranges* cranges;
1083
#endif
1084
1085
/* If an XClass contains a negative special such as \S, we need to flip the
1086
negation flag at the end, so that support for characters > 255 works correctly
1087
(they are all included in the class). An XClass may need to insert specific
1088
matching or non-matching code for wide characters.
1089
*/
1090
1091
should_flip_negation = FALSE;
1092
1093
/* XClass will be used when characters > 255 might match. */
1094
1095
#ifdef SUPPORT_WIDE_CHARS
1096
xclass_props = 0;
1097
1098
#if PCRE2_CODE_UNIT_WIDTH == 8
1099
cranges = NULL;
1100
1101
if (utf)
1102
#endif
1103
{
1104
if (lengthptr != NULL)
1105
{
1106
cranges = compile_optimize_class(pptr, options, xoptions, cb);
1107
1108
if (cranges == NULL)
1109
{
1110
*errorcodeptr = ERR21;
1111
return NULL;
1112
}
1113
1114
/* Caching the pre-processed character ranges. */
1115
if (cb->next_cranges != NULL)
1116
cb->next_cranges->next = cranges;
1117
else
1118
cb->cranges = cranges;
1119
1120
cb->next_cranges = cranges;
1121
}
1122
else
1123
{
1124
/* Reuse the pre-processed character ranges. */
1125
cranges = cb->cranges;
1126
PCRE2_ASSERT(cranges != NULL);
1127
cb->cranges = cranges->next;
1128
}
1129
1130
if (cranges->range_list_size > 0)
1131
{
1132
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
1133
1134
if (ranges[0] <= 255)
1135
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1136
1137
if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
1138
ranges[cranges->range_list_size - 2] <= 256)
1139
xclass_props |= XCLASS_HIGH_ANY;
1140
}
1141
}
1142
1143
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
1144
#endif /* SUPPORT_WIDE_CHARS */
1145
1146
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
1147
in a temporary bit of memory, in case the class contains fewer than two
1148
8-bit characters because in that case the compiled code doesn't use the bit
1149
map. */
1150
1151
memset(classbits, 0, 32);
1152
1153
/* Process items until end_ptr is reached. */
1154
1155
while (TRUE)
1156
{
1157
uint32_t meta = *(pptr++);
1158
BOOL local_negate;
1159
int posix_class;
1160
int taboffset, tabopt;
1161
class_bits_storage pbits;
1162
uint32_t escape, c;
1163
1164
/* Handle POSIX classes such as [:alpha:] etc. */
1165
switch (META_CODE(meta))
1166
{
1167
case META_POSIX:
1168
case META_POSIX_NEG:
1169
1170
local_negate = (meta == META_POSIX_NEG);
1171
posix_class = *(pptr++);
1172
1173
if (local_negate) should_flip_negation = TRUE; /* Note negative special */
1174
1175
/* If matching is caseless, upper and lower are converted to alpha.
1176
This relies on the fact that the class table starts with alpha,
1177
lower, upper as the first 3 entries. */
1178
1179
if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
1180
posix_class = 0;
1181
1182
/* When PCRE2_UCP is set, some of the POSIX classes are converted to
1183
different escape sequences that use Unicode properties \p or \P.
1184
Others that are not available via \p or \P have to generate
1185
XCL_PROP/XCL_NOTPROP directly, which is done here. */
1186
1187
#ifdef SUPPORT_UNICODE
1188
/* TODO This entire block of code here appears to be unreachable!? I simply
1189
can't see how it can be hit, given that the frontend parser doesn't emit
1190
META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
1191
if ((options & PCRE2_UCP) != 0 &&
1192
(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
1193
{
1194
uint32_t ptype;
1195
1196
switch(posix_class)
1197
{
1198
case PC_GRAPH:
1199
case PC_PRINT:
1200
case PC_PUNCT:
1201
ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
1202
(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
1203
1204
PRIV(update_classbits)(ptype, 0, local_negate, classbits);
1205
1206
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1207
{
1208
if (lengthptr != NULL)
1209
*lengthptr += 3;
1210
else
1211
{
1212
*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
1213
*class_uchardata++ = (PCRE2_UCHAR)ptype;
1214
*class_uchardata++ = 0;
1215
}
1216
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1217
}
1218
continue;
1219
1220
/* For the other POSIX classes (ex: ascii) we are going to
1221
fall through to the non-UCP case and build a bit map for
1222
characters with code points less than 256. However, if we are in
1223
a negated POSIX class, characters with code points greater than
1224
255 must either all match or all not match, depending on whether
1225
the whole class is not or is negated. For example, for
1226
[[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
1227
they must not.
1228
1229
In the special case where there are no xclass items, this is
1230
automatically handled by the use of OP_CLASS or OP_NCLASS, but an
1231
explicit range is needed for OP_XCLASS. Setting a flag here
1232
causes the range to be generated later when it is known that
1233
OP_XCLASS is required. In the 8-bit library this is relevant only in
1234
utf mode, since no wide characters can exist otherwise. */
1235
1236
default:
1237
break;
1238
}
1239
}
1240
#endif /* SUPPORT_UNICODE */
1241
1242
/* In the non-UCP case, or when UCP makes no difference, we build the
1243
bit map for the POSIX class in a chunk of local store because we may
1244
be adding and subtracting from it, and we don't want to subtract bits
1245
that may be in the main map already. At the end we or the result into
1246
the bit map that is being built. */
1247
1248
posix_class *= 3;
1249
1250
/* Copy in the first table (always present) */
1251
1252
memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
1253
1254
/* If there is a second table, add or remove it as required. */
1255
1256
taboffset = PRIV(posix_class_maps)[posix_class + 1];
1257
tabopt = PRIV(posix_class_maps)[posix_class + 2];
1258
1259
if (taboffset >= 0)
1260
{
1261
if (tabopt >= 0)
1262
for (int i = 0; i < 32; i++)
1263
pbits.classbits[i] |= cbits[i + taboffset];
1264
else
1265
for (int i = 0; i < 32; i++)
1266
pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
1267
}
1268
1269
/* Now see if we need to remove any special characters. An option
1270
value of 1 removes vertical space and 2 removes underscore. */
1271
1272
if (tabopt < 0) tabopt = -tabopt;
1273
if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
1274
else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
1275
1276
/* Add the POSIX table or its complement into the main table that is
1277
being built and we are done. */
1278
1279
{
1280
uint32_t *classwords = cb->classbits.classwords;
1281
1282
if (local_negate)
1283
for (int i = 0; i < 8; i++)
1284
classwords[i] |= (uint32_t)(~pbits.classwords[i]);
1285
else
1286
for (int i = 0; i < 8; i++)
1287
classwords[i] |= pbits.classwords[i];
1288
}
1289
1290
#ifdef SUPPORT_WIDE_CHARS
1291
/* Every class contains at least one < 256 character. */
1292
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1293
#endif
1294
continue; /* End of POSIX handling */
1295
1296
/* Other than POSIX classes, the only items we should encounter are
1297
\d-type escapes and literal characters (possibly as ranges). */
1298
case META_BIGVALUE:
1299
meta = *(pptr++);
1300
break;
1301
1302
case META_ESCAPE:
1303
escape = META_DATA(meta);
1304
1305
switch(escape)
1306
{
1307
case ESC_d:
1308
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
1309
break;
1310
1311
case ESC_D:
1312
should_flip_negation = TRUE;
1313
for (int i = 0; i < 32; i++)
1314
classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
1315
break;
1316
1317
case ESC_w:
1318
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
1319
break;
1320
1321
case ESC_W:
1322
should_flip_negation = TRUE;
1323
for (int i = 0; i < 32; i++)
1324
classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
1325
break;
1326
1327
/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
1328
5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
1329
previously set by something earlier in the character class.
1330
Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
1331
we could just adjust the appropriate bit. From PCRE 8.34 we no
1332
longer treat \s and \S specially. */
1333
1334
case ESC_s:
1335
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
1336
break;
1337
1338
case ESC_S:
1339
should_flip_negation = TRUE;
1340
for (int i = 0; i < 32; i++)
1341
classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
1342
break;
1343
1344
/* When adding the horizontal or vertical space lists to a class, or
1345
their complements, disable PCRE2_CASELESS, because it justs wastes
1346
time, and in the "not-x" UTF cases can create unwanted duplicates in
1347
the XCLASS list (provoked by characters that have more than one other
1348
case and by both cases being in the same "not-x" sublist). */
1349
1350
case ESC_h:
1351
#if PCRE2_CODE_UNIT_WIDTH == 8
1352
#ifdef SUPPORT_UNICODE
1353
if (cranges != NULL) break;
1354
#endif
1355
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1356
cb, PRIV(hspace_list));
1357
#else
1358
PCRE2_ASSERT(cranges != NULL);
1359
#endif
1360
break;
1361
1362
case ESC_H:
1363
#if PCRE2_CODE_UNIT_WIDTH == 8
1364
#ifdef SUPPORT_UNICODE
1365
if (cranges != NULL) break;
1366
#endif
1367
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1368
cb, PRIV(hspace_list));
1369
#else
1370
PCRE2_ASSERT(cranges != NULL);
1371
#endif
1372
break;
1373
1374
case ESC_v:
1375
#if PCRE2_CODE_UNIT_WIDTH == 8
1376
#ifdef SUPPORT_UNICODE
1377
if (cranges != NULL) break;
1378
#endif
1379
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1380
cb, PRIV(vspace_list));
1381
#else
1382
PCRE2_ASSERT(cranges != NULL);
1383
#endif
1384
break;
1385
1386
case ESC_V:
1387
#if PCRE2_CODE_UNIT_WIDTH == 8
1388
#ifdef SUPPORT_UNICODE
1389
if (cranges != NULL) break;
1390
#endif
1391
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
1392
cb, PRIV(vspace_list));
1393
#else
1394
PCRE2_ASSERT(cranges != NULL);
1395
#endif
1396
break;
1397
1398
/* If Unicode is not supported, \P and \p are not allowed and are
1399
faulted at parse time, so will never appear here. */
1400
1401
#ifdef SUPPORT_UNICODE
1402
case ESC_p:
1403
case ESC_P:
1404
{
1405
uint32_t ptype = *pptr >> 16;
1406
uint32_t pdata = *(pptr++) & 0xffff;
1407
1408
/* The "Any" is processed by PRIV(update_classbits)(). */
1409
if (ptype == PT_ANY)
1410
{
1411
#if PCRE2_CODE_UNIT_WIDTH == 8
1412
if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
1413
#endif
1414
continue;
1415
}
1416
1417
PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
1418
1419
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1420
{
1421
if (lengthptr != NULL)
1422
*lengthptr += 3;
1423
else
1424
{
1425
*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
1426
*class_uchardata++ = ptype;
1427
*class_uchardata++ = pdata;
1428
}
1429
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1430
}
1431
}
1432
continue;
1433
#endif
1434
}
1435
1436
#ifdef SUPPORT_WIDE_CHARS
1437
/* Every non-property class contains at least one < 256 character. */
1438
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1439
#endif
1440
/* End handling \d-type escapes */
1441
continue;
1442
1443
CLASS_END_CASES(meta)
1444
/* Literals. */
1445
if (meta < META_END) break;
1446
/* Non-literals: end of class contents. */
1447
goto END_PROCESSING;
1448
}
1449
1450
/* A literal character may be followed by a range meta. At parse time
1451
there are checks for out-of-order characters, for ranges where the two
1452
characters are equal, and for hyphens that cannot indicate a range. At
1453
this point, therefore, no checking is needed. */
1454
1455
c = meta;
1456
1457
/* Remember if \r or \n were explicitly used */
1458
1459
if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1460
1461
/* Process a character range */
1462
1463
if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
1464
{
1465
uint32_t d;
1466
1467
#ifdef EBCDIC
1468
BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
1469
#endif
1470
++pptr;
1471
d = *(pptr++);
1472
if (d == META_BIGVALUE) d = *(pptr++);
1473
1474
/* Remember an explicit \r or \n, and add the range to the class. */
1475
1476
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
1477
1478
#if PCRE2_CODE_UNIT_WIDTH == 8
1479
#ifdef SUPPORT_UNICODE
1480
if (cranges != NULL) continue;
1481
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1482
#endif
1483
1484
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
1485
because there are holes in the encoding, and simply using the range
1486
A-Z (for example) would include the characters in the holes. This
1487
applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
1488
1489
#ifdef EBCDIC
1490
if (range_is_literal &&
1491
(cb->ctypes[c] & ctype_letter) != 0 &&
1492
(cb->ctypes[d] & ctype_letter) != 0 &&
1493
(c <= CHAR_z) == (d <= CHAR_z))
1494
{
1495
uint32_t uc = (d <= CHAR_z)? 0 : 64;
1496
uint32_t C = c - uc;
1497
uint32_t D = d - uc;
1498
1499
if (C <= CHAR_i)
1500
{
1501
add_to_class(options, xoptions, cb, C + uc,
1502
((D < CHAR_i)? D : CHAR_i) + uc);
1503
C = CHAR_j;
1504
}
1505
1506
if (C <= D && C <= CHAR_r)
1507
{
1508
add_to_class(options, xoptions, cb, C + uc,
1509
((D < CHAR_r)? D : CHAR_r) + uc);
1510
C = CHAR_s;
1511
}
1512
1513
if (C <= D)
1514
add_to_class(options, xoptions, cb, C + uc, D + uc);
1515
}
1516
else
1517
#endif
1518
/* Not an EBCDIC special range */
1519
1520
add_to_class(options, xoptions, cb, c, d);
1521
#else
1522
PCRE2_ASSERT(cranges != NULL);
1523
#endif
1524
continue;
1525
} /* End of range handling */
1526
1527
/* Character ranges are ignored when class_ranges is present. */
1528
#if PCRE2_CODE_UNIT_WIDTH == 8
1529
#ifdef SUPPORT_UNICODE
1530
if (cranges != NULL) continue;
1531
xclass_props |= XCLASS_HAS_8BIT_CHARS;
1532
#endif
1533
/* Handle a single character. */
1534
1535
add_to_class(options, xoptions, cb, meta, meta);
1536
#else
1537
PCRE2_ASSERT(cranges != NULL);
1538
#endif
1539
} /* End of main class-processing loop */
1540
1541
END_PROCESSING:
1542
1543
#ifdef SUPPORT_WIDE_CHARS
1544
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
1545
(xclass_props & XCLASS_HIGH_ANY) == 0);
1546
1547
if (cranges != NULL)
1548
{
1549
uint32_t *range = (uint32_t*)(cranges + 1);
1550
uint32_t *end = range + cranges->range_list_size;
1551
1552
while (range < end && range[0] < 256)
1553
{
1554
PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
1555
/* Add range to bitset. If we are in UTF or UCP mode, then clear the
1556
caseless bit, because the cranges handle caselessness (only) in this
1557
condition; see the condition for PARSE_CLASS_CASELESS_UTF in
1558
compile_optimize_class(). */
1559
add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
1560
(options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
1561
1562
if (range[1] > 255) break;
1563
range += 2;
1564
}
1565
1566
if (cranges->char_lists_size > 0)
1567
{
1568
/* The cranges structure is still used and freed later. */
1569
PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
1570
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
1571
}
1572
else
1573
{
1574
if ((xclass_props & XCLASS_HIGH_ANY) != 0)
1575
{
1576
PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
1577
range[1] >= GET_MAX_CHAR_VALUE(utf));
1578
should_flip_negation = TRUE;
1579
range = end;
1580
}
1581
1582
while (range < end)
1583
{
1584
uint32_t range_start = range[0];
1585
uint32_t range_end = range[1];
1586
1587
range += 2;
1588
xclass_props |= XCLASS_REQUIRED;
1589
1590
if (range_start < 256) range_start = 256;
1591
1592
if (lengthptr != NULL)
1593
{
1594
#ifdef SUPPORT_UNICODE
1595
if (utf)
1596
{
1597
*lengthptr += 1;
1598
1599
if (range_start < range_end)
1600
*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
1601
1602
*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
1603
continue;
1604
}
1605
#endif /* SUPPORT_UNICODE */
1606
1607
*lengthptr += range_start < range_end ? 3 : 2;
1608
continue;
1609
}
1610
1611
#ifdef SUPPORT_UNICODE
1612
if (utf)
1613
{
1614
if (range_start < range_end)
1615
{
1616
*class_uchardata++ = XCL_RANGE;
1617
class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
1618
}
1619
else
1620
*class_uchardata++ = XCL_SINGLE;
1621
1622
class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
1623
continue;
1624
}
1625
#endif /* SUPPORT_UNICODE */
1626
1627
/* Without UTF support, character values are constrained
1628
by the bit length, and can only be > 256 for 16-bit and
1629
32-bit libraries. */
1630
#if PCRE2_CODE_UNIT_WIDTH != 8
1631
if (range_start < range_end)
1632
{
1633
*class_uchardata++ = XCL_RANGE;
1634
*class_uchardata++ = range_start;
1635
}
1636
else
1637
*class_uchardata++ = XCL_SINGLE;
1638
1639
*class_uchardata++ = range_end;
1640
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1641
}
1642
1643
if (lengthptr == NULL)
1644
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1645
}
1646
}
1647
#endif /* SUPPORT_WIDE_CHARS */
1648
1649
/* If there are characters with values > 255, or Unicode property settings
1650
(\p or \P), we have to compile an extended class, with its own opcode,
1651
unless there were no property settings and there was a negated special such
1652
as \S in the class, and PCRE2_UCP is not set, because in that case all
1653
characters > 255 are in or not in the class, so any that were explicitly
1654
given as well can be ignored.
1655
1656
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
1657
were present in a class, we either have to match or not match all wide
1658
characters (depending on whether the whole class is or is not negated).
1659
This requirement is indicated by match_all_or_no_wide_chars being true.
1660
We do this by including an explicit range, which works in both cases.
1661
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
1662
cannot be any wide characters in 8-bit non-UTF mode.
1663
1664
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
1665
class where \S etc is present without PCRE2_UCP, causing an extended class
1666
to be compiled, we make sure that all characters > 255 are included by
1667
forcing match_all_or_no_wide_chars to be true.
1668
1669
If, when generating an xclass, there are no characters < 256, we can omit
1670
the bitmap in the actual compiled code. */
1671
1672
#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
1673
if ((xclass_props & XCLASS_REQUIRED) != 0)
1674
{
1675
PCRE2_UCHAR *previous = code;
1676
1677
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
1678
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
1679
*code++ = OP_XCLASS;
1680
code += LINK_SIZE;
1681
*code = negate_class? XCL_NOT:0;
1682
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
1683
1684
/* If the map is required, move up the extra data to make room for it;
1685
otherwise just move the code pointer to the end of the extra data. */
1686
1687
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
1688
{
1689
if (negate_class)
1690
{
1691
uint32_t *classwords = cb->classbits.classwords;
1692
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1693
}
1694
1695
if (has_bitmap == NULL)
1696
{
1697
*code++ |= XCL_MAP;
1698
(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
1699
CU2BYTES(class_uchardata - code));
1700
memcpy(code, classbits, 32);
1701
code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
1702
}
1703
else
1704
{
1705
code = class_uchardata;
1706
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
1707
*has_bitmap = TRUE;
1708
}
1709
}
1710
else code = class_uchardata;
1711
1712
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
1713
{
1714
/* Char lists size is an even number, because all items are 16 or 32
1715
bit values. The character list data is always aligned to 32 bits. */
1716
size_t char_lists_size = cranges->char_lists_size;
1717
PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
1718
(cb->char_lists_size & 0x3) == 0);
1719
1720
if (lengthptr != NULL)
1721
{
1722
char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1723
1724
#if PCRE2_CODE_UNIT_WIDTH == 8
1725
*lengthptr += 2 + LINK_SIZE;
1726
#else
1727
*lengthptr += 1 + LINK_SIZE;
1728
#endif
1729
1730
cb->char_lists_size += char_lists_size;
1731
1732
char_lists_size /= sizeof(PCRE2_UCHAR);
1733
1734
/* Storage space for character lists is included
1735
in the maximum pattern size. */
1736
if (*lengthptr > MAX_PATTERN_SIZE ||
1737
MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
1738
{
1739
*errorcodeptr = ERR20; /* Pattern is too large */
1740
return NULL;
1741
}
1742
}
1743
else
1744
{
1745
uint8_t *data;
1746
1747
PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
1748
#if PCRE2_CODE_UNIT_WIDTH == 8
1749
/* Encode as high / low bytes. */
1750
code[0] = (uint8_t)(XCL_LIST |
1751
(cranges->char_lists_types >> 8));
1752
code[1] = (uint8_t)cranges->char_lists_types;
1753
code += 2;
1754
#else
1755
*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
1756
#endif
1757
1758
/* Character lists are stored in backwards direction from
1759
byte code start. The non-dfa/dfa matchers can access these
1760
lists using the byte code start stored in match blocks.
1761
Each list is aligned to 32 bit with an optional unused
1762
16 bit value at the beginning of the character list. */
1763
1764
cb->char_lists_size += char_lists_size;
1765
data = (uint8_t*)cb->start_code - cb->char_lists_size;
1766
1767
memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
1768
char_lists_size);
1769
1770
/* Since character lists total size is less than MAX_PATTERN_SIZE,
1771
their starting offset fits into a value which size is LINK_SIZE. */
1772
1773
char_lists_size = cb->char_lists_size;
1774
PUT(code, 0, (uint32_t)(char_lists_size >> 1));
1775
code += LINK_SIZE;
1776
1777
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
1778
if ((char_lists_size & 0x2) != 0)
1779
{
1780
/* In debug the unused 16 bit value is set
1781
to a fixed value and marked unused. */
1782
((uint16_t*)data)[-1] = 0x5555;
1783
#ifdef SUPPORT_VALGRIND
1784
VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
1785
#endif
1786
}
1787
#endif
1788
1789
cb->char_lists_size =
1790
CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
1791
1792
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
1793
}
1794
}
1795
1796
/* Now fill in the complete length of the item */
1797
1798
PUT(previous, 1, (int)(code - previous));
1799
goto DONE; /* End of class handling */
1800
}
1801
#endif /* SUPPORT_WIDE_CHARS */
1802
1803
/* If there are no characters > 255, or they are all to be included or
1804
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
1805
whole class was negated and whether there were negative specials such as \S
1806
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
1807
negating it if necessary. */
1808
1809
if (negate_class)
1810
{
1811
uint32_t *classwords = cb->classbits.classwords;
1812
1813
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
1814
}
1815
1816
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
1817
cb->classbits.classwords[0] == ~(uint32_t)0)
1818
{
1819
const uint32_t *classwords = cb->classbits.classwords;
1820
int i;
1821
1822
for (i = 0; i < 8; i++)
1823
if (classwords[i] != ~(uint32_t)0) break;
1824
1825
if (i == 8)
1826
{
1827
*code++ = OP_ALLANY;
1828
goto DONE; /* End of class handling */
1829
}
1830
}
1831
1832
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1833
memcpy(code, classbits, 32);
1834
code += 32 / sizeof(PCRE2_UCHAR);
1835
1836
DONE:
1837
*pcode = code;
1838
return pptr - 1;
1839
}
1840
1841
1842
1843
/* ===================================================================*/
1844
/* Here follows a block of ECLASS-compiling functions. You may well want to
1845
read them from top to bottom; they are ordered from leafmost (at the top) to
1846
outermost parser (at the bottom of the file). */
1847
1848
/* This function folds one operand using the negation operator.
1849
The new, combined chunk of stack code is written out to *pop_info. */
1850
1851
static void
1852
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
1853
BOOL preserve_classbits)
1854
{
1855
/* If the chunk of stack code is already composed of multiple ops, we won't
1856
descend in and try and propagate the negation down the tree. (That would lead
1857
to O(n^2) compile-time, which could be exploitable with a malicious regex -
1858
although maybe that's not really too much of a worry in a library that offers
1859
an exponential-time matching function!) */
1860
1861
if (pop_info->op_single_type == 0)
1862
{
1863
if (lengthptr != NULL)
1864
*lengthptr += 1;
1865
else
1866
pop_info->code_start[pop_info->length] = ECL_NOT;
1867
pop_info->length += 1;
1868
}
1869
1870
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
1871
without needing to produce an ECL_NOT. */
1872
1873
else if (pop_info->op_single_type == ECL_ANY ||
1874
pop_info->op_single_type == ECL_NONE)
1875
{
1876
pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
1877
ECL_ANY : ECL_NONE;
1878
if (lengthptr == NULL)
1879
*(pop_info->code_start) = pop_info->op_single_type;
1880
}
1881
else
1882
{
1883
PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
1884
pop_info->length >= 1 + LINK_SIZE + 1);
1885
if (lengthptr == NULL)
1886
pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
1887
}
1888
1889
if (!preserve_classbits)
1890
{
1891
for (int i = 0; i < 8; i++)
1892
pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
1893
}
1894
}
1895
1896
1897
1898
/* This function folds together two operands using a binary operator.
1899
The new, combined chunk of stack code is written out to *lhs_op_info. */
1900
1901
static void
1902
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
1903
PCRE2_SIZE *lengthptr)
1904
{
1905
switch (op)
1906
{
1907
/* ECL_AND truth table:
1908
1909
LHS RHS RESULT
1910
----------------
1911
ANY * RHS
1912
* ANY LHS
1913
NONE * NONE
1914
* NONE NONE
1915
X Y X & Y
1916
*/
1917
1918
case ECL_AND:
1919
if (rhs_op_info->op_single_type == ECL_ANY)
1920
{
1921
/* no-op: drop the RHS */
1922
}
1923
else if (lhs_op_info->op_single_type == ECL_ANY)
1924
{
1925
/* no-op: drop the LHS, and memmove the RHS into its place */
1926
if (lengthptr == NULL)
1927
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1928
CU2BYTES(rhs_op_info->length));
1929
lhs_op_info->length = rhs_op_info->length;
1930
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1931
}
1932
else if (rhs_op_info->op_single_type == ECL_NONE)
1933
{
1934
/* the result is ECL_NONE: write into the LHS */
1935
if (lengthptr == NULL)
1936
lhs_op_info->code_start[0] = ECL_NONE;
1937
lhs_op_info->length = 1;
1938
lhs_op_info->op_single_type = ECL_NONE;
1939
}
1940
else if (lhs_op_info->op_single_type == ECL_NONE)
1941
{
1942
/* the result is ECL_NONE: drop the RHS */
1943
}
1944
else
1945
{
1946
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
1947
if (lengthptr != NULL)
1948
*lengthptr += 1;
1949
else
1950
{
1951
PCRE2_ASSERT(rhs_op_info->code_start ==
1952
lhs_op_info->code_start + lhs_op_info->length);
1953
rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
1954
}
1955
lhs_op_info->length += rhs_op_info->length + 1;
1956
lhs_op_info->op_single_type = 0;
1957
}
1958
1959
for (int i = 0; i < 8; i++)
1960
lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
1961
break;
1962
1963
/* ECL_OR truth table:
1964
1965
LHS RHS RESULT
1966
----------------
1967
ANY * ANY
1968
* ANY ANY
1969
NONE * RHS
1970
* NONE LHS
1971
X Y X | Y
1972
*/
1973
1974
case ECL_OR:
1975
if (rhs_op_info->op_single_type == ECL_NONE)
1976
{
1977
/* no-op: drop the RHS */
1978
}
1979
else if (lhs_op_info->op_single_type == ECL_NONE)
1980
{
1981
/* no-op: drop the LHS, and memmove the RHS into its place */
1982
if (lengthptr == NULL)
1983
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
1984
CU2BYTES(rhs_op_info->length));
1985
lhs_op_info->length = rhs_op_info->length;
1986
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
1987
}
1988
else if (rhs_op_info->op_single_type == ECL_ANY)
1989
{
1990
/* the result is ECL_ANY: write into the LHS */
1991
if (lengthptr == NULL)
1992
lhs_op_info->code_start[0] = ECL_ANY;
1993
lhs_op_info->length = 1;
1994
lhs_op_info->op_single_type = ECL_ANY;
1995
}
1996
else if (lhs_op_info->op_single_type == ECL_ANY)
1997
{
1998
/* the result is ECL_ANY: drop the RHS */
1999
}
2000
else
2001
{
2002
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2003
if (lengthptr != NULL)
2004
*lengthptr += 1;
2005
else
2006
{
2007
PCRE2_ASSERT(rhs_op_info->code_start ==
2008
lhs_op_info->code_start + lhs_op_info->length);
2009
rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
2010
}
2011
lhs_op_info->length += rhs_op_info->length + 1;
2012
lhs_op_info->op_single_type = 0;
2013
}
2014
2015
for (int i = 0; i < 8; i++)
2016
lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
2017
break;
2018
2019
/* ECL_XOR truth table:
2020
2021
LHS RHS RESULT
2022
----------------
2023
ANY * !RHS
2024
* ANY !LHS
2025
NONE * RHS
2026
* NONE LHS
2027
X Y X ^ Y
2028
*/
2029
2030
case ECL_XOR:
2031
if (rhs_op_info->op_single_type == ECL_NONE)
2032
{
2033
/* no-op: drop the RHS */
2034
}
2035
else if (lhs_op_info->op_single_type == ECL_NONE)
2036
{
2037
/* no-op: drop the LHS, and memmove the RHS into its place */
2038
if (lengthptr == NULL)
2039
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2040
CU2BYTES(rhs_op_info->length));
2041
lhs_op_info->length = rhs_op_info->length;
2042
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2043
}
2044
else if (rhs_op_info->op_single_type == ECL_ANY)
2045
{
2046
/* the result is !LHS: fold in the negation, and drop the RHS */
2047
/* Preserve the classbits, because we promise to deal with them later. */
2048
fold_negation(lhs_op_info, lengthptr, TRUE);
2049
}
2050
else if (lhs_op_info->op_single_type == ECL_ANY)
2051
{
2052
/* the result is !RHS: drop the LHS, memmove the RHS into its place, and
2053
fold in the negation */
2054
if (lengthptr == NULL)
2055
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
2056
CU2BYTES(rhs_op_info->length));
2057
lhs_op_info->length = rhs_op_info->length;
2058
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
2059
2060
/* Preserve the classbits, because we promise to deal with them later. */
2061
fold_negation(lhs_op_info, lengthptr, TRUE);
2062
}
2063
else
2064
{
2065
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
2066
if (lengthptr != NULL)
2067
*lengthptr += 1;
2068
else
2069
{
2070
PCRE2_ASSERT(rhs_op_info->code_start ==
2071
lhs_op_info->code_start + lhs_op_info->length);
2072
rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
2073
}
2074
lhs_op_info->length += rhs_op_info->length + 1;
2075
lhs_op_info->op_single_type = 0;
2076
}
2077
2078
for (int i = 0; i < 8; i++)
2079
lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
2080
break;
2081
2082
default:
2083
PCRE2_DEBUG_UNREACHABLE();
2084
break;
2085
}
2086
}
2087
2088
2089
2090
static BOOL
2091
compile_eclass_nested(eclass_context *context, BOOL negated,
2092
uint32_t **pptr, PCRE2_UCHAR **pcode,
2093
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
2094
2095
/* This function consumes a group of implicitly-unioned class elements.
2096
These can be characters, ranges, properties, or nested classes, as long
2097
as they are all joined by being placed adjacently. */
2098
2099
static BOOL
2100
compile_class_operand(eclass_context *context, BOOL negated,
2101
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2102
PCRE2_SIZE *lengthptr)
2103
{
2104
uint32_t *ptr = *pptr;
2105
uint32_t *prev_ptr;
2106
PCRE2_UCHAR *code = *pcode;
2107
PCRE2_UCHAR *code_start = code;
2108
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
2109
PCRE2_SIZE extra_length;
2110
uint32_t meta = META_CODE(*ptr);
2111
2112
switch (meta)
2113
{
2114
case META_CLASS_EMPTY_NOT:
2115
case META_CLASS_EMPTY:
2116
++ptr;
2117
pop_info->length = 1;
2118
if ((meta == META_CLASS_EMPTY) == negated)
2119
{
2120
*code++ = pop_info->op_single_type = ECL_ANY;
2121
memset(pop_info->bits.classbits, 0xff, 32);
2122
}
2123
else
2124
{
2125
*code++ = pop_info->op_single_type = ECL_NONE;
2126
memset(pop_info->bits.classbits, 0, 32);
2127
}
2128
break;
2129
2130
case META_CLASS:
2131
case META_CLASS_NOT:
2132
if ((*ptr & CLASS_IS_ECLASS) != 0)
2133
{
2134
if (!compile_eclass_nested(context, negated, &ptr, &code,
2135
pop_info, lengthptr))
2136
return FALSE;
2137
2138
PCRE2_ASSERT(*ptr == META_CLASS_END);
2139
ptr++;
2140
goto DONE;
2141
}
2142
2143
ptr++;
2144
/* Fall through */
2145
2146
default:
2147
/* Scan forward characters, ranges, and properties.
2148
For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
2149
we still need to collect that fragment up into a "leaf" OP_CLASS. */
2150
2151
prev_ptr = ptr;
2152
ptr = PRIV(compile_class_not_nested)(
2153
context->options, context->xoptions, ptr, &code,
2154
(meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
2155
context->errorcodeptr, context->cb, lengthptr);
2156
if (ptr == NULL) return FALSE;
2157
2158
/* We must have a 100% guarantee that ptr increases when
2159
compile_class_operand() returns, even on Release builds, so that we can
2160
statically prove our loops terminate. */
2161
if (ptr <= prev_ptr)
2162
{
2163
PCRE2_DEBUG_UNREACHABLE();
2164
return FALSE;
2165
}
2166
2167
/* If we fell through above, consume the closing ']'. */
2168
if (meta == META_CLASS || meta == META_CLASS_NOT)
2169
{
2170
PCRE2_ASSERT(*ptr == META_CLASS_END);
2171
ptr++;
2172
}
2173
2174
/* Regardless of whether (lengthptr == NULL), some data will still be written
2175
out to *pcode, which we need: we have to peek at it, to transform the opcode
2176
into the ECLASS version (since we need to hoist up the bitmaps). */
2177
PCRE2_ASSERT(code > code_start);
2178
extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
2179
2180
/* Easiest case: convert OP_ALLANY to ECL_ANY */
2181
2182
if (*code_start == OP_ALLANY)
2183
{
2184
PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
2185
pop_info->length = 1;
2186
*code_start = pop_info->op_single_type = ECL_ANY;
2187
memset(pop_info->bits.classbits, 0xff, 32);
2188
}
2189
2190
/* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
2191
ECL_NONE / ECL_ANY respectively. */
2192
2193
else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
2194
{
2195
PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
2196
extra_length == 0);
2197
pop_info->length = 1;
2198
*code_start = pop_info->op_single_type =
2199
(*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
2200
memcpy(pop_info->bits.classbits, code_start + 1, 32);
2201
/* Rewind the code pointer, but make sure we adjust *lengthptr, because we
2202
do need to reserve that space (even though we only use it temporarily). */
2203
if (lengthptr != NULL)
2204
*lengthptr += code - (code_start + 1);
2205
code = code_start + 1;
2206
2207
if (!context->needs_bitmap && *code_start == ECL_NONE)
2208
{
2209
uint32_t *classwords = pop_info->bits.classwords;
2210
2211
for (int i = 0; i < 8; i++)
2212
if (classwords[i] != 0)
2213
{
2214
context->needs_bitmap = TRUE;
2215
break;
2216
}
2217
}
2218
else
2219
context->needs_bitmap = TRUE;
2220
}
2221
2222
/* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
2223
ECL_XCLASS. */
2224
2225
else
2226
{
2227
PCRE2_ASSERT(*code_start == OP_XCLASS);
2228
*code_start = pop_info->op_single_type = ECL_XCLASS;
2229
2230
PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
2231
2232
memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
2233
pop_info->length = (code - code_start) + extra_length;
2234
}
2235
2236
break;
2237
} /* End of switch(meta) */
2238
2239
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
2240
2241
if (lengthptr != NULL)
2242
{
2243
*lengthptr += code - code_start;
2244
code = code_start;
2245
}
2246
2247
DONE:
2248
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
2249
2250
*pptr = ptr;
2251
*pcode = code;
2252
return TRUE;
2253
}
2254
2255
2256
2257
/* This function consumes a group of implicitly-unioned class elements.
2258
These can be characters, ranges, properties, or nested classes, as long
2259
as they are all joined by being placed adjacently. */
2260
2261
static BOOL
2262
compile_class_juxtaposition(eclass_context *context, BOOL negated,
2263
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2264
PCRE2_SIZE *lengthptr)
2265
{
2266
uint32_t *ptr = *pptr;
2267
PCRE2_UCHAR *code = *pcode;
2268
#ifdef PCRE2_DEBUG
2269
PCRE2_UCHAR *start_code = *pcode;
2270
#endif
2271
2272
/* See compile_class_binary_loose() for comments on compile-time folding of
2273
the "negated" flag. */
2274
2275
/* Because it's a non-empty class, there must be an operand at the start. */
2276
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
2277
return FALSE;
2278
2279
while (*ptr != META_CLASS_END &&
2280
!(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
2281
{
2282
uint32_t op;
2283
BOOL rhs_negated;
2284
eclass_op_info rhs_op_info;
2285
2286
if (negated)
2287
{
2288
/* !(A juxtapose B) -> !A && !B */
2289
op = ECL_AND;
2290
rhs_negated = TRUE;
2291
}
2292
else
2293
{
2294
/* A juxtapose B -> A || B */
2295
op = ECL_OR;
2296
rhs_negated = FALSE;
2297
}
2298
2299
/* An operand must follow the operator. */
2300
if (!compile_class_operand(context, rhs_negated, &ptr, &code,
2301
&rhs_op_info, lengthptr))
2302
return FALSE;
2303
2304
/* Convert infix to postfix (RPN). */
2305
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2306
if (lengthptr == NULL)
2307
code = pop_info->code_start + pop_info->length;
2308
}
2309
2310
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2311
2312
*pptr = ptr;
2313
*pcode = code;
2314
return TRUE;
2315
}
2316
2317
2318
2319
/* This function consumes unary prefix operators. */
2320
2321
static BOOL
2322
compile_class_unary(eclass_context *context, BOOL negated,
2323
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2324
PCRE2_SIZE *lengthptr)
2325
{
2326
uint32_t *ptr = *pptr;
2327
#ifdef PCRE2_DEBUG
2328
PCRE2_UCHAR *start_code = *pcode;
2329
#endif
2330
2331
while (*ptr == META_ECLASS_NOT)
2332
{
2333
++ptr;
2334
negated = !negated;
2335
}
2336
2337
*pptr = ptr;
2338
/* Because it's a non-empty class, there must be an operand. */
2339
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
2340
pop_info, lengthptr))
2341
return FALSE;
2342
2343
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2344
return TRUE;
2345
}
2346
2347
2348
2349
/* This function consumes tightly-binding binary operators. */
2350
2351
static BOOL
2352
compile_class_binary_tight(eclass_context *context, BOOL negated,
2353
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2354
PCRE2_SIZE *lengthptr)
2355
{
2356
uint32_t *ptr = *pptr;
2357
PCRE2_UCHAR *code = *pcode;
2358
#ifdef PCRE2_DEBUG
2359
PCRE2_UCHAR *start_code = *pcode;
2360
#endif
2361
2362
/* See compile_class_binary_loose() for comments on compile-time folding of
2363
the "negated" flag. */
2364
2365
/* Because it's a non-empty class, there must be an operand at the start. */
2366
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
2367
return FALSE;
2368
2369
while (*ptr == META_ECLASS_AND)
2370
{
2371
uint32_t op;
2372
BOOL rhs_negated;
2373
eclass_op_info rhs_op_info;
2374
2375
if (negated)
2376
{
2377
/* !(A && B) -> !A || !B */
2378
op = ECL_OR;
2379
rhs_negated = TRUE;
2380
}
2381
else
2382
{
2383
/* A && B -> A && B */
2384
op = ECL_AND;
2385
rhs_negated = FALSE;
2386
}
2387
2388
++ptr;
2389
2390
/* An operand must follow the operator. */
2391
if (!compile_class_unary(context, rhs_negated, &ptr, &code,
2392
&rhs_op_info, lengthptr))
2393
return FALSE;
2394
2395
/* Convert infix to postfix (RPN). */
2396
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2397
if (lengthptr == NULL)
2398
code = pop_info->code_start + pop_info->length;
2399
}
2400
2401
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2402
2403
*pptr = ptr;
2404
*pcode = code;
2405
return TRUE;
2406
}
2407
2408
2409
2410
/* This function consumes loosely-binding binary operators. */
2411
2412
static BOOL
2413
compile_class_binary_loose(eclass_context *context, BOOL negated,
2414
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
2415
PCRE2_SIZE *lengthptr)
2416
{
2417
uint32_t *ptr = *pptr;
2418
PCRE2_UCHAR *code = *pcode;
2419
#ifdef PCRE2_DEBUG
2420
PCRE2_UCHAR *start_code = *pcode;
2421
#endif
2422
2423
/* We really want to fold the negation operator, if at all possible, so that
2424
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
2425
to produce a fully-folded expression, so that we can guarantee not to emit any
2426
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
2427
2428
This has the consequence that with a little ingenuity, we can in fact avoid
2429
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
2430
!(A ...
2431
We have parsed the preceding "!", and we are about to parse the "A" operand. We
2432
don't know yet whether there will even be a following binary operand! Both of
2433
these are possibilities for what follows:
2434
!(A && B)
2435
!(A)
2436
However, we can still fold the "!" into the "A" operand, because no matter what
2437
the following binary operator will be, we can produce an expression which is
2438
equivalent. */
2439
2440
/* Because it's a non-empty class, there must be an operand at the start. */
2441
if (!compile_class_binary_tight(context, negated, &ptr, &code,
2442
pop_info, lengthptr))
2443
return FALSE;
2444
2445
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
2446
{
2447
uint32_t op;
2448
BOOL op_neg;
2449
BOOL rhs_negated;
2450
eclass_op_info rhs_op_info;
2451
2452
if (negated)
2453
{
2454
/* The whole expression is being negated; we respond by unconditionally
2455
negating the LHS A, before seeing what follows. And hooray! We can recover,
2456
no matter what follows. */
2457
/* !(A || B) -> !A && !B */
2458
/* !(A -- B) -> !(A && !B) -> !A || B */
2459
/* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */
2460
op = (*ptr == META_ECLASS_OR )? ECL_AND :
2461
(*ptr == META_ECLASS_SUB)? ECL_OR :
2462
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
2463
op_neg = (*ptr == META_ECLASS_XOR);
2464
rhs_negated = *ptr != META_ECLASS_SUB;
2465
}
2466
else
2467
{
2468
/* A || B -> A || B */
2469
/* A -- B -> A && !B */
2470
/* A XOR B -> A XOR B */
2471
op = (*ptr == META_ECLASS_OR )? ECL_OR :
2472
(*ptr == META_ECLASS_SUB)? ECL_AND :
2473
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
2474
op_neg = FALSE;
2475
rhs_negated = *ptr == META_ECLASS_SUB;
2476
}
2477
2478
++ptr;
2479
2480
/* An operand must follow the operator. */
2481
if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
2482
&rhs_op_info, lengthptr))
2483
return FALSE;
2484
2485
/* Convert infix to postfix (RPN). */
2486
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
2487
if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
2488
if (lengthptr == NULL)
2489
code = pop_info->code_start + pop_info->length;
2490
}
2491
2492
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
2493
2494
*pptr = ptr;
2495
*pcode = code;
2496
return TRUE;
2497
}
2498
2499
2500
2501
/* This function converts the META codes in pptr into opcodes written to
2502
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
2503
2504
The class is compiled as a left-associative sequence of operator
2505
applications.
2506
2507
The pptr will be left pointing at the matching META_CLASS_END. */
2508
2509
static BOOL
2510
compile_eclass_nested(eclass_context *context, BOOL negated,
2511
uint32_t **pptr, PCRE2_UCHAR **pcode,
2512
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
2513
{
2514
uint32_t *ptr = *pptr;
2515
#ifdef PCRE2_DEBUG
2516
PCRE2_UCHAR *start_code = *pcode;
2517
#endif
2518
2519
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
2520
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
2521
*ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
2522
2523
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
2524
negated = !negated;
2525
2526
(*pptr)++;
2527
2528
/* Because it's a non-empty class, there must be an operand at the start. */
2529
if (!compile_class_binary_loose(context, negated, pptr, pcode,
2530
pop_info, lengthptr))
2531
return FALSE;
2532
2533
PCRE2_ASSERT(**pptr == META_CLASS_END);
2534
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
2535
return TRUE;
2536
}
2537
2538
BOOL
2539
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
2540
uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
2541
compile_block *cb, PCRE2_SIZE *lengthptr)
2542
{
2543
eclass_context context;
2544
eclass_op_info op_info;
2545
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
2546
PCRE2_UCHAR *code = *pcode;
2547
PCRE2_UCHAR *previous;
2548
BOOL allbitsone = TRUE;
2549
2550
context.needs_bitmap = FALSE;
2551
context.options = options;
2552
context.xoptions = xoptions;
2553
context.errorcodeptr = errorcodeptr;
2554
context.cb = cb;
2555
2556
previous = code;
2557
*code++ = OP_ECLASS;
2558
code += LINK_SIZE;
2559
*code++ = 0; /* Flags, currently zero. */
2560
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
2561
return FALSE;
2562
2563
if (lengthptr != NULL)
2564
{
2565
*lengthptr += code - previous;
2566
code = previous;
2567
/* (*lengthptr - previous_length) now holds the amount of buffer that
2568
we require to make the call to compile_class_nested() with
2569
lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
2570
before that call. */
2571
}
2572
2573
/* Do some useful counting of what's in the bitmap. */
2574
for (int i = 0; i < 8; i++)
2575
if (op_info.bits.classwords[i] != 0xffffffff)
2576
{
2577
allbitsone = FALSE;
2578
break;
2579
}
2580
2581
/* After constant-folding the extended class syntax, it may turn out to be
2582
a simple class after all. In that case, we can unwrap it from the
2583
OP_ECLASS container - and in fact, we must do so, because in 8-bit
2584
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
2585
2586
#ifndef SUPPORT_WIDE_CHARS
2587
PCRE2_ASSERT(op_info.op_single_type != 0);
2588
#else
2589
if (op_info.op_single_type != 0)
2590
#endif
2591
{
2592
/* Rewind back over the OP_ECLASS. */
2593
code = previous;
2594
2595
/* If the bits are all ones, and the "high characters" are all matched
2596
too, we use a special-cased encoding of OP_ALLANY. */
2597
2598
if (op_info.op_single_type == ECL_ANY && allbitsone)
2599
{
2600
/* Advancing code means rewinding lengthptr, at this point. */
2601
if (lengthptr != NULL) *lengthptr -= 1;
2602
*code++ = OP_ALLANY;
2603
}
2604
2605
/* If the high bits are all matched / all not-matched, then we emit an
2606
OP_NCLASS/OP_CLASS respectively. */
2607
2608
else if (op_info.op_single_type == ECL_ANY ||
2609
op_info.op_single_type == ECL_NONE)
2610
{
2611
PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
2612
2613
if (lengthptr != NULL)
2614
{
2615
if (required_len > (*lengthptr - previous_length))
2616
*lengthptr = previous_length + required_len;
2617
}
2618
2619
/* Advancing code means rewinding lengthptr, at this point. */
2620
if (lengthptr != NULL) *lengthptr -= required_len;
2621
*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
2622
memcpy(code, op_info.bits.classbits, 32);
2623
code += 32 / sizeof(PCRE2_UCHAR);
2624
}
2625
2626
/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
2627
there, but, we pulled out its bitmap into op_info, so now we have to
2628
put that back into the OP_XCLASS. */
2629
2630
else
2631
{
2632
#ifndef SUPPORT_WIDE_CHARS
2633
PCRE2_DEBUG_UNREACHABLE();
2634
#else
2635
BOOL need_map = context.needs_bitmap;
2636
PCRE2_SIZE required_len;
2637
2638
PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
2639
required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
2640
2641
if (lengthptr != NULL)
2642
{
2643
/* Don't unconditionally request all the space we need - we may
2644
already have asked for more during processing of the ECLASS. */
2645
if (required_len > (*lengthptr - previous_length))
2646
*lengthptr = previous_length + required_len;
2647
2648
/* The code we write out here won't be ignored, even during the
2649
(lengthptr != NULL) phase, because if there's a following quantifier
2650
it will peek backwards. So we do have to write out a (truncated)
2651
OP_XCLASS, even on this branch. */
2652
*lengthptr -= 1 + LINK_SIZE + 1;
2653
*code++ = OP_XCLASS;
2654
PUT(code, 0, 1 + LINK_SIZE + 1);
2655
code += LINK_SIZE;
2656
*code++ = 0;
2657
}
2658
else
2659
{
2660
PCRE2_UCHAR *rest;
2661
PCRE2_SIZE rest_len;
2662
PCRE2_UCHAR flags;
2663
2664
/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
2665
PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
2666
rest = op_info.code_start + 1 + LINK_SIZE + 1;
2667
rest_len = (op_info.code_start + op_info.length) - rest;
2668
2669
/* First read any data we use, before memmove splats it. */
2670
flags = op_info.code_start[1 + LINK_SIZE];
2671
PCRE2_ASSERT((flags & XCL_MAP) == 0);
2672
2673
/* Next do the memmove before any writes. */
2674
memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
2675
rest, CU2BYTES(rest_len));
2676
2677
/* Finally write the header data. */
2678
*code++ = OP_XCLASS;
2679
PUT(code, 0, (int)required_len);
2680
code += LINK_SIZE;
2681
*code++ = flags | (need_map? XCL_MAP : 0);
2682
if (need_map)
2683
{
2684
memcpy(code, op_info.bits.classbits, 32);
2685
code += 32 / sizeof(PCRE2_UCHAR);
2686
}
2687
code += rest_len;
2688
}
2689
#endif /* SUPPORT_WIDE_CHARS */
2690
}
2691
}
2692
2693
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
2694
to do some adjustment to insert the bitmap if we have one. */
2695
2696
#ifdef SUPPORT_WIDE_CHARS
2697
else
2698
{
2699
BOOL need_map = context.needs_bitmap;
2700
PCRE2_SIZE required_len =
2701
1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
2702
2703
if (lengthptr != NULL)
2704
{
2705
if (required_len > (*lengthptr - previous_length))
2706
*lengthptr = previous_length + required_len;
2707
2708
/* As for the XCLASS branch above, we do have to write out a dummy
2709
OP_ECLASS, because of the backwards peek by the quantifier code. Write
2710
out a (truncated) OP_ECLASS, even on this branch. */
2711
*lengthptr -= 1 + LINK_SIZE + 1;
2712
*code++ = OP_ECLASS;
2713
PUT(code, 0, 1 + LINK_SIZE + 1);
2714
code += LINK_SIZE;
2715
*code++ = 0;
2716
}
2717
else
2718
{
2719
if (need_map)
2720
{
2721
PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
2722
previous[1 + LINK_SIZE] |= ECL_MAP;
2723
memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
2724
CU2BYTES(code - map_start));
2725
memcpy(map_start, op_info.bits.classbits, 32);
2726
code += 32 / sizeof(PCRE2_UCHAR);
2727
}
2728
PUT(previous, 1, (int)(code - previous));
2729
}
2730
}
2731
#endif /* SUPPORT_WIDE_CHARS */
2732
2733
*pcode = code;
2734
return TRUE;
2735
}
2736
2737
/* End of pcre2_compile_class.c */
2738
2739