Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_study.c
21409 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains functions for scanning a compiled pattern and
43
collecting data (e.g. minimum matching length). */
44
45
46
#include "pcre2_internal.h"
47
48
49
50
/* The maximum remembered capturing brackets minimum. */
51
52
#define MAX_CACHE_BACKREF 128
53
54
/* Set a bit in the starting code unit bit map. */
55
56
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
57
58
/* Returns from set_start_bits() */
59
60
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
61
62
63
/*************************************************
64
* Find the minimum subject length for a group *
65
*************************************************/
66
67
/* Scan a parenthesized group and compute the minimum length of subject that
68
is needed to match it. This is a lower bound; it does not mean there is a
69
string of that length that matches. In UTF mode, the result is in characters
70
rather than code units. The field in a compiled pattern for storing the minimum
71
length is 16-bits long (on the grounds that anything longer than that is
72
pathological), so we give up when we reach that amount. This also means that
73
integer overflow for really crazy patterns cannot happen.
74
75
Backreference minimum lengths are cached to speed up multiple references. This
76
function is called only when the highest back reference in the pattern is less
77
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
78
caching vector. The zeroth element contains the number of the highest set
79
value.
80
81
Arguments:
82
re compiled pattern block
83
code pointer to start of group (the bracket)
84
startcode pointer to start of the whole pattern's code
85
utf UTF flag
86
recurses chain of recurse_check to catch mutual recursion
87
countptr pointer to call count (to catch over complexity)
88
backref_cache vector for caching back references.
89
90
This function is no longer called when the pattern contains (*ACCEPT); however,
91
the old code for returning -1 is retained, just in case.
92
93
Returns: the minimum length
94
-1 \C in UTF-8 mode
95
or (*ACCEPT)
96
or pattern too complicated
97
-2 internal error (missing capturing bracket)
98
-3 internal error (opcode not listed)
99
*/
100
101
static int
102
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
103
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
104
int *backref_cache)
105
{
106
int length = -1;
107
int branchlength = 0;
108
int prev_cap_recno = -1;
109
int prev_cap_d = 0;
110
int prev_recurse_recno = -1;
111
int prev_recurse_d = 0;
112
uint32_t once_fudge = 0;
113
BOOL had_recurse = FALSE;
114
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
115
PCRE2_SPTR nextbranch = code + GET(code, 1);
116
PCRE2_SPTR cc = code + 1 + LINK_SIZE;
117
recurse_check this_recurse;
118
119
/* If this is a "could be empty" group, its minimum length is 0. */
120
121
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
122
123
/* Skip over capturing bracket number */
124
125
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
126
127
/* A large and/or complex regex can take too long to process. */
128
129
if ((*countptr)++ > 1000) return -1;
130
131
/* Scan along the opcodes for this branch. If we get to the end of the branch,
132
check the length against that of the other branches. If the accumulated length
133
passes 16-bits, reset to that value and skip the rest of the branch. */
134
135
for (;;)
136
{
137
int d, min, recno;
138
PCRE2_UCHAR op;
139
PCRE2_SPTR cs, ce;
140
141
if (branchlength >= (int)UINT16_MAX)
142
{
143
branchlength = UINT16_MAX;
144
cc = nextbranch;
145
}
146
147
op = *cc;
148
switch (op)
149
{
150
case OP_COND:
151
case OP_SCOND:
152
153
/* If there is only one branch in a condition, the implied branch has zero
154
length, so we don't add anything. This covers the DEFINE "condition"
155
automatically. If there are two branches we can treat it the same as any
156
other non-capturing subpattern. */
157
158
cs = cc + GET(cc, 1);
159
if (*cs != OP_ALT)
160
{
161
cc = cs + 1 + LINK_SIZE;
162
break;
163
}
164
goto PROCESS_NON_CAPTURE;
165
166
case OP_BRA:
167
/* There's a special case of OP_BRA, when it is wrapped round a repeated
168
OP_RECURSE. We'd like to process the latter at this level so that
169
remembering the value works for repeated cases. So we do nothing, but
170
set a fudge value to skip over the OP_KET after the recurse. */
171
172
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173
{
174
once_fudge = 1 + LINK_SIZE;
175
cc += 1 + LINK_SIZE;
176
break;
177
}
178
PCRE2_FALLTHROUGH /* Fall through */
179
180
case OP_ONCE:
181
case OP_SCRIPT_RUN:
182
case OP_SBRA:
183
case OP_BRAPOS:
184
case OP_SBRAPOS:
185
PROCESS_NON_CAPTURE:
186
d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187
backref_cache);
188
if (d < 0) return d;
189
branchlength += d;
190
do cc += GET(cc, 1); while (*cc == OP_ALT);
191
cc += 1 + LINK_SIZE;
192
break;
193
194
/* To save time for repeated capturing subpatterns, we remember the
195
length of the previous one. Unfortunately we can't do the same for
196
the unnumbered ones above. Nor can we do this if (?| is present in the
197
pattern because captures with the same number are not then identical. */
198
199
case OP_CBRA:
200
case OP_SCBRA:
201
case OP_CBRAPOS:
202
case OP_SCBRAPOS:
203
recno = (int)GET2(cc, 1+LINK_SIZE);
204
if (dupcapused || recno != prev_cap_recno)
205
{
206
prev_cap_recno = recno;
207
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208
backref_cache);
209
if (prev_cap_d < 0) return prev_cap_d;
210
}
211
branchlength += prev_cap_d;
212
do cc += GET(cc, 1); while (*cc == OP_ALT);
213
cc += 1 + LINK_SIZE;
214
break;
215
216
/* ACCEPT makes things far too complicated; we have to give up. In fact,
217
from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218
used. However, leave the code in place, just in case. */
219
220
case OP_ACCEPT:
221
case OP_ASSERT_ACCEPT:
222
return -1;
223
224
/* Reached end of a branch; if it's a ket it is the end of a nested
225
call. If it's ALT it is an alternation in a nested call. If it is END it's
226
the end of the outer call. All can be handled by the same code. If the
227
length of any branch is zero, there is no need to scan any subsequent
228
branches. */
229
230
case OP_ALT:
231
case OP_KET:
232
case OP_KETRMAX:
233
case OP_KETRMIN:
234
case OP_KETRPOS:
235
case OP_END:
236
if (length < 0 || (!had_recurse && branchlength < length))
237
length = branchlength;
238
if (op != OP_ALT || length == 0) return length;
239
nextbranch = cc + GET(cc, 1);
240
cc += 1 + LINK_SIZE;
241
branchlength = 0;
242
had_recurse = FALSE;
243
break;
244
245
/* Skip over assertive subpatterns */
246
247
case OP_ASSERT:
248
case OP_ASSERT_NOT:
249
case OP_ASSERTBACK:
250
case OP_ASSERTBACK_NOT:
251
case OP_ASSERT_NA:
252
case OP_ASSERT_SCS:
253
case OP_ASSERTBACK_NA:
254
do cc += GET(cc, 1); while (*cc == OP_ALT);
255
PCRE2_FALLTHROUGH /* Fall through */
256
257
/* Skip over things that don't match chars */
258
259
case OP_REVERSE:
260
case OP_VREVERSE:
261
case OP_CREF:
262
case OP_DNCREF:
263
case OP_RREF:
264
case OP_DNRREF:
265
case OP_FALSE:
266
case OP_TRUE:
267
case OP_CALLOUT:
268
case OP_SOD:
269
case OP_SOM:
270
case OP_EOD:
271
case OP_EODN:
272
case OP_CIRC:
273
case OP_CIRCM:
274
case OP_DOLL:
275
case OP_DOLLM:
276
case OP_NOT_WORD_BOUNDARY:
277
case OP_WORD_BOUNDARY:
278
case OP_NOT_UCP_WORD_BOUNDARY:
279
case OP_UCP_WORD_BOUNDARY:
280
cc += PRIV(OP_lengths)[*cc];
281
break;
282
283
case OP_CALLOUT_STR:
284
cc += GET(cc, 1 + 2*LINK_SIZE);
285
break;
286
287
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
288
289
case OP_BRAZERO:
290
case OP_BRAMINZERO:
291
case OP_BRAPOSZERO:
292
case OP_SKIPZERO:
293
cc += PRIV(OP_lengths)[*cc];
294
do cc += GET(cc, 1); while (*cc == OP_ALT);
295
cc += 1 + LINK_SIZE;
296
break;
297
298
/* Handle literal characters and + repetitions */
299
300
case OP_CHAR:
301
case OP_CHARI:
302
case OP_NOT:
303
case OP_NOTI:
304
case OP_PLUS:
305
case OP_PLUSI:
306
case OP_MINPLUS:
307
case OP_MINPLUSI:
308
case OP_POSPLUS:
309
case OP_POSPLUSI:
310
case OP_NOTPLUS:
311
case OP_NOTPLUSI:
312
case OP_NOTMINPLUS:
313
case OP_NOTMINPLUSI:
314
case OP_NOTPOSPLUS:
315
case OP_NOTPOSPLUSI:
316
branchlength++;
317
cc += 2;
318
#ifdef SUPPORT_UNICODE
319
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
320
#endif
321
break;
322
323
case OP_TYPEPLUS:
324
case OP_TYPEMINPLUS:
325
case OP_TYPEPOSPLUS:
326
branchlength++;
327
cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
328
break;
329
330
/* Handle exact repetitions. The count is already in characters, but we
331
may need to skip over a multibyte character in UTF mode. */
332
333
case OP_EXACT:
334
case OP_EXACTI:
335
case OP_NOTEXACT:
336
case OP_NOTEXACTI:
337
branchlength += GET2(cc,1);
338
cc += 2 + IMM2_SIZE;
339
#ifdef SUPPORT_UNICODE
340
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
341
#endif
342
break;
343
344
case OP_TYPEEXACT:
345
branchlength += GET2(cc,1);
346
cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
347
|| cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
348
break;
349
350
/* Handle single-char non-literal matchers */
351
352
case OP_PROP:
353
case OP_NOTPROP:
354
cc += 2;
355
PCRE2_FALLTHROUGH /* Fall through */
356
357
case OP_NOT_DIGIT:
358
case OP_DIGIT:
359
case OP_NOT_WHITESPACE:
360
case OP_WHITESPACE:
361
case OP_NOT_WORDCHAR:
362
case OP_WORDCHAR:
363
case OP_ANY:
364
case OP_ALLANY:
365
case OP_EXTUNI:
366
case OP_HSPACE:
367
case OP_NOT_HSPACE:
368
case OP_VSPACE:
369
case OP_NOT_VSPACE:
370
branchlength++;
371
cc++;
372
break;
373
374
/* "Any newline" might match two characters, but it also might match just
375
one. */
376
377
case OP_ANYNL:
378
branchlength += 1;
379
cc++;
380
break;
381
382
/* The single-byte matcher means we can't proceed in UTF mode. (In
383
non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
384
appear, but leave the code, just in case.) */
385
386
case OP_ANYBYTE:
387
#ifdef SUPPORT_UNICODE
388
if (utf) return -1;
389
#endif
390
branchlength++;
391
cc++;
392
break;
393
394
/* For repeated character types, we have to test for \p and \P, which have
395
an extra two bytes of parameters. */
396
397
case OP_TYPESTAR:
398
case OP_TYPEMINSTAR:
399
case OP_TYPEQUERY:
400
case OP_TYPEMINQUERY:
401
case OP_TYPEPOSSTAR:
402
case OP_TYPEPOSQUERY:
403
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
404
cc += PRIV(OP_lengths)[op];
405
break;
406
407
case OP_TYPEUPTO:
408
case OP_TYPEMINUPTO:
409
case OP_TYPEPOSUPTO:
410
if (cc[1 + IMM2_SIZE] == OP_PROP
411
|| cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
412
cc += PRIV(OP_lengths)[op];
413
break;
414
415
/* Check a class for variable quantification */
416
417
case OP_CLASS:
418
case OP_NCLASS:
419
#ifdef SUPPORT_WIDE_CHARS
420
case OP_XCLASS:
421
case OP_ECLASS:
422
/* The original code caused an unsigned overflow in 64 bit systems,
423
so now we use a conditional statement. */
424
if (op == OP_XCLASS || op == OP_ECLASS)
425
cc += GET(cc, 1);
426
else
427
#endif
428
cc += PRIV(OP_lengths)[OP_CLASS];
429
430
switch (*cc)
431
{
432
case OP_CRPLUS:
433
case OP_CRMINPLUS:
434
case OP_CRPOSPLUS:
435
branchlength++;
436
PCRE2_FALLTHROUGH /* Fall through */
437
438
case OP_CRSTAR:
439
case OP_CRMINSTAR:
440
case OP_CRQUERY:
441
case OP_CRMINQUERY:
442
case OP_CRPOSSTAR:
443
case OP_CRPOSQUERY:
444
cc++;
445
break;
446
447
case OP_CRRANGE:
448
case OP_CRMINRANGE:
449
case OP_CRPOSRANGE:
450
branchlength += GET2(cc,1);
451
cc += 1 + 2 * IMM2_SIZE;
452
break;
453
454
default:
455
branchlength++;
456
break;
457
}
458
break;
459
460
/* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461
way: we find the minimum length for the subpattern. A recursion
462
(backreference or subroutine) causes an a flag to be set that causes the
463
length of this branch to be ignored. The logic is that a recursion can only
464
make sense if there is another alternative that stops the recursing. That
465
will provide the minimum length (when no recursion happens).
466
467
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468
matches an empty string (by default it causes a matching failure), so in
469
that case we must set the minimum length to zero.
470
471
For backreferenes, if duplicate numbers are present in the pattern we check
472
for a reference to a duplicate. If it is, we don't know which version will
473
be referenced, so we have to set the minimum length to zero. */
474
475
/* Duplicate named pattern back reference. */
476
477
case OP_DNREF:
478
case OP_DNREFI:
479
if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480
{
481
int count = GET2(cc, 1+IMM2_SIZE);
482
PCRE2_SPTR slot =
483
(PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
484
GET2(cc, 1) * re->name_entry_size;
485
486
d = INT_MAX;
487
488
/* Scan all groups with the same name; find the shortest. */
489
490
while (count-- > 0)
491
{
492
int dd, i;
493
recno = GET2(slot, 0);
494
495
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496
dd = backref_cache[recno];
497
else
498
{
499
ce = cs = PRIV(find_bracket)(startcode, utf, recno);
500
if (cs == NULL) return -2;
501
do ce += GET(ce, 1); while (*ce == OP_ALT);
502
503
dd = 0;
504
if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
505
{
506
if (cc > cs && cc < ce) /* Simple recursion */
507
{
508
had_recurse = TRUE;
509
}
510
else
511
{
512
recurse_check *r = recurses;
513
for (r = recurses; r != NULL; r = r->prev)
514
if (r->group == cs) break;
515
if (r != NULL) /* Mutual recursion */
516
{
517
had_recurse = TRUE;
518
}
519
else
520
{
521
this_recurse.prev = recurses; /* No recursion */
522
this_recurse.group = cs;
523
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
524
countptr, backref_cache);
525
if (dd < 0) return dd;
526
}
527
}
528
}
529
530
backref_cache[recno] = dd;
531
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
532
backref_cache[0] = recno;
533
}
534
535
if (dd < d) d = dd;
536
if (d <= 0) break; /* No point looking at any more */
537
slot += re->name_entry_size;
538
}
539
}
540
else d = 0;
541
cc += PRIV(OP_lengths)[*cc];
542
goto REPEAT_BACK_REFERENCE;
543
544
/* Single back reference by number. References by name are converted to by
545
number when there is no duplication. */
546
547
case OP_REF:
548
case OP_REFI:
549
recno = GET2(cc, 1);
550
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
551
d = backref_cache[recno];
552
else
553
{
554
int i;
555
d = 0;
556
557
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
558
{
559
ce = cs = PRIV(find_bracket)(startcode, utf, recno);
560
if (cs == NULL) return -2;
561
do ce += GET(ce, 1); while (*ce == OP_ALT);
562
563
if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
564
{
565
if (cc > cs && cc < ce) /* Simple recursion */
566
{
567
had_recurse = TRUE;
568
}
569
else
570
{
571
recurse_check *r = recurses;
572
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
573
if (r != NULL) /* Mutual recursion */
574
{
575
had_recurse = TRUE;
576
}
577
else /* No recursion */
578
{
579
this_recurse.prev = recurses;
580
this_recurse.group = cs;
581
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
582
backref_cache);
583
if (d < 0) return d;
584
}
585
}
586
}
587
}
588
589
backref_cache[recno] = d;
590
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
591
backref_cache[0] = recno;
592
}
593
594
cc += PRIV(OP_lengths)[*cc];
595
596
/* Handle repeated back references */
597
598
REPEAT_BACK_REFERENCE:
599
switch (*cc)
600
{
601
case OP_CRSTAR:
602
case OP_CRMINSTAR:
603
case OP_CRQUERY:
604
case OP_CRMINQUERY:
605
case OP_CRPOSSTAR:
606
case OP_CRPOSQUERY:
607
min = 0;
608
cc++;
609
break;
610
611
case OP_CRPLUS:
612
case OP_CRMINPLUS:
613
case OP_CRPOSPLUS:
614
min = 1;
615
cc++;
616
break;
617
618
case OP_CRRANGE:
619
case OP_CRMINRANGE:
620
case OP_CRPOSRANGE:
621
min = GET2(cc, 1);
622
cc += 1 + 2 * IMM2_SIZE;
623
break;
624
625
default:
626
min = 1;
627
break;
628
}
629
630
/* Take care not to overflow: (1) min and d are ints, so check that their
631
product is not greater than INT_MAX. (2) branchlength is limited to
632
UINT16_MAX (checked at the top of the loop). */
633
634
if ((d > 0 && (INT_MAX/d) < min) || (int)UINT16_MAX - branchlength < min*d)
635
branchlength = UINT16_MAX;
636
else branchlength += min * d;
637
break;
638
639
/* Recursion always refers to the first occurrence of a subpattern with a
640
given number. Therefore, we can always make use of caching, even when the
641
pattern contains multiple subpatterns with the same number. */
642
643
case OP_RECURSE:
644
cs = ce = startcode + GET(cc, 1);
645
recno = GET2(cs, 1+LINK_SIZE);
646
if (recno == prev_recurse_recno)
647
{
648
branchlength += prev_recurse_d;
649
}
650
else
651
{
652
do ce += GET(ce, 1); while (*ce == OP_ALT);
653
if (cc > cs && cc < ce) /* Simple recursion */
654
had_recurse = TRUE;
655
else
656
{
657
recurse_check *r = recurses;
658
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
659
if (r != NULL) /* Mutual recursion */
660
had_recurse = TRUE;
661
else
662
{
663
this_recurse.prev = recurses;
664
this_recurse.group = cs;
665
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
666
countptr, backref_cache);
667
if (prev_recurse_d < 0) return prev_recurse_d;
668
prev_recurse_recno = recno;
669
branchlength += prev_recurse_d;
670
}
671
}
672
}
673
cc += 1 + LINK_SIZE + once_fudge;
674
once_fudge = 0;
675
break;
676
677
/* Anything else does not or need not match a character. We can get the
678
item's length from the table, but for those that can match zero occurrences
679
of a character, we must take special action for UTF-8 characters. As it
680
happens, the "NOT" versions of these opcodes are used at present only for
681
ASCII characters, so they could be omitted from this list. However, in
682
future that may change, so we include them here so as not to leave a
683
gotcha for a future maintainer. */
684
685
case OP_UPTO:
686
case OP_UPTOI:
687
case OP_NOTUPTO:
688
case OP_NOTUPTOI:
689
case OP_MINUPTO:
690
case OP_MINUPTOI:
691
case OP_NOTMINUPTO:
692
case OP_NOTMINUPTOI:
693
case OP_POSUPTO:
694
case OP_POSUPTOI:
695
case OP_NOTPOSUPTO:
696
case OP_NOTPOSUPTOI:
697
698
case OP_STAR:
699
case OP_STARI:
700
case OP_NOTSTAR:
701
case OP_NOTSTARI:
702
case OP_MINSTAR:
703
case OP_MINSTARI:
704
case OP_NOTMINSTAR:
705
case OP_NOTMINSTARI:
706
case OP_POSSTAR:
707
case OP_POSSTARI:
708
case OP_NOTPOSSTAR:
709
case OP_NOTPOSSTARI:
710
711
case OP_QUERY:
712
case OP_QUERYI:
713
case OP_NOTQUERY:
714
case OP_NOTQUERYI:
715
case OP_MINQUERY:
716
case OP_MINQUERYI:
717
case OP_NOTMINQUERY:
718
case OP_NOTMINQUERYI:
719
case OP_POSQUERY:
720
case OP_POSQUERYI:
721
case OP_NOTPOSQUERY:
722
case OP_NOTPOSQUERYI:
723
724
cc += PRIV(OP_lengths)[op];
725
#ifdef SUPPORT_UNICODE
726
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
727
#endif
728
break;
729
730
/* Skip these, but we need to add in the name length. */
731
732
case OP_MARK:
733
case OP_COMMIT_ARG:
734
case OP_PRUNE_ARG:
735
case OP_SKIP_ARG:
736
case OP_THEN_ARG:
737
cc += PRIV(OP_lengths)[op] + cc[1];
738
break;
739
740
/* The remaining opcodes are just skipped over. */
741
742
case OP_CLOSE:
743
case OP_COMMIT:
744
case OP_FAIL:
745
case OP_PRUNE:
746
case OP_SET_SOM:
747
case OP_SKIP:
748
case OP_THEN:
749
cc += PRIV(OP_lengths)[op];
750
break;
751
752
/* This should not occur: we list all opcodes explicitly so that when
753
new ones get added they are properly considered. */
754
755
/* LCOV_EXCL_START */
756
default:
757
PCRE2_DEBUG_UNREACHABLE();
758
return -3;
759
/* LCOV_EXCL_STOP */
760
}
761
}
762
763
/* LCOV_EXCL_START */
764
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
765
return -3; /* Avoid compiler warnings */
766
/* LCOV_EXCL_STOP */
767
}
768
769
770
771
/*************************************************
772
* Set a bit and maybe its alternate case *
773
*************************************************/
774
775
/* Given a character, set its first code unit's bit in the table, and also the
776
corresponding bit for the other version of a letter if we are caseless.
777
778
Arguments:
779
re points to the regex block
780
p points to the first code unit of the character
781
caseless TRUE if caseless
782
utf TRUE for UTF mode
783
ucp TRUE for UCP mode
784
785
Returns: pointer after the character
786
*/
787
788
static PCRE2_SPTR
789
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
790
BOOL ucp)
791
{
792
uint32_t c = *p++; /* First code unit */
793
794
(void)utf; /* Stop compiler warnings when UTF not supported */
795
(void)ucp;
796
797
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
798
0xff. */
799
800
#if PCRE2_CODE_UNIT_WIDTH != 8
801
if (c > 0xff) SET_BIT(0xff); else
802
#endif
803
804
SET_BIT(c);
805
806
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
807
the end of the character, even when caseless. */
808
809
#ifdef SUPPORT_UNICODE
810
if (utf)
811
{
812
#if PCRE2_CODE_UNIT_WIDTH == 8
813
if (c >= 0xc0) GETUTF8INC(c, p);
814
#elif PCRE2_CODE_UNIT_WIDTH == 16
815
if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
816
#endif
817
}
818
#endif /* SUPPORT_UNICODE */
819
820
/* If caseless, handle the other case of the character. */
821
822
if (caseless)
823
{
824
#ifdef SUPPORT_UNICODE
825
if (utf || ucp)
826
{
827
c = UCD_OTHERCASE(c);
828
#if PCRE2_CODE_UNIT_WIDTH == 8
829
if (utf)
830
{
831
PCRE2_UCHAR buff[6];
832
(void)PRIV(ord2utf)(c, buff);
833
SET_BIT(buff[0]);
834
}
835
else if (c < 256) SET_BIT(c);
836
#else /* 16-bit or 32-bit mode */
837
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
838
#endif
839
}
840
841
else
842
#endif /* SUPPORT_UNICODE */
843
844
/* Not UTF or UCP */
845
846
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
847
}
848
849
return p;
850
}
851
852
853
854
/*************************************************
855
* Set bits for a positive character type *
856
*************************************************/
857
858
/* This function sets starting bits for a character type. In UTF-8 mode, we can
859
only do a direct setting for bytes less than 128, as otherwise there can be
860
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
861
environment, the tables will only recognize ASCII characters anyway, but in at
862
least one Windows environment, some higher bytes bits were set in the tables.
863
So we deal with that case by considering the UTF-8 encoding.
864
865
Arguments:
866
re the regex block
867
cbit type the type of character wanted
868
table_limit 32 for non-UTF-8; 16 for UTF-8
869
870
Returns: nothing
871
*/
872
873
static void
874
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
875
{
876
uint32_t c;
877
for (c = 0; c < table_limit; c++)
878
re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
879
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
880
if (table_limit == 32) return;
881
for (c = 128; c < 256; c++)
882
{
883
if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
884
{
885
PCRE2_UCHAR buff[6];
886
(void)PRIV(ord2utf)(c, buff);
887
SET_BIT(buff[0]);
888
}
889
}
890
#endif /* UTF-8 */
891
}
892
893
894
/*************************************************
895
* Set bits for a negative character type *
896
*************************************************/
897
898
/* This function sets starting bits for a negative character type such as \D.
899
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
900
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
901
Unlike in the positive case, where we can set appropriate starting bits for
902
specific high-valued UTF-8 characters, in this case we have to set the bits for
903
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
904
0xc0 (192) for simplicity.
905
906
Arguments:
907
re the regex block
908
cbit type the type of character wanted
909
table_limit 32 for non-UTF-8; 16 for UTF-8
910
911
Returns: nothing
912
*/
913
914
static void
915
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
916
{
917
uint32_t c;
918
for (c = 0; c < table_limit; c++)
919
re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
920
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
921
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
922
#endif
923
}
924
925
926
927
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
928
/*************************************************
929
* Set starting bits for a character list. *
930
*************************************************/
931
932
/* This function sets starting bits for a character list. It enumerates
933
all characters and character ranges in the character list, and sets
934
the starting bits accordingly.
935
936
Arguments:
937
code pointer to the code
938
start_bitmap pointer to the starting bitmap
939
940
Returns: nothing
941
*/
942
static void
943
study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,
944
const uint8_t *char_lists_end)
945
{
946
uint32_t type, list_ind;
947
uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
948
uint32_t range_start = ~(uint32_t)0, range_end = 0;
949
const uint8_t *next_char;
950
PCRE2_UCHAR start_buffer[6], end_buffer[6];
951
PCRE2_UCHAR start, end;
952
953
/* Only needed in 8-bit mode at the moment. */
954
type = (uint32_t)(code[0] << 8) | code[1];
955
code += 2;
956
957
/* Align characters. */
958
next_char = char_lists_end - (GET(code, 0) << 1);
959
type &= XCL_TYPE_MASK;
960
list_ind = 0;
961
962
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
963
range_start = XCL_CHAR_LIST_LOW_16_START;
964
965
while (type > 0)
966
{
967
uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
968
969
if (item_count == XCL_ITEM_COUNT_MASK)
970
{
971
if (list_ind <= 1)
972
{
973
item_count = *(const uint16_t*)next_char;
974
next_char += 2;
975
}
976
else
977
{
978
item_count = *(const uint32_t*)next_char;
979
next_char += 4;
980
}
981
}
982
983
while (item_count > 0)
984
{
985
if (list_ind <= 1)
986
{
987
range_end = *(const uint16_t*)next_char;
988
next_char += 2;
989
}
990
else
991
{
992
range_end = *(const uint32_t*)next_char;
993
next_char += 4;
994
}
995
996
if ((range_end & XCL_CHAR_END) != 0)
997
{
998
range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
999
1000
PRIV(ord2utf)(range_end, end_buffer);
1001
end = end_buffer[0];
1002
1003
if (range_start < range_end)
1004
{
1005
PRIV(ord2utf)(range_start, start_buffer);
1006
for (start = start_buffer[0]; start <= end; start++)
1007
start_bitmap[start / 8] |= (1u << (start & 7));
1008
}
1009
else
1010
start_bitmap[end / 8] |= (1u << (end & 7));
1011
1012
range_start = ~(uint32_t)0;
1013
}
1014
else
1015
range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
1016
1017
item_count--;
1018
}
1019
1020
list_ind++;
1021
type >>= XCL_TYPE_BIT_LEN;
1022
1023
if (range_start == ~(uint32_t)0)
1024
{
1025
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
1026
{
1027
/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */
1028
if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
1029
else range_start = XCL_CHAR_LIST_LOW_32_START;
1030
}
1031
}
1032
else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
1033
{
1034
PRIV(ord2utf)(range_start, start_buffer);
1035
1036
/* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and
1037
XCL_CHAR_LIST_HIGH_32_END are not possible. */
1038
if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
1039
else range_end = XCL_CHAR_LIST_HIGH_16_END;
1040
1041
PRIV(ord2utf)(range_end, end_buffer);
1042
end = end_buffer[0];
1043
1044
for (start = start_buffer[0]; start <= end; start++)
1045
start_bitmap[start / 8] |= (1u << (start & 7));
1046
1047
range_start = ~(uint32_t)0;
1048
}
1049
1050
/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */
1051
if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
1052
else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
1053
}
1054
}
1055
#endif
1056
1057
1058
1059
/*************************************************
1060
* Create bitmap of starting code units *
1061
*************************************************/
1062
1063
/* This function scans a compiled unanchored expression recursively and
1064
attempts to build a bitmap of the set of possible starting code units whose
1065
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
1066
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
1067
we pass a value of 16 rather than 32 as the final argument. (See comments in
1068
those functions for the reason.)
1069
1070
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
1071
(a*)b where the group provides some optional starting code units but scanning
1072
must continue at the outer level to find at least one mandatory code unit. At
1073
the outermost level, this function fails unless the result is SSB_DONE.
1074
1075
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
1076
issues.
1077
1078
Arguments:
1079
re points to the compiled regex block
1080
code points to an expression
1081
utf TRUE if in UTF mode
1082
ucp TRUE if in UCP mode
1083
depthptr pointer to recurse depth
1084
1085
Returns: SSB_FAIL => Failed to find any starting code units
1086
SSB_DONE => Found mandatory starting code units
1087
SSB_CONTINUE => Found optional starting code units
1088
SSB_UNKNOWN => Hit an unrecognized opcode
1089
SSB_TOODEEP => Recursion is too deep
1090
*/
1091
1092
static int
1093
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
1094
int *depthptr)
1095
{
1096
uint32_t c;
1097
int yield = SSB_DONE;
1098
1099
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1100
int table_limit = utf? 16:32;
1101
#else
1102
int table_limit = 32;
1103
#endif
1104
1105
*depthptr += 1;
1106
if (*depthptr > 1000) return SSB_TOODEEP;
1107
1108
do
1109
{
1110
BOOL try_next = TRUE;
1111
PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
1112
1113
if (*code == OP_CBRA || *code == OP_SCBRA ||
1114
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1115
1116
while (try_next) /* Loop for items in this branch */
1117
{
1118
int rc;
1119
PCRE2_SPTR ncode;
1120
const uint8_t *classmap = NULL;
1121
#ifdef SUPPORT_WIDE_CHARS
1122
PCRE2_UCHAR xclassflags;
1123
#endif
1124
1125
switch(*tcode)
1126
{
1127
/* If we reach something we don't understand, it means a new opcode has
1128
been created that hasn't been added to this function. Hopefully this
1129
problem will be discovered during testing. */
1130
1131
default:
1132
return SSB_UNKNOWN;
1133
1134
/* Fail for a valid opcode that implies no starting bits. */
1135
1136
case OP_ACCEPT:
1137
case OP_ASSERT_ACCEPT:
1138
case OP_ALLANY:
1139
case OP_ANY:
1140
case OP_ANYBYTE:
1141
case OP_CIRCM:
1142
case OP_CLOSE:
1143
case OP_COMMIT:
1144
case OP_COMMIT_ARG:
1145
case OP_COND:
1146
case OP_CREF:
1147
case OP_FALSE:
1148
case OP_TRUE:
1149
case OP_DNCREF:
1150
case OP_DNREF:
1151
case OP_DNREFI:
1152
case OP_DNRREF:
1153
case OP_DOLL:
1154
case OP_DOLLM:
1155
case OP_END:
1156
case OP_EOD:
1157
case OP_EODN:
1158
case OP_EXTUNI:
1159
case OP_FAIL:
1160
case OP_MARK:
1161
case OP_NOT:
1162
case OP_NOTEXACT:
1163
case OP_NOTEXACTI:
1164
case OP_NOTI:
1165
case OP_NOTMINPLUS:
1166
case OP_NOTMINPLUSI:
1167
case OP_NOTMINQUERY:
1168
case OP_NOTMINQUERYI:
1169
case OP_NOTMINSTAR:
1170
case OP_NOTMINSTARI:
1171
case OP_NOTMINUPTO:
1172
case OP_NOTMINUPTOI:
1173
case OP_NOTPLUS:
1174
case OP_NOTPLUSI:
1175
case OP_NOTPOSPLUS:
1176
case OP_NOTPOSPLUSI:
1177
case OP_NOTPOSQUERY:
1178
case OP_NOTPOSQUERYI:
1179
case OP_NOTPOSSTAR:
1180
case OP_NOTPOSSTARI:
1181
case OP_NOTPOSUPTO:
1182
case OP_NOTPOSUPTOI:
1183
case OP_NOTPROP:
1184
case OP_NOTQUERY:
1185
case OP_NOTQUERYI:
1186
case OP_NOTSTAR:
1187
case OP_NOTSTARI:
1188
case OP_NOTUPTO:
1189
case OP_NOTUPTOI:
1190
case OP_NOT_HSPACE:
1191
case OP_NOT_VSPACE:
1192
case OP_PRUNE:
1193
case OP_PRUNE_ARG:
1194
case OP_RECURSE:
1195
case OP_REF:
1196
case OP_REFI:
1197
case OP_REVERSE:
1198
case OP_VREVERSE:
1199
case OP_RREF:
1200
case OP_SCOND:
1201
case OP_SET_SOM:
1202
case OP_SKIP:
1203
case OP_SKIP_ARG:
1204
case OP_SOD:
1205
case OP_SOM:
1206
case OP_THEN:
1207
case OP_THEN_ARG:
1208
return SSB_FAIL;
1209
1210
/* OP_CIRC happens only at the start of an anchored branch (multiline ^
1211
uses OP_CIRCM). Skip over it. */
1212
1213
case OP_CIRC:
1214
tcode += PRIV(OP_lengths)[OP_CIRC];
1215
break;
1216
1217
/* A "real" property test implies no starting bits, but the fake property
1218
PT_CLIST identifies a list of characters. These lists are short, as they
1219
are used for characters with more than one "other case", so there is no
1220
point in recognizing them for OP_NOTPROP. */
1221
1222
case OP_PROP:
1223
if (tcode[1] != PT_CLIST) return SSB_FAIL;
1224
{
1225
const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1226
while ((c = *p++) < NOTACHAR)
1227
{
1228
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1229
if (utf)
1230
{
1231
PCRE2_UCHAR buff[6];
1232
(void)PRIV(ord2utf)(c, buff);
1233
c = buff[0];
1234
}
1235
#endif
1236
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1237
}
1238
}
1239
try_next = FALSE;
1240
break;
1241
1242
/* We can ignore word boundary tests. */
1243
1244
case OP_WORD_BOUNDARY:
1245
case OP_NOT_WORD_BOUNDARY:
1246
case OP_UCP_WORD_BOUNDARY:
1247
case OP_NOT_UCP_WORD_BOUNDARY:
1248
tcode++;
1249
break;
1250
1251
/* For a positive lookahead assertion, inspect what immediately follows,
1252
ignoring intermediate assertions and callouts. If the next item is one
1253
that sets a mandatory character, skip this assertion. Otherwise, treat it
1254
the same as other bracket groups. */
1255
1256
case OP_ASSERT:
1257
case OP_ASSERT_NA:
1258
ncode = tcode + GET(tcode, 1);
1259
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1260
ncode += 1 + LINK_SIZE;
1261
1262
/* Skip irrelevant items */
1263
1264
for (BOOL done = FALSE; !done;)
1265
{
1266
switch (*ncode)
1267
{
1268
case OP_ASSERT:
1269
case OP_ASSERT_NOT:
1270
case OP_ASSERTBACK:
1271
case OP_ASSERTBACK_NOT:
1272
case OP_ASSERT_NA:
1273
case OP_ASSERTBACK_NA:
1274
case OP_ASSERT_SCS:
1275
ncode += GET(ncode, 1);
1276
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1277
ncode += 1 + LINK_SIZE;
1278
break;
1279
1280
case OP_WORD_BOUNDARY:
1281
case OP_NOT_WORD_BOUNDARY:
1282
case OP_UCP_WORD_BOUNDARY:
1283
case OP_NOT_UCP_WORD_BOUNDARY:
1284
ncode++;
1285
break;
1286
1287
case OP_CALLOUT:
1288
ncode += PRIV(OP_lengths)[OP_CALLOUT];
1289
break;
1290
1291
case OP_CALLOUT_STR:
1292
ncode += GET(ncode, 1 + 2*LINK_SIZE);
1293
break;
1294
1295
default:
1296
done = TRUE;
1297
break;
1298
}
1299
}
1300
1301
/* Now check the next significant item. */
1302
1303
switch(*ncode)
1304
{
1305
default:
1306
break;
1307
1308
case OP_PROP:
1309
if (ncode[1] != PT_CLIST) break;
1310
PCRE2_FALLTHROUGH /* Fall through */
1311
case OP_ANYNL:
1312
case OP_CHAR:
1313
case OP_CHARI:
1314
case OP_EXACT:
1315
case OP_EXACTI:
1316
case OP_HSPACE:
1317
case OP_MINPLUS:
1318
case OP_MINPLUSI:
1319
case OP_PLUS:
1320
case OP_PLUSI:
1321
case OP_POSPLUS:
1322
case OP_POSPLUSI:
1323
case OP_VSPACE:
1324
/* Note that these types will only be present in non-UCP mode. */
1325
case OP_DIGIT:
1326
case OP_NOT_DIGIT:
1327
case OP_WORDCHAR:
1328
case OP_NOT_WORDCHAR:
1329
case OP_WHITESPACE:
1330
case OP_NOT_WHITESPACE:
1331
tcode = ncode;
1332
continue; /* With the following significant opcode */
1333
}
1334
PCRE2_FALLTHROUGH /* Fall through */
1335
1336
/* For a group bracket or a positive assertion without an immediately
1337
following mandatory setting, recurse to set bits from within the
1338
subpattern. If it can't find anything, we have to give up. If it finds
1339
some mandatory character(s), we are done for this branch. Otherwise,
1340
carry on scanning after the subpattern. */
1341
1342
case OP_BRA:
1343
case OP_SBRA:
1344
case OP_CBRA:
1345
case OP_SCBRA:
1346
case OP_BRAPOS:
1347
case OP_SBRAPOS:
1348
case OP_CBRAPOS:
1349
case OP_SCBRAPOS:
1350
case OP_ONCE:
1351
case OP_SCRIPT_RUN:
1352
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1353
if (rc == SSB_DONE)
1354
{
1355
try_next = FALSE;
1356
}
1357
else if (rc == SSB_CONTINUE)
1358
{
1359
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1360
tcode += 1 + LINK_SIZE;
1361
}
1362
else return rc; /* FAIL, UNKNOWN, or TOODEEP */
1363
break;
1364
1365
/* If we hit ALT or KET, it means we haven't found anything mandatory in
1366
this branch, though we might have found something optional. For ALT, we
1367
continue with the next alternative, but we have to arrange that the final
1368
result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1369
return SSB_CONTINUE: if this is the top level, that indicates failure,
1370
but after a nested subpattern, it causes scanning to continue. */
1371
1372
case OP_ALT:
1373
yield = SSB_CONTINUE;
1374
try_next = FALSE;
1375
break;
1376
1377
case OP_KET:
1378
case OP_KETRMAX:
1379
case OP_KETRMIN:
1380
case OP_KETRPOS:
1381
return SSB_CONTINUE;
1382
1383
/* Skip over callout */
1384
1385
case OP_CALLOUT:
1386
tcode += PRIV(OP_lengths)[OP_CALLOUT];
1387
break;
1388
1389
case OP_CALLOUT_STR:
1390
tcode += GET(tcode, 1 + 2*LINK_SIZE);
1391
break;
1392
1393
/* Skip over lookbehind, negative lookahead, and scan substring
1394
assertions */
1395
1396
case OP_ASSERT_NOT:
1397
case OP_ASSERTBACK:
1398
case OP_ASSERTBACK_NOT:
1399
case OP_ASSERTBACK_NA:
1400
case OP_ASSERT_SCS:
1401
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1402
tcode += 1 + LINK_SIZE;
1403
break;
1404
1405
/* BRAZERO does the bracket, but carries on. */
1406
1407
case OP_BRAZERO:
1408
case OP_BRAMINZERO:
1409
case OP_BRAPOSZERO:
1410
rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1411
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1412
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1413
tcode += 1 + LINK_SIZE;
1414
break;
1415
1416
/* SKIPZERO skips the bracket. */
1417
1418
case OP_SKIPZERO:
1419
tcode++;
1420
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1421
tcode += 1 + LINK_SIZE;
1422
break;
1423
1424
/* Single-char * or ? sets the bit and tries the next item */
1425
1426
case OP_STAR:
1427
case OP_MINSTAR:
1428
case OP_POSSTAR:
1429
case OP_QUERY:
1430
case OP_MINQUERY:
1431
case OP_POSQUERY:
1432
tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1433
break;
1434
1435
case OP_STARI:
1436
case OP_MINSTARI:
1437
case OP_POSSTARI:
1438
case OP_QUERYI:
1439
case OP_MINQUERYI:
1440
case OP_POSQUERYI:
1441
tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1442
break;
1443
1444
/* Single-char upto sets the bit and tries the next */
1445
1446
case OP_UPTO:
1447
case OP_MINUPTO:
1448
case OP_POSUPTO:
1449
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1450
break;
1451
1452
case OP_UPTOI:
1453
case OP_MINUPTOI:
1454
case OP_POSUPTOI:
1455
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1456
break;
1457
1458
/* At least one single char sets the bit and stops */
1459
1460
case OP_EXACT:
1461
tcode += IMM2_SIZE;
1462
PCRE2_FALLTHROUGH /* Fall through */
1463
case OP_CHAR:
1464
case OP_PLUS:
1465
case OP_MINPLUS:
1466
case OP_POSPLUS:
1467
(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1468
try_next = FALSE;
1469
break;
1470
1471
case OP_EXACTI:
1472
tcode += IMM2_SIZE;
1473
PCRE2_FALLTHROUGH /* Fall through */
1474
case OP_CHARI:
1475
case OP_PLUSI:
1476
case OP_MINPLUSI:
1477
case OP_POSPLUSI:
1478
(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1479
try_next = FALSE;
1480
break;
1481
1482
/* Special spacing and line-terminating items. These recognize specific
1483
lists of characters. The difference between VSPACE and ANYNL is that the
1484
latter can match the two-character CRLF sequence, but that is not
1485
relevant for finding the first character, so their code here is
1486
identical. */
1487
1488
case OP_HSPACE:
1489
SET_BIT(CHAR_HT);
1490
SET_BIT(CHAR_SPACE);
1491
1492
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1493
the bits for NBSP and for code units >= 255, independently of UTF. */
1494
1495
#if PCRE2_CODE_UNIT_WIDTH != 8
1496
SET_BIT(CHAR_NBSP);
1497
SET_BIT(0xFF);
1498
#else
1499
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1500
units of horizontal space characters. */
1501
1502
#ifdef SUPPORT_UNICODE
1503
if (utf)
1504
{
1505
SET_BIT(0xC2); /* For U+00A0 */
1506
SET_BIT(0xE1); /* For U+1680, U+180E */
1507
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1508
SET_BIT(0xE3); /* For U+3000 */
1509
}
1510
else
1511
#endif
1512
/* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */
1513
{
1514
SET_BIT(CHAR_NBSP);
1515
}
1516
#endif /* 8-bit support */
1517
1518
try_next = FALSE;
1519
break;
1520
1521
case OP_ANYNL:
1522
case OP_VSPACE:
1523
SET_BIT(CHAR_LF);
1524
SET_BIT(CHAR_VT);
1525
SET_BIT(CHAR_FF);
1526
SET_BIT(CHAR_CR);
1527
1528
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529
the bits for NEL and for code units >= 255, independently of UTF. */
1530
1531
#if PCRE2_CODE_UNIT_WIDTH != 8
1532
SET_BIT(CHAR_NEL);
1533
SET_BIT(0xFF);
1534
#else
1535
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536
units of vertical space characters. */
1537
1538
#ifdef SUPPORT_UNICODE
1539
if (utf)
1540
{
1541
SET_BIT(0xC2); /* For U+0085 (NEL) */
1542
SET_BIT(0xE2); /* For U+2028, U+2029 */
1543
}
1544
else
1545
#endif
1546
/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547
{
1548
SET_BIT(CHAR_NEL);
1549
}
1550
#endif /* 8-bit support */
1551
1552
try_next = FALSE;
1553
break;
1554
1555
/* Single character types set the bits and stop. Note that if PCRE2_UCP
1556
is set, we do not see these opcodes because \d etc are converted to
1557
properties. Therefore, these apply in the case when only characters less
1558
than 256 are recognized to match the types. */
1559
1560
case OP_NOT_DIGIT:
1561
set_nottype_bits(re, cbit_digit, table_limit);
1562
try_next = FALSE;
1563
break;
1564
1565
case OP_DIGIT:
1566
set_type_bits(re, cbit_digit, table_limit);
1567
try_next = FALSE;
1568
break;
1569
1570
case OP_NOT_WHITESPACE:
1571
set_nottype_bits(re, cbit_space, table_limit);
1572
try_next = FALSE;
1573
break;
1574
1575
case OP_WHITESPACE:
1576
set_type_bits(re, cbit_space, table_limit);
1577
try_next = FALSE;
1578
break;
1579
1580
case OP_NOT_WORDCHAR:
1581
set_nottype_bits(re, cbit_word, table_limit);
1582
try_next = FALSE;
1583
break;
1584
1585
case OP_WORDCHAR:
1586
set_type_bits(re, cbit_word, table_limit);
1587
try_next = FALSE;
1588
break;
1589
1590
/* One or more character type fudges the pointer and restarts, knowing
1591
it will hit a single character type and stop there. */
1592
1593
case OP_TYPEPLUS:
1594
case OP_TYPEMINPLUS:
1595
case OP_TYPEPOSPLUS:
1596
tcode++;
1597
break;
1598
1599
case OP_TYPEEXACT:
1600
tcode += 1 + IMM2_SIZE;
1601
break;
1602
1603
/* Zero or more repeats of character types set the bits and then
1604
try again. */
1605
1606
case OP_TYPEUPTO:
1607
case OP_TYPEMINUPTO:
1608
case OP_TYPEPOSUPTO:
1609
tcode += IMM2_SIZE;
1610
PCRE2_FALLTHROUGH /* Fall through */
1611
1612
case OP_TYPESTAR:
1613
case OP_TYPEMINSTAR:
1614
case OP_TYPEPOSSTAR:
1615
case OP_TYPEQUERY:
1616
case OP_TYPEMINQUERY:
1617
case OP_TYPEPOSQUERY:
1618
switch(tcode[1])
1619
{
1620
default:
1621
case OP_ANY:
1622
case OP_ALLANY:
1623
return SSB_FAIL;
1624
1625
case OP_HSPACE:
1626
SET_BIT(CHAR_HT);
1627
SET_BIT(CHAR_SPACE);
1628
1629
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1630
the bits for NBSP and for code units >= 255, independently of UTF. */
1631
1632
#if PCRE2_CODE_UNIT_WIDTH != 8
1633
SET_BIT(CHAR_NBSP);
1634
SET_BIT(0xFF);
1635
#else
1636
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1637
units of horizontal space characters. */
1638
1639
#ifdef SUPPORT_UNICODE
1640
if (utf)
1641
{
1642
SET_BIT(0xC2); /* For U+00A0 */
1643
SET_BIT(0xE1); /* For U+1680, U+180E */
1644
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1645
SET_BIT(0xE3); /* For U+3000 */
1646
}
1647
else
1648
#endif
1649
/* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */
1650
{
1651
SET_BIT(CHAR_NBSP);
1652
}
1653
#endif /* 8-bit support */
1654
break;
1655
1656
case OP_ANYNL:
1657
case OP_VSPACE:
1658
SET_BIT(CHAR_LF);
1659
SET_BIT(CHAR_VT);
1660
SET_BIT(CHAR_FF);
1661
SET_BIT(CHAR_CR);
1662
1663
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1664
the bits for NEL and for code units >= 255, independently of UTF. */
1665
1666
#if PCRE2_CODE_UNIT_WIDTH != 8
1667
SET_BIT(CHAR_NEL);
1668
SET_BIT(0xFF);
1669
#else
1670
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1671
units of vertical space characters. */
1672
1673
#ifdef SUPPORT_UNICODE
1674
if (utf)
1675
{
1676
SET_BIT(0xC2); /* For U+0085 (NEL) */
1677
SET_BIT(0xE2); /* For U+2028, U+2029 */
1678
}
1679
else
1680
#endif
1681
/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1682
{
1683
SET_BIT(CHAR_NEL);
1684
}
1685
#endif /* 8-bit support */
1686
break;
1687
1688
case OP_NOT_DIGIT:
1689
set_nottype_bits(re, cbit_digit, table_limit);
1690
break;
1691
1692
case OP_DIGIT:
1693
set_type_bits(re, cbit_digit, table_limit);
1694
break;
1695
1696
case OP_NOT_WHITESPACE:
1697
set_nottype_bits(re, cbit_space, table_limit);
1698
break;
1699
1700
case OP_WHITESPACE:
1701
set_type_bits(re, cbit_space, table_limit);
1702
break;
1703
1704
case OP_NOT_WORDCHAR:
1705
set_nottype_bits(re, cbit_word, table_limit);
1706
break;
1707
1708
case OP_WORDCHAR:
1709
set_type_bits(re, cbit_word, table_limit);
1710
break;
1711
}
1712
1713
tcode += 2;
1714
break;
1715
1716
/* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */
1717
1718
#ifdef SUPPORT_WIDE_CHARS
1719
case OP_ECLASS:
1720
return SSB_FAIL;
1721
#endif
1722
1723
/* Extended class: if there are any property checks, or if this is a
1724
negative XCLASS without a map, give up. If there are no property checks,
1725
there must be wide characters on the XCLASS list, because otherwise an
1726
XCLASS would not have been created. This means that code points >= 255
1727
are potential starters. In the UTF-8 case we can scan them and set bits
1728
for the relevant leading bytes. */
1729
1730
#ifdef SUPPORT_WIDE_CHARS
1731
case OP_XCLASS:
1732
xclassflags = tcode[1 + LINK_SIZE];
1733
if ((xclassflags & XCL_HASPROP) != 0 ||
1734
(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1735
return SSB_FAIL;
1736
1737
/* We have a positive XCLASS or a negative one without a map. Set up the
1738
map pointer if there is one, and fall through. */
1739
1740
classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1741
(const uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1742
1743
/* In UTF-8 mode, scan the character list and set bits for leading bytes,
1744
then jump to handle the map. */
1745
1746
#if PCRE2_CODE_UNIT_WIDTH == 8
1747
if (utf && (xclassflags & XCL_NOT) == 0)
1748
{
1749
PCRE2_UCHAR b, e;
1750
PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1751
tcode += GET(tcode, 1);
1752
1753
if (*p >= XCL_LIST)
1754
{
1755
study_char_list(p, re->start_bitmap,
1756
((const uint8_t *)re + re->code_start));
1757
goto HANDLE_CLASSMAP;
1758
}
1759
1760
for (;;) switch (*p++)
1761
{
1762
case XCL_SINGLE:
1763
b = *p++;
1764
while ((*p & 0xc0) == 0x80) p++;
1765
re->start_bitmap[b/8] |= (1u << (b&7));
1766
break;
1767
1768
case XCL_RANGE:
1769
b = *p++;
1770
while ((*p & 0xc0) == 0x80) p++;
1771
e = *p++;
1772
while ((*p & 0xc0) == 0x80) p++;
1773
for (; b <= e; b++)
1774
re->start_bitmap[b/8] |= (1u << (b&7));
1775
break;
1776
1777
case XCL_END:
1778
goto HANDLE_CLASSMAP;
1779
1780
/* LCOV_EXCL_START */
1781
default:
1782
PCRE2_DEBUG_UNREACHABLE();
1783
return SSB_UNKNOWN; /* Internal error, should not occur */
1784
/* LCOV_EXCL_STOP */
1785
}
1786
}
1787
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1788
#endif /* SUPPORT_WIDE_CHARS */
1789
1790
/* It seems that the fall through comment must be outside the #ifdef if
1791
it is to avoid the gcc compiler warning. */
1792
1793
PCRE2_FALLTHROUGH /* Fall through */
1794
1795
/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1796
in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1797
because it starts a character with a value > 255. In 8-bit non-UTF mode,
1798
there is no difference between CLASS and NCLASS. In all other wide
1799
character modes, set the 0xFF bit to indicate code units >= 255. */
1800
1801
case OP_NCLASS:
1802
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1803
if (utf)
1804
{
1805
re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1806
memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1807
}
1808
PCRE2_FALLTHROUGH /* Fall through */
1809
#elif PCRE2_CODE_UNIT_WIDTH != 8
1810
SET_BIT(0xFF); /* For characters >= 255 */
1811
PCRE2_FALLTHROUGH /* Fall through */
1812
#endif
1813
1814
/* Enter here for a positive non-XCLASS. If we have fallen through from
1815
an XCLASS, classmap will already be set; just advance the code pointer.
1816
Otherwise, set up classmap for a non-XCLASS and advance past it. */
1817
1818
case OP_CLASS:
1819
if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1820
{
1821
classmap = (const uint8_t *)(++tcode);
1822
tcode += 32 / sizeof(PCRE2_UCHAR);
1823
}
1824
1825
/* When wide characters are supported, classmap may be NULL. In UTF-8
1826
(sic) mode, the bits in a class bit map correspond to character values,
1827
not to byte values. However, the bit map we are constructing is for byte
1828
values. So we have to do a conversion for characters whose code point is
1829
greater than 127. In fact, there are only two possible starting bytes for
1830
characters in the range 128 - 255. */
1831
1832
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1833
HANDLE_CLASSMAP:
1834
#endif
1835
if (classmap != NULL)
1836
{
1837
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1838
if (utf)
1839
{
1840
for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1841
for (c = 128; c < 256; c++)
1842
{
1843
if ((classmap[c/8] & (1u << (c&7))) != 0)
1844
{
1845
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1846
re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */
1847
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1848
}
1849
}
1850
}
1851
else
1852
#endif
1853
/* In all modes except UTF-8, the two bit maps are compatible. */
1854
1855
{
1856
for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1857
}
1858
}
1859
1860
/* Act on what follows the class. For a zero minimum repeat, continue;
1861
otherwise stop processing. */
1862
1863
switch (*tcode)
1864
{
1865
case OP_CRSTAR:
1866
case OP_CRMINSTAR:
1867
case OP_CRQUERY:
1868
case OP_CRMINQUERY:
1869
case OP_CRPOSSTAR:
1870
case OP_CRPOSQUERY:
1871
tcode++;
1872
break;
1873
1874
case OP_CRRANGE:
1875
case OP_CRMINRANGE:
1876
case OP_CRPOSRANGE:
1877
if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1878
else try_next = FALSE;
1879
break;
1880
1881
default:
1882
try_next = FALSE;
1883
break;
1884
}
1885
break; /* End of class handling case */
1886
} /* End of switch for opcodes */
1887
} /* End of try_next loop */
1888
1889
code += GET(code, 1); /* Advance to next branch */
1890
}
1891
while (*code == OP_ALT);
1892
1893
return yield;
1894
}
1895
1896
1897
1898
/*************************************************
1899
* Study a compiled expression *
1900
*************************************************/
1901
1902
/* This function is handed a compiled expression that it must study to produce
1903
information that will speed up the matching.
1904
1905
Argument:
1906
re points to the compiled expression
1907
1908
Returns: 0 normally; non-zero should never normally occur
1909
1 unknown opcode in set_start_bits
1910
2 missing capturing bracket
1911
3 unknown opcode in find_minlength
1912
*/
1913
1914
int
1915
PRIV(study)(pcre2_real_code *re)
1916
{
1917
int count = 0;
1918
PCRE2_UCHAR *code;
1919
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1920
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1921
1922
/* Find start of compiled code */
1923
1924
code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
1925
1926
/* For a pattern that has a first code unit, or a multiline pattern that
1927
matches only at "line start", there is no point in seeking a list of starting
1928
code units. */
1929
1930
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1931
{
1932
int depth = 0;
1933
int rc = set_start_bits(re, code, utf, ucp, &depth);
1934
/* LCOV_EXCL_START */
1935
if (rc == SSB_UNKNOWN)
1936
{
1937
PCRE2_DEBUG_UNREACHABLE();
1938
return 1;
1939
}
1940
/* LCOV_EXCL_STOP */
1941
1942
/* If a list of starting code units was set up, scan the list to see if only
1943
one or two were listed. Having only one listed is rare because usually a
1944
single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1945
If two are listed, see if they are caseless versions of the same character;
1946
if so we can replace the list with a caseless first code unit. This gives
1947
better performance and is plausibly worth doing for patterns such as [Ww]ord
1948
or (word|WORD). */
1949
1950
if (rc == SSB_DONE)
1951
{
1952
int i;
1953
int a = -1;
1954
int b = -1;
1955
uint8_t *p = re->start_bitmap;
1956
uint32_t flags = PCRE2_FIRSTMAPSET;
1957
1958
for (i = 0; i < 256; p++, i += 8)
1959
{
1960
uint8_t x = *p;
1961
if (x != 0)
1962
{
1963
int c;
1964
uint8_t y = x & (~x + 1); /* Least significant bit */
1965
if (y != x) goto DONE; /* More than one bit set */
1966
1967
/* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1968
all wide characters", so we cannot use it here. */
1969
1970
#if PCRE2_CODE_UNIT_WIDTH != 8
1971
if (i == 248 && x == 0x80) goto DONE;
1972
#endif
1973
1974
/* Compute the character value */
1975
1976
c = i;
1977
switch (x)
1978
{
1979
case 1: break;
1980
case 2: c += 1; break; case 4: c += 2; break;
1981
case 8: c += 3; break; case 16: c += 4; break;
1982
case 32: c += 5; break; case 64: c += 6; break;
1983
case 128: c += 7; break;
1984
}
1985
1986
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
1987
mode, only values < 128 can be used. In all the other cases, c is a
1988
character value. */
1989
1990
#if PCRE2_CODE_UNIT_WIDTH == 8
1991
if (utf && c > 127) goto DONE;
1992
#endif
1993
if (a < 0) a = c; /* First one found, save in a */
1994
else if (b < 0) /* Second one found */
1995
{
1996
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1997
1998
#ifdef SUPPORT_UNICODE
1999
if (utf || ucp)
2000
{
2001
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
2002
if (c > 127) d = UCD_OTHERCASE(c);
2003
}
2004
#endif /* SUPPORT_UNICODE */
2005
2006
if (d != a) goto DONE; /* Not the other case of a */
2007
b = c; /* Save second in b */
2008
2009
#ifdef EBCDIC
2010
/* To match ASCII (which puts the uppercase one in a), swap a & b
2011
if needed. This doesn't really matter, but neatens the tests. */
2012
if (TABLE_GET((unsigned int)a, re->tables + lcc_offset, a) == a)
2013
{
2014
b = a;
2015
a = c;
2016
}
2017
#endif
2018
}
2019
else goto DONE; /* More than two characters found */
2020
}
2021
}
2022
2023
/* Replace the start code unit bits with a first code unit. If it is the
2024
same as a required later code unit, then clear the required later code
2025
unit. This is because a search for a required code unit starts after an
2026
explicit first code unit, but at a code unit found from the bitmap.
2027
Patterns such as /a*a/ don't work if both the start unit and required
2028
unit are the same. */
2029
2030
if (a >= 0) {
2031
if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {
2032
re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);
2033
re->last_codeunit = 0;
2034
}
2035
re->first_codeunit = a;
2036
flags = PCRE2_FIRSTSET;
2037
if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
2038
}
2039
2040
DONE:
2041
re->flags |= flags;
2042
}
2043
}
2044
2045
/* Find the minimum length of subject string. If the pattern can match an empty
2046
string, the minimum length is already known. If the pattern contains (*ACCEPT)
2047
all bets are off, and we don't even try to find a minimum length. If there are
2048
more back references than the size of the vector we are going to cache them in,
2049
do nothing. A pattern that complicated will probably take a long time to
2050
analyze and may in any case turn out to be too complicated. Note that back
2051
reference minima are held as 16-bit numbers. */
2052
2053
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
2054
re->top_backref <= MAX_CACHE_BACKREF)
2055
{
2056
int min;
2057
int backref_cache[MAX_CACHE_BACKREF+1];
2058
backref_cache[0] = 0; /* Highest one that is set */
2059
min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
2060
switch(min)
2061
{
2062
case -1: /* \C in UTF mode or over-complex regex */
2063
break; /* Leave minlength unchanged (will be zero) */
2064
2065
/* LCOV_EXCL_START */
2066
case -2:
2067
PCRE2_DEBUG_UNREACHABLE();
2068
return 2; /* missing capturing bracket */
2069
/* LCOV_EXCL_STOP */
2070
2071
/* LCOV_EXCL_START */
2072
case -3:
2073
PCRE2_DEBUG_UNREACHABLE();
2074
return 3; /* unrecognized opcode */
2075
/* LCOV_EXCL_STOP */
2076
2077
default:
2078
re->minlength = (min > (int)UINT16_MAX)? (int)UINT16_MAX : min;
2079
break;
2080
}
2081
}
2082
2083
return 0;
2084
}
2085
2086
/* End of pcre2_study.c */
2087
2088