Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_study.c
9903 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains functions for scanning a compiled pattern and
42
collecting data (e.g. minimum matching length). */
43
44
45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48
49
#include "pcre2_internal.h"
50
51
/* The maximum remembered capturing brackets minimum. */
52
53
#define MAX_CACHE_BACKREF 128
54
55
/* Set a bit in the starting code unit bit map. */
56
57
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59
/* Returns from set_start_bits() */
60
61
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62
63
64
/*************************************************
65
* Find the minimum subject length for a group *
66
*************************************************/
67
68
/* Scan a parenthesized group and compute the minimum length of subject that
69
is needed to match it. This is a lower bound; it does not mean there is a
70
string of that length that matches. In UTF mode, the result is in characters
71
rather than code units. The field in a compiled pattern for storing the minimum
72
length is 16-bits long (on the grounds that anything longer than that is
73
pathological), so we give up when we reach that amount. This also means that
74
integer overflow for really crazy patterns cannot happen.
75
76
Backreference minimum lengths are cached to speed up multiple references. This
77
function is called only when the highest back reference in the pattern is less
78
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79
caching vector. The zeroth element contains the number of the highest set
80
value.
81
82
Arguments:
83
re compiled pattern block
84
code pointer to start of group (the bracket)
85
startcode pointer to start of the whole pattern's code
86
utf UTF flag
87
recurses chain of recurse_check to catch mutual recursion
88
countptr pointer to call count (to catch over complexity)
89
backref_cache vector for caching back references.
90
91
This function is no longer called when the pattern contains (*ACCEPT); however,
92
the old code for returning -1 is retained, just in case.
93
94
Returns: the minimum length
95
-1 \C in UTF-8 mode
96
or (*ACCEPT)
97
or pattern too complicated
98
-2 internal error (missing capturing bracket)
99
-3 internal error (opcode not listed)
100
*/
101
102
static int
103
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105
int *backref_cache)
106
{
107
int length = -1;
108
int branchlength = 0;
109
int prev_cap_recno = -1;
110
int prev_cap_d = 0;
111
int prev_recurse_recno = -1;
112
int prev_recurse_d = 0;
113
uint32_t once_fudge = 0;
114
BOOL had_recurse = FALSE;
115
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116
PCRE2_SPTR nextbranch = code + GET(code, 1);
117
PCRE2_SPTR cc = code + 1 + LINK_SIZE;
118
recurse_check this_recurse;
119
120
/* If this is a "could be empty" group, its minimum length is 0. */
121
122
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123
124
/* Skip over capturing bracket number */
125
126
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127
128
/* A large and/or complex regex can take too long to process. */
129
130
if ((*countptr)++ > 1000) return -1;
131
132
/* Scan along the opcodes for this branch. If we get to the end of the branch,
133
check the length against that of the other branches. If the accumulated length
134
passes 16-bits, reset to that value and skip the rest of the branch. */
135
136
for (;;)
137
{
138
int d, min, recno;
139
PCRE2_UCHAR op;
140
PCRE2_SPTR cs, ce;
141
142
if (branchlength >= UINT16_MAX)
143
{
144
branchlength = UINT16_MAX;
145
cc = nextbranch;
146
}
147
148
op = *cc;
149
switch (op)
150
{
151
case OP_COND:
152
case OP_SCOND:
153
154
/* If there is only one branch in a condition, the implied branch has zero
155
length, so we don't add anything. This covers the DEFINE "condition"
156
automatically. If there are two branches we can treat it the same as any
157
other non-capturing subpattern. */
158
159
cs = cc + GET(cc, 1);
160
if (*cs != OP_ALT)
161
{
162
cc = cs + 1 + LINK_SIZE;
163
break;
164
}
165
goto PROCESS_NON_CAPTURE;
166
167
case OP_BRA:
168
/* There's a special case of OP_BRA, when it is wrapped round a repeated
169
OP_RECURSE. We'd like to process the latter at this level so that
170
remembering the value works for repeated cases. So we do nothing, but
171
set a fudge value to skip over the OP_KET after the recurse. */
172
173
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
174
{
175
once_fudge = 1 + LINK_SIZE;
176
cc += 1 + LINK_SIZE;
177
break;
178
}
179
/* Fall through */
180
181
case OP_ONCE:
182
case OP_SCRIPT_RUN:
183
case OP_SBRA:
184
case OP_BRAPOS:
185
case OP_SBRAPOS:
186
PROCESS_NON_CAPTURE:
187
d = find_minlength(re, cc, startcode, utf, recurses, countptr,
188
backref_cache);
189
if (d < 0) return d;
190
branchlength += d;
191
do cc += GET(cc, 1); while (*cc == OP_ALT);
192
cc += 1 + LINK_SIZE;
193
break;
194
195
/* To save time for repeated capturing subpatterns, we remember the
196
length of the previous one. Unfortunately we can't do the same for
197
the unnumbered ones above. Nor can we do this if (?| is present in the
198
pattern because captures with the same number are not then identical. */
199
200
case OP_CBRA:
201
case OP_SCBRA:
202
case OP_CBRAPOS:
203
case OP_SCBRAPOS:
204
recno = (int)GET2(cc, 1+LINK_SIZE);
205
if (dupcapused || recno != prev_cap_recno)
206
{
207
prev_cap_recno = recno;
208
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
209
backref_cache);
210
if (prev_cap_d < 0) return prev_cap_d;
211
}
212
branchlength += prev_cap_d;
213
do cc += GET(cc, 1); while (*cc == OP_ALT);
214
cc += 1 + LINK_SIZE;
215
break;
216
217
/* ACCEPT makes things far too complicated; we have to give up. In fact,
218
from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
219
used. However, leave the code in place, just in case. */
220
221
case OP_ACCEPT:
222
case OP_ASSERT_ACCEPT:
223
return -1;
224
225
/* Reached end of a branch; if it's a ket it is the end of a nested
226
call. If it's ALT it is an alternation in a nested call. If it is END it's
227
the end of the outer call. All can be handled by the same code. If the
228
length of any branch is zero, there is no need to scan any subsequent
229
branches. */
230
231
case OP_ALT:
232
case OP_KET:
233
case OP_KETRMAX:
234
case OP_KETRMIN:
235
case OP_KETRPOS:
236
case OP_END:
237
if (length < 0 || (!had_recurse && branchlength < length))
238
length = branchlength;
239
if (op != OP_ALT || length == 0) return length;
240
nextbranch = cc + GET(cc, 1);
241
cc += 1 + LINK_SIZE;
242
branchlength = 0;
243
had_recurse = FALSE;
244
break;
245
246
/* Skip over assertive subpatterns */
247
248
case OP_ASSERT:
249
case OP_ASSERT_NOT:
250
case OP_ASSERTBACK:
251
case OP_ASSERTBACK_NOT:
252
case OP_ASSERT_NA:
253
case OP_ASSERT_SCS:
254
case OP_ASSERTBACK_NA:
255
do cc += GET(cc, 1); while (*cc == OP_ALT);
256
/* Fall through */
257
258
/* Skip over things that don't match chars */
259
260
case OP_REVERSE:
261
case OP_VREVERSE:
262
case OP_CREF:
263
case OP_DNCREF:
264
case OP_RREF:
265
case OP_DNRREF:
266
case OP_FALSE:
267
case OP_TRUE:
268
case OP_CALLOUT:
269
case OP_SOD:
270
case OP_SOM:
271
case OP_EOD:
272
case OP_EODN:
273
case OP_CIRC:
274
case OP_CIRCM:
275
case OP_DOLL:
276
case OP_DOLLM:
277
case OP_NOT_WORD_BOUNDARY:
278
case OP_WORD_BOUNDARY:
279
case OP_NOT_UCP_WORD_BOUNDARY:
280
case OP_UCP_WORD_BOUNDARY:
281
cc += PRIV(OP_lengths)[*cc];
282
break;
283
284
case OP_CALLOUT_STR:
285
cc += GET(cc, 1 + 2*LINK_SIZE);
286
break;
287
288
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
289
290
case OP_BRAZERO:
291
case OP_BRAMINZERO:
292
case OP_BRAPOSZERO:
293
case OP_SKIPZERO:
294
cc += PRIV(OP_lengths)[*cc];
295
do cc += GET(cc, 1); while (*cc == OP_ALT);
296
cc += 1 + LINK_SIZE;
297
break;
298
299
/* Handle literal characters and + repetitions */
300
301
case OP_CHAR:
302
case OP_CHARI:
303
case OP_NOT:
304
case OP_NOTI:
305
case OP_PLUS:
306
case OP_PLUSI:
307
case OP_MINPLUS:
308
case OP_MINPLUSI:
309
case OP_POSPLUS:
310
case OP_POSPLUSI:
311
case OP_NOTPLUS:
312
case OP_NOTPLUSI:
313
case OP_NOTMINPLUS:
314
case OP_NOTMINPLUSI:
315
case OP_NOTPOSPLUS:
316
case OP_NOTPOSPLUSI:
317
branchlength++;
318
cc += 2;
319
#ifdef SUPPORT_UNICODE
320
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
321
#endif
322
break;
323
324
case OP_TYPEPLUS:
325
case OP_TYPEMINPLUS:
326
case OP_TYPEPOSPLUS:
327
branchlength++;
328
cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
329
break;
330
331
/* Handle exact repetitions. The count is already in characters, but we
332
may need to skip over a multibyte character in UTF mode. */
333
334
case OP_EXACT:
335
case OP_EXACTI:
336
case OP_NOTEXACT:
337
case OP_NOTEXACTI:
338
branchlength += GET2(cc,1);
339
cc += 2 + IMM2_SIZE;
340
#ifdef SUPPORT_UNICODE
341
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
342
#endif
343
break;
344
345
case OP_TYPEEXACT:
346
branchlength += GET2(cc,1);
347
cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
348
|| cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
349
break;
350
351
/* Handle single-char non-literal matchers */
352
353
case OP_PROP:
354
case OP_NOTPROP:
355
cc += 2;
356
/* Fall through */
357
358
case OP_NOT_DIGIT:
359
case OP_DIGIT:
360
case OP_NOT_WHITESPACE:
361
case OP_WHITESPACE:
362
case OP_NOT_WORDCHAR:
363
case OP_WORDCHAR:
364
case OP_ANY:
365
case OP_ALLANY:
366
case OP_EXTUNI:
367
case OP_HSPACE:
368
case OP_NOT_HSPACE:
369
case OP_VSPACE:
370
case OP_NOT_VSPACE:
371
branchlength++;
372
cc++;
373
break;
374
375
/* "Any newline" might match two characters, but it also might match just
376
one. */
377
378
case OP_ANYNL:
379
branchlength += 1;
380
cc++;
381
break;
382
383
/* The single-byte matcher means we can't proceed in UTF mode. (In
384
non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
385
appear, but leave the code, just in case.) */
386
387
case OP_ANYBYTE:
388
#ifdef SUPPORT_UNICODE
389
if (utf) return -1;
390
#endif
391
branchlength++;
392
cc++;
393
break;
394
395
/* For repeated character types, we have to test for \p and \P, which have
396
an extra two bytes of parameters. */
397
398
case OP_TYPESTAR:
399
case OP_TYPEMINSTAR:
400
case OP_TYPEQUERY:
401
case OP_TYPEMINQUERY:
402
case OP_TYPEPOSSTAR:
403
case OP_TYPEPOSQUERY:
404
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
405
cc += PRIV(OP_lengths)[op];
406
break;
407
408
case OP_TYPEUPTO:
409
case OP_TYPEMINUPTO:
410
case OP_TYPEPOSUPTO:
411
if (cc[1 + IMM2_SIZE] == OP_PROP
412
|| cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
413
cc += PRIV(OP_lengths)[op];
414
break;
415
416
/* Check a class for variable quantification */
417
418
case OP_CLASS:
419
case OP_NCLASS:
420
#ifdef SUPPORT_WIDE_CHARS
421
case OP_XCLASS:
422
case OP_ECLASS:
423
/* The original code caused an unsigned overflow in 64 bit systems,
424
so now we use a conditional statement. */
425
if (op == OP_XCLASS || op == OP_ECLASS)
426
cc += GET(cc, 1);
427
else
428
#endif
429
cc += PRIV(OP_lengths)[OP_CLASS];
430
431
switch (*cc)
432
{
433
case OP_CRPLUS:
434
case OP_CRMINPLUS:
435
case OP_CRPOSPLUS:
436
branchlength++;
437
/* Fall through */
438
439
case OP_CRSTAR:
440
case OP_CRMINSTAR:
441
case OP_CRQUERY:
442
case OP_CRMINQUERY:
443
case OP_CRPOSSTAR:
444
case OP_CRPOSQUERY:
445
cc++;
446
break;
447
448
case OP_CRRANGE:
449
case OP_CRMINRANGE:
450
case OP_CRPOSRANGE:
451
branchlength += GET2(cc,1);
452
cc += 1 + 2 * IMM2_SIZE;
453
break;
454
455
default:
456
branchlength++;
457
break;
458
}
459
break;
460
461
/* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
462
way: we find the minimum length for the subpattern. A recursion
463
(backreference or subroutine) causes an a flag to be set that causes the
464
length of this branch to be ignored. The logic is that a recursion can only
465
make sense if there is another alternative that stops the recursing. That
466
will provide the minimum length (when no recursion happens).
467
468
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
469
matches an empty string (by default it causes a matching failure), so in
470
that case we must set the minimum length to zero.
471
472
For backreferenes, if duplicate numbers are present in the pattern we check
473
for a reference to a duplicate. If it is, we don't know which version will
474
be referenced, so we have to set the minimum length to zero. */
475
476
/* Duplicate named pattern back reference. */
477
478
case OP_DNREF:
479
case OP_DNREFI:
480
if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
481
{
482
int count = GET2(cc, 1+IMM2_SIZE);
483
PCRE2_SPTR slot =
484
(PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
485
GET2(cc, 1) * re->name_entry_size;
486
487
d = INT_MAX;
488
489
/* Scan all groups with the same name; find the shortest. */
490
491
while (count-- > 0)
492
{
493
int dd, i;
494
recno = GET2(slot, 0);
495
496
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
497
dd = backref_cache[recno];
498
else
499
{
500
ce = cs = PRIV(find_bracket)(startcode, utf, recno);
501
if (cs == NULL) return -2;
502
do ce += GET(ce, 1); while (*ce == OP_ALT);
503
504
dd = 0;
505
if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
506
{
507
if (cc > cs && cc < ce) /* Simple recursion */
508
{
509
had_recurse = TRUE;
510
}
511
else
512
{
513
recurse_check *r = recurses;
514
for (r = recurses; r != NULL; r = r->prev)
515
if (r->group == cs) break;
516
if (r != NULL) /* Mutual recursion */
517
{
518
had_recurse = TRUE;
519
}
520
else
521
{
522
this_recurse.prev = recurses; /* No recursion */
523
this_recurse.group = cs;
524
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525
countptr, backref_cache);
526
if (dd < 0) return dd;
527
}
528
}
529
}
530
531
backref_cache[recno] = dd;
532
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533
backref_cache[0] = recno;
534
}
535
536
if (dd < d) d = dd;
537
if (d <= 0) break; /* No point looking at any more */
538
slot += re->name_entry_size;
539
}
540
}
541
else d = 0;
542
cc += PRIV(OP_lengths)[*cc];
543
goto REPEAT_BACK_REFERENCE;
544
545
/* Single back reference by number. References by name are converted to by
546
number when there is no duplication. */
547
548
case OP_REF:
549
case OP_REFI:
550
recno = GET2(cc, 1);
551
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552
d = backref_cache[recno];
553
else
554
{
555
int i;
556
d = 0;
557
558
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559
{
560
ce = cs = PRIV(find_bracket)(startcode, utf, recno);
561
if (cs == NULL) return -2;
562
do ce += GET(ce, 1); while (*ce == OP_ALT);
563
564
if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
565
{
566
if (cc > cs && cc < ce) /* Simple recursion */
567
{
568
had_recurse = TRUE;
569
}
570
else
571
{
572
recurse_check *r = recurses;
573
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
574
if (r != NULL) /* Mutual recursion */
575
{
576
had_recurse = TRUE;
577
}
578
else /* No recursion */
579
{
580
this_recurse.prev = recurses;
581
this_recurse.group = cs;
582
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
583
backref_cache);
584
if (d < 0) return d;
585
}
586
}
587
}
588
}
589
590
backref_cache[recno] = d;
591
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
592
backref_cache[0] = recno;
593
}
594
595
cc += PRIV(OP_lengths)[*cc];
596
597
/* Handle repeated back references */
598
599
REPEAT_BACK_REFERENCE:
600
switch (*cc)
601
{
602
case OP_CRSTAR:
603
case OP_CRMINSTAR:
604
case OP_CRQUERY:
605
case OP_CRMINQUERY:
606
case OP_CRPOSSTAR:
607
case OP_CRPOSQUERY:
608
min = 0;
609
cc++;
610
break;
611
612
case OP_CRPLUS:
613
case OP_CRMINPLUS:
614
case OP_CRPOSPLUS:
615
min = 1;
616
cc++;
617
break;
618
619
case OP_CRRANGE:
620
case OP_CRMINRANGE:
621
case OP_CRPOSRANGE:
622
min = GET2(cc, 1);
623
cc += 1 + 2 * IMM2_SIZE;
624
break;
625
626
default:
627
min = 1;
628
break;
629
}
630
631
/* Take care not to overflow: (1) min and d are ints, so check that their
632
product is not greater than INT_MAX. (2) branchlength is limited to
633
UINT16_MAX (checked at the top of the loop). */
634
635
if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
636
branchlength = UINT16_MAX;
637
else branchlength += min * d;
638
break;
639
640
/* Recursion always refers to the first occurrence of a subpattern with a
641
given number. Therefore, we can always make use of caching, even when the
642
pattern contains multiple subpatterns with the same number. */
643
644
case OP_RECURSE:
645
cs = ce = startcode + GET(cc, 1);
646
recno = GET2(cs, 1+LINK_SIZE);
647
if (recno == prev_recurse_recno)
648
{
649
branchlength += prev_recurse_d;
650
}
651
else
652
{
653
do ce += GET(ce, 1); while (*ce == OP_ALT);
654
if (cc > cs && cc < ce) /* Simple recursion */
655
had_recurse = TRUE;
656
else
657
{
658
recurse_check *r = recurses;
659
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
660
if (r != NULL) /* Mutual recursion */
661
had_recurse = TRUE;
662
else
663
{
664
this_recurse.prev = recurses;
665
this_recurse.group = cs;
666
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
667
countptr, backref_cache);
668
if (prev_recurse_d < 0) return prev_recurse_d;
669
prev_recurse_recno = recno;
670
branchlength += prev_recurse_d;
671
}
672
}
673
}
674
cc += 1 + LINK_SIZE + once_fudge;
675
once_fudge = 0;
676
break;
677
678
/* Anything else does not or need not match a character. We can get the
679
item's length from the table, but for those that can match zero occurrences
680
of a character, we must take special action for UTF-8 characters. As it
681
happens, the "NOT" versions of these opcodes are used at present only for
682
ASCII characters, so they could be omitted from this list. However, in
683
future that may change, so we include them here so as not to leave a
684
gotcha for a future maintainer. */
685
686
case OP_UPTO:
687
case OP_UPTOI:
688
case OP_NOTUPTO:
689
case OP_NOTUPTOI:
690
case OP_MINUPTO:
691
case OP_MINUPTOI:
692
case OP_NOTMINUPTO:
693
case OP_NOTMINUPTOI:
694
case OP_POSUPTO:
695
case OP_POSUPTOI:
696
case OP_NOTPOSUPTO:
697
case OP_NOTPOSUPTOI:
698
699
case OP_STAR:
700
case OP_STARI:
701
case OP_NOTSTAR:
702
case OP_NOTSTARI:
703
case OP_MINSTAR:
704
case OP_MINSTARI:
705
case OP_NOTMINSTAR:
706
case OP_NOTMINSTARI:
707
case OP_POSSTAR:
708
case OP_POSSTARI:
709
case OP_NOTPOSSTAR:
710
case OP_NOTPOSSTARI:
711
712
case OP_QUERY:
713
case OP_QUERYI:
714
case OP_NOTQUERY:
715
case OP_NOTQUERYI:
716
case OP_MINQUERY:
717
case OP_MINQUERYI:
718
case OP_NOTMINQUERY:
719
case OP_NOTMINQUERYI:
720
case OP_POSQUERY:
721
case OP_POSQUERYI:
722
case OP_NOTPOSQUERY:
723
case OP_NOTPOSQUERYI:
724
725
cc += PRIV(OP_lengths)[op];
726
#ifdef SUPPORT_UNICODE
727
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
728
#endif
729
break;
730
731
/* Skip these, but we need to add in the name length. */
732
733
case OP_MARK:
734
case OP_COMMIT_ARG:
735
case OP_PRUNE_ARG:
736
case OP_SKIP_ARG:
737
case OP_THEN_ARG:
738
cc += PRIV(OP_lengths)[op] + cc[1];
739
break;
740
741
/* The remaining opcodes are just skipped over. */
742
743
case OP_CLOSE:
744
case OP_COMMIT:
745
case OP_FAIL:
746
case OP_PRUNE:
747
case OP_SET_SOM:
748
case OP_SKIP:
749
case OP_THEN:
750
cc += PRIV(OP_lengths)[op];
751
break;
752
753
/* This should not occur: we list all opcodes explicitly so that when
754
new ones get added they are properly considered. */
755
756
default:
757
PCRE2_DEBUG_UNREACHABLE();
758
return -3;
759
}
760
}
761
762
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
763
return -3; /* Avoid compiler warnings */
764
}
765
766
767
768
/*************************************************
769
* Set a bit and maybe its alternate case *
770
*************************************************/
771
772
/* Given a character, set its first code unit's bit in the table, and also the
773
corresponding bit for the other version of a letter if we are caseless.
774
775
Arguments:
776
re points to the regex block
777
p points to the first code unit of the character
778
caseless TRUE if caseless
779
utf TRUE for UTF mode
780
ucp TRUE for UCP mode
781
782
Returns: pointer after the character
783
*/
784
785
static PCRE2_SPTR
786
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
787
BOOL ucp)
788
{
789
uint32_t c = *p++; /* First code unit */
790
791
(void)utf; /* Stop compiler warnings when UTF not supported */
792
(void)ucp;
793
794
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
795
0xff. */
796
797
#if PCRE2_CODE_UNIT_WIDTH != 8
798
if (c > 0xff) SET_BIT(0xff); else
799
#endif
800
801
SET_BIT(c);
802
803
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
804
the end of the character, even when caseless. */
805
806
#ifdef SUPPORT_UNICODE
807
if (utf)
808
{
809
#if PCRE2_CODE_UNIT_WIDTH == 8
810
if (c >= 0xc0) GETUTF8INC(c, p);
811
#elif PCRE2_CODE_UNIT_WIDTH == 16
812
if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
813
#endif
814
}
815
#endif /* SUPPORT_UNICODE */
816
817
/* If caseless, handle the other case of the character. */
818
819
if (caseless)
820
{
821
#ifdef SUPPORT_UNICODE
822
if (utf || ucp)
823
{
824
c = UCD_OTHERCASE(c);
825
#if PCRE2_CODE_UNIT_WIDTH == 8
826
if (utf)
827
{
828
PCRE2_UCHAR buff[6];
829
(void)PRIV(ord2utf)(c, buff);
830
SET_BIT(buff[0]);
831
}
832
else if (c < 256) SET_BIT(c);
833
#else /* 16-bit or 32-bit mode */
834
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
835
#endif
836
}
837
838
else
839
#endif /* SUPPORT_UNICODE */
840
841
/* Not UTF or UCP */
842
843
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
844
}
845
846
return p;
847
}
848
849
850
851
/*************************************************
852
* Set bits for a positive character type *
853
*************************************************/
854
855
/* This function sets starting bits for a character type. In UTF-8 mode, we can
856
only do a direct setting for bytes less than 128, as otherwise there can be
857
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
858
environment, the tables will only recognize ASCII characters anyway, but in at
859
least one Windows environment, some higher bytes bits were set in the tables.
860
So we deal with that case by considering the UTF-8 encoding.
861
862
Arguments:
863
re the regex block
864
cbit type the type of character wanted
865
table_limit 32 for non-UTF-8; 16 for UTF-8
866
867
Returns: nothing
868
*/
869
870
static void
871
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
872
{
873
uint32_t c;
874
for (c = 0; c < table_limit; c++)
875
re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
876
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
877
if (table_limit == 32) return;
878
for (c = 128; c < 256; c++)
879
{
880
if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
881
{
882
PCRE2_UCHAR buff[6];
883
(void)PRIV(ord2utf)(c, buff);
884
SET_BIT(buff[0]);
885
}
886
}
887
#endif /* UTF-8 */
888
}
889
890
891
/*************************************************
892
* Set bits for a negative character type *
893
*************************************************/
894
895
/* This function sets starting bits for a negative character type such as \D.
896
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
897
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
898
Unlike in the positive case, where we can set appropriate starting bits for
899
specific high-valued UTF-8 characters, in this case we have to set the bits for
900
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
901
0xc0 (192) for simplicity.
902
903
Arguments:
904
re the regex block
905
cbit type the type of character wanted
906
table_limit 32 for non-UTF-8; 16 for UTF-8
907
908
Returns: nothing
909
*/
910
911
static void
912
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
913
{
914
uint32_t c;
915
for (c = 0; c < table_limit; c++)
916
re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
917
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
918
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
919
#endif
920
}
921
922
923
924
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
925
/*************************************************
926
* Set starting bits for a character list. *
927
*************************************************/
928
929
/* This function sets starting bits for a character list. It enumerates
930
all characters and character ranges in the character list, and sets
931
the starting bits accordingly.
932
933
Arguments:
934
code pointer to the code
935
start_bitmap pointer to the starting bitmap
936
937
Returns: nothing
938
*/
939
static void
940
study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,
941
const uint8_t *char_lists_end)
942
{
943
uint32_t type, list_ind;
944
uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
945
uint32_t range_start = ~(uint32_t)0, range_end = 0;
946
const uint8_t *next_char;
947
PCRE2_UCHAR start_buffer[6], end_buffer[6];
948
PCRE2_UCHAR start, end;
949
950
/* Only needed in 8-bit mode at the moment. */
951
type = (uint32_t)(code[0] << 8) | code[1];
952
code += 2;
953
954
/* Align characters. */
955
next_char = char_lists_end - (GET(code, 0) << 1);
956
type &= XCL_TYPE_MASK;
957
list_ind = 0;
958
959
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
960
range_start = XCL_CHAR_LIST_LOW_16_START;
961
962
while (type > 0)
963
{
964
uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
965
966
if (item_count == XCL_ITEM_COUNT_MASK)
967
{
968
if (list_ind <= 1)
969
{
970
item_count = *(const uint16_t*)next_char;
971
next_char += 2;
972
}
973
else
974
{
975
item_count = *(const uint32_t*)next_char;
976
next_char += 4;
977
}
978
}
979
980
while (item_count > 0)
981
{
982
if (list_ind <= 1)
983
{
984
range_end = *(const uint16_t*)next_char;
985
next_char += 2;
986
}
987
else
988
{
989
range_end = *(const uint32_t*)next_char;
990
next_char += 4;
991
}
992
993
if ((range_end & XCL_CHAR_END) != 0)
994
{
995
range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
996
997
PRIV(ord2utf)(range_end, end_buffer);
998
end = end_buffer[0];
999
1000
if (range_start < range_end)
1001
{
1002
PRIV(ord2utf)(range_start, start_buffer);
1003
for (start = start_buffer[0]; start <= end; start++)
1004
start_bitmap[start / 8] |= (1u << (start & 7));
1005
}
1006
else
1007
start_bitmap[end / 8] |= (1u << (end & 7));
1008
1009
range_start = ~(uint32_t)0;
1010
}
1011
else
1012
range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
1013
1014
item_count--;
1015
}
1016
1017
list_ind++;
1018
type >>= XCL_TYPE_BIT_LEN;
1019
1020
if (range_start == ~(uint32_t)0)
1021
{
1022
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
1023
{
1024
/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */
1025
if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
1026
else range_start = XCL_CHAR_LIST_LOW_32_START;
1027
}
1028
}
1029
else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
1030
{
1031
PRIV(ord2utf)(range_start, start_buffer);
1032
1033
/* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and
1034
XCL_CHAR_LIST_HIGH_32_END are not possible. */
1035
if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
1036
else range_end = XCL_CHAR_LIST_HIGH_16_END;
1037
1038
PRIV(ord2utf)(range_end, end_buffer);
1039
end = end_buffer[0];
1040
1041
for (start = start_buffer[0]; start <= end; start++)
1042
start_bitmap[start / 8] |= (1u << (start & 7));
1043
1044
range_start = ~(uint32_t)0;
1045
}
1046
1047
/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */
1048
if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
1049
else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
1050
}
1051
}
1052
#endif
1053
1054
1055
1056
/*************************************************
1057
* Create bitmap of starting code units *
1058
*************************************************/
1059
1060
/* This function scans a compiled unanchored expression recursively and
1061
attempts to build a bitmap of the set of possible starting code units whose
1062
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
1063
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
1064
we pass a value of 16 rather than 32 as the final argument. (See comments in
1065
those functions for the reason.)
1066
1067
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
1068
(a*)b where the group provides some optional starting code units but scanning
1069
must continue at the outer level to find at least one mandatory code unit. At
1070
the outermost level, this function fails unless the result is SSB_DONE.
1071
1072
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
1073
issues.
1074
1075
Arguments:
1076
re points to the compiled regex block
1077
code points to an expression
1078
utf TRUE if in UTF mode
1079
ucp TRUE if in UCP mode
1080
depthptr pointer to recurse depth
1081
1082
Returns: SSB_FAIL => Failed to find any starting code units
1083
SSB_DONE => Found mandatory starting code units
1084
SSB_CONTINUE => Found optional starting code units
1085
SSB_UNKNOWN => Hit an unrecognized opcode
1086
SSB_TOODEEP => Recursion is too deep
1087
*/
1088
1089
static int
1090
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
1091
int *depthptr)
1092
{
1093
uint32_t c;
1094
int yield = SSB_DONE;
1095
1096
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1097
int table_limit = utf? 16:32;
1098
#else
1099
int table_limit = 32;
1100
#endif
1101
1102
*depthptr += 1;
1103
if (*depthptr > 1000) return SSB_TOODEEP;
1104
1105
do
1106
{
1107
BOOL try_next = TRUE;
1108
PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
1109
1110
if (*code == OP_CBRA || *code == OP_SCBRA ||
1111
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1112
1113
while (try_next) /* Loop for items in this branch */
1114
{
1115
int rc;
1116
PCRE2_SPTR ncode;
1117
const uint8_t *classmap = NULL;
1118
#ifdef SUPPORT_WIDE_CHARS
1119
PCRE2_UCHAR xclassflags;
1120
#endif
1121
1122
switch(*tcode)
1123
{
1124
/* If we reach something we don't understand, it means a new opcode has
1125
been created that hasn't been added to this function. Hopefully this
1126
problem will be discovered during testing. */
1127
1128
default:
1129
return SSB_UNKNOWN;
1130
1131
/* Fail for a valid opcode that implies no starting bits. */
1132
1133
case OP_ACCEPT:
1134
case OP_ASSERT_ACCEPT:
1135
case OP_ALLANY:
1136
case OP_ANY:
1137
case OP_ANYBYTE:
1138
case OP_CIRCM:
1139
case OP_CLOSE:
1140
case OP_COMMIT:
1141
case OP_COMMIT_ARG:
1142
case OP_COND:
1143
case OP_CREF:
1144
case OP_FALSE:
1145
case OP_TRUE:
1146
case OP_DNCREF:
1147
case OP_DNREF:
1148
case OP_DNREFI:
1149
case OP_DNRREF:
1150
case OP_DOLL:
1151
case OP_DOLLM:
1152
case OP_END:
1153
case OP_EOD:
1154
case OP_EODN:
1155
case OP_EXTUNI:
1156
case OP_FAIL:
1157
case OP_MARK:
1158
case OP_NOT:
1159
case OP_NOTEXACT:
1160
case OP_NOTEXACTI:
1161
case OP_NOTI:
1162
case OP_NOTMINPLUS:
1163
case OP_NOTMINPLUSI:
1164
case OP_NOTMINQUERY:
1165
case OP_NOTMINQUERYI:
1166
case OP_NOTMINSTAR:
1167
case OP_NOTMINSTARI:
1168
case OP_NOTMINUPTO:
1169
case OP_NOTMINUPTOI:
1170
case OP_NOTPLUS:
1171
case OP_NOTPLUSI:
1172
case OP_NOTPOSPLUS:
1173
case OP_NOTPOSPLUSI:
1174
case OP_NOTPOSQUERY:
1175
case OP_NOTPOSQUERYI:
1176
case OP_NOTPOSSTAR:
1177
case OP_NOTPOSSTARI:
1178
case OP_NOTPOSUPTO:
1179
case OP_NOTPOSUPTOI:
1180
case OP_NOTPROP:
1181
case OP_NOTQUERY:
1182
case OP_NOTQUERYI:
1183
case OP_NOTSTAR:
1184
case OP_NOTSTARI:
1185
case OP_NOTUPTO:
1186
case OP_NOTUPTOI:
1187
case OP_NOT_HSPACE:
1188
case OP_NOT_VSPACE:
1189
case OP_PRUNE:
1190
case OP_PRUNE_ARG:
1191
case OP_RECURSE:
1192
case OP_REF:
1193
case OP_REFI:
1194
case OP_REVERSE:
1195
case OP_VREVERSE:
1196
case OP_RREF:
1197
case OP_SCOND:
1198
case OP_SET_SOM:
1199
case OP_SKIP:
1200
case OP_SKIP_ARG:
1201
case OP_SOD:
1202
case OP_SOM:
1203
case OP_THEN:
1204
case OP_THEN_ARG:
1205
return SSB_FAIL;
1206
1207
/* OP_CIRC happens only at the start of an anchored branch (multiline ^
1208
uses OP_CIRCM). Skip over it. */
1209
1210
case OP_CIRC:
1211
tcode += PRIV(OP_lengths)[OP_CIRC];
1212
break;
1213
1214
/* A "real" property test implies no starting bits, but the fake property
1215
PT_CLIST identifies a list of characters. These lists are short, as they
1216
are used for characters with more than one "other case", so there is no
1217
point in recognizing them for OP_NOTPROP. */
1218
1219
case OP_PROP:
1220
if (tcode[1] != PT_CLIST) return SSB_FAIL;
1221
{
1222
const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1223
while ((c = *p++) < NOTACHAR)
1224
{
1225
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1226
if (utf)
1227
{
1228
PCRE2_UCHAR buff[6];
1229
(void)PRIV(ord2utf)(c, buff);
1230
c = buff[0];
1231
}
1232
#endif
1233
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1234
}
1235
}
1236
try_next = FALSE;
1237
break;
1238
1239
/* We can ignore word boundary tests. */
1240
1241
case OP_WORD_BOUNDARY:
1242
case OP_NOT_WORD_BOUNDARY:
1243
case OP_UCP_WORD_BOUNDARY:
1244
case OP_NOT_UCP_WORD_BOUNDARY:
1245
tcode++;
1246
break;
1247
1248
/* For a positive lookahead assertion, inspect what immediately follows,
1249
ignoring intermediate assertions and callouts. If the next item is one
1250
that sets a mandatory character, skip this assertion. Otherwise, treat it
1251
the same as other bracket groups. */
1252
1253
case OP_ASSERT:
1254
case OP_ASSERT_NA:
1255
ncode = tcode + GET(tcode, 1);
1256
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1257
ncode += 1 + LINK_SIZE;
1258
1259
/* Skip irrelevant items */
1260
1261
for (BOOL done = FALSE; !done;)
1262
{
1263
switch (*ncode)
1264
{
1265
case OP_ASSERT:
1266
case OP_ASSERT_NOT:
1267
case OP_ASSERTBACK:
1268
case OP_ASSERTBACK_NOT:
1269
case OP_ASSERT_NA:
1270
case OP_ASSERTBACK_NA:
1271
case OP_ASSERT_SCS:
1272
ncode += GET(ncode, 1);
1273
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1274
ncode += 1 + LINK_SIZE;
1275
break;
1276
1277
case OP_WORD_BOUNDARY:
1278
case OP_NOT_WORD_BOUNDARY:
1279
case OP_UCP_WORD_BOUNDARY:
1280
case OP_NOT_UCP_WORD_BOUNDARY:
1281
ncode++;
1282
break;
1283
1284
case OP_CALLOUT:
1285
ncode += PRIV(OP_lengths)[OP_CALLOUT];
1286
break;
1287
1288
case OP_CALLOUT_STR:
1289
ncode += GET(ncode, 1 + 2*LINK_SIZE);
1290
break;
1291
1292
default:
1293
done = TRUE;
1294
break;
1295
}
1296
}
1297
1298
/* Now check the next significant item. */
1299
1300
switch(*ncode)
1301
{
1302
default:
1303
break;
1304
1305
case OP_PROP:
1306
if (ncode[1] != PT_CLIST) break;
1307
/* Fall through */
1308
case OP_ANYNL:
1309
case OP_CHAR:
1310
case OP_CHARI:
1311
case OP_EXACT:
1312
case OP_EXACTI:
1313
case OP_HSPACE:
1314
case OP_MINPLUS:
1315
case OP_MINPLUSI:
1316
case OP_PLUS:
1317
case OP_PLUSI:
1318
case OP_POSPLUS:
1319
case OP_POSPLUSI:
1320
case OP_VSPACE:
1321
/* Note that these types will only be present in non-UCP mode. */
1322
case OP_DIGIT:
1323
case OP_NOT_DIGIT:
1324
case OP_WORDCHAR:
1325
case OP_NOT_WORDCHAR:
1326
case OP_WHITESPACE:
1327
case OP_NOT_WHITESPACE:
1328
tcode = ncode;
1329
continue; /* With the following significant opcode */
1330
}
1331
/* Fall through */
1332
1333
/* For a group bracket or a positive assertion without an immediately
1334
following mandatory setting, recurse to set bits from within the
1335
subpattern. If it can't find anything, we have to give up. If it finds
1336
some mandatory character(s), we are done for this branch. Otherwise,
1337
carry on scanning after the subpattern. */
1338
1339
case OP_BRA:
1340
case OP_SBRA:
1341
case OP_CBRA:
1342
case OP_SCBRA:
1343
case OP_BRAPOS:
1344
case OP_SBRAPOS:
1345
case OP_CBRAPOS:
1346
case OP_SCBRAPOS:
1347
case OP_ONCE:
1348
case OP_SCRIPT_RUN:
1349
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1350
if (rc == SSB_DONE)
1351
{
1352
try_next = FALSE;
1353
}
1354
else if (rc == SSB_CONTINUE)
1355
{
1356
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1357
tcode += 1 + LINK_SIZE;
1358
}
1359
else return rc; /* FAIL, UNKNOWN, or TOODEEP */
1360
break;
1361
1362
/* If we hit ALT or KET, it means we haven't found anything mandatory in
1363
this branch, though we might have found something optional. For ALT, we
1364
continue with the next alternative, but we have to arrange that the final
1365
result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1366
return SSB_CONTINUE: if this is the top level, that indicates failure,
1367
but after a nested subpattern, it causes scanning to continue. */
1368
1369
case OP_ALT:
1370
yield = SSB_CONTINUE;
1371
try_next = FALSE;
1372
break;
1373
1374
case OP_KET:
1375
case OP_KETRMAX:
1376
case OP_KETRMIN:
1377
case OP_KETRPOS:
1378
return SSB_CONTINUE;
1379
1380
/* Skip over callout */
1381
1382
case OP_CALLOUT:
1383
tcode += PRIV(OP_lengths)[OP_CALLOUT];
1384
break;
1385
1386
case OP_CALLOUT_STR:
1387
tcode += GET(tcode, 1 + 2*LINK_SIZE);
1388
break;
1389
1390
/* Skip over lookbehind, negative lookahead, and scan substring
1391
assertions */
1392
1393
case OP_ASSERT_NOT:
1394
case OP_ASSERTBACK:
1395
case OP_ASSERTBACK_NOT:
1396
case OP_ASSERTBACK_NA:
1397
case OP_ASSERT_SCS:
1398
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1399
tcode += 1 + LINK_SIZE;
1400
break;
1401
1402
/* BRAZERO does the bracket, but carries on. */
1403
1404
case OP_BRAZERO:
1405
case OP_BRAMINZERO:
1406
case OP_BRAPOSZERO:
1407
rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1408
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1409
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1410
tcode += 1 + LINK_SIZE;
1411
break;
1412
1413
/* SKIPZERO skips the bracket. */
1414
1415
case OP_SKIPZERO:
1416
tcode++;
1417
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1418
tcode += 1 + LINK_SIZE;
1419
break;
1420
1421
/* Single-char * or ? sets the bit and tries the next item */
1422
1423
case OP_STAR:
1424
case OP_MINSTAR:
1425
case OP_POSSTAR:
1426
case OP_QUERY:
1427
case OP_MINQUERY:
1428
case OP_POSQUERY:
1429
tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1430
break;
1431
1432
case OP_STARI:
1433
case OP_MINSTARI:
1434
case OP_POSSTARI:
1435
case OP_QUERYI:
1436
case OP_MINQUERYI:
1437
case OP_POSQUERYI:
1438
tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1439
break;
1440
1441
/* Single-char upto sets the bit and tries the next */
1442
1443
case OP_UPTO:
1444
case OP_MINUPTO:
1445
case OP_POSUPTO:
1446
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1447
break;
1448
1449
case OP_UPTOI:
1450
case OP_MINUPTOI:
1451
case OP_POSUPTOI:
1452
tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1453
break;
1454
1455
/* At least one single char sets the bit and stops */
1456
1457
case OP_EXACT:
1458
tcode += IMM2_SIZE;
1459
/* Fall through */
1460
case OP_CHAR:
1461
case OP_PLUS:
1462
case OP_MINPLUS:
1463
case OP_POSPLUS:
1464
(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1465
try_next = FALSE;
1466
break;
1467
1468
case OP_EXACTI:
1469
tcode += IMM2_SIZE;
1470
/* Fall through */
1471
case OP_CHARI:
1472
case OP_PLUSI:
1473
case OP_MINPLUSI:
1474
case OP_POSPLUSI:
1475
(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1476
try_next = FALSE;
1477
break;
1478
1479
/* Special spacing and line-terminating items. These recognize specific
1480
lists of characters. The difference between VSPACE and ANYNL is that the
1481
latter can match the two-character CRLF sequence, but that is not
1482
relevant for finding the first character, so their code here is
1483
identical. */
1484
1485
case OP_HSPACE:
1486
SET_BIT(CHAR_HT);
1487
SET_BIT(CHAR_SPACE);
1488
1489
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1490
the bits for 0xA0 and for code units >= 255, independently of UTF. */
1491
1492
#if PCRE2_CODE_UNIT_WIDTH != 8
1493
SET_BIT(0xA0);
1494
SET_BIT(0xFF);
1495
#else
1496
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1497
units of horizontal space characters. */
1498
1499
#ifdef SUPPORT_UNICODE
1500
if (utf)
1501
{
1502
SET_BIT(0xC2); /* For U+00A0 */
1503
SET_BIT(0xE1); /* For U+1680, U+180E */
1504
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1505
SET_BIT(0xE3); /* For U+3000 */
1506
}
1507
else
1508
#endif
1509
/* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1510
the code is EBCDIC. */
1511
{
1512
#ifndef EBCDIC
1513
SET_BIT(0xA0);
1514
#endif /* Not EBCDIC */
1515
}
1516
#endif /* 8-bit support */
1517
1518
try_next = FALSE;
1519
break;
1520
1521
case OP_ANYNL:
1522
case OP_VSPACE:
1523
SET_BIT(CHAR_LF);
1524
SET_BIT(CHAR_VT);
1525
SET_BIT(CHAR_FF);
1526
SET_BIT(CHAR_CR);
1527
1528
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529
the bits for NEL and for code units >= 255, independently of UTF. */
1530
1531
#if PCRE2_CODE_UNIT_WIDTH != 8
1532
SET_BIT(CHAR_NEL);
1533
SET_BIT(0xFF);
1534
#else
1535
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536
units of vertical space characters. */
1537
1538
#ifdef SUPPORT_UNICODE
1539
if (utf)
1540
{
1541
SET_BIT(0xC2); /* For U+0085 (NEL) */
1542
SET_BIT(0xE2); /* For U+2028, U+2029 */
1543
}
1544
else
1545
#endif
1546
/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547
{
1548
SET_BIT(CHAR_NEL);
1549
}
1550
#endif /* 8-bit support */
1551
1552
try_next = FALSE;
1553
break;
1554
1555
/* Single character types set the bits and stop. Note that if PCRE2_UCP
1556
is set, we do not see these opcodes because \d etc are converted to
1557
properties. Therefore, these apply in the case when only characters less
1558
than 256 are recognized to match the types. */
1559
1560
case OP_NOT_DIGIT:
1561
set_nottype_bits(re, cbit_digit, table_limit);
1562
try_next = FALSE;
1563
break;
1564
1565
case OP_DIGIT:
1566
set_type_bits(re, cbit_digit, table_limit);
1567
try_next = FALSE;
1568
break;
1569
1570
case OP_NOT_WHITESPACE:
1571
set_nottype_bits(re, cbit_space, table_limit);
1572
try_next = FALSE;
1573
break;
1574
1575
case OP_WHITESPACE:
1576
set_type_bits(re, cbit_space, table_limit);
1577
try_next = FALSE;
1578
break;
1579
1580
case OP_NOT_WORDCHAR:
1581
set_nottype_bits(re, cbit_word, table_limit);
1582
try_next = FALSE;
1583
break;
1584
1585
case OP_WORDCHAR:
1586
set_type_bits(re, cbit_word, table_limit);
1587
try_next = FALSE;
1588
break;
1589
1590
/* One or more character type fudges the pointer and restarts, knowing
1591
it will hit a single character type and stop there. */
1592
1593
case OP_TYPEPLUS:
1594
case OP_TYPEMINPLUS:
1595
case OP_TYPEPOSPLUS:
1596
tcode++;
1597
break;
1598
1599
case OP_TYPEEXACT:
1600
tcode += 1 + IMM2_SIZE;
1601
break;
1602
1603
/* Zero or more repeats of character types set the bits and then
1604
try again. */
1605
1606
case OP_TYPEUPTO:
1607
case OP_TYPEMINUPTO:
1608
case OP_TYPEPOSUPTO:
1609
tcode += IMM2_SIZE; /* Fall through */
1610
1611
case OP_TYPESTAR:
1612
case OP_TYPEMINSTAR:
1613
case OP_TYPEPOSSTAR:
1614
case OP_TYPEQUERY:
1615
case OP_TYPEMINQUERY:
1616
case OP_TYPEPOSQUERY:
1617
switch(tcode[1])
1618
{
1619
default:
1620
case OP_ANY:
1621
case OP_ALLANY:
1622
return SSB_FAIL;
1623
1624
case OP_HSPACE:
1625
SET_BIT(CHAR_HT);
1626
SET_BIT(CHAR_SPACE);
1627
1628
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1629
the bits for 0xA0 and for code units >= 255, independently of UTF. */
1630
1631
#if PCRE2_CODE_UNIT_WIDTH != 8
1632
SET_BIT(0xA0);
1633
SET_BIT(0xFF);
1634
#else
1635
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1636
units of horizontal space characters. */
1637
1638
#ifdef SUPPORT_UNICODE
1639
if (utf)
1640
{
1641
SET_BIT(0xC2); /* For U+00A0 */
1642
SET_BIT(0xE1); /* For U+1680, U+180E */
1643
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1644
SET_BIT(0xE3); /* For U+3000 */
1645
}
1646
else
1647
#endif
1648
/* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1649
the code is EBCDIC. */
1650
{
1651
#ifndef EBCDIC
1652
SET_BIT(0xA0);
1653
#endif /* Not EBCDIC */
1654
}
1655
#endif /* 8-bit support */
1656
break;
1657
1658
case OP_ANYNL:
1659
case OP_VSPACE:
1660
SET_BIT(CHAR_LF);
1661
SET_BIT(CHAR_VT);
1662
SET_BIT(CHAR_FF);
1663
SET_BIT(CHAR_CR);
1664
1665
/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1666
the bits for NEL and for code units >= 255, independently of UTF. */
1667
1668
#if PCRE2_CODE_UNIT_WIDTH != 8
1669
SET_BIT(CHAR_NEL);
1670
SET_BIT(0xFF);
1671
#else
1672
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
1673
units of vertical space characters. */
1674
1675
#ifdef SUPPORT_UNICODE
1676
if (utf)
1677
{
1678
SET_BIT(0xC2); /* For U+0085 (NEL) */
1679
SET_BIT(0xE2); /* For U+2028, U+2029 */
1680
}
1681
else
1682
#endif
1683
/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1684
{
1685
SET_BIT(CHAR_NEL);
1686
}
1687
#endif /* 8-bit support */
1688
break;
1689
1690
case OP_NOT_DIGIT:
1691
set_nottype_bits(re, cbit_digit, table_limit);
1692
break;
1693
1694
case OP_DIGIT:
1695
set_type_bits(re, cbit_digit, table_limit);
1696
break;
1697
1698
case OP_NOT_WHITESPACE:
1699
set_nottype_bits(re, cbit_space, table_limit);
1700
break;
1701
1702
case OP_WHITESPACE:
1703
set_type_bits(re, cbit_space, table_limit);
1704
break;
1705
1706
case OP_NOT_WORDCHAR:
1707
set_nottype_bits(re, cbit_word, table_limit);
1708
break;
1709
1710
case OP_WORDCHAR:
1711
set_type_bits(re, cbit_word, table_limit);
1712
break;
1713
}
1714
1715
tcode += 2;
1716
break;
1717
1718
/* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */
1719
1720
#ifdef SUPPORT_WIDE_CHARS
1721
case OP_ECLASS:
1722
return SSB_FAIL;
1723
#endif
1724
1725
/* Extended class: if there are any property checks, or if this is a
1726
negative XCLASS without a map, give up. If there are no property checks,
1727
there must be wide characters on the XCLASS list, because otherwise an
1728
XCLASS would not have been created. This means that code points >= 255
1729
are potential starters. In the UTF-8 case we can scan them and set bits
1730
for the relevant leading bytes. */
1731
1732
#ifdef SUPPORT_WIDE_CHARS
1733
case OP_XCLASS:
1734
xclassflags = tcode[1 + LINK_SIZE];
1735
if ((xclassflags & XCL_HASPROP) != 0 ||
1736
(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1737
return SSB_FAIL;
1738
1739
/* We have a positive XCLASS or a negative one without a map. Set up the
1740
map pointer if there is one, and fall through. */
1741
1742
classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1743
(const uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1744
1745
/* In UTF-8 mode, scan the character list and set bits for leading bytes,
1746
then jump to handle the map. */
1747
1748
#if PCRE2_CODE_UNIT_WIDTH == 8
1749
if (utf && (xclassflags & XCL_NOT) == 0)
1750
{
1751
PCRE2_UCHAR b, e;
1752
PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1753
tcode += GET(tcode, 1);
1754
1755
if (*p >= XCL_LIST)
1756
{
1757
study_char_list(p, re->start_bitmap,
1758
((const uint8_t *)re + re->code_start));
1759
goto HANDLE_CLASSMAP;
1760
}
1761
1762
for (;;) switch (*p++)
1763
{
1764
case XCL_SINGLE:
1765
b = *p++;
1766
while ((*p & 0xc0) == 0x80) p++;
1767
re->start_bitmap[b/8] |= (1u << (b&7));
1768
break;
1769
1770
case XCL_RANGE:
1771
b = *p++;
1772
while ((*p & 0xc0) == 0x80) p++;
1773
e = *p++;
1774
while ((*p & 0xc0) == 0x80) p++;
1775
for (; b <= e; b++)
1776
re->start_bitmap[b/8] |= (1u << (b&7));
1777
break;
1778
1779
case XCL_END:
1780
goto HANDLE_CLASSMAP;
1781
1782
default:
1783
PCRE2_DEBUG_UNREACHABLE();
1784
return SSB_UNKNOWN; /* Internal error, should not occur */
1785
}
1786
}
1787
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1788
#endif /* SUPPORT_WIDE_CHARS */
1789
1790
/* It seems that the fall through comment must be outside the #ifdef if
1791
it is to avoid the gcc compiler warning. */
1792
1793
/* Fall through */
1794
1795
/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1796
in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1797
because it starts a character with a value > 255. In 8-bit non-UTF mode,
1798
there is no difference between CLASS and NCLASS. In all other wide
1799
character modes, set the 0xFF bit to indicate code units >= 255. */
1800
1801
case OP_NCLASS:
1802
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1803
if (utf)
1804
{
1805
re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1806
memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1807
}
1808
#elif PCRE2_CODE_UNIT_WIDTH != 8
1809
SET_BIT(0xFF); /* For characters >= 255 */
1810
#endif
1811
/* Fall through */
1812
1813
/* Enter here for a positive non-XCLASS. If we have fallen through from
1814
an XCLASS, classmap will already be set; just advance the code pointer.
1815
Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1816
1817
case OP_CLASS:
1818
if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1819
{
1820
classmap = (const uint8_t *)(++tcode);
1821
tcode += 32 / sizeof(PCRE2_UCHAR);
1822
}
1823
1824
/* When wide characters are supported, classmap may be NULL. In UTF-8
1825
(sic) mode, the bits in a class bit map correspond to character values,
1826
not to byte values. However, the bit map we are constructing is for byte
1827
values. So we have to do a conversion for characters whose code point is
1828
greater than 127. In fact, there are only two possible starting bytes for
1829
characters in the range 128 - 255. */
1830
1831
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1832
HANDLE_CLASSMAP:
1833
#endif
1834
if (classmap != NULL)
1835
{
1836
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1837
if (utf)
1838
{
1839
for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1840
for (c = 128; c < 256; c++)
1841
{
1842
if ((classmap[c/8] & (1u << (c&7))) != 0)
1843
{
1844
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1845
re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */
1846
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1847
}
1848
}
1849
}
1850
else
1851
#endif
1852
/* In all modes except UTF-8, the two bit maps are compatible. */
1853
1854
{
1855
for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1856
}
1857
}
1858
1859
/* Act on what follows the class. For a zero minimum repeat, continue;
1860
otherwise stop processing. */
1861
1862
switch (*tcode)
1863
{
1864
case OP_CRSTAR:
1865
case OP_CRMINSTAR:
1866
case OP_CRQUERY:
1867
case OP_CRMINQUERY:
1868
case OP_CRPOSSTAR:
1869
case OP_CRPOSQUERY:
1870
tcode++;
1871
break;
1872
1873
case OP_CRRANGE:
1874
case OP_CRMINRANGE:
1875
case OP_CRPOSRANGE:
1876
if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1877
else try_next = FALSE;
1878
break;
1879
1880
default:
1881
try_next = FALSE;
1882
break;
1883
}
1884
break; /* End of class handling case */
1885
} /* End of switch for opcodes */
1886
} /* End of try_next loop */
1887
1888
code += GET(code, 1); /* Advance to next branch */
1889
}
1890
while (*code == OP_ALT);
1891
1892
return yield;
1893
}
1894
1895
1896
1897
/*************************************************
1898
* Study a compiled expression *
1899
*************************************************/
1900
1901
/* This function is handed a compiled expression that it must study to produce
1902
information that will speed up the matching.
1903
1904
Argument:
1905
re points to the compiled expression
1906
1907
Returns: 0 normally; non-zero should never normally occur
1908
1 unknown opcode in set_start_bits
1909
2 missing capturing bracket
1910
3 unknown opcode in find_minlength
1911
*/
1912
1913
int
1914
PRIV(study)(pcre2_real_code *re)
1915
{
1916
int count = 0;
1917
PCRE2_UCHAR *code;
1918
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1919
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1920
1921
/* Find start of compiled code */
1922
1923
code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
1924
1925
/* For a pattern that has a first code unit, or a multiline pattern that
1926
matches only at "line start", there is no point in seeking a list of starting
1927
code units. */
1928
1929
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1930
{
1931
int depth = 0;
1932
int rc = set_start_bits(re, code, utf, ucp, &depth);
1933
if (rc == SSB_UNKNOWN)
1934
{
1935
PCRE2_DEBUG_UNREACHABLE();
1936
return 1;
1937
}
1938
1939
/* If a list of starting code units was set up, scan the list to see if only
1940
one or two were listed. Having only one listed is rare because usually a
1941
single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1942
If two are listed, see if they are caseless versions of the same character;
1943
if so we can replace the list with a caseless first code unit. This gives
1944
better performance and is plausibly worth doing for patterns such as [Ww]ord
1945
or (word|WORD). */
1946
1947
if (rc == SSB_DONE)
1948
{
1949
int i;
1950
int a = -1;
1951
int b = -1;
1952
uint8_t *p = re->start_bitmap;
1953
uint32_t flags = PCRE2_FIRSTMAPSET;
1954
1955
for (i = 0; i < 256; p++, i += 8)
1956
{
1957
uint8_t x = *p;
1958
if (x != 0)
1959
{
1960
int c;
1961
uint8_t y = x & (~x + 1); /* Least significant bit */
1962
if (y != x) goto DONE; /* More than one bit set */
1963
1964
/* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1965
all wide characters", so we cannot use it here. */
1966
1967
#if PCRE2_CODE_UNIT_WIDTH != 8
1968
if (i == 248 && x == 0x80) goto DONE;
1969
#endif
1970
1971
/* Compute the character value */
1972
1973
c = i;
1974
switch (x)
1975
{
1976
case 1: break;
1977
case 2: c += 1; break; case 4: c += 2; break;
1978
case 8: c += 3; break; case 16: c += 4; break;
1979
case 32: c += 5; break; case 64: c += 6; break;
1980
case 128: c += 7; break;
1981
}
1982
1983
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
1984
mode, only values < 128 can be used. In all the other cases, c is a
1985
character value. */
1986
1987
#if PCRE2_CODE_UNIT_WIDTH == 8
1988
if (utf && c > 127) goto DONE;
1989
#endif
1990
if (a < 0) a = c; /* First one found, save in a */
1991
else if (b < 0) /* Second one found */
1992
{
1993
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1994
1995
#ifdef SUPPORT_UNICODE
1996
if (utf || ucp)
1997
{
1998
if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
1999
if (c > 127) d = UCD_OTHERCASE(c);
2000
}
2001
#endif /* SUPPORT_UNICODE */
2002
2003
if (d != a) goto DONE; /* Not the other case of a */
2004
b = c; /* Save second in b */
2005
}
2006
else goto DONE; /* More than two characters found */
2007
}
2008
}
2009
2010
/* Replace the start code unit bits with a first code unit. If it is the
2011
same as a required later code unit, then clear the required later code
2012
unit. This is because a search for a required code unit starts after an
2013
explicit first code unit, but at a code unit found from the bitmap.
2014
Patterns such as /a*a/ don't work if both the start unit and required
2015
unit are the same. */
2016
2017
if (a >= 0) {
2018
if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {
2019
re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);
2020
re->last_codeunit = 0;
2021
}
2022
re->first_codeunit = a;
2023
flags = PCRE2_FIRSTSET;
2024
if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
2025
}
2026
2027
DONE:
2028
re->flags |= flags;
2029
}
2030
}
2031
2032
/* Find the minimum length of subject string. If the pattern can match an empty
2033
string, the minimum length is already known. If the pattern contains (*ACCEPT)
2034
all bets are off, and we don't even try to find a minimum length. If there are
2035
more back references than the size of the vector we are going to cache them in,
2036
do nothing. A pattern that complicated will probably take a long time to
2037
analyze and may in any case turn out to be too complicated. Note that back
2038
reference minima are held as 16-bit numbers. */
2039
2040
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
2041
re->top_backref <= MAX_CACHE_BACKREF)
2042
{
2043
int min;
2044
int backref_cache[MAX_CACHE_BACKREF+1];
2045
backref_cache[0] = 0; /* Highest one that is set */
2046
min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
2047
switch(min)
2048
{
2049
case -1: /* \C in UTF mode or over-complex regex */
2050
break; /* Leave minlength unchanged (will be zero) */
2051
2052
case -2:
2053
PCRE2_DEBUG_UNREACHABLE();
2054
return 2; /* missing capturing bracket */
2055
2056
case -3:
2057
PCRE2_DEBUG_UNREACHABLE();
2058
return 3; /* unrecognized opcode */
2059
2060
default:
2061
re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
2062
break;
2063
}
2064
}
2065
2066
return 0;
2067
}
2068
2069
/* End of pcre2_study.c */
2070
2071