CoCalc -- pcre2

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_study.c
⁹⁹⁰³ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41
/* This module contains functions for scanning a compiled pattern and
42
collecting data (e.g. minimum matching length). */
43

44

45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48

49
#include "pcre2_internal.h"
50

51
/* The maximum remembered capturing brackets minimum. */
52

53
#define MAX_CACHE_BACKREF 128
54

55
/* Set a bit in the starting code unit bit map. */
56

57
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58

59
/* Returns from set_start_bits() */
60

61
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62

63

64
/*************************************************
65
*   Find the minimum subject length for a group  *
66
*************************************************/
67

68
/* Scan a parenthesized group and compute the minimum length of subject that
69
is needed to match it. This is a lower bound; it does not mean there is a
70
string of that length that matches. In UTF mode, the result is in characters
71
rather than code units. The field in a compiled pattern for storing the minimum
72
length is 16-bits long (on the grounds that anything longer than that is
73
pathological), so we give up when we reach that amount. This also means that
74
integer overflow for really crazy patterns cannot happen.
75

76
Backreference minimum lengths are cached to speed up multiple references. This
77
function is called only when the highest back reference in the pattern is less
78
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79
caching vector. The zeroth element contains the number of the highest set
80
value.
81

82
Arguments:
83
  re              compiled pattern block
84
  code            pointer to start of group (the bracket)
85
  startcode       pointer to start of the whole pattern's code
86
  utf             UTF flag
87
  recurses        chain of recurse_check to catch mutual recursion
88
  countptr        pointer to call count (to catch over complexity)
89
  backref_cache   vector for caching back references.
90

91
This function is no longer called when the pattern contains (*ACCEPT); however,
92
the old code for returning -1 is retained, just in case.
93

94
Returns:   the minimum length
95
           -1 \C in UTF-8 mode
96
              or (*ACCEPT)
97
              or pattern too complicated
98
           -2 internal error (missing capturing bracket)
99
           -3 internal error (opcode not listed)
100
*/
101

102
static int
103
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104
  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105
  int *backref_cache)
106
{
107
int length = -1;
108
int branchlength = 0;
109
int prev_cap_recno = -1;
110
int prev_cap_d = 0;
111
int prev_recurse_recno = -1;
112
int prev_recurse_d = 0;
113
uint32_t once_fudge = 0;
114
BOOL had_recurse = FALSE;
115
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116
PCRE2_SPTR nextbranch = code + GET(code, 1);
117
PCRE2_SPTR cc = code + 1 + LINK_SIZE;
118
recurse_check this_recurse;
119

120
/* If this is a "could be empty" group, its minimum length is 0. */
121

122
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123

124
/* Skip over capturing bracket number */
125

126
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127

128
/* A large and/or complex regex can take too long to process. */
129

130
if ((*countptr)++ > 1000) return -1;
131

132
/* Scan along the opcodes for this branch. If we get to the end of the branch,
133
check the length against that of the other branches. If the accumulated length
134
passes 16-bits, reset to that value and skip the rest of the branch. */
135

136
for (;;)
137
  {
138
  int d, min, recno;
139
  PCRE2_UCHAR op;
140
  PCRE2_SPTR cs, ce;
141

142
  if (branchlength >= UINT16_MAX)
143
    {
144
    branchlength = UINT16_MAX;
145
    cc = nextbranch;
146
    }
147

148
  op = *cc;
149
  switch (op)
150
    {
151
    case OP_COND:
152
    case OP_SCOND:
153

154
    /* If there is only one branch in a condition, the implied branch has zero
155
    length, so we don't add anything. This covers the DEFINE "condition"
156
    automatically. If there are two branches we can treat it the same as any
157
    other non-capturing subpattern. */
158

159
    cs = cc + GET(cc, 1);
160
    if (*cs != OP_ALT)
161
      {
162
      cc = cs + 1 + LINK_SIZE;
163
      break;
164
      }
165
    goto PROCESS_NON_CAPTURE;
166

167
    case OP_BRA:
168
    /* There's a special case of OP_BRA, when it is wrapped round a repeated
169
    OP_RECURSE. We'd like to process the latter at this level so that
170
    remembering the value works for repeated cases. So we do nothing, but
171
    set a fudge value to skip over the OP_KET after the recurse. */
172

173
    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
174
      {
175
      once_fudge = 1 + LINK_SIZE;
176
      cc += 1 + LINK_SIZE;
177
      break;
178
      }
179
    /* Fall through */
180

181
    case OP_ONCE:
182
    case OP_SCRIPT_RUN:
183
    case OP_SBRA:
184
    case OP_BRAPOS:
185
    case OP_SBRAPOS:
186
    PROCESS_NON_CAPTURE:
187
    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
188
      backref_cache);
189
    if (d < 0) return d;
190
    branchlength += d;
191
    do cc += GET(cc, 1); while (*cc == OP_ALT);
192
    cc += 1 + LINK_SIZE;
193
    break;
194

195
    /* To save time for repeated capturing subpatterns, we remember the
196
    length of the previous one. Unfortunately we can't do the same for
197
    the unnumbered ones above. Nor can we do this if (?| is present in the
198
    pattern because captures with the same number are not then identical. */
199

200
    case OP_CBRA:
201
    case OP_SCBRA:
202
    case OP_CBRAPOS:
203
    case OP_SCBRAPOS:
204
    recno = (int)GET2(cc, 1+LINK_SIZE);
205
    if (dupcapused || recno != prev_cap_recno)
206
      {
207
      prev_cap_recno = recno;
208
      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
209
        backref_cache);
210
      if (prev_cap_d < 0) return prev_cap_d;
211
      }
212
    branchlength += prev_cap_d;
213
    do cc += GET(cc, 1); while (*cc == OP_ALT);
214
    cc += 1 + LINK_SIZE;
215
    break;
216

217
    /* ACCEPT makes things far too complicated; we have to give up. In fact,
218
    from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
219
    used. However, leave the code in place, just in case. */
220

221
    case OP_ACCEPT:
222
    case OP_ASSERT_ACCEPT:
223
    return -1;
224

225
    /* Reached end of a branch; if it's a ket it is the end of a nested
226
    call. If it's ALT it is an alternation in a nested call. If it is END it's
227
    the end of the outer call. All can be handled by the same code. If the
228
    length of any branch is zero, there is no need to scan any subsequent
229
    branches. */
230

231
    case OP_ALT:
232
    case OP_KET:
233
    case OP_KETRMAX:
234
    case OP_KETRMIN:
235
    case OP_KETRPOS:
236
    case OP_END:
237
    if (length < 0 || (!had_recurse && branchlength < length))
238
      length = branchlength;
239
    if (op != OP_ALT || length == 0) return length;
240
    nextbranch = cc + GET(cc, 1);
241
    cc += 1 + LINK_SIZE;
242
    branchlength = 0;
243
    had_recurse = FALSE;
244
    break;
245

246
    /* Skip over assertive subpatterns */
247

248
    case OP_ASSERT:
249
    case OP_ASSERT_NOT:
250
    case OP_ASSERTBACK:
251
    case OP_ASSERTBACK_NOT:
252
    case OP_ASSERT_NA:
253
    case OP_ASSERT_SCS:
254
    case OP_ASSERTBACK_NA:
255
    do cc += GET(cc, 1); while (*cc == OP_ALT);
256
    /* Fall through */
257

258
    /* Skip over things that don't match chars */
259

260
    case OP_REVERSE:
261
    case OP_VREVERSE:
262
    case OP_CREF:
263
    case OP_DNCREF:
264
    case OP_RREF:
265
    case OP_DNRREF:
266
    case OP_FALSE:
267
    case OP_TRUE:
268
    case OP_CALLOUT:
269
    case OP_SOD:
270
    case OP_SOM:
271
    case OP_EOD:
272
    case OP_EODN:
273
    case OP_CIRC:
274
    case OP_CIRCM:
275
    case OP_DOLL:
276
    case OP_DOLLM:
277
    case OP_NOT_WORD_BOUNDARY:
278
    case OP_WORD_BOUNDARY:
279
    case OP_NOT_UCP_WORD_BOUNDARY:
280
    case OP_UCP_WORD_BOUNDARY:
281
    cc += PRIV(OP_lengths)[*cc];
282
    break;
283

284
    case OP_CALLOUT_STR:
285
    cc += GET(cc, 1 + 2*LINK_SIZE);
286
    break;
287

288
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
289

290
    case OP_BRAZERO:
291
    case OP_BRAMINZERO:
292
    case OP_BRAPOSZERO:
293
    case OP_SKIPZERO:
294
    cc += PRIV(OP_lengths)[*cc];
295
    do cc += GET(cc, 1); while (*cc == OP_ALT);
296
    cc += 1 + LINK_SIZE;
297
    break;
298

299
    /* Handle literal characters and + repetitions */
300

301
    case OP_CHAR:
302
    case OP_CHARI:
303
    case OP_NOT:
304
    case OP_NOTI:
305
    case OP_PLUS:
306
    case OP_PLUSI:
307
    case OP_MINPLUS:
308
    case OP_MINPLUSI:
309
    case OP_POSPLUS:
310
    case OP_POSPLUSI:
311
    case OP_NOTPLUS:
312
    case OP_NOTPLUSI:
313
    case OP_NOTMINPLUS:
314
    case OP_NOTMINPLUSI:
315
    case OP_NOTPOSPLUS:
316
    case OP_NOTPOSPLUSI:
317
    branchlength++;
318
    cc += 2;
319
#ifdef SUPPORT_UNICODE
320
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
321
#endif
322
    break;
323

324
    case OP_TYPEPLUS:
325
    case OP_TYPEMINPLUS:
326
    case OP_TYPEPOSPLUS:
327
    branchlength++;
328
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
329
    break;
330

331
    /* Handle exact repetitions. The count is already in characters, but we
332
    may need to skip over a multibyte character in UTF mode.  */
333

334
    case OP_EXACT:
335
    case OP_EXACTI:
336
    case OP_NOTEXACT:
337
    case OP_NOTEXACTI:
338
    branchlength += GET2(cc,1);
339
    cc += 2 + IMM2_SIZE;
340
#ifdef SUPPORT_UNICODE
341
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
342
#endif
343
    break;
344

345
    case OP_TYPEEXACT:
346
    branchlength += GET2(cc,1);
347
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
348
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
349
    break;
350

351
    /* Handle single-char non-literal matchers */
352

353
    case OP_PROP:
354
    case OP_NOTPROP:
355
    cc += 2;
356
    /* Fall through */
357

358
    case OP_NOT_DIGIT:
359
    case OP_DIGIT:
360
    case OP_NOT_WHITESPACE:
361
    case OP_WHITESPACE:
362
    case OP_NOT_WORDCHAR:
363
    case OP_WORDCHAR:
364
    case OP_ANY:
365
    case OP_ALLANY:
366
    case OP_EXTUNI:
367
    case OP_HSPACE:
368
    case OP_NOT_HSPACE:
369
    case OP_VSPACE:
370
    case OP_NOT_VSPACE:
371
    branchlength++;
372
    cc++;
373
    break;
374

375
    /* "Any newline" might match two characters, but it also might match just
376
    one. */
377

378
    case OP_ANYNL:
379
    branchlength += 1;
380
    cc++;
381
    break;
382

383
    /* The single-byte matcher means we can't proceed in UTF mode. (In
384
    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
385
    appear, but leave the code, just in case.) */
386

387
    case OP_ANYBYTE:
388
#ifdef SUPPORT_UNICODE
389
    if (utf) return -1;
390
#endif
391
    branchlength++;
392
    cc++;
393
    break;
394

395
    /* For repeated character types, we have to test for \p and \P, which have
396
    an extra two bytes of parameters. */
397

398
    case OP_TYPESTAR:
399
    case OP_TYPEMINSTAR:
400
    case OP_TYPEQUERY:
401
    case OP_TYPEMINQUERY:
402
    case OP_TYPEPOSSTAR:
403
    case OP_TYPEPOSQUERY:
404
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
405
    cc += PRIV(OP_lengths)[op];
406
    break;
407

408
    case OP_TYPEUPTO:
409
    case OP_TYPEMINUPTO:
410
    case OP_TYPEPOSUPTO:
411
    if (cc[1 + IMM2_SIZE] == OP_PROP
412
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
413
    cc += PRIV(OP_lengths)[op];
414
    break;
415

416
    /* Check a class for variable quantification */
417

418
    case OP_CLASS:
419
    case OP_NCLASS:
420
#ifdef SUPPORT_WIDE_CHARS
421
    case OP_XCLASS:
422
    case OP_ECLASS:
423
    /* The original code caused an unsigned overflow in 64 bit systems,
424
    so now we use a conditional statement. */
425
    if (op == OP_XCLASS || op == OP_ECLASS)
426
      cc += GET(cc, 1);
427
    else
428
#endif
429
      cc += PRIV(OP_lengths)[OP_CLASS];
430

431
    switch (*cc)
432
      {
433
      case OP_CRPLUS:
434
      case OP_CRMINPLUS:
435
      case OP_CRPOSPLUS:
436
      branchlength++;
437
      /* Fall through */
438

439
      case OP_CRSTAR:
440
      case OP_CRMINSTAR:
441
      case OP_CRQUERY:
442
      case OP_CRMINQUERY:
443
      case OP_CRPOSSTAR:
444
      case OP_CRPOSQUERY:
445
      cc++;
446
      break;
447

448
      case OP_CRRANGE:
449
      case OP_CRMINRANGE:
450
      case OP_CRPOSRANGE:
451
      branchlength += GET2(cc,1);
452
      cc += 1 + 2 * IMM2_SIZE;
453
      break;
454

455
      default:
456
      branchlength++;
457
      break;
458
      }
459
    break;
460

461
    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
462
    way: we find the minimum length for the subpattern. A recursion
463
    (backreference or subroutine) causes an a flag to be set that causes the
464
    length of this branch to be ignored. The logic is that a recursion can only
465
    make sense if there is another alternative that stops the recursing. That
466
    will provide the minimum length (when no recursion happens).
467

468
    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
469
    matches an empty string (by default it causes a matching failure), so in
470
    that case we must set the minimum length to zero.
471

472
    For backreferenes, if duplicate numbers are present in the pattern we check
473
    for a reference to a duplicate. If it is, we don't know which version will
474
    be referenced, so we have to set the minimum length to zero. */
475

476
    /* Duplicate named pattern back reference. */
477

478
    case OP_DNREF:
479
    case OP_DNREFI:
480
    if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
481
      {
482
      int count = GET2(cc, 1+IMM2_SIZE);
483
      PCRE2_SPTR slot =
484
        (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
485
          GET2(cc, 1) * re->name_entry_size;
486

487
      d = INT_MAX;
488

489
      /* Scan all groups with the same name; find the shortest. */
490

491
      while (count-- > 0)
492
        {
493
        int dd, i;
494
        recno = GET2(slot, 0);
495

496
        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
497
          dd = backref_cache[recno];
498
        else
499
          {
500
          ce = cs = PRIV(find_bracket)(startcode, utf, recno);
501
          if (cs == NULL) return -2;
502
          do ce += GET(ce, 1); while (*ce == OP_ALT);
503

504
          dd = 0;
505
          if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
506
            {
507
            if (cc > cs && cc < ce)    /* Simple recursion */
508
              {
509
              had_recurse = TRUE;
510
              }
511
            else
512
              {
513
              recurse_check *r = recurses;
514
              for (r = recurses; r != NULL; r = r->prev)
515
                if (r->group == cs) break;
516
              if (r != NULL)           /* Mutual recursion */
517
                {
518
                had_recurse = TRUE;
519
                }
520
              else
521
                {
522
                this_recurse.prev = recurses;  /* No recursion */
523
                this_recurse.group = cs;
524
                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525
                  countptr, backref_cache);
526
                if (dd < 0) return dd;
527
                }
528
              }
529
            }
530

531
          backref_cache[recno] = dd;
532
          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533
          backref_cache[0] = recno;
534
          }
535

536
        if (dd < d) d = dd;
537
        if (d <= 0) break;    /* No point looking at any more */
538
        slot += re->name_entry_size;
539
        }
540
      }
541
    else d = 0;
542
    cc += PRIV(OP_lengths)[*cc];
543
    goto REPEAT_BACK_REFERENCE;
544

545
    /* Single back reference by number. References by name are converted to by
546
    number when there is no duplication. */
547

548
    case OP_REF:
549
    case OP_REFI:
550
    recno = GET2(cc, 1);
551
    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552
      d = backref_cache[recno];
553
    else
554
      {
555
      int i;
556
      d = 0;
557

558
      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559
        {
560
        ce = cs = PRIV(find_bracket)(startcode, utf, recno);
561
        if (cs == NULL) return -2;
562
        do ce += GET(ce, 1); while (*ce == OP_ALT);
563

564
        if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
565
          {
566
          if (cc > cs && cc < ce)    /* Simple recursion */
567
            {
568
            had_recurse = TRUE;
569
            }
570
          else
571
            {
572
            recurse_check *r = recurses;
573
            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
574
            if (r != NULL)           /* Mutual recursion */
575
              {
576
              had_recurse = TRUE;
577
              }
578
            else                     /* No recursion */
579
              {
580
              this_recurse.prev = recurses;
581
              this_recurse.group = cs;
582
              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
583
                backref_cache);
584
              if (d < 0) return d;
585
              }
586
            }
587
          }
588
        }
589

590
      backref_cache[recno] = d;
591
      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
592
      backref_cache[0] = recno;
593
      }
594

595
    cc += PRIV(OP_lengths)[*cc];
596

597
    /* Handle repeated back references */
598

599
    REPEAT_BACK_REFERENCE:
600
    switch (*cc)
601
      {
602
      case OP_CRSTAR:
603
      case OP_CRMINSTAR:
604
      case OP_CRQUERY:
605
      case OP_CRMINQUERY:
606
      case OP_CRPOSSTAR:
607
      case OP_CRPOSQUERY:
608
      min = 0;
609
      cc++;
610
      break;
611

612
      case OP_CRPLUS:
613
      case OP_CRMINPLUS:
614
      case OP_CRPOSPLUS:
615
      min = 1;
616
      cc++;
617
      break;
618

619
      case OP_CRRANGE:
620
      case OP_CRMINRANGE:
621
      case OP_CRPOSRANGE:
622
      min = GET2(cc, 1);
623
      cc += 1 + 2 * IMM2_SIZE;
624
      break;
625

626
      default:
627
      min = 1;
628
      break;
629
      }
630

631
     /* Take care not to overflow: (1) min and d are ints, so check that their
632
     product is not greater than INT_MAX. (2) branchlength is limited to
633
     UINT16_MAX (checked at the top of the loop). */
634

635
    if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
636
      branchlength = UINT16_MAX;
637
    else branchlength += min * d;
638
    break;
639

640
    /* Recursion always refers to the first occurrence of a subpattern with a
641
    given number. Therefore, we can always make use of caching, even when the
642
    pattern contains multiple subpatterns with the same number. */
643

644
    case OP_RECURSE:
645
    cs = ce = startcode + GET(cc, 1);
646
    recno = GET2(cs, 1+LINK_SIZE);
647
    if (recno == prev_recurse_recno)
648
      {
649
      branchlength += prev_recurse_d;
650
      }
651
    else
652
      {
653
      do ce += GET(ce, 1); while (*ce == OP_ALT);
654
      if (cc > cs && cc < ce)    /* Simple recursion */
655
        had_recurse = TRUE;
656
      else
657
        {
658
        recurse_check *r = recurses;
659
        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
660
        if (r != NULL)          /* Mutual recursion */
661
          had_recurse = TRUE;
662
        else
663
          {
664
          this_recurse.prev = recurses;
665
          this_recurse.group = cs;
666
          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
667
            countptr, backref_cache);
668
          if (prev_recurse_d < 0) return prev_recurse_d;
669
          prev_recurse_recno = recno;
670
          branchlength += prev_recurse_d;
671
          }
672
        }
673
      }
674
    cc += 1 + LINK_SIZE + once_fudge;
675
    once_fudge = 0;
676
    break;
677

678
    /* Anything else does not or need not match a character. We can get the
679
    item's length from the table, but for those that can match zero occurrences
680
    of a character, we must take special action for UTF-8 characters. As it
681
    happens, the "NOT" versions of these opcodes are used at present only for
682
    ASCII characters, so they could be omitted from this list. However, in
683
    future that may change, so we include them here so as not to leave a
684
    gotcha for a future maintainer. */
685

686
    case OP_UPTO:
687
    case OP_UPTOI:
688
    case OP_NOTUPTO:
689
    case OP_NOTUPTOI:
690
    case OP_MINUPTO:
691
    case OP_MINUPTOI:
692
    case OP_NOTMINUPTO:
693
    case OP_NOTMINUPTOI:
694
    case OP_POSUPTO:
695
    case OP_POSUPTOI:
696
    case OP_NOTPOSUPTO:
697
    case OP_NOTPOSUPTOI:
698

699
    case OP_STAR:
700
    case OP_STARI:
701
    case OP_NOTSTAR:
702
    case OP_NOTSTARI:
703
    case OP_MINSTAR:
704
    case OP_MINSTARI:
705
    case OP_NOTMINSTAR:
706
    case OP_NOTMINSTARI:
707
    case OP_POSSTAR:
708
    case OP_POSSTARI:
709
    case OP_NOTPOSSTAR:
710
    case OP_NOTPOSSTARI:
711

712
    case OP_QUERY:
713
    case OP_QUERYI:
714
    case OP_NOTQUERY:
715
    case OP_NOTQUERYI:
716
    case OP_MINQUERY:
717
    case OP_MINQUERYI:
718
    case OP_NOTMINQUERY:
719
    case OP_NOTMINQUERYI:
720
    case OP_POSQUERY:
721
    case OP_POSQUERYI:
722
    case OP_NOTPOSQUERY:
723
    case OP_NOTPOSQUERYI:
724

725
    cc += PRIV(OP_lengths)[op];
726
#ifdef SUPPORT_UNICODE
727
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
728
#endif
729
    break;
730

731
    /* Skip these, but we need to add in the name length. */
732

733
    case OP_MARK:
734
    case OP_COMMIT_ARG:
735
    case OP_PRUNE_ARG:
736
    case OP_SKIP_ARG:
737
    case OP_THEN_ARG:
738
    cc += PRIV(OP_lengths)[op] + cc[1];
739
    break;
740

741
    /* The remaining opcodes are just skipped over. */
742

743
    case OP_CLOSE:
744
    case OP_COMMIT:
745
    case OP_FAIL:
746
    case OP_PRUNE:
747
    case OP_SET_SOM:
748
    case OP_SKIP:
749
    case OP_THEN:
750
    cc += PRIV(OP_lengths)[op];
751
    break;
752

753
    /* This should not occur: we list all opcodes explicitly so that when
754
    new ones get added they are properly considered. */
755

756
    default:
757
    PCRE2_DEBUG_UNREACHABLE();
758
    return -3;
759
    }
760
  }
761

762
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
763
return -3;                 /* Avoid compiler warnings */
764
}
765

766

767

768
/*************************************************
769
*      Set a bit and maybe its alternate case    *
770
*************************************************/
771

772
/* Given a character, set its first code unit's bit in the table, and also the
773
corresponding bit for the other version of a letter if we are caseless.
774

775
Arguments:
776
  re            points to the regex block
777
  p             points to the first code unit of the character
778
  caseless      TRUE if caseless
779
  utf           TRUE for UTF mode
780
  ucp           TRUE for UCP mode
781

782
Returns:        pointer after the character
783
*/
784

785
static PCRE2_SPTR
786
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
787
  BOOL ucp)
788
{
789
uint32_t c = *p++;   /* First code unit */
790

791
(void)utf;           /* Stop compiler warnings when UTF not supported */
792
(void)ucp;
793

794
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
795
0xff. */
796

797
#if PCRE2_CODE_UNIT_WIDTH != 8
798
if (c > 0xff) SET_BIT(0xff); else
799
#endif
800

801
SET_BIT(c);
802

803
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
804
the end of the character, even when caseless. */
805

806
#ifdef SUPPORT_UNICODE
807
if (utf)
808
  {
809
#if PCRE2_CODE_UNIT_WIDTH == 8
810
  if (c >= 0xc0) GETUTF8INC(c, p);
811
#elif PCRE2_CODE_UNIT_WIDTH == 16
812
  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
813
#endif
814
  }
815
#endif  /* SUPPORT_UNICODE */
816

817
/* If caseless, handle the other case of the character. */
818

819
if (caseless)
820
  {
821
#ifdef SUPPORT_UNICODE
822
  if (utf || ucp)
823
    {
824
    c = UCD_OTHERCASE(c);
825
#if PCRE2_CODE_UNIT_WIDTH == 8
826
    if (utf)
827
      {
828
      PCRE2_UCHAR buff[6];
829
      (void)PRIV(ord2utf)(c, buff);
830
      SET_BIT(buff[0]);
831
      }
832
    else if (c < 256) SET_BIT(c);
833
#else  /* 16-bit or 32-bit mode */
834
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
835
#endif
836
    }
837

838
  else
839
#endif  /* SUPPORT_UNICODE */
840

841
  /* Not UTF or UCP */
842

843
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
844
  }
845

846
return p;
847
}
848

849

850

851
/*************************************************
852
*     Set bits for a positive character type     *
853
*************************************************/
854

855
/* This function sets starting bits for a character type. In UTF-8 mode, we can
856
only do a direct setting for bytes less than 128, as otherwise there can be
857
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
858
environment, the tables will only recognize ASCII characters anyway, but in at
859
least one Windows environment, some higher bytes bits were set in the tables.
860
So we deal with that case by considering the UTF-8 encoding.
861

862
Arguments:
863
  re             the regex block
864
  cbit type      the type of character wanted
865
  table_limit    32 for non-UTF-8; 16 for UTF-8
866

867
Returns:         nothing
868
*/
869

870
static void
871
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
872
{
873
uint32_t c;
874
for (c = 0; c < table_limit; c++)
875
  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
876
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
877
if (table_limit == 32) return;
878
for (c = 128; c < 256; c++)
879
  {
880
  if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
881
    {
882
    PCRE2_UCHAR buff[6];
883
    (void)PRIV(ord2utf)(c, buff);
884
    SET_BIT(buff[0]);
885
    }
886
  }
887
#endif  /* UTF-8 */
888
}
889

890

891
/*************************************************
892
*     Set bits for a negative character type     *
893
*************************************************/
894

895
/* This function sets starting bits for a negative character type such as \D.
896
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
897
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
898
Unlike in the positive case, where we can set appropriate starting bits for
899
specific high-valued UTF-8 characters, in this case we have to set the bits for
900
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
901
0xc0 (192) for simplicity.
902

903
Arguments:
904
  re             the regex block
905
  cbit type      the type of character wanted
906
  table_limit    32 for non-UTF-8; 16 for UTF-8
907

908
Returns:         nothing
909
*/
910

911
static void
912
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
913
{
914
uint32_t c;
915
for (c = 0; c < table_limit; c++)
916
  re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
917
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
918
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
919
#endif
920
}
921

922

923

924
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
925
/*************************************************
926
*     Set starting bits for a character list.    *
927
*************************************************/
928

929
/* This function sets starting bits for a character list. It enumerates
930
all characters and character ranges in the character list, and sets
931
the starting bits accordingly.
932

933
Arguments:
934
  code           pointer to the code
935
  start_bitmap   pointer to the starting bitmap
936

937
Returns:         nothing
938
*/
939
static void
940
study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,
941
  const uint8_t *char_lists_end)
942
{
943
uint32_t type, list_ind;
944
uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
945
uint32_t range_start = ~(uint32_t)0, range_end = 0;
946
const uint8_t *next_char;
947
PCRE2_UCHAR start_buffer[6], end_buffer[6];
948
PCRE2_UCHAR start, end;
949

950
/* Only needed in 8-bit mode at the moment. */
951
type = (uint32_t)(code[0] << 8) | code[1];
952
code += 2;
953

954
/* Align characters. */
955
next_char = char_lists_end - (GET(code, 0) << 1);
956
type &= XCL_TYPE_MASK;
957
list_ind = 0;
958

959
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
960
  range_start = XCL_CHAR_LIST_LOW_16_START;
961

962
while (type > 0)
963
  {
964
  uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
965

966
  if (item_count == XCL_ITEM_COUNT_MASK)
967
    {
968
    if (list_ind <= 1)
969
      {
970
      item_count = *(const uint16_t*)next_char;
971
      next_char += 2;
972
      }
973
    else
974
      {
975
      item_count = *(const uint32_t*)next_char;
976
      next_char += 4;
977
      }
978
    }
979

980
  while (item_count > 0)
981
    {
982
    if (list_ind <= 1)
983
      {
984
      range_end = *(const uint16_t*)next_char;
985
      next_char += 2;
986
      }
987
    else
988
      {
989
      range_end = *(const uint32_t*)next_char;
990
      next_char += 4;
991
      }
992

993
    if ((range_end & XCL_CHAR_END) != 0)
994
      {
995
      range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
996

997
      PRIV(ord2utf)(range_end, end_buffer);
998
      end = end_buffer[0];
999

1000
      if (range_start < range_end)
1001
        {
1002
        PRIV(ord2utf)(range_start, start_buffer);
1003
        for (start = start_buffer[0]; start <= end; start++)
1004
          start_bitmap[start / 8] |= (1u << (start & 7));
1005
        }
1006
      else
1007
        start_bitmap[end / 8] |= (1u << (end & 7));
1008

1009
      range_start = ~(uint32_t)0;
1010
      }
1011
    else
1012
      range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
1013

1014
    item_count--;
1015
    }
1016

1017
  list_ind++;
1018
  type >>= XCL_TYPE_BIT_LEN;
1019

1020
  if (range_start == ~(uint32_t)0)
1021
    {
1022
    if ((type & XCL_BEGIN_WITH_RANGE) != 0)
1023
      {
1024
      /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */
1025
      if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
1026
      else range_start = XCL_CHAR_LIST_LOW_32_START;
1027
      }
1028
    }
1029
  else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
1030
    {
1031
    PRIV(ord2utf)(range_start, start_buffer);
1032

1033
    /* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and
1034
    XCL_CHAR_LIST_HIGH_32_END are not possible. */
1035
    if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
1036
    else range_end = XCL_CHAR_LIST_HIGH_16_END;
1037

1038
    PRIV(ord2utf)(range_end, end_buffer);
1039
    end = end_buffer[0];
1040

1041
    for (start = start_buffer[0]; start <= end; start++)
1042
      start_bitmap[start / 8] |= (1u << (start & 7));
1043

1044
    range_start = ~(uint32_t)0;
1045
    }
1046

1047
  /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */
1048
  if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
1049
  else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
1050
  }
1051
}
1052
#endif
1053

1054

1055

1056
/*************************************************
1057
*      Create bitmap of starting code units      *
1058
*************************************************/
1059

1060
/* This function scans a compiled unanchored expression recursively and
1061
attempts to build a bitmap of the set of possible starting code units whose
1062
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
1063
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
1064
we pass a value of 16 rather than 32 as the final argument. (See comments in
1065
those functions for the reason.)
1066

1067
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
1068
(a*)b where the group provides some optional starting code units but scanning
1069
must continue at the outer level to find at least one mandatory code unit. At
1070
the outermost level, this function fails unless the result is SSB_DONE.
1071

1072
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
1073
issues.
1074

1075
Arguments:
1076
  re           points to the compiled regex block
1077
  code         points to an expression
1078
  utf          TRUE if in UTF mode
1079
  ucp          TRUE if in UCP mode
1080
  depthptr     pointer to recurse depth
1081

1082
Returns:       SSB_FAIL     => Failed to find any starting code units
1083
               SSB_DONE     => Found mandatory starting code units
1084
               SSB_CONTINUE => Found optional starting code units
1085
               SSB_UNKNOWN  => Hit an unrecognized opcode
1086
               SSB_TOODEEP  => Recursion is too deep
1087
*/
1088

1089
static int
1090
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
1091
  int *depthptr)
1092
{
1093
uint32_t c;
1094
int yield = SSB_DONE;
1095

1096
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1097
int table_limit = utf? 16:32;
1098
#else
1099
int table_limit = 32;
1100
#endif
1101

1102
*depthptr += 1;
1103
if (*depthptr > 1000) return SSB_TOODEEP;
1104

1105
do
1106
  {
1107
  BOOL try_next = TRUE;
1108
  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
1109

1110
  if (*code == OP_CBRA || *code == OP_SCBRA ||
1111
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1112

1113
  while (try_next)    /* Loop for items in this branch */
1114
    {
1115
    int rc;
1116
    PCRE2_SPTR ncode;
1117
    const uint8_t *classmap = NULL;
1118
#ifdef SUPPORT_WIDE_CHARS
1119
    PCRE2_UCHAR xclassflags;
1120
#endif
1121

1122
    switch(*tcode)
1123
      {
1124
      /* If we reach something we don't understand, it means a new opcode has
1125
      been created that hasn't been added to this function. Hopefully this
1126
      problem will be discovered during testing. */
1127

1128
      default:
1129
      return SSB_UNKNOWN;
1130

1131
      /* Fail for a valid opcode that implies no starting bits. */
1132

1133
      case OP_ACCEPT:
1134
      case OP_ASSERT_ACCEPT:
1135
      case OP_ALLANY:
1136
      case OP_ANY:
1137
      case OP_ANYBYTE:
1138
      case OP_CIRCM:
1139
      case OP_CLOSE:
1140
      case OP_COMMIT:
1141
      case OP_COMMIT_ARG:
1142
      case OP_COND:
1143
      case OP_CREF:
1144
      case OP_FALSE:
1145
      case OP_TRUE:
1146
      case OP_DNCREF:
1147
      case OP_DNREF:
1148
      case OP_DNREFI:
1149
      case OP_DNRREF:
1150
      case OP_DOLL:
1151
      case OP_DOLLM:
1152
      case OP_END:
1153
      case OP_EOD:
1154
      case OP_EODN:
1155
      case OP_EXTUNI:
1156
      case OP_FAIL:
1157
      case OP_MARK:
1158
      case OP_NOT:
1159
      case OP_NOTEXACT:
1160
      case OP_NOTEXACTI:
1161
      case OP_NOTI:
1162
      case OP_NOTMINPLUS:
1163
      case OP_NOTMINPLUSI:
1164
      case OP_NOTMINQUERY:
1165
      case OP_NOTMINQUERYI:
1166
      case OP_NOTMINSTAR:
1167
      case OP_NOTMINSTARI:
1168
      case OP_NOTMINUPTO:
1169
      case OP_NOTMINUPTOI:
1170
      case OP_NOTPLUS:
1171
      case OP_NOTPLUSI:
1172
      case OP_NOTPOSPLUS:
1173
      case OP_NOTPOSPLUSI:
1174
      case OP_NOTPOSQUERY:
1175
      case OP_NOTPOSQUERYI:
1176
      case OP_NOTPOSSTAR:
1177
      case OP_NOTPOSSTARI:
1178
      case OP_NOTPOSUPTO:
1179
      case OP_NOTPOSUPTOI:
1180
      case OP_NOTPROP:
1181
      case OP_NOTQUERY:
1182
      case OP_NOTQUERYI:
1183
      case OP_NOTSTAR:
1184
      case OP_NOTSTARI:
1185
      case OP_NOTUPTO:
1186
      case OP_NOTUPTOI:
1187
      case OP_NOT_HSPACE:
1188
      case OP_NOT_VSPACE:
1189
      case OP_PRUNE:
1190
      case OP_PRUNE_ARG:
1191
      case OP_RECURSE:
1192
      case OP_REF:
1193
      case OP_REFI:
1194
      case OP_REVERSE:
1195
      case OP_VREVERSE:
1196
      case OP_RREF:
1197
      case OP_SCOND:
1198
      case OP_SET_SOM:
1199
      case OP_SKIP:
1200
      case OP_SKIP_ARG:
1201
      case OP_SOD:
1202
      case OP_SOM:
1203
      case OP_THEN:
1204
      case OP_THEN_ARG:
1205
      return SSB_FAIL;
1206

1207
      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1208
      uses OP_CIRCM). Skip over it. */
1209

1210
      case OP_CIRC:
1211
      tcode += PRIV(OP_lengths)[OP_CIRC];
1212
      break;
1213

1214
      /* A "real" property test implies no starting bits, but the fake property
1215
      PT_CLIST identifies a list of characters. These lists are short, as they
1216
      are used for characters with more than one "other case", so there is no
1217
      point in recognizing them for OP_NOTPROP. */
1218

1219
      case OP_PROP:
1220
      if (tcode[1] != PT_CLIST) return SSB_FAIL;
1221
        {
1222
        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1223
        while ((c = *p++) < NOTACHAR)
1224
          {
1225
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1226
          if (utf)
1227
            {
1228
            PCRE2_UCHAR buff[6];
1229
            (void)PRIV(ord2utf)(c, buff);
1230
            c = buff[0];
1231
            }
1232
#endif
1233
          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1234
          }
1235
        }
1236
      try_next = FALSE;
1237
      break;
1238

1239
      /* We can ignore word boundary tests. */
1240

1241
      case OP_WORD_BOUNDARY:
1242
      case OP_NOT_WORD_BOUNDARY:
1243
      case OP_UCP_WORD_BOUNDARY:
1244
      case OP_NOT_UCP_WORD_BOUNDARY:
1245
      tcode++;
1246
      break;
1247

1248
      /* For a positive lookahead assertion, inspect what immediately follows,
1249
      ignoring intermediate assertions and callouts. If the next item is one
1250
      that sets a mandatory character, skip this assertion. Otherwise, treat it
1251
      the same as other bracket groups. */
1252

1253
      case OP_ASSERT:
1254
      case OP_ASSERT_NA:
1255
      ncode = tcode + GET(tcode, 1);
1256
      while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1257
      ncode += 1 + LINK_SIZE;
1258

1259
      /* Skip irrelevant items */
1260

1261
      for (BOOL done = FALSE; !done;)
1262
        {
1263
        switch (*ncode)
1264
          {
1265
          case OP_ASSERT:
1266
          case OP_ASSERT_NOT:
1267
          case OP_ASSERTBACK:
1268
          case OP_ASSERTBACK_NOT:
1269
          case OP_ASSERT_NA:
1270
          case OP_ASSERTBACK_NA:
1271
          case OP_ASSERT_SCS:
1272
          ncode += GET(ncode, 1);
1273
          while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1274
          ncode += 1 + LINK_SIZE;
1275
          break;
1276

1277
          case OP_WORD_BOUNDARY:
1278
          case OP_NOT_WORD_BOUNDARY:
1279
          case OP_UCP_WORD_BOUNDARY:
1280
          case OP_NOT_UCP_WORD_BOUNDARY:
1281
          ncode++;
1282
          break;
1283

1284
          case OP_CALLOUT:
1285
          ncode += PRIV(OP_lengths)[OP_CALLOUT];
1286
          break;
1287

1288
          case OP_CALLOUT_STR:
1289
          ncode += GET(ncode, 1 + 2*LINK_SIZE);
1290
          break;
1291

1292
          default:
1293
          done = TRUE;
1294
          break;
1295
          }
1296
        }
1297

1298
      /* Now check the next significant item. */
1299

1300
      switch(*ncode)
1301
        {
1302
        default:
1303
        break;
1304

1305
        case OP_PROP:
1306
        if (ncode[1] != PT_CLIST) break;
1307
        /* Fall through */
1308
        case OP_ANYNL:
1309
        case OP_CHAR:
1310
        case OP_CHARI:
1311
        case OP_EXACT:
1312
        case OP_EXACTI:
1313
        case OP_HSPACE:
1314
        case OP_MINPLUS:
1315
        case OP_MINPLUSI:
1316
        case OP_PLUS:
1317
        case OP_PLUSI:
1318
        case OP_POSPLUS:
1319
        case OP_POSPLUSI:
1320
        case OP_VSPACE:
1321
        /* Note that these types will only be present in non-UCP mode. */
1322
        case OP_DIGIT:
1323
        case OP_NOT_DIGIT:
1324
        case OP_WORDCHAR:
1325
        case OP_NOT_WORDCHAR:
1326
        case OP_WHITESPACE:
1327
        case OP_NOT_WHITESPACE:
1328
        tcode = ncode;
1329
        continue;   /* With the following significant opcode */
1330
        }
1331
      /* Fall through */
1332

1333
      /* For a group bracket or a positive assertion without an immediately
1334
      following mandatory setting, recurse to set bits from within the
1335
      subpattern. If it can't find anything, we have to give up. If it finds
1336
      some mandatory character(s), we are done for this branch. Otherwise,
1337
      carry on scanning after the subpattern. */
1338

1339
      case OP_BRA:
1340
      case OP_SBRA:
1341
      case OP_CBRA:
1342
      case OP_SCBRA:
1343
      case OP_BRAPOS:
1344
      case OP_SBRAPOS:
1345
      case OP_CBRAPOS:
1346
      case OP_SCBRAPOS:
1347
      case OP_ONCE:
1348
      case OP_SCRIPT_RUN:
1349
      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1350
      if (rc == SSB_DONE)
1351
        {
1352
        try_next = FALSE;
1353
        }
1354
      else if (rc == SSB_CONTINUE)
1355
        {
1356
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1357
        tcode += 1 + LINK_SIZE;
1358
        }
1359
      else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1360
      break;
1361

1362
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1363
      this branch, though we might have found something optional. For ALT, we
1364
      continue with the next alternative, but we have to arrange that the final
1365
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1366
      return SSB_CONTINUE: if this is the top level, that indicates failure,
1367
      but after a nested subpattern, it causes scanning to continue. */
1368

1369
      case OP_ALT:
1370
      yield = SSB_CONTINUE;
1371
      try_next = FALSE;
1372
      break;
1373

1374
      case OP_KET:
1375
      case OP_KETRMAX:
1376
      case OP_KETRMIN:
1377
      case OP_KETRPOS:
1378
      return SSB_CONTINUE;
1379

1380
      /* Skip over callout */
1381

1382
      case OP_CALLOUT:
1383
      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1384
      break;
1385

1386
      case OP_CALLOUT_STR:
1387
      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1388
      break;
1389

1390
      /* Skip over lookbehind, negative lookahead, and scan substring
1391
      assertions */
1392

1393
      case OP_ASSERT_NOT:
1394
      case OP_ASSERTBACK:
1395
      case OP_ASSERTBACK_NOT:
1396
      case OP_ASSERTBACK_NA:
1397
      case OP_ASSERT_SCS:
1398
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1399
      tcode += 1 + LINK_SIZE;
1400
      break;
1401

1402
      /* BRAZERO does the bracket, but carries on. */
1403

1404
      case OP_BRAZERO:
1405
      case OP_BRAMINZERO:
1406
      case OP_BRAPOSZERO:
1407
      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1408
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1409
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1410
      tcode += 1 + LINK_SIZE;
1411
      break;
1412

1413
      /* SKIPZERO skips the bracket. */
1414

1415
      case OP_SKIPZERO:
1416
      tcode++;
1417
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1418
      tcode += 1 + LINK_SIZE;
1419
      break;
1420

1421
      /* Single-char * or ? sets the bit and tries the next item */
1422

1423
      case OP_STAR:
1424
      case OP_MINSTAR:
1425
      case OP_POSSTAR:
1426
      case OP_QUERY:
1427
      case OP_MINQUERY:
1428
      case OP_POSQUERY:
1429
      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1430
      break;
1431

1432
      case OP_STARI:
1433
      case OP_MINSTARI:
1434
      case OP_POSSTARI:
1435
      case OP_QUERYI:
1436
      case OP_MINQUERYI:
1437
      case OP_POSQUERYI:
1438
      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1439
      break;
1440

1441
      /* Single-char upto sets the bit and tries the next */
1442

1443
      case OP_UPTO:
1444
      case OP_MINUPTO:
1445
      case OP_POSUPTO:
1446
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1447
      break;
1448

1449
      case OP_UPTOI:
1450
      case OP_MINUPTOI:
1451
      case OP_POSUPTOI:
1452
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1453
      break;
1454

1455
      /* At least one single char sets the bit and stops */
1456

1457
      case OP_EXACT:
1458
      tcode += IMM2_SIZE;
1459
      /* Fall through */
1460
      case OP_CHAR:
1461
      case OP_PLUS:
1462
      case OP_MINPLUS:
1463
      case OP_POSPLUS:
1464
      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1465
      try_next = FALSE;
1466
      break;
1467

1468
      case OP_EXACTI:
1469
      tcode += IMM2_SIZE;
1470
      /* Fall through */
1471
      case OP_CHARI:
1472
      case OP_PLUSI:
1473
      case OP_MINPLUSI:
1474
      case OP_POSPLUSI:
1475
      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1476
      try_next = FALSE;
1477
      break;
1478

1479
      /* Special spacing and line-terminating items. These recognize specific
1480
      lists of characters. The difference between VSPACE and ANYNL is that the
1481
      latter can match the two-character CRLF sequence, but that is not
1482
      relevant for finding the first character, so their code here is
1483
      identical. */
1484

1485
      case OP_HSPACE:
1486
      SET_BIT(CHAR_HT);
1487
      SET_BIT(CHAR_SPACE);
1488

1489
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1490
      the bits for 0xA0 and for code units >= 255, independently of UTF. */
1491

1492
#if PCRE2_CODE_UNIT_WIDTH != 8
1493
      SET_BIT(0xA0);
1494
      SET_BIT(0xFF);
1495
#else
1496
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1497
      units of horizontal space characters. */
1498

1499
#ifdef SUPPORT_UNICODE
1500
      if (utf)
1501
        {
1502
        SET_BIT(0xC2);  /* For U+00A0 */
1503
        SET_BIT(0xE1);  /* For U+1680, U+180E */
1504
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1505
        SET_BIT(0xE3);  /* For U+3000 */
1506
        }
1507
      else
1508
#endif
1509
      /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1510
      the code is EBCDIC. */
1511
        {
1512
#ifndef EBCDIC
1513
        SET_BIT(0xA0);
1514
#endif  /* Not EBCDIC */
1515
        }
1516
#endif  /* 8-bit support */
1517

1518
      try_next = FALSE;
1519
      break;
1520

1521
      case OP_ANYNL:
1522
      case OP_VSPACE:
1523
      SET_BIT(CHAR_LF);
1524
      SET_BIT(CHAR_VT);
1525
      SET_BIT(CHAR_FF);
1526
      SET_BIT(CHAR_CR);
1527

1528
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529
      the bits for NEL and for code units >= 255, independently of UTF. */
1530

1531
#if PCRE2_CODE_UNIT_WIDTH != 8
1532
      SET_BIT(CHAR_NEL);
1533
      SET_BIT(0xFF);
1534
#else
1535
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536
      units of vertical space characters. */
1537

1538
#ifdef SUPPORT_UNICODE
1539
      if (utf)
1540
        {
1541
        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1542
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1543
        }
1544
      else
1545
#endif
1546
      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547
        {
1548
        SET_BIT(CHAR_NEL);
1549
        }
1550
#endif  /* 8-bit support */
1551

1552
      try_next = FALSE;
1553
      break;
1554

1555
      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1556
      is set, we do not see these opcodes because \d etc are converted to
1557
      properties. Therefore, these apply in the case when only characters less
1558
      than 256 are recognized to match the types. */
1559

1560
      case OP_NOT_DIGIT:
1561
      set_nottype_bits(re, cbit_digit, table_limit);
1562
      try_next = FALSE;
1563
      break;
1564

1565
      case OP_DIGIT:
1566
      set_type_bits(re, cbit_digit, table_limit);
1567
      try_next = FALSE;
1568
      break;
1569

1570
      case OP_NOT_WHITESPACE:
1571
      set_nottype_bits(re, cbit_space, table_limit);
1572
      try_next = FALSE;
1573
      break;
1574

1575
      case OP_WHITESPACE:
1576
      set_type_bits(re, cbit_space, table_limit);
1577
      try_next = FALSE;
1578
      break;
1579

1580
      case OP_NOT_WORDCHAR:
1581
      set_nottype_bits(re, cbit_word, table_limit);
1582
      try_next = FALSE;
1583
      break;
1584

1585
      case OP_WORDCHAR:
1586
      set_type_bits(re, cbit_word, table_limit);
1587
      try_next = FALSE;
1588
      break;
1589

1590
      /* One or more character type fudges the pointer and restarts, knowing
1591
      it will hit a single character type and stop there. */
1592

1593
      case OP_TYPEPLUS:
1594
      case OP_TYPEMINPLUS:
1595
      case OP_TYPEPOSPLUS:
1596
      tcode++;
1597
      break;
1598

1599
      case OP_TYPEEXACT:
1600
      tcode += 1 + IMM2_SIZE;
1601
      break;
1602

1603
      /* Zero or more repeats of character types set the bits and then
1604
      try again. */
1605

1606
      case OP_TYPEUPTO:
1607
      case OP_TYPEMINUPTO:
1608
      case OP_TYPEPOSUPTO:
1609
      tcode += IMM2_SIZE;  /* Fall through */
1610

1611
      case OP_TYPESTAR:
1612
      case OP_TYPEMINSTAR:
1613
      case OP_TYPEPOSSTAR:
1614
      case OP_TYPEQUERY:
1615
      case OP_TYPEMINQUERY:
1616
      case OP_TYPEPOSQUERY:
1617
      switch(tcode[1])
1618
        {
1619
        default:
1620
        case OP_ANY:
1621
        case OP_ALLANY:
1622
        return SSB_FAIL;
1623

1624
        case OP_HSPACE:
1625
        SET_BIT(CHAR_HT);
1626
        SET_BIT(CHAR_SPACE);
1627

1628
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1629
        the bits for 0xA0 and for code units >= 255, independently of UTF. */
1630

1631
#if PCRE2_CODE_UNIT_WIDTH != 8
1632
        SET_BIT(0xA0);
1633
        SET_BIT(0xFF);
1634
#else
1635
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1636
        units of horizontal space characters. */
1637

1638
#ifdef SUPPORT_UNICODE
1639
        if (utf)
1640
          {
1641
          SET_BIT(0xC2);  /* For U+00A0 */
1642
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1643
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1644
          SET_BIT(0xE3);  /* For U+3000 */
1645
          }
1646
        else
1647
#endif
1648
        /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1649
        the code is EBCDIC. */
1650
          {
1651
#ifndef EBCDIC
1652
          SET_BIT(0xA0);
1653
#endif  /* Not EBCDIC */
1654
          }
1655
#endif  /* 8-bit support */
1656
        break;
1657

1658
        case OP_ANYNL:
1659
        case OP_VSPACE:
1660
        SET_BIT(CHAR_LF);
1661
        SET_BIT(CHAR_VT);
1662
        SET_BIT(CHAR_FF);
1663
        SET_BIT(CHAR_CR);
1664

1665
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1666
        the bits for NEL and for code units >= 255, independently of UTF. */
1667

1668
#if PCRE2_CODE_UNIT_WIDTH != 8
1669
        SET_BIT(CHAR_NEL);
1670
        SET_BIT(0xFF);
1671
#else
1672
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1673
        units of vertical space characters. */
1674

1675
#ifdef SUPPORT_UNICODE
1676
        if (utf)
1677
          {
1678
          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1679
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1680
          }
1681
        else
1682
#endif
1683
        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1684
          {
1685
          SET_BIT(CHAR_NEL);
1686
          }
1687
#endif  /* 8-bit support */
1688
        break;
1689

1690
        case OP_NOT_DIGIT:
1691
        set_nottype_bits(re, cbit_digit, table_limit);
1692
        break;
1693

1694
        case OP_DIGIT:
1695
        set_type_bits(re, cbit_digit, table_limit);
1696
        break;
1697

1698
        case OP_NOT_WHITESPACE:
1699
        set_nottype_bits(re, cbit_space, table_limit);
1700
        break;
1701

1702
        case OP_WHITESPACE:
1703
        set_type_bits(re, cbit_space, table_limit);
1704
        break;
1705

1706
        case OP_NOT_WORDCHAR:
1707
        set_nottype_bits(re, cbit_word, table_limit);
1708
        break;
1709

1710
        case OP_WORDCHAR:
1711
        set_type_bits(re, cbit_word, table_limit);
1712
        break;
1713
        }
1714

1715
      tcode += 2;
1716
      break;
1717

1718
      /* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */
1719

1720
#ifdef SUPPORT_WIDE_CHARS
1721
      case OP_ECLASS:
1722
      return SSB_FAIL;
1723
#endif
1724

1725
      /* Extended class: if there are any property checks, or if this is a
1726
      negative XCLASS without a map, give up. If there are no property checks,
1727
      there must be wide characters on the XCLASS list, because otherwise an
1728
      XCLASS would not have been created. This means that code points >= 255
1729
      are potential starters. In the UTF-8 case we can scan them and set bits
1730
      for the relevant leading bytes. */
1731

1732
#ifdef SUPPORT_WIDE_CHARS
1733
      case OP_XCLASS:
1734
      xclassflags = tcode[1 + LINK_SIZE];
1735
      if ((xclassflags & XCL_HASPROP) != 0 ||
1736
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1737
        return SSB_FAIL;
1738

1739
      /* We have a positive XCLASS or a negative one without a map. Set up the
1740
      map pointer if there is one, and fall through. */
1741

1742
      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1743
        (const uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1744

1745
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1746
      then jump to handle the map. */
1747

1748
#if PCRE2_CODE_UNIT_WIDTH == 8
1749
      if (utf && (xclassflags & XCL_NOT) == 0)
1750
        {
1751
        PCRE2_UCHAR b, e;
1752
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1753
        tcode += GET(tcode, 1);
1754

1755
        if (*p >= XCL_LIST)
1756
          {
1757
          study_char_list(p, re->start_bitmap,
1758
            ((const uint8_t *)re + re->code_start));
1759
          goto HANDLE_CLASSMAP;
1760
          }
1761

1762
        for (;;) switch (*p++)
1763
          {
1764
          case XCL_SINGLE:
1765
          b = *p++;
1766
          while ((*p & 0xc0) == 0x80) p++;
1767
          re->start_bitmap[b/8] |= (1u << (b&7));
1768
          break;
1769

1770
          case XCL_RANGE:
1771
          b = *p++;
1772
          while ((*p & 0xc0) == 0x80) p++;
1773
          e = *p++;
1774
          while ((*p & 0xc0) == 0x80) p++;
1775
          for (; b <= e; b++)
1776
            re->start_bitmap[b/8] |= (1u << (b&7));
1777
          break;
1778

1779
          case XCL_END:
1780
          goto HANDLE_CLASSMAP;
1781

1782
          default:
1783
          PCRE2_DEBUG_UNREACHABLE();
1784
          return SSB_UNKNOWN;   /* Internal error, should not occur */
1785
          }
1786
        }
1787
#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1788
#endif  /* SUPPORT_WIDE_CHARS */
1789

1790
      /* It seems that the fall through comment must be outside the #ifdef if
1791
      it is to avoid the gcc compiler warning. */
1792

1793
      /* Fall through */
1794

1795
      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1796
      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1797
      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1798
      there is no difference between CLASS and NCLASS. In all other wide
1799
      character modes, set the 0xFF bit to indicate code units >= 255. */
1800

1801
      case OP_NCLASS:
1802
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1803
      if (utf)
1804
        {
1805
        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1806
        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1807
        }
1808
#elif PCRE2_CODE_UNIT_WIDTH != 8
1809
      SET_BIT(0xFF);                             /* For characters >= 255 */
1810
#endif
1811
      /* Fall through */
1812

1813
      /* Enter here for a positive non-XCLASS. If we have fallen through from
1814
      an XCLASS, classmap will already be set; just advance the code pointer.
1815
      Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1816

1817
      case OP_CLASS:
1818
      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1819
        {
1820
        classmap = (const uint8_t *)(++tcode);
1821
        tcode += 32 / sizeof(PCRE2_UCHAR);
1822
        }
1823

1824
      /* When wide characters are supported, classmap may be NULL. In UTF-8
1825
      (sic) mode, the bits in a class bit map correspond to character values,
1826
      not to byte values. However, the bit map we are constructing is for byte
1827
      values. So we have to do a conversion for characters whose code point is
1828
      greater than 127. In fact, there are only two possible starting bytes for
1829
      characters in the range 128 - 255. */
1830

1831
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1832
      HANDLE_CLASSMAP:
1833
#endif
1834
      if (classmap != NULL)
1835
        {
1836
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1837
        if (utf)
1838
          {
1839
          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1840
          for (c = 128; c < 256; c++)
1841
            {
1842
            if ((classmap[c/8] & (1u << (c&7))) != 0)
1843
              {
1844
              int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1845
              re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1846
              c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1847
              }
1848
            }
1849
          }
1850
        else
1851
#endif
1852
        /* In all modes except UTF-8, the two bit maps are compatible. */
1853

1854
          {
1855
          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1856
          }
1857
        }
1858

1859
      /* Act on what follows the class. For a zero minimum repeat, continue;
1860
      otherwise stop processing. */
1861

1862
      switch (*tcode)
1863
        {
1864
        case OP_CRSTAR:
1865
        case OP_CRMINSTAR:
1866
        case OP_CRQUERY:
1867
        case OP_CRMINQUERY:
1868
        case OP_CRPOSSTAR:
1869
        case OP_CRPOSQUERY:
1870
        tcode++;
1871
        break;
1872

1873
        case OP_CRRANGE:
1874
        case OP_CRMINRANGE:
1875
        case OP_CRPOSRANGE:
1876
        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1877
          else try_next = FALSE;
1878
        break;
1879

1880
        default:
1881
        try_next = FALSE;
1882
        break;
1883
        }
1884
      break; /* End of class handling case */
1885
      }      /* End of switch for opcodes */
1886
    }        /* End of try_next loop */
1887

1888
  code += GET(code, 1);   /* Advance to next branch */
1889
  }
1890
while (*code == OP_ALT);
1891

1892
return yield;
1893
}
1894

1895

1896

1897
/*************************************************
1898
*          Study a compiled expression           *
1899
*************************************************/
1900

1901
/* This function is handed a compiled expression that it must study to produce
1902
information that will speed up the matching.
1903

1904
Argument:
1905
  re       points to the compiled expression
1906

1907
Returns:   0 normally; non-zero should never normally occur
1908
           1 unknown opcode in set_start_bits
1909
           2 missing capturing bracket
1910
           3 unknown opcode in find_minlength
1911
*/
1912

1913
int
1914
PRIV(study)(pcre2_real_code *re)
1915
{
1916
int count = 0;
1917
PCRE2_UCHAR *code;
1918
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1919
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1920

1921
/* Find start of compiled code */
1922

1923
code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
1924

1925
/* For a pattern that has a first code unit, or a multiline pattern that
1926
matches only at "line start", there is no point in seeking a list of starting
1927
code units. */
1928

1929
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1930
  {
1931
  int depth = 0;
1932
  int rc = set_start_bits(re, code, utf, ucp, &depth);
1933
  if (rc == SSB_UNKNOWN)
1934
    {
1935
    PCRE2_DEBUG_UNREACHABLE();
1936
    return 1;
1937
    }
1938

1939
  /* If a list of starting code units was set up, scan the list to see if only
1940
  one or two were listed. Having only one listed is rare because usually a
1941
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1942
  If two are listed, see if they are caseless versions of the same character;
1943
  if so we can replace the list with a caseless first code unit. This gives
1944
  better performance and is plausibly worth doing for patterns such as [Ww]ord
1945
  or (word|WORD). */
1946

1947
  if (rc == SSB_DONE)
1948
    {
1949
    int i;
1950
    int a = -1;
1951
    int b = -1;
1952
    uint8_t *p = re->start_bitmap;
1953
    uint32_t flags = PCRE2_FIRSTMAPSET;
1954

1955
    for (i = 0; i < 256; p++, i += 8)
1956
      {
1957
      uint8_t x = *p;
1958
      if (x != 0)
1959
        {
1960
        int c;
1961
        uint8_t y = x & (~x + 1);   /* Least significant bit */
1962
        if (y != x) goto DONE;      /* More than one bit set */
1963

1964
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1965
        all wide characters", so we cannot use it here. */
1966

1967
#if PCRE2_CODE_UNIT_WIDTH != 8
1968
        if (i == 248 && x == 0x80) goto DONE;
1969
#endif
1970

1971
        /* Compute the character value */
1972

1973
        c = i;
1974
        switch (x)
1975
          {
1976
          case 1:   break;
1977
          case 2:   c += 1; break;  case 4:  c += 2; break;
1978
          case 8:   c += 3; break;  case 16: c += 4; break;
1979
          case 32:  c += 5; break;  case 64: c += 6; break;
1980
          case 128: c += 7; break;
1981
          }
1982

1983
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1984
        mode, only values < 128 can be used. In all the other cases, c is a
1985
        character value. */
1986

1987
#if PCRE2_CODE_UNIT_WIDTH == 8
1988
        if (utf && c > 127) goto DONE;
1989
#endif
1990
        if (a < 0) a = c;   /* First one found, save in a */
1991
        else if (b < 0)     /* Second one found */
1992
          {
1993
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1994

1995
#ifdef SUPPORT_UNICODE
1996
          if (utf || ucp)
1997
            {
1998
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1999
            if (c > 127) d = UCD_OTHERCASE(c);
2000
            }
2001
#endif  /* SUPPORT_UNICODE */
2002

2003
          if (d != a) goto DONE;   /* Not the other case of a */
2004
          b = c;                   /* Save second in b */
2005
          }
2006
        else goto DONE;   /* More than two characters found */
2007
        }
2008
      }
2009

2010
    /* Replace the start code unit bits with a first code unit. If it is the
2011
    same as a required later code unit, then clear the required later code
2012
    unit. This is because a search for a required code unit starts after an
2013
    explicit first code unit, but at a code unit found from the bitmap.
2014
    Patterns such as /a*a/ don't work if both the start unit and required
2015
    unit are the same. */
2016

2017
    if (a >= 0) {
2018
      if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {
2019
        re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);
2020
        re->last_codeunit = 0;
2021
      }
2022
      re->first_codeunit = a;
2023
      flags = PCRE2_FIRSTSET;
2024
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
2025
    }
2026

2027
    DONE:
2028
    re->flags |= flags;
2029
    }
2030
  }
2031

2032
/* Find the minimum length of subject string. If the pattern can match an empty
2033
string, the minimum length is already known. If the pattern contains (*ACCEPT)
2034
all bets are off, and we don't even try to find a minimum length. If there are
2035
more back references than the size of the vector we are going to cache them in,
2036
do nothing. A pattern that complicated will probably take a long time to
2037
analyze and may in any case turn out to be too complicated. Note that back
2038
reference minima are held as 16-bit numbers. */
2039

2040
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
2041
     re->top_backref <= MAX_CACHE_BACKREF)
2042
  {
2043
  int min;
2044
  int backref_cache[MAX_CACHE_BACKREF+1];
2045
  backref_cache[0] = 0;    /* Highest one that is set */
2046
  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
2047
  switch(min)
2048
    {
2049
    case -1:  /* \C in UTF mode or over-complex regex */
2050
    break;    /* Leave minlength unchanged (will be zero) */
2051

2052
    case -2:
2053
    PCRE2_DEBUG_UNREACHABLE();
2054
    return 2; /* missing capturing bracket */
2055

2056
    case -3:
2057
    PCRE2_DEBUG_UNREACHABLE();
2058
    return 3; /* unrecognized opcode */
2059

2060
    default:
2061
    re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
2062
    break;
2063
    }
2064
  }
2065

2066
return 0;
2067
}
2068

2069
/* End of pcre2_study.c */
2070

2071
Product

Resources

Company