CoCalc -- pcre2_auto

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_auto_possess.c
⁹⁸⁹⁸ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41
/* This module contains functions that scan a compiled pattern and change
42
repeats into possessive repeats where possible. */
43

44

45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48

49

50
#include "pcre2_internal.h"
51

52
/* This macro represents the max size of list[] and that is used to keep
53
track of UCD info in several places, it should be kept on sync with the
54
value used by GenerateUcd.py */
55
#define MAX_LIST 8
56

57
/*************************************************
58
*        Tables for auto-possessification        *
59
*************************************************/
60

61
/* This table is used to check whether auto-possessification is possible
62
between adjacent character-type opcodes. The left-hand (repeated) opcode is
63
used to select the row, and the right-hand opcode is use to select the column.
64
A value of 1 means that auto-possessification is OK. For example, the second
65
value in the first row means that \D+\d can be turned into \D++\d.
66

67
The Unicode property types (\P and \p) have to be present to fill out the table
68
because of what their opcode values are, but the table values should always be
69
zero because property types are handled separately in the code. The last four
70
columns apply to items that cannot be repeated, so there is no need to have
71
rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is
72
*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
73

74
#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
75
#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
76

77
static const uint8_t autoposstab[APTROWS][APTCOLS] = {
78
/* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
79
  { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
80
  { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
81
  { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
82
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
83
  { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
84
  { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
85
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
86
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
87
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
88
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
89
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
90
  { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
91
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
92
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
93
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
94
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
95
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
96
};
97

98
#ifdef SUPPORT_UNICODE
99
/* This table is used to check whether auto-possessification is possible
100
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
101
left-hand (repeated) opcode is used to select the row, and the right-hand
102
opcode is used to select the column. The values are as follows:
103

104
  0   Always return FALSE (never auto-possessify)
105
  1   Character groups are distinct (possessify if both are OP_PROP)
106
  2   Check character categories in the same group (general or particular)
107
  3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
108

109
  4   Check left general category vs right particular category
110
  5   Check right general category vs left particular category
111

112
  6   Left alphanum vs right general category
113
  7   Left space vs right general category
114
  8   Left word vs right general category
115

116
  9   Right alphanum vs left general category
117
 10   Right space vs left general category
118
 11   Right word vs left general category
119

120
 12   Left alphanum vs right particular category
121
 13   Left space vs right particular category
122
 14   Left word vs right particular category
123

124
 15   Right alphanum vs left particular category
125
 16   Right space vs left particular category
126
 17   Right word vs left particular category
127
*/
128

129
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
130
/* LAMP GC  PC  SC  SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
131
  { 3,  0,  0,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_LAMP */
132
  { 0,  2,  4,  0,   0,    9,   10,     10,  11,    0,   0,    0,    0 },  /* PT_GC */
133
  { 0,  5,  2,  0,   0,   15,   16,     16,  17,    0,   0,    0,    0 },  /* PT_PC */
134
  { 0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SC */
135
  { 0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SCX */
136
  { 3,  6, 12,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_ALNUM */
137
  { 1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_SPACE */
138
  { 1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_PXSPACE */
139
  { 0,  8, 14,  0,   0,    0,    1,      1,   3,    0,   0,    0,    0 },  /* PT_WORD */
140
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_CLIST */
141
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   3,    0,    0 },  /* PT_UCNC */
142
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_BIDICL */
143
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 }   /* PT_BOOL */
144
  /* PT_ANY does not need a record. */
145
};
146

147
/* This table is used to check whether auto-possessification is possible
148
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
149
specifies a general category and the other specifies a particular category. The
150
row is selected by the general category and the column by the particular
151
category. The value is 1 if the particular category is not part of the general
152
category. */
153

154
static const uint8_t catposstab[7][30] = {
155
/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
156
  { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
157
  { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
158
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
159
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
160
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
161
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
162
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
163
};
164

165
/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
166
a general or particular category. The properties in each row are those
167
that apply to the character set in question. Duplication means that a little
168
unnecessary work is done when checking, but this keeps things much simpler
169
because they can all use the same code. For more details see the comment where
170
this table is used.
171

172
Note: SPACE and PXSPACE used to be different because Perl excluded VT from
173
"space", but from Perl 5.18 it's included, so both categories are treated the
174
same here. */
175

176
static const uint8_t posspropstab[3][4] = {
177
  { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
178
  { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
179
  { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
180
};
181
#endif  /* SUPPORT_UNICODE */
182

183

184

185
#ifdef SUPPORT_UNICODE
186
/*************************************************
187
*        Check a character and a property        *
188
*************************************************/
189

190
/* This function is called by compare_opcodes() when a property item is
191
adjacent to a fixed character.
192

193
Arguments:
194
  c            the character
195
  ptype        the property type
196
  pdata        the data for the type
197
  negated      TRUE if it's a negated property (\P or \p{^)
198

199
Returns:       TRUE if auto-possessifying is OK
200
*/
201

202
static BOOL
203
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
204
  BOOL negated)
205
{
206
BOOL ok, rc;
207
const uint32_t *p;
208
const ucd_record *prop = GET_UCD(c);
209

210
switch(ptype)
211
  {
212
  case PT_LAMP:
213
  return (prop->chartype == ucp_Lu ||
214
          prop->chartype == ucp_Ll ||
215
          prop->chartype == ucp_Lt) == negated;
216

217
  case PT_GC:
218
  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
219

220
  case PT_PC:
221
  return (pdata == prop->chartype) == negated;
222

223
  case PT_SC:
224
  return (pdata == prop->script) == negated;
225

226
  case PT_SCX:
227
  ok = (pdata == prop->script
228
        || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
229
  return ok == negated;
230

231
  /* These are specials */
232

233
  case PT_ALNUM:
234
  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
235
          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
236

237
  /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
238
  means that Perl space and POSIX space are now identical. PCRE was changed
239
  at release 8.34. */
240

241
  case PT_SPACE:    /* Perl space */
242
  case PT_PXSPACE:  /* POSIX space */
243
  switch(c)
244
    {
245
    HSPACE_CASES:
246
    VSPACE_CASES:
247
    rc = negated;
248
    break;
249

250
    default:
251
    rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
252
    }
253
  return rc;
254

255
  case PT_WORD:
256
  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
257
          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
258
          c == CHAR_UNDERSCORE) == negated;
259

260
  case PT_CLIST:
261
  p = PRIV(ucd_caseless_sets) + prop->caseset;
262
  for (;;)
263
    {
264
    if (c < *p) return !negated;
265
    if (c == *p++) return negated;
266
    }
267
  PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
268
  break;
269

270
  /* Haven't yet thought these through. */
271

272
  case PT_BIDICL:
273
  return FALSE;
274

275
  case PT_BOOL:
276
  return FALSE;
277
  }
278

279
return FALSE;
280
}
281
#endif  /* SUPPORT_UNICODE */
282

283

284

285
/*************************************************
286
*        Base opcode of repeated opcodes         *
287
*************************************************/
288

289
/* Returns the base opcode for repeated single character type opcodes. If the
290
opcode is not a repeated character type, it returns with the original value.
291

292
Arguments:  c opcode
293
Returns:    base opcode for the type
294
*/
295

296
static PCRE2_UCHAR
297
get_repeat_base(PCRE2_UCHAR c)
298
{
299
return (c > OP_TYPEPOSUPTO)? c :
300
       (c >= OP_TYPESTAR)?   OP_TYPESTAR :
301
       (c >= OP_NOTSTARI)?   OP_NOTSTARI :
302
       (c >= OP_NOTSTAR)?    OP_NOTSTAR :
303
       (c >= OP_STARI)?      OP_STARI :
304
                             OP_STAR;
305
}
306

307

308
/*************************************************
309
*        Fill the character property list        *
310
*************************************************/
311

312
/* Checks whether the code points to an opcode that can take part in auto-
313
possessification, and if so, fills a list with its properties.
314

315
Arguments:
316
  code        points to start of expression
317
  utf         TRUE if in UTF mode
318
  ucp         TRUE if in UCP mode
319
  fcc         points to the case-flipping table
320
  list        points to output list
321
              list[0] will be filled with the opcode
322
              list[1] will be non-zero if this opcode
323
                can match an empty character string
324
              list[2..7] depends on the opcode
325

326
Returns:      points to the start of the next opcode if *code is accepted
327
              NULL if *code is not accepted
328
*/
329

330
static PCRE2_SPTR
331
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
332
  uint32_t *list)
333
{
334
PCRE2_UCHAR c = *code;
335
PCRE2_UCHAR base;
336
PCRE2_SPTR end;
337
PCRE2_SPTR class_end;
338
uint32_t chr;
339

340
#ifdef SUPPORT_UNICODE
341
uint32_t *clist_dest;
342
const uint32_t *clist_src;
343
#else
344
(void)utf;    /* Suppress "unused parameter" compiler warnings */
345
(void)ucp;
346
#endif
347

348
list[0] = c;
349
list[1] = FALSE;
350
code++;
351

352
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
353
  {
354
  base = get_repeat_base(c);
355
  c -= (base - OP_STAR);
356

357
  if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
358
    code += IMM2_SIZE;
359

360
  list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
361
             c != OP_POSPLUS);
362

363
  switch(base)
364
    {
365
    case OP_STAR:
366
    list[0] = OP_CHAR;
367
    break;
368

369
    case OP_STARI:
370
    list[0] = OP_CHARI;
371
    break;
372

373
    case OP_NOTSTAR:
374
    list[0] = OP_NOT;
375
    break;
376

377
    case OP_NOTSTARI:
378
    list[0] = OP_NOTI;
379
    break;
380

381
    case OP_TYPESTAR:
382
    list[0] = *code;
383
    code++;
384
    break;
385
    }
386
  c = list[0];
387
  }
388

389
switch(c)
390
  {
391
  case OP_NOT_DIGIT:
392
  case OP_DIGIT:
393
  case OP_NOT_WHITESPACE:
394
  case OP_WHITESPACE:
395
  case OP_NOT_WORDCHAR:
396
  case OP_WORDCHAR:
397
  case OP_ANY:
398
  case OP_ALLANY:
399
  case OP_ANYNL:
400
  case OP_NOT_HSPACE:
401
  case OP_HSPACE:
402
  case OP_NOT_VSPACE:
403
  case OP_VSPACE:
404
  case OP_EXTUNI:
405
  case OP_EODN:
406
  case OP_EOD:
407
  case OP_DOLL:
408
  case OP_DOLLM:
409
  return code;
410

411
  case OP_CHAR:
412
  case OP_NOT:
413
  GETCHARINCTEST(chr, code);
414
  list[2] = chr;
415
  list[3] = NOTACHAR;
416
  return code;
417

418
  case OP_CHARI:
419
  case OP_NOTI:
420
  list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
421
  GETCHARINCTEST(chr, code);
422
  list[2] = chr;
423

424
#ifdef SUPPORT_UNICODE
425
  if (chr < 128 || (chr < 256 && !utf && !ucp))
426
    list[3] = fcc[chr];
427
  else
428
    list[3] = UCD_OTHERCASE(chr);
429
#elif defined SUPPORT_WIDE_CHARS
430
  list[3] = (chr < 256) ? fcc[chr] : chr;
431
#else
432
  list[3] = fcc[chr];
433
#endif
434

435
  /* The othercase might be the same value. */
436

437
  if (chr == list[3])
438
    list[3] = NOTACHAR;
439
  else
440
    list[4] = NOTACHAR;
441
  return code;
442

443
#ifdef SUPPORT_UNICODE
444
  case OP_PROP:
445
  case OP_NOTPROP:
446
  if (code[0] != PT_CLIST)
447
    {
448
    list[2] = code[0];
449
    list[3] = code[1];
450
    return code + 2;
451
    }
452

453
  /* Convert only if we have enough space. */
454

455
  clist_src = PRIV(ucd_caseless_sets) + code[1];
456
  clist_dest = list + 2;
457
  code += 2;
458

459
  do {
460
     if (clist_dest >= list + MAX_LIST)
461
       {
462
       /* Early return if there is not enough space. GenerateUcd.py
463
       generated a list with more than 5 characters and something
464
       must be done about that going forward. */
465
       PCRE2_DEBUG_UNREACHABLE();   /* Remove if it ever triggers */
466
       list[2] = code[0];
467
       list[3] = code[1];
468
       return code;
469
       }
470
     *clist_dest++ = *clist_src;
471
     }
472
  while(*clist_src++ != NOTACHAR);
473

474
  /* All characters are stored. The terminating NOTACHAR is copied from the
475
  clist itself. */
476

477
  list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
478
  return code;
479
#endif
480

481
  case OP_NCLASS:
482
  case OP_CLASS:
483
#ifdef SUPPORT_WIDE_CHARS
484
  case OP_XCLASS:
485
  case OP_ECLASS:
486
  if (c == OP_XCLASS || c == OP_ECLASS)
487
    end = code + GET(code, 0) - 1;
488
  else
489
#endif
490
    end = code + 32 / sizeof(PCRE2_UCHAR);
491
  class_end = end;
492

493
  switch(*end)
494
    {
495
    case OP_CRSTAR:
496
    case OP_CRMINSTAR:
497
    case OP_CRQUERY:
498
    case OP_CRMINQUERY:
499
    case OP_CRPOSSTAR:
500
    case OP_CRPOSQUERY:
501
    list[1] = TRUE;
502
    end++;
503
    break;
504

505
    case OP_CRPLUS:
506
    case OP_CRMINPLUS:
507
    case OP_CRPOSPLUS:
508
    end++;
509
    break;
510

511
    case OP_CRRANGE:
512
    case OP_CRMINRANGE:
513
    case OP_CRPOSRANGE:
514
    list[1] = (GET2(end, 1) == 0);
515
    end += 1 + 2 * IMM2_SIZE;
516
    break;
517
    }
518
  list[2] = (uint32_t)(end - code);
519
  list[3] = (uint32_t)(end - class_end);
520
  return end;
521
  }
522

523
return NULL;    /* Opcode not accepted */
524
}
525

526

527

528
/*************************************************
529
*    Scan further character sets for match       *
530
*************************************************/
531

532
/* Checks whether the base and the current opcode have a common character, in
533
which case the base cannot be possessified.
534

535
Arguments:
536
  code        points to the byte code
537
  utf         TRUE in UTF mode
538
  ucp         TRUE in UCP mode
539
  cb          compile data block
540
  base_list   the data list of the base opcode
541
  base_end    the end of the base opcode
542
  rec_limit   points to recursion depth counter
543

544
Returns:      TRUE if the auto-possessification is possible
545
*/
546

547
static BOOL
548
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
549
  const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
550
{
551
PCRE2_UCHAR c;
552
uint32_t list[MAX_LIST];
553
const uint32_t *chr_ptr;
554
const uint32_t *ochr_ptr;
555
const uint32_t *list_ptr;
556
PCRE2_SPTR next_code;
557
#ifdef SUPPORT_WIDE_CHARS
558
PCRE2_SPTR xclass_flags;
559
#endif
560
const uint8_t *class_bitset;
561
const uint8_t *set1, *set2, *set_end;
562
uint32_t chr;
563
BOOL accepted, invert_bits;
564
BOOL entered_a_group = FALSE;
565

566
if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */
567

568
/* Note: the base_list[1] contains whether the current opcode has a greedy
569
(represented by a non-zero value) quantifier. This is a different from
570
other character type lists, which store here that the character iterator
571
matches to an empty string (also represented by a non-zero value). */
572

573
for(;;)
574
  {
575
  PCRE2_SPTR bracode;
576

577
  /* All operations move the code pointer forward.
578
  Therefore infinite recursions are not possible. */
579

580
  c = *code;
581

582
  /* Skip over callouts */
583

584
  if (c == OP_CALLOUT)
585
    {
586
    code += PRIV(OP_lengths)[c];
587
    continue;
588
    }
589

590
  if (c == OP_CALLOUT_STR)
591
    {
592
    code += GET(code, 1 + 2*LINK_SIZE);
593
    continue;
594
    }
595

596
  /* At the end of a branch, skip to the end of the group and process it. */
597

598
  if (c == OP_ALT)
599
    {
600
    do code += GET(code, 1); while (*code == OP_ALT);
601
    c = *code;
602
    }
603

604
  /* Inspect the next opcode. */
605

606
  switch(c)
607
    {
608
    /* We can always possessify a greedy iterator at the end of the pattern,
609
    which is reached after skipping over the final OP_KET. A non-greedy
610
    iterator must never be possessified. */
611

612
    case OP_END:
613
    return base_list[1] != 0;
614

615
    /* When an iterator is at the end of certain kinds of group we can inspect
616
    what follows the group by skipping over the closing ket. Note that this
617
    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
618
    iteration is variable (could be another iteration or could be the next
619
    item). As these two opcodes are not listed in the next switch, they will
620
    end up as the next code to inspect, and return FALSE by virtue of being
621
    unsupported. */
622

623
    case OP_KET:
624
    case OP_KETRPOS:
625
    /* The non-greedy case cannot be converted to a possessive form. */
626

627
    if (base_list[1] == 0) return FALSE;
628

629
    /* If the bracket is capturing it might be referenced by an OP_RECURSE
630
    so its last iterator can never be possessified if the pattern contains
631
    recursions. (This could be improved by keeping a list of group numbers that
632
    are called by recursion.) */
633

634
    bracode = code - GET(code, 1);
635
    switch(*bracode)
636
      {
637
      case OP_CBRA:
638
      case OP_SCBRA:
639
      case OP_CBRAPOS:
640
      case OP_SCBRAPOS:
641
      if (cb->had_recurse) return FALSE;
642
      break;
643

644
      /* A script run might have to backtrack if the iterated item can match
645
      characters from more than one script. So give up unless repeating an
646
      explicit character. */
647

648
      case OP_SCRIPT_RUN:
649
      if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
650
        return FALSE;
651
      break;
652

653
      /* Atomic sub-patterns and forward assertions can always auto-possessify
654
      their last iterator. However, if the group was entered as a result of
655
      checking a previous iterator, this is not possible. */
656

657
      case OP_ASSERT:
658
      case OP_ASSERT_NOT:
659
      case OP_ONCE:
660
      return !entered_a_group;
661

662
      /* Fixed-length lookbehinds can be treated the same way, but variable
663
      length lookbehinds must not auto-possessify their last iterator. Note
664
      that in order to identify a variable length lookbehind we must check
665
      through all branches, because some may be of fixed length. */
666

667
      case OP_ASSERTBACK:
668
      case OP_ASSERTBACK_NOT:
669
      do
670
        {
671
        if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE;  /* Variable */
672
        bracode += GET(bracode, 1);
673
        }
674
      while (*bracode == OP_ALT);
675
      return !entered_a_group;  /* Not variable length */
676

677
      /* Non-atomic assertions - don't possessify last iterator. This needs
678
      more thought. */
679

680
      case OP_ASSERT_NA:
681
      case OP_ASSERTBACK_NA:
682
      return FALSE;
683
      }
684

685
    /* Skip over the bracket and inspect what comes next. */
686

687
    code += PRIV(OP_lengths)[c];
688
    continue;
689

690
    /* Handle cases where the next item is a group. */
691

692
    case OP_ONCE:
693
    case OP_BRA:
694
    case OP_CBRA:
695
    next_code = code + GET(code, 1);
696
    code += PRIV(OP_lengths)[c];
697

698
    /* Check each branch. We have to recurse a level for all but the last
699
    branch. */
700

701
    while (*next_code == OP_ALT)
702
      {
703
      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
704
        return FALSE;
705
      code = next_code + 1 + LINK_SIZE;
706
      next_code += GET(next_code, 1);
707
      }
708

709
    entered_a_group = TRUE;
710
    continue;
711

712
    case OP_BRAZERO:
713
    case OP_BRAMINZERO:
714

715
    next_code = code + 1;
716
    if (*next_code != OP_BRA && *next_code != OP_CBRA &&
717
        *next_code != OP_ONCE) return FALSE;
718

719
    do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
720

721
    /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
722

723
    next_code += 1 + LINK_SIZE;
724
    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
725
         rec_limit))
726
      return FALSE;
727

728
    code += PRIV(OP_lengths)[c];
729
    continue;
730

731
    /* The next opcode does not need special handling; fall through and use it
732
    to see if the base can be possessified. */
733

734
    default:
735
    break;
736
    }
737

738
  /* We now have the next appropriate opcode to compare with the base. Check
739
  for a supported opcode, and load its properties. */
740

741
  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
742
  if (code == NULL) return FALSE;    /* Unsupported */
743

744
  /* If either opcode is a small character list, set pointers for comparing
745
  characters from that list with another list, or with a property. */
746

747
  if (base_list[0] == OP_CHAR)
748
    {
749
    chr_ptr = base_list + 2;
750
    list_ptr = list;
751
    }
752
  else if (list[0] == OP_CHAR)
753
    {
754
    chr_ptr = list + 2;
755
    list_ptr = base_list;
756
    }
757

758
  /* Character bitsets can also be compared to certain opcodes. */
759

760
  else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
761
#if PCRE2_CODE_UNIT_WIDTH == 8
762
      /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
763
      || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
764
#endif
765
      )
766
    {
767
#if PCRE2_CODE_UNIT_WIDTH == 8
768
    if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
769
#else
770
    if (base_list[0] == OP_CLASS)
771
#endif
772
      {
773
      set1 = (const uint8_t *)(base_end - base_list[2]);
774
      list_ptr = list;
775
      }
776
    else
777
      {
778
      set1 = (const uint8_t *)(code - list[2]);
779
      list_ptr = base_list;
780
      }
781

782
    invert_bits = FALSE;
783
    switch(list_ptr[0])
784
      {
785
      case OP_CLASS:
786
      case OP_NCLASS:
787
      set2 = (const uint8_t *)
788
        ((list_ptr == list ? code : base_end) - list_ptr[2]);
789
      break;
790

791
#ifdef SUPPORT_WIDE_CHARS
792
      case OP_XCLASS:
793
      xclass_flags = (list_ptr == list ? code : base_end) -
794
        list_ptr[2] + LINK_SIZE;
795
      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
796
      if ((*xclass_flags & XCL_MAP) == 0)
797
        {
798
        /* No bits are set for characters < 256. */
799
        if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
800
        /* Might be an empty repeat. */
801
        continue;
802
        }
803
      set2 = (const uint8_t *)(xclass_flags + 1);
804
      break;
805
#endif
806

807
      case OP_NOT_DIGIT:
808
      invert_bits = TRUE;
809
      /* Fall through */
810
      case OP_DIGIT:
811
      set2 = (const uint8_t *)(cb->cbits + cbit_digit);
812
      break;
813

814
      case OP_NOT_WHITESPACE:
815
      invert_bits = TRUE;
816
      /* Fall through */
817
      case OP_WHITESPACE:
818
      set2 = (const uint8_t *)(cb->cbits + cbit_space);
819
      break;
820

821
      case OP_NOT_WORDCHAR:
822
      invert_bits = TRUE;
823
      /* Fall through */
824
      case OP_WORDCHAR:
825
      set2 = (const uint8_t *)(cb->cbits + cbit_word);
826
      break;
827

828
      default:
829
      return FALSE;
830
      }
831

832
    /* Because the bit sets are unaligned bytes, we need to perform byte
833
    comparison here. */
834

835
    set_end = set1 + 32;
836
    if (invert_bits)
837
      {
838
      do
839
        {
840
        if ((*set1++ & ~(*set2++)) != 0) return FALSE;
841
        }
842
      while (set1 < set_end);
843
      }
844
    else
845
      {
846
      do
847
        {
848
        if ((*set1++ & *set2++) != 0) return FALSE;
849
        }
850
      while (set1 < set_end);
851
      }
852

853
    if (list[1] == 0) return TRUE;
854
    /* Might be an empty repeat. */
855
    continue;
856
    }
857

858
  /* Some property combinations also acceptable. Unicode property opcodes are
859
  processed specially; the rest can be handled with a lookup table. */
860

861
  else
862
    {
863
    uint32_t leftop, rightop;
864

865
    leftop = base_list[0];
866
    rightop = list[0];
867

868
#ifdef SUPPORT_UNICODE
869
    accepted = FALSE; /* Always set in non-unicode case. */
870
    if (leftop == OP_PROP || leftop == OP_NOTPROP)
871
      {
872
      if (rightop == OP_EOD)
873
        accepted = TRUE;
874
      else if (rightop == OP_PROP || rightop == OP_NOTPROP)
875
        {
876
        int n;
877
        const uint8_t *p;
878
        BOOL same = leftop == rightop;
879
        BOOL lisprop = leftop == OP_PROP;
880
        BOOL risprop = rightop == OP_PROP;
881
        BOOL bothprop = lisprop && risprop;
882

883
        /* There's a table that specifies how each combination is to be
884
        processed:
885
          0   Always return FALSE (never auto-possessify)
886
          1   Character groups are distinct (possessify if both are OP_PROP)
887
          2   Check character categories in the same group (general or particular)
888
          3   Return TRUE if the two opcodes are not the same
889
          ... see comments below
890
        */
891

892
        n = propposstab[base_list[2]][list[2]];
893
        switch(n)
894
          {
895
          case 0: break;
896
          case 1: accepted = bothprop; break;
897
          case 2: accepted = (base_list[3] == list[3]) != same; break;
898
          case 3: accepted = !same; break;
899

900
          case 4:  /* Left general category, right particular category */
901
          accepted = risprop && catposstab[base_list[3]][list[3]] == same;
902
          break;
903

904
          case 5:  /* Right general category, left particular category */
905
          accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
906
          break;
907

908
          /* This code is logically tricky. Think hard before fiddling with it.
909
          The posspropstab table has four entries per row. Each row relates to
910
          one of PCRE's special properties such as ALNUM or SPACE or WORD.
911
          Only WORD actually needs all four entries, but using repeats for the
912
          others means they can all use the same code below.
913

914
          The first two entries in each row are Unicode general categories, and
915
          apply always, because all the characters they include are part of the
916
          PCRE character set. The third and fourth entries are a general and a
917
          particular category, respectively, that include one or more relevant
918
          characters. One or the other is used, depending on whether the check
919
          is for a general or a particular category. However, in both cases the
920
          category contains more characters than the specials that are defined
921
          for the property being tested against. Therefore, it cannot be used
922
          in a NOTPROP case.
923

924
          Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
925
          Underscore is covered by ucp_P or ucp_Po. */
926

927
          case 6:  /* Left alphanum vs right general category */
928
          case 7:  /* Left space vs right general category */
929
          case 8:  /* Left word vs right general category */
930
          p = posspropstab[n-6];
931
          accepted = risprop && lisprop ==
932
            (list[3] != p[0] &&
933
             list[3] != p[1] &&
934
            (list[3] != p[2] || !lisprop));
935
          break;
936

937
          case 9:   /* Right alphanum vs left general category */
938
          case 10:  /* Right space vs left general category */
939
          case 11:  /* Right word vs left general category */
940
          p = posspropstab[n-9];
941
          accepted = lisprop && risprop ==
942
            (base_list[3] != p[0] &&
943
             base_list[3] != p[1] &&
944
            (base_list[3] != p[2] || !risprop));
945
          break;
946

947
          case 12:  /* Left alphanum vs right particular category */
948
          case 13:  /* Left space vs right particular category */
949
          case 14:  /* Left word vs right particular category */
950
          p = posspropstab[n-12];
951
          accepted = risprop && lisprop ==
952
            (catposstab[p[0]][list[3]] &&
953
             catposstab[p[1]][list[3]] &&
954
            (list[3] != p[3] || !lisprop));
955
          break;
956

957
          case 15:  /* Right alphanum vs left particular category */
958
          case 16:  /* Right space vs left particular category */
959
          case 17:  /* Right word vs left particular category */
960
          p = posspropstab[n-15];
961
          accepted = lisprop && risprop ==
962
            (catposstab[p[0]][base_list[3]] &&
963
             catposstab[p[1]][base_list[3]] &&
964
            (base_list[3] != p[3] || !risprop));
965
          break;
966
          }
967
        }
968
      }
969

970
    else
971
#endif  /* SUPPORT_UNICODE */
972

973
    accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
974
           rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
975
           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
976

977
    if (!accepted) return FALSE;
978

979
    if (list[1] == 0) return TRUE;
980
    /* Might be an empty repeat. */
981
    continue;
982
    }
983

984
  /* Control reaches here only if one of the items is a small character list.
985
  All characters are checked against the other side. */
986

987
  do
988
    {
989
    chr = *chr_ptr;
990

991
    switch(list_ptr[0])
992
      {
993
      case OP_CHAR:
994
      ochr_ptr = list_ptr + 2;
995
      do
996
        {
997
        if (chr == *ochr_ptr) return FALSE;
998
        ochr_ptr++;
999
        }
1000
      while(*ochr_ptr != NOTACHAR);
1001
      break;
1002

1003
      case OP_NOT:
1004
      ochr_ptr = list_ptr + 2;
1005
      do
1006
        {
1007
        if (chr == *ochr_ptr)
1008
          break;
1009
        ochr_ptr++;
1010
        }
1011
      while(*ochr_ptr != NOTACHAR);
1012
      if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
1013
      break;
1014

1015
      /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
1016
      set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
1017

1018
      case OP_DIGIT:
1019
      if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
1020
      break;
1021

1022
      case OP_NOT_DIGIT:
1023
      if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
1024
      break;
1025

1026
      case OP_WHITESPACE:
1027
      if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
1028
      break;
1029

1030
      case OP_NOT_WHITESPACE:
1031
      if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
1032
      break;
1033

1034
      case OP_WORDCHAR:
1035
      if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
1036
      break;
1037

1038
      case OP_NOT_WORDCHAR:
1039
      if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
1040
      break;
1041

1042
      case OP_HSPACE:
1043
      switch(chr)
1044
        {
1045
        HSPACE_CASES: return FALSE;
1046
        default: break;
1047
        }
1048
      break;
1049

1050
      case OP_NOT_HSPACE:
1051
      switch(chr)
1052
        {
1053
        HSPACE_CASES: break;
1054
        default: return FALSE;
1055
        }
1056
      break;
1057

1058
      case OP_ANYNL:
1059
      case OP_VSPACE:
1060
      switch(chr)
1061
        {
1062
        VSPACE_CASES: return FALSE;
1063
        default: break;
1064
        }
1065
      break;
1066

1067
      case OP_NOT_VSPACE:
1068
      switch(chr)
1069
        {
1070
        VSPACE_CASES: break;
1071
        default: return FALSE;
1072
        }
1073
      break;
1074

1075
      case OP_DOLL:
1076
      case OP_EODN:
1077
      switch (chr)
1078
        {
1079
        case CHAR_CR:
1080
        case CHAR_LF:
1081
        case CHAR_VT:
1082
        case CHAR_FF:
1083
        case CHAR_NEL:
1084
#ifndef EBCDIC
1085
        case 0x2028:
1086
        case 0x2029:
1087
#endif  /* Not EBCDIC */
1088
        return FALSE;
1089
        }
1090
      break;
1091

1092
      case OP_EOD:    /* Can always possessify before \z */
1093
      break;
1094

1095
#ifdef SUPPORT_UNICODE
1096
      case OP_PROP:
1097
      case OP_NOTPROP:
1098
      if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1099
            list_ptr[0] == OP_NOTPROP))
1100
        return FALSE;
1101
      break;
1102
#endif
1103

1104
      case OP_NCLASS:
1105
      if (chr > 255) return FALSE;
1106
      /* Fall through */
1107

1108
      case OP_CLASS:
1109
      if (chr > 255) break;
1110
      class_bitset = (const uint8_t *)
1111
        ((list_ptr == list ? code : base_end) - list_ptr[2]);
1112
      if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;
1113
      break;
1114

1115
#ifdef SUPPORT_WIDE_CHARS
1116
      case OP_XCLASS:
1117
      if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1118
          list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))
1119
        return FALSE;
1120
      break;
1121

1122
      case OP_ECLASS:
1123
      if (PRIV(eclass)(chr,
1124
          (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,
1125
          (list_ptr == list ? code : base_end) - list_ptr[3],
1126
          (const uint8_t*)cb->start_code, utf))
1127
        return FALSE;
1128
      break;
1129
#endif /* SUPPORT_WIDE_CHARS */
1130

1131
      default:
1132
      return FALSE;
1133
      }
1134

1135
    chr_ptr++;
1136
    }
1137
  while(*chr_ptr != NOTACHAR);
1138

1139
  /* At least one character must be matched from this opcode. */
1140

1141
  if (list[1] == 0) return TRUE;
1142
  }
1143

1144
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
1145
return FALSE;              /* Avoid compiler warnings */
1146
}
1147

1148

1149

1150
/*************************************************
1151
*    Scan compiled regex for auto-possession     *
1152
*************************************************/
1153

1154
/* Replaces single character iterations with their possessive alternatives
1155
if appropriate. This function modifies the compiled opcode! Hitting a
1156
non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1157
bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1158
overly complicated or large patterns. In these cases, the check just stops,
1159
leaving the remainder of the pattern unpossessified.
1160

1161
Arguments:
1162
  code        points to start of the byte code
1163
  cb          compile data block
1164

1165
Returns:      0 for success
1166
              -1 if a non-existant opcode is encountered
1167
*/
1168

1169
int
1170
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
1171
{
1172
PCRE2_UCHAR c;
1173
PCRE2_SPTR end;
1174
PCRE2_UCHAR *repeat_opcode;
1175
uint32_t list[MAX_LIST];
1176
int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
1177
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
1178
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
1179

1180
for (;;)
1181
  {
1182
  c = *code;
1183

1184
  if (c >= OP_TABLE_LENGTH)
1185
    {
1186
    PCRE2_DEBUG_UNREACHABLE();
1187
    return -1;   /* Something gone wrong */
1188
    }
1189

1190
  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1191
    {
1192
    c -= get_repeat_base(c) - OP_STAR;
1193
    end = (c <= OP_MINUPTO) ?
1194
      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
1195
    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1196

1197
    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
1198
        &rec_limit))
1199
      {
1200
      switch(c)
1201
        {
1202
        case OP_STAR:
1203
        *code += OP_POSSTAR - OP_STAR;
1204
        break;
1205

1206
        case OP_MINSTAR:
1207
        *code += OP_POSSTAR - OP_MINSTAR;
1208
        break;
1209

1210
        case OP_PLUS:
1211
        *code += OP_POSPLUS - OP_PLUS;
1212
        break;
1213

1214
        case OP_MINPLUS:
1215
        *code += OP_POSPLUS - OP_MINPLUS;
1216
        break;
1217

1218
        case OP_QUERY:
1219
        *code += OP_POSQUERY - OP_QUERY;
1220
        break;
1221

1222
        case OP_MINQUERY:
1223
        *code += OP_POSQUERY - OP_MINQUERY;
1224
        break;
1225

1226
        case OP_UPTO:
1227
        *code += OP_POSUPTO - OP_UPTO;
1228
        break;
1229

1230
        case OP_MINUPTO:
1231
        *code += OP_POSUPTO - OP_MINUPTO;
1232
        break;
1233
        }
1234
      }
1235
    c = *code;
1236
    }
1237
  else if (c == OP_CLASS || c == OP_NCLASS
1238
#ifdef SUPPORT_WIDE_CHARS
1239
           || c == OP_XCLASS || c == OP_ECLASS
1240
#endif
1241
           )
1242
    {
1243
#ifdef SUPPORT_WIDE_CHARS
1244
    if (c == OP_XCLASS || c == OP_ECLASS)
1245
      repeat_opcode = code + GET(code, 1);
1246
    else
1247
#endif
1248
      repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1249

1250
    c = *repeat_opcode;
1251
    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1252
      {
1253
      /* The return from get_chr_property_list() will never be NULL when
1254
      *code (aka c) is one of the four class opcodes. However, gcc with
1255
      -fanalyzer notes that a NULL return is possible, and grumbles. Hence we
1256
      put in a check. */
1257

1258
      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
1259
      list[1] = (c & 1) == 0;
1260

1261
      if (end != NULL &&
1262
          compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
1263
        {
1264
        switch (c)
1265
          {
1266
          case OP_CRSTAR:
1267
          case OP_CRMINSTAR:
1268
          *repeat_opcode = OP_CRPOSSTAR;
1269
          break;
1270

1271
          case OP_CRPLUS:
1272
          case OP_CRMINPLUS:
1273
          *repeat_opcode = OP_CRPOSPLUS;
1274
          break;
1275

1276
          case OP_CRQUERY:
1277
          case OP_CRMINQUERY:
1278
          *repeat_opcode = OP_CRPOSQUERY;
1279
          break;
1280

1281
          case OP_CRRANGE:
1282
          case OP_CRMINRANGE:
1283
          *repeat_opcode = OP_CRPOSRANGE;
1284
          break;
1285
          }
1286
        }
1287
      }
1288
    c = *code;
1289
    }
1290

1291
  switch(c)
1292
    {
1293
    case OP_END:
1294
    return 0;
1295

1296
    case OP_TYPESTAR:
1297
    case OP_TYPEMINSTAR:
1298
    case OP_TYPEPLUS:
1299
    case OP_TYPEMINPLUS:
1300
    case OP_TYPEQUERY:
1301
    case OP_TYPEMINQUERY:
1302
    case OP_TYPEPOSSTAR:
1303
    case OP_TYPEPOSPLUS:
1304
    case OP_TYPEPOSQUERY:
1305
    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1306
    break;
1307

1308
    case OP_TYPEUPTO:
1309
    case OP_TYPEMINUPTO:
1310
    case OP_TYPEEXACT:
1311
    case OP_TYPEPOSUPTO:
1312
    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1313
      code += 2;
1314
    break;
1315

1316
    case OP_CALLOUT_STR:
1317
    code += GET(code, 1 + 2*LINK_SIZE);
1318
    break;
1319

1320
#ifdef SUPPORT_WIDE_CHARS
1321
    case OP_XCLASS:
1322
    case OP_ECLASS:
1323
    code += GET(code, 1);
1324
    break;
1325
#endif
1326

1327
    case OP_MARK:
1328
    case OP_COMMIT_ARG:
1329
    case OP_PRUNE_ARG:
1330
    case OP_SKIP_ARG:
1331
    case OP_THEN_ARG:
1332
    code += code[1];
1333
    break;
1334
    }
1335

1336
  /* Add in the fixed length from the table */
1337

1338
  code += PRIV(OP_lengths)[c];
1339

1340
  /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1341
  followed by a multi-byte character. The length in the table is a minimum, so
1342
  we have to arrange to skip the extra code units. */
1343

1344
#ifdef MAYBE_UTF_MULTI
1345
  if (utf) switch(c)
1346
    {
1347
    case OP_CHAR:
1348
    case OP_CHARI:
1349
    case OP_NOT:
1350
    case OP_NOTI:
1351
    case OP_STAR:
1352
    case OP_MINSTAR:
1353
    case OP_PLUS:
1354
    case OP_MINPLUS:
1355
    case OP_QUERY:
1356
    case OP_MINQUERY:
1357
    case OP_UPTO:
1358
    case OP_MINUPTO:
1359
    case OP_EXACT:
1360
    case OP_POSSTAR:
1361
    case OP_POSPLUS:
1362
    case OP_POSQUERY:
1363
    case OP_POSUPTO:
1364
    case OP_STARI:
1365
    case OP_MINSTARI:
1366
    case OP_PLUSI:
1367
    case OP_MINPLUSI:
1368
    case OP_QUERYI:
1369
    case OP_MINQUERYI:
1370
    case OP_UPTOI:
1371
    case OP_MINUPTOI:
1372
    case OP_EXACTI:
1373
    case OP_POSSTARI:
1374
    case OP_POSPLUSI:
1375
    case OP_POSQUERYI:
1376
    case OP_POSUPTOI:
1377
    case OP_NOTSTAR:
1378
    case OP_NOTMINSTAR:
1379
    case OP_NOTPLUS:
1380
    case OP_NOTMINPLUS:
1381
    case OP_NOTQUERY:
1382
    case OP_NOTMINQUERY:
1383
    case OP_NOTUPTO:
1384
    case OP_NOTMINUPTO:
1385
    case OP_NOTEXACT:
1386
    case OP_NOTPOSSTAR:
1387
    case OP_NOTPOSPLUS:
1388
    case OP_NOTPOSQUERY:
1389
    case OP_NOTPOSUPTO:
1390
    case OP_NOTSTARI:
1391
    case OP_NOTMINSTARI:
1392
    case OP_NOTPLUSI:
1393
    case OP_NOTMINPLUSI:
1394
    case OP_NOTQUERYI:
1395
    case OP_NOTMINQUERYI:
1396
    case OP_NOTUPTOI:
1397
    case OP_NOTMINUPTOI:
1398
    case OP_NOTEXACTI:
1399
    case OP_NOTPOSSTARI:
1400
    case OP_NOTPOSPLUSI:
1401
    case OP_NOTPOSQUERYI:
1402
    case OP_NOTPOSUPTOI:
1403
    if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1404
    break;
1405
    }
1406
#else
1407
  (void)(utf);  /* Keep compiler happy by referencing function argument */
1408
#endif  /* SUPPORT_WIDE_CHARS */
1409
  }
1410
}
1411

1412
/* End of pcre2_auto_possess.c */
1413

1414
Product

Resources

Company