Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_auto_possess.c
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains functions that scan a compiled pattern and change
42
repeats into possessive repeats where possible. */
43
44
45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48
49
50
#include "pcre2_internal.h"
51
52
/* This macro represents the max size of list[] and that is used to keep
53
track of UCD info in several places, it should be kept on sync with the
54
value used by GenerateUcd.py */
55
#define MAX_LIST 8
56
57
/*************************************************
58
* Tables for auto-possessification *
59
*************************************************/
60
61
/* This table is used to check whether auto-possessification is possible
62
between adjacent character-type opcodes. The left-hand (repeated) opcode is
63
used to select the row, and the right-hand opcode is use to select the column.
64
A value of 1 means that auto-possessification is OK. For example, the second
65
value in the first row means that \D+\d can be turned into \D++\d.
66
67
The Unicode property types (\P and \p) have to be present to fill out the table
68
because of what their opcode values are, but the table values should always be
69
zero because property types are handled separately in the code. The last four
70
columns apply to items that cannot be repeated, so there is no need to have
71
rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is
72
*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
73
74
#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
75
#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
76
77
static const uint8_t autoposstab[APTROWS][APTCOLS] = {
78
/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
79
{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
80
{ 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
81
{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
82
{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
83
{ 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
84
{ 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
85
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
86
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
87
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
88
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
89
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
90
{ 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
91
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
92
{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
93
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
94
{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
95
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
96
};
97
98
#ifdef SUPPORT_UNICODE
99
/* This table is used to check whether auto-possessification is possible
100
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
101
left-hand (repeated) opcode is used to select the row, and the right-hand
102
opcode is used to select the column. The values are as follows:
103
104
0 Always return FALSE (never auto-possessify)
105
1 Character groups are distinct (possessify if both are OP_PROP)
106
2 Check character categories in the same group (general or particular)
107
3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
108
109
4 Check left general category vs right particular category
110
5 Check right general category vs left particular category
111
112
6 Left alphanum vs right general category
113
7 Left space vs right general category
114
8 Left word vs right general category
115
116
9 Right alphanum vs left general category
117
10 Right space vs left general category
118
11 Right word vs left general category
119
120
12 Left alphanum vs right particular category
121
13 Left space vs right particular category
122
14 Left word vs right particular category
123
124
15 Right alphanum vs left particular category
125
16 Right space vs left particular category
126
17 Right word vs left particular category
127
*/
128
129
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
130
/* LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
131
{ 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */
132
{ 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */
133
{ 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */
134
{ 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
135
{ 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */
136
{ 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */
137
{ 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */
138
{ 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */
139
{ 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */
140
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
141
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */
142
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */
143
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */
144
/* PT_ANY does not need a record. */
145
};
146
147
/* This table is used to check whether auto-possessification is possible
148
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
149
specifies a general category and the other specifies a particular category. The
150
row is selected by the general category and the column by the particular
151
category. The value is 1 if the particular category is not part of the general
152
category. */
153
154
static const uint8_t catposstab[7][30] = {
155
/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
156
{ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
157
{ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
158
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
159
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
160
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
161
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
162
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
163
};
164
165
/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
166
a general or particular category. The properties in each row are those
167
that apply to the character set in question. Duplication means that a little
168
unnecessary work is done when checking, but this keeps things much simpler
169
because they can all use the same code. For more details see the comment where
170
this table is used.
171
172
Note: SPACE and PXSPACE used to be different because Perl excluded VT from
173
"space", but from Perl 5.18 it's included, so both categories are treated the
174
same here. */
175
176
static const uint8_t posspropstab[3][4] = {
177
{ ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
178
{ ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
179
{ ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
180
};
181
#endif /* SUPPORT_UNICODE */
182
183
184
185
#ifdef SUPPORT_UNICODE
186
/*************************************************
187
* Check a character and a property *
188
*************************************************/
189
190
/* This function is called by compare_opcodes() when a property item is
191
adjacent to a fixed character.
192
193
Arguments:
194
c the character
195
ptype the property type
196
pdata the data for the type
197
negated TRUE if it's a negated property (\P or \p{^)
198
199
Returns: TRUE if auto-possessifying is OK
200
*/
201
202
static BOOL
203
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
204
BOOL negated)
205
{
206
BOOL ok, rc;
207
const uint32_t *p;
208
const ucd_record *prop = GET_UCD(c);
209
210
switch(ptype)
211
{
212
case PT_LAMP:
213
return (prop->chartype == ucp_Lu ||
214
prop->chartype == ucp_Ll ||
215
prop->chartype == ucp_Lt) == negated;
216
217
case PT_GC:
218
return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
219
220
case PT_PC:
221
return (pdata == prop->chartype) == negated;
222
223
case PT_SC:
224
return (pdata == prop->script) == negated;
225
226
case PT_SCX:
227
ok = (pdata == prop->script
228
|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
229
return ok == negated;
230
231
/* These are specials */
232
233
case PT_ALNUM:
234
return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
235
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
236
237
/* Perl space used to exclude VT, but from Perl 5.18 it is included, which
238
means that Perl space and POSIX space are now identical. PCRE was changed
239
at release 8.34. */
240
241
case PT_SPACE: /* Perl space */
242
case PT_PXSPACE: /* POSIX space */
243
switch(c)
244
{
245
HSPACE_CASES:
246
VSPACE_CASES:
247
rc = negated;
248
break;
249
250
default:
251
rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
252
}
253
return rc;
254
255
case PT_WORD:
256
return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
257
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
258
c == CHAR_UNDERSCORE) == negated;
259
260
case PT_CLIST:
261
p = PRIV(ucd_caseless_sets) + prop->caseset;
262
for (;;)
263
{
264
if (c < *p) return !negated;
265
if (c == *p++) return negated;
266
}
267
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
268
break;
269
270
/* Haven't yet thought these through. */
271
272
case PT_BIDICL:
273
return FALSE;
274
275
case PT_BOOL:
276
return FALSE;
277
}
278
279
return FALSE;
280
}
281
#endif /* SUPPORT_UNICODE */
282
283
284
285
/*************************************************
286
* Base opcode of repeated opcodes *
287
*************************************************/
288
289
/* Returns the base opcode for repeated single character type opcodes. If the
290
opcode is not a repeated character type, it returns with the original value.
291
292
Arguments: c opcode
293
Returns: base opcode for the type
294
*/
295
296
static PCRE2_UCHAR
297
get_repeat_base(PCRE2_UCHAR c)
298
{
299
return (c > OP_TYPEPOSUPTO)? c :
300
(c >= OP_TYPESTAR)? OP_TYPESTAR :
301
(c >= OP_NOTSTARI)? OP_NOTSTARI :
302
(c >= OP_NOTSTAR)? OP_NOTSTAR :
303
(c >= OP_STARI)? OP_STARI :
304
OP_STAR;
305
}
306
307
308
/*************************************************
309
* Fill the character property list *
310
*************************************************/
311
312
/* Checks whether the code points to an opcode that can take part in auto-
313
possessification, and if so, fills a list with its properties.
314
315
Arguments:
316
code points to start of expression
317
utf TRUE if in UTF mode
318
ucp TRUE if in UCP mode
319
fcc points to the case-flipping table
320
list points to output list
321
list[0] will be filled with the opcode
322
list[1] will be non-zero if this opcode
323
can match an empty character string
324
list[2..7] depends on the opcode
325
326
Returns: points to the start of the next opcode if *code is accepted
327
NULL if *code is not accepted
328
*/
329
330
static PCRE2_SPTR
331
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
332
uint32_t *list)
333
{
334
PCRE2_UCHAR c = *code;
335
PCRE2_UCHAR base;
336
PCRE2_SPTR end;
337
PCRE2_SPTR class_end;
338
uint32_t chr;
339
340
#ifdef SUPPORT_UNICODE
341
uint32_t *clist_dest;
342
const uint32_t *clist_src;
343
#else
344
(void)utf; /* Suppress "unused parameter" compiler warnings */
345
(void)ucp;
346
#endif
347
348
list[0] = c;
349
list[1] = FALSE;
350
code++;
351
352
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
353
{
354
base = get_repeat_base(c);
355
c -= (base - OP_STAR);
356
357
if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
358
code += IMM2_SIZE;
359
360
list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
361
c != OP_POSPLUS);
362
363
switch(base)
364
{
365
case OP_STAR:
366
list[0] = OP_CHAR;
367
break;
368
369
case OP_STARI:
370
list[0] = OP_CHARI;
371
break;
372
373
case OP_NOTSTAR:
374
list[0] = OP_NOT;
375
break;
376
377
case OP_NOTSTARI:
378
list[0] = OP_NOTI;
379
break;
380
381
case OP_TYPESTAR:
382
list[0] = *code;
383
code++;
384
break;
385
}
386
c = list[0];
387
}
388
389
switch(c)
390
{
391
case OP_NOT_DIGIT:
392
case OP_DIGIT:
393
case OP_NOT_WHITESPACE:
394
case OP_WHITESPACE:
395
case OP_NOT_WORDCHAR:
396
case OP_WORDCHAR:
397
case OP_ANY:
398
case OP_ALLANY:
399
case OP_ANYNL:
400
case OP_NOT_HSPACE:
401
case OP_HSPACE:
402
case OP_NOT_VSPACE:
403
case OP_VSPACE:
404
case OP_EXTUNI:
405
case OP_EODN:
406
case OP_EOD:
407
case OP_DOLL:
408
case OP_DOLLM:
409
return code;
410
411
case OP_CHAR:
412
case OP_NOT:
413
GETCHARINCTEST(chr, code);
414
list[2] = chr;
415
list[3] = NOTACHAR;
416
return code;
417
418
case OP_CHARI:
419
case OP_NOTI:
420
list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
421
GETCHARINCTEST(chr, code);
422
list[2] = chr;
423
424
#ifdef SUPPORT_UNICODE
425
if (chr < 128 || (chr < 256 && !utf && !ucp))
426
list[3] = fcc[chr];
427
else
428
list[3] = UCD_OTHERCASE(chr);
429
#elif defined SUPPORT_WIDE_CHARS
430
list[3] = (chr < 256) ? fcc[chr] : chr;
431
#else
432
list[3] = fcc[chr];
433
#endif
434
435
/* The othercase might be the same value. */
436
437
if (chr == list[3])
438
list[3] = NOTACHAR;
439
else
440
list[4] = NOTACHAR;
441
return code;
442
443
#ifdef SUPPORT_UNICODE
444
case OP_PROP:
445
case OP_NOTPROP:
446
if (code[0] != PT_CLIST)
447
{
448
list[2] = code[0];
449
list[3] = code[1];
450
return code + 2;
451
}
452
453
/* Convert only if we have enough space. */
454
455
clist_src = PRIV(ucd_caseless_sets) + code[1];
456
clist_dest = list + 2;
457
code += 2;
458
459
do {
460
if (clist_dest >= list + MAX_LIST)
461
{
462
/* Early return if there is not enough space. GenerateUcd.py
463
generated a list with more than 5 characters and something
464
must be done about that going forward. */
465
PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */
466
list[2] = code[0];
467
list[3] = code[1];
468
return code;
469
}
470
*clist_dest++ = *clist_src;
471
}
472
while(*clist_src++ != NOTACHAR);
473
474
/* All characters are stored. The terminating NOTACHAR is copied from the
475
clist itself. */
476
477
list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
478
return code;
479
#endif
480
481
case OP_NCLASS:
482
case OP_CLASS:
483
#ifdef SUPPORT_WIDE_CHARS
484
case OP_XCLASS:
485
case OP_ECLASS:
486
if (c == OP_XCLASS || c == OP_ECLASS)
487
end = code + GET(code, 0) - 1;
488
else
489
#endif
490
end = code + 32 / sizeof(PCRE2_UCHAR);
491
class_end = end;
492
493
switch(*end)
494
{
495
case OP_CRSTAR:
496
case OP_CRMINSTAR:
497
case OP_CRQUERY:
498
case OP_CRMINQUERY:
499
case OP_CRPOSSTAR:
500
case OP_CRPOSQUERY:
501
list[1] = TRUE;
502
end++;
503
break;
504
505
case OP_CRPLUS:
506
case OP_CRMINPLUS:
507
case OP_CRPOSPLUS:
508
end++;
509
break;
510
511
case OP_CRRANGE:
512
case OP_CRMINRANGE:
513
case OP_CRPOSRANGE:
514
list[1] = (GET2(end, 1) == 0);
515
end += 1 + 2 * IMM2_SIZE;
516
break;
517
}
518
list[2] = (uint32_t)(end - code);
519
list[3] = (uint32_t)(end - class_end);
520
return end;
521
}
522
523
return NULL; /* Opcode not accepted */
524
}
525
526
527
528
/*************************************************
529
* Scan further character sets for match *
530
*************************************************/
531
532
/* Checks whether the base and the current opcode have a common character, in
533
which case the base cannot be possessified.
534
535
Arguments:
536
code points to the byte code
537
utf TRUE in UTF mode
538
ucp TRUE in UCP mode
539
cb compile data block
540
base_list the data list of the base opcode
541
base_end the end of the base opcode
542
rec_limit points to recursion depth counter
543
544
Returns: TRUE if the auto-possessification is possible
545
*/
546
547
static BOOL
548
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
549
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
550
{
551
PCRE2_UCHAR c;
552
uint32_t list[MAX_LIST];
553
const uint32_t *chr_ptr;
554
const uint32_t *ochr_ptr;
555
const uint32_t *list_ptr;
556
PCRE2_SPTR next_code;
557
#ifdef SUPPORT_WIDE_CHARS
558
PCRE2_SPTR xclass_flags;
559
#endif
560
const uint8_t *class_bitset;
561
const uint8_t *set1, *set2, *set_end;
562
uint32_t chr;
563
BOOL accepted, invert_bits;
564
BOOL entered_a_group = FALSE;
565
566
if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */
567
568
/* Note: the base_list[1] contains whether the current opcode has a greedy
569
(represented by a non-zero value) quantifier. This is a different from
570
other character type lists, which store here that the character iterator
571
matches to an empty string (also represented by a non-zero value). */
572
573
for(;;)
574
{
575
PCRE2_SPTR bracode;
576
577
/* All operations move the code pointer forward.
578
Therefore infinite recursions are not possible. */
579
580
c = *code;
581
582
/* Skip over callouts */
583
584
if (c == OP_CALLOUT)
585
{
586
code += PRIV(OP_lengths)[c];
587
continue;
588
}
589
590
if (c == OP_CALLOUT_STR)
591
{
592
code += GET(code, 1 + 2*LINK_SIZE);
593
continue;
594
}
595
596
/* At the end of a branch, skip to the end of the group and process it. */
597
598
if (c == OP_ALT)
599
{
600
do code += GET(code, 1); while (*code == OP_ALT);
601
c = *code;
602
}
603
604
/* Inspect the next opcode. */
605
606
switch(c)
607
{
608
/* We can always possessify a greedy iterator at the end of the pattern,
609
which is reached after skipping over the final OP_KET. A non-greedy
610
iterator must never be possessified. */
611
612
case OP_END:
613
return base_list[1] != 0;
614
615
/* When an iterator is at the end of certain kinds of group we can inspect
616
what follows the group by skipping over the closing ket. Note that this
617
does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
618
iteration is variable (could be another iteration or could be the next
619
item). As these two opcodes are not listed in the next switch, they will
620
end up as the next code to inspect, and return FALSE by virtue of being
621
unsupported. */
622
623
case OP_KET:
624
case OP_KETRPOS:
625
/* The non-greedy case cannot be converted to a possessive form. */
626
627
if (base_list[1] == 0) return FALSE;
628
629
/* If the bracket is capturing it might be referenced by an OP_RECURSE
630
so its last iterator can never be possessified if the pattern contains
631
recursions. (This could be improved by keeping a list of group numbers that
632
are called by recursion.) */
633
634
bracode = code - GET(code, 1);
635
switch(*bracode)
636
{
637
case OP_CBRA:
638
case OP_SCBRA:
639
case OP_CBRAPOS:
640
case OP_SCBRAPOS:
641
if (cb->had_recurse) return FALSE;
642
break;
643
644
/* A script run might have to backtrack if the iterated item can match
645
characters from more than one script. So give up unless repeating an
646
explicit character. */
647
648
case OP_SCRIPT_RUN:
649
if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
650
return FALSE;
651
break;
652
653
/* Atomic sub-patterns and forward assertions can always auto-possessify
654
their last iterator. However, if the group was entered as a result of
655
checking a previous iterator, this is not possible. */
656
657
case OP_ASSERT:
658
case OP_ASSERT_NOT:
659
case OP_ONCE:
660
return !entered_a_group;
661
662
/* Fixed-length lookbehinds can be treated the same way, but variable
663
length lookbehinds must not auto-possessify their last iterator. Note
664
that in order to identify a variable length lookbehind we must check
665
through all branches, because some may be of fixed length. */
666
667
case OP_ASSERTBACK:
668
case OP_ASSERTBACK_NOT:
669
do
670
{
671
if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */
672
bracode += GET(bracode, 1);
673
}
674
while (*bracode == OP_ALT);
675
return !entered_a_group; /* Not variable length */
676
677
/* Non-atomic assertions - don't possessify last iterator. This needs
678
more thought. */
679
680
case OP_ASSERT_NA:
681
case OP_ASSERTBACK_NA:
682
return FALSE;
683
}
684
685
/* Skip over the bracket and inspect what comes next. */
686
687
code += PRIV(OP_lengths)[c];
688
continue;
689
690
/* Handle cases where the next item is a group. */
691
692
case OP_ONCE:
693
case OP_BRA:
694
case OP_CBRA:
695
next_code = code + GET(code, 1);
696
code += PRIV(OP_lengths)[c];
697
698
/* Check each branch. We have to recurse a level for all but the last
699
branch. */
700
701
while (*next_code == OP_ALT)
702
{
703
if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
704
return FALSE;
705
code = next_code + 1 + LINK_SIZE;
706
next_code += GET(next_code, 1);
707
}
708
709
entered_a_group = TRUE;
710
continue;
711
712
case OP_BRAZERO:
713
case OP_BRAMINZERO:
714
715
next_code = code + 1;
716
if (*next_code != OP_BRA && *next_code != OP_CBRA &&
717
*next_code != OP_ONCE) return FALSE;
718
719
do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
720
721
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
722
723
next_code += 1 + LINK_SIZE;
724
if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
725
rec_limit))
726
return FALSE;
727
728
code += PRIV(OP_lengths)[c];
729
continue;
730
731
/* The next opcode does not need special handling; fall through and use it
732
to see if the base can be possessified. */
733
734
default:
735
break;
736
}
737
738
/* We now have the next appropriate opcode to compare with the base. Check
739
for a supported opcode, and load its properties. */
740
741
code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
742
if (code == NULL) return FALSE; /* Unsupported */
743
744
/* If either opcode is a small character list, set pointers for comparing
745
characters from that list with another list, or with a property. */
746
747
if (base_list[0] == OP_CHAR)
748
{
749
chr_ptr = base_list + 2;
750
list_ptr = list;
751
}
752
else if (list[0] == OP_CHAR)
753
{
754
chr_ptr = list + 2;
755
list_ptr = base_list;
756
}
757
758
/* Character bitsets can also be compared to certain opcodes. */
759
760
else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
761
#if PCRE2_CODE_UNIT_WIDTH == 8
762
/* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
763
|| (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
764
#endif
765
)
766
{
767
#if PCRE2_CODE_UNIT_WIDTH == 8
768
if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
769
#else
770
if (base_list[0] == OP_CLASS)
771
#endif
772
{
773
set1 = (const uint8_t *)(base_end - base_list[2]);
774
list_ptr = list;
775
}
776
else
777
{
778
set1 = (const uint8_t *)(code - list[2]);
779
list_ptr = base_list;
780
}
781
782
invert_bits = FALSE;
783
switch(list_ptr[0])
784
{
785
case OP_CLASS:
786
case OP_NCLASS:
787
set2 = (const uint8_t *)
788
((list_ptr == list ? code : base_end) - list_ptr[2]);
789
break;
790
791
#ifdef SUPPORT_WIDE_CHARS
792
case OP_XCLASS:
793
xclass_flags = (list_ptr == list ? code : base_end) -
794
list_ptr[2] + LINK_SIZE;
795
if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
796
if ((*xclass_flags & XCL_MAP) == 0)
797
{
798
/* No bits are set for characters < 256. */
799
if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
800
/* Might be an empty repeat. */
801
continue;
802
}
803
set2 = (const uint8_t *)(xclass_flags + 1);
804
break;
805
#endif
806
807
case OP_NOT_DIGIT:
808
invert_bits = TRUE;
809
/* Fall through */
810
case OP_DIGIT:
811
set2 = (const uint8_t *)(cb->cbits + cbit_digit);
812
break;
813
814
case OP_NOT_WHITESPACE:
815
invert_bits = TRUE;
816
/* Fall through */
817
case OP_WHITESPACE:
818
set2 = (const uint8_t *)(cb->cbits + cbit_space);
819
break;
820
821
case OP_NOT_WORDCHAR:
822
invert_bits = TRUE;
823
/* Fall through */
824
case OP_WORDCHAR:
825
set2 = (const uint8_t *)(cb->cbits + cbit_word);
826
break;
827
828
default:
829
return FALSE;
830
}
831
832
/* Because the bit sets are unaligned bytes, we need to perform byte
833
comparison here. */
834
835
set_end = set1 + 32;
836
if (invert_bits)
837
{
838
do
839
{
840
if ((*set1++ & ~(*set2++)) != 0) return FALSE;
841
}
842
while (set1 < set_end);
843
}
844
else
845
{
846
do
847
{
848
if ((*set1++ & *set2++) != 0) return FALSE;
849
}
850
while (set1 < set_end);
851
}
852
853
if (list[1] == 0) return TRUE;
854
/* Might be an empty repeat. */
855
continue;
856
}
857
858
/* Some property combinations also acceptable. Unicode property opcodes are
859
processed specially; the rest can be handled with a lookup table. */
860
861
else
862
{
863
uint32_t leftop, rightop;
864
865
leftop = base_list[0];
866
rightop = list[0];
867
868
#ifdef SUPPORT_UNICODE
869
accepted = FALSE; /* Always set in non-unicode case. */
870
if (leftop == OP_PROP || leftop == OP_NOTPROP)
871
{
872
if (rightop == OP_EOD)
873
accepted = TRUE;
874
else if (rightop == OP_PROP || rightop == OP_NOTPROP)
875
{
876
int n;
877
const uint8_t *p;
878
BOOL same = leftop == rightop;
879
BOOL lisprop = leftop == OP_PROP;
880
BOOL risprop = rightop == OP_PROP;
881
BOOL bothprop = lisprop && risprop;
882
883
/* There's a table that specifies how each combination is to be
884
processed:
885
0 Always return FALSE (never auto-possessify)
886
1 Character groups are distinct (possessify if both are OP_PROP)
887
2 Check character categories in the same group (general or particular)
888
3 Return TRUE if the two opcodes are not the same
889
... see comments below
890
*/
891
892
n = propposstab[base_list[2]][list[2]];
893
switch(n)
894
{
895
case 0: break;
896
case 1: accepted = bothprop; break;
897
case 2: accepted = (base_list[3] == list[3]) != same; break;
898
case 3: accepted = !same; break;
899
900
case 4: /* Left general category, right particular category */
901
accepted = risprop && catposstab[base_list[3]][list[3]] == same;
902
break;
903
904
case 5: /* Right general category, left particular category */
905
accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
906
break;
907
908
/* This code is logically tricky. Think hard before fiddling with it.
909
The posspropstab table has four entries per row. Each row relates to
910
one of PCRE's special properties such as ALNUM or SPACE or WORD.
911
Only WORD actually needs all four entries, but using repeats for the
912
others means they can all use the same code below.
913
914
The first two entries in each row are Unicode general categories, and
915
apply always, because all the characters they include are part of the
916
PCRE character set. The third and fourth entries are a general and a
917
particular category, respectively, that include one or more relevant
918
characters. One or the other is used, depending on whether the check
919
is for a general or a particular category. However, in both cases the
920
category contains more characters than the specials that are defined
921
for the property being tested against. Therefore, it cannot be used
922
in a NOTPROP case.
923
924
Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
925
Underscore is covered by ucp_P or ucp_Po. */
926
927
case 6: /* Left alphanum vs right general category */
928
case 7: /* Left space vs right general category */
929
case 8: /* Left word vs right general category */
930
p = posspropstab[n-6];
931
accepted = risprop && lisprop ==
932
(list[3] != p[0] &&
933
list[3] != p[1] &&
934
(list[3] != p[2] || !lisprop));
935
break;
936
937
case 9: /* Right alphanum vs left general category */
938
case 10: /* Right space vs left general category */
939
case 11: /* Right word vs left general category */
940
p = posspropstab[n-9];
941
accepted = lisprop && risprop ==
942
(base_list[3] != p[0] &&
943
base_list[3] != p[1] &&
944
(base_list[3] != p[2] || !risprop));
945
break;
946
947
case 12: /* Left alphanum vs right particular category */
948
case 13: /* Left space vs right particular category */
949
case 14: /* Left word vs right particular category */
950
p = posspropstab[n-12];
951
accepted = risprop && lisprop ==
952
(catposstab[p[0]][list[3]] &&
953
catposstab[p[1]][list[3]] &&
954
(list[3] != p[3] || !lisprop));
955
break;
956
957
case 15: /* Right alphanum vs left particular category */
958
case 16: /* Right space vs left particular category */
959
case 17: /* Right word vs left particular category */
960
p = posspropstab[n-15];
961
accepted = lisprop && risprop ==
962
(catposstab[p[0]][base_list[3]] &&
963
catposstab[p[1]][base_list[3]] &&
964
(base_list[3] != p[3] || !risprop));
965
break;
966
}
967
}
968
}
969
970
else
971
#endif /* SUPPORT_UNICODE */
972
973
accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
974
rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
975
autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
976
977
if (!accepted) return FALSE;
978
979
if (list[1] == 0) return TRUE;
980
/* Might be an empty repeat. */
981
continue;
982
}
983
984
/* Control reaches here only if one of the items is a small character list.
985
All characters are checked against the other side. */
986
987
do
988
{
989
chr = *chr_ptr;
990
991
switch(list_ptr[0])
992
{
993
case OP_CHAR:
994
ochr_ptr = list_ptr + 2;
995
do
996
{
997
if (chr == *ochr_ptr) return FALSE;
998
ochr_ptr++;
999
}
1000
while(*ochr_ptr != NOTACHAR);
1001
break;
1002
1003
case OP_NOT:
1004
ochr_ptr = list_ptr + 2;
1005
do
1006
{
1007
if (chr == *ochr_ptr)
1008
break;
1009
ochr_ptr++;
1010
}
1011
while(*ochr_ptr != NOTACHAR);
1012
if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
1013
break;
1014
1015
/* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
1016
set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
1017
1018
case OP_DIGIT:
1019
if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
1020
break;
1021
1022
case OP_NOT_DIGIT:
1023
if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
1024
break;
1025
1026
case OP_WHITESPACE:
1027
if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
1028
break;
1029
1030
case OP_NOT_WHITESPACE:
1031
if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
1032
break;
1033
1034
case OP_WORDCHAR:
1035
if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
1036
break;
1037
1038
case OP_NOT_WORDCHAR:
1039
if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
1040
break;
1041
1042
case OP_HSPACE:
1043
switch(chr)
1044
{
1045
HSPACE_CASES: return FALSE;
1046
default: break;
1047
}
1048
break;
1049
1050
case OP_NOT_HSPACE:
1051
switch(chr)
1052
{
1053
HSPACE_CASES: break;
1054
default: return FALSE;
1055
}
1056
break;
1057
1058
case OP_ANYNL:
1059
case OP_VSPACE:
1060
switch(chr)
1061
{
1062
VSPACE_CASES: return FALSE;
1063
default: break;
1064
}
1065
break;
1066
1067
case OP_NOT_VSPACE:
1068
switch(chr)
1069
{
1070
VSPACE_CASES: break;
1071
default: return FALSE;
1072
}
1073
break;
1074
1075
case OP_DOLL:
1076
case OP_EODN:
1077
switch (chr)
1078
{
1079
case CHAR_CR:
1080
case CHAR_LF:
1081
case CHAR_VT:
1082
case CHAR_FF:
1083
case CHAR_NEL:
1084
#ifndef EBCDIC
1085
case 0x2028:
1086
case 0x2029:
1087
#endif /* Not EBCDIC */
1088
return FALSE;
1089
}
1090
break;
1091
1092
case OP_EOD: /* Can always possessify before \z */
1093
break;
1094
1095
#ifdef SUPPORT_UNICODE
1096
case OP_PROP:
1097
case OP_NOTPROP:
1098
if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1099
list_ptr[0] == OP_NOTPROP))
1100
return FALSE;
1101
break;
1102
#endif
1103
1104
case OP_NCLASS:
1105
if (chr > 255) return FALSE;
1106
/* Fall through */
1107
1108
case OP_CLASS:
1109
if (chr > 255) break;
1110
class_bitset = (const uint8_t *)
1111
((list_ptr == list ? code : base_end) - list_ptr[2]);
1112
if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;
1113
break;
1114
1115
#ifdef SUPPORT_WIDE_CHARS
1116
case OP_XCLASS:
1117
if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1118
list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))
1119
return FALSE;
1120
break;
1121
1122
case OP_ECLASS:
1123
if (PRIV(eclass)(chr,
1124
(list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,
1125
(list_ptr == list ? code : base_end) - list_ptr[3],
1126
(const uint8_t*)cb->start_code, utf))
1127
return FALSE;
1128
break;
1129
#endif /* SUPPORT_WIDE_CHARS */
1130
1131
default:
1132
return FALSE;
1133
}
1134
1135
chr_ptr++;
1136
}
1137
while(*chr_ptr != NOTACHAR);
1138
1139
/* At least one character must be matched from this opcode. */
1140
1141
if (list[1] == 0) return TRUE;
1142
}
1143
1144
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
1145
return FALSE; /* Avoid compiler warnings */
1146
}
1147
1148
1149
1150
/*************************************************
1151
* Scan compiled regex for auto-possession *
1152
*************************************************/
1153
1154
/* Replaces single character iterations with their possessive alternatives
1155
if appropriate. This function modifies the compiled opcode! Hitting a
1156
non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1157
bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1158
overly complicated or large patterns. In these cases, the check just stops,
1159
leaving the remainder of the pattern unpossessified.
1160
1161
Arguments:
1162
code points to start of the byte code
1163
cb compile data block
1164
1165
Returns: 0 for success
1166
-1 if a non-existant opcode is encountered
1167
*/
1168
1169
int
1170
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
1171
{
1172
PCRE2_UCHAR c;
1173
PCRE2_SPTR end;
1174
PCRE2_UCHAR *repeat_opcode;
1175
uint32_t list[MAX_LIST];
1176
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
1177
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
1178
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
1179
1180
for (;;)
1181
{
1182
c = *code;
1183
1184
if (c >= OP_TABLE_LENGTH)
1185
{
1186
PCRE2_DEBUG_UNREACHABLE();
1187
return -1; /* Something gone wrong */
1188
}
1189
1190
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1191
{
1192
c -= get_repeat_base(c) - OP_STAR;
1193
end = (c <= OP_MINUPTO) ?
1194
get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
1195
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1196
1197
if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
1198
&rec_limit))
1199
{
1200
switch(c)
1201
{
1202
case OP_STAR:
1203
*code += OP_POSSTAR - OP_STAR;
1204
break;
1205
1206
case OP_MINSTAR:
1207
*code += OP_POSSTAR - OP_MINSTAR;
1208
break;
1209
1210
case OP_PLUS:
1211
*code += OP_POSPLUS - OP_PLUS;
1212
break;
1213
1214
case OP_MINPLUS:
1215
*code += OP_POSPLUS - OP_MINPLUS;
1216
break;
1217
1218
case OP_QUERY:
1219
*code += OP_POSQUERY - OP_QUERY;
1220
break;
1221
1222
case OP_MINQUERY:
1223
*code += OP_POSQUERY - OP_MINQUERY;
1224
break;
1225
1226
case OP_UPTO:
1227
*code += OP_POSUPTO - OP_UPTO;
1228
break;
1229
1230
case OP_MINUPTO:
1231
*code += OP_POSUPTO - OP_MINUPTO;
1232
break;
1233
}
1234
}
1235
c = *code;
1236
}
1237
else if (c == OP_CLASS || c == OP_NCLASS
1238
#ifdef SUPPORT_WIDE_CHARS
1239
|| c == OP_XCLASS || c == OP_ECLASS
1240
#endif
1241
)
1242
{
1243
#ifdef SUPPORT_WIDE_CHARS
1244
if (c == OP_XCLASS || c == OP_ECLASS)
1245
repeat_opcode = code + GET(code, 1);
1246
else
1247
#endif
1248
repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1249
1250
c = *repeat_opcode;
1251
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1252
{
1253
/* The return from get_chr_property_list() will never be NULL when
1254
*code (aka c) is one of the four class opcodes. However, gcc with
1255
-fanalyzer notes that a NULL return is possible, and grumbles. Hence we
1256
put in a check. */
1257
1258
end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
1259
list[1] = (c & 1) == 0;
1260
1261
if (end != NULL &&
1262
compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
1263
{
1264
switch (c)
1265
{
1266
case OP_CRSTAR:
1267
case OP_CRMINSTAR:
1268
*repeat_opcode = OP_CRPOSSTAR;
1269
break;
1270
1271
case OP_CRPLUS:
1272
case OP_CRMINPLUS:
1273
*repeat_opcode = OP_CRPOSPLUS;
1274
break;
1275
1276
case OP_CRQUERY:
1277
case OP_CRMINQUERY:
1278
*repeat_opcode = OP_CRPOSQUERY;
1279
break;
1280
1281
case OP_CRRANGE:
1282
case OP_CRMINRANGE:
1283
*repeat_opcode = OP_CRPOSRANGE;
1284
break;
1285
}
1286
}
1287
}
1288
c = *code;
1289
}
1290
1291
switch(c)
1292
{
1293
case OP_END:
1294
return 0;
1295
1296
case OP_TYPESTAR:
1297
case OP_TYPEMINSTAR:
1298
case OP_TYPEPLUS:
1299
case OP_TYPEMINPLUS:
1300
case OP_TYPEQUERY:
1301
case OP_TYPEMINQUERY:
1302
case OP_TYPEPOSSTAR:
1303
case OP_TYPEPOSPLUS:
1304
case OP_TYPEPOSQUERY:
1305
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1306
break;
1307
1308
case OP_TYPEUPTO:
1309
case OP_TYPEMINUPTO:
1310
case OP_TYPEEXACT:
1311
case OP_TYPEPOSUPTO:
1312
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1313
code += 2;
1314
break;
1315
1316
case OP_CALLOUT_STR:
1317
code += GET(code, 1 + 2*LINK_SIZE);
1318
break;
1319
1320
#ifdef SUPPORT_WIDE_CHARS
1321
case OP_XCLASS:
1322
case OP_ECLASS:
1323
code += GET(code, 1);
1324
break;
1325
#endif
1326
1327
case OP_MARK:
1328
case OP_COMMIT_ARG:
1329
case OP_PRUNE_ARG:
1330
case OP_SKIP_ARG:
1331
case OP_THEN_ARG:
1332
code += code[1];
1333
break;
1334
}
1335
1336
/* Add in the fixed length from the table */
1337
1338
code += PRIV(OP_lengths)[c];
1339
1340
/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1341
followed by a multi-byte character. The length in the table is a minimum, so
1342
we have to arrange to skip the extra code units. */
1343
1344
#ifdef MAYBE_UTF_MULTI
1345
if (utf) switch(c)
1346
{
1347
case OP_CHAR:
1348
case OP_CHARI:
1349
case OP_NOT:
1350
case OP_NOTI:
1351
case OP_STAR:
1352
case OP_MINSTAR:
1353
case OP_PLUS:
1354
case OP_MINPLUS:
1355
case OP_QUERY:
1356
case OP_MINQUERY:
1357
case OP_UPTO:
1358
case OP_MINUPTO:
1359
case OP_EXACT:
1360
case OP_POSSTAR:
1361
case OP_POSPLUS:
1362
case OP_POSQUERY:
1363
case OP_POSUPTO:
1364
case OP_STARI:
1365
case OP_MINSTARI:
1366
case OP_PLUSI:
1367
case OP_MINPLUSI:
1368
case OP_QUERYI:
1369
case OP_MINQUERYI:
1370
case OP_UPTOI:
1371
case OP_MINUPTOI:
1372
case OP_EXACTI:
1373
case OP_POSSTARI:
1374
case OP_POSPLUSI:
1375
case OP_POSQUERYI:
1376
case OP_POSUPTOI:
1377
case OP_NOTSTAR:
1378
case OP_NOTMINSTAR:
1379
case OP_NOTPLUS:
1380
case OP_NOTMINPLUS:
1381
case OP_NOTQUERY:
1382
case OP_NOTMINQUERY:
1383
case OP_NOTUPTO:
1384
case OP_NOTMINUPTO:
1385
case OP_NOTEXACT:
1386
case OP_NOTPOSSTAR:
1387
case OP_NOTPOSPLUS:
1388
case OP_NOTPOSQUERY:
1389
case OP_NOTPOSUPTO:
1390
case OP_NOTSTARI:
1391
case OP_NOTMINSTARI:
1392
case OP_NOTPLUSI:
1393
case OP_NOTMINPLUSI:
1394
case OP_NOTQUERYI:
1395
case OP_NOTMINQUERYI:
1396
case OP_NOTUPTOI:
1397
case OP_NOTMINUPTOI:
1398
case OP_NOTEXACTI:
1399
case OP_NOTPOSSTARI:
1400
case OP_NOTPOSPLUSI:
1401
case OP_NOTPOSQUERYI:
1402
case OP_NOTPOSUPTOI:
1403
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1404
break;
1405
}
1406
#else
1407
(void)(utf); /* Keep compiler happy by referencing function argument */
1408
#endif /* SUPPORT_WIDE_CHARS */
1409
}
1410
}
1411
1412
/* End of pcre2_auto_possess.c */
1413
1414