Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_compile.c
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
#define NLBLOCK cb /* Block containing newline information */
47
#define PSSTART start_pattern /* Field containing processed string start */
48
#define PSEND end_pattern /* Field containing processed string end */
49
50
#include "pcre2_compile.h"
51
52
/* In rare error cases debugging might require calling pcre2_printint(). */
53
54
#if 0
55
#ifdef EBCDIC
56
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57
#else
58
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59
#endif
60
#include "pcre2_printint.c"
61
#define DEBUG_CALL_PRINTINT
62
#endif
63
64
/* Other debugging code can be enabled by these defines. */
65
66
/* #define DEBUG_SHOW_CAPTURES */
67
/* #define DEBUG_SHOW_PARSED */
68
69
/* There are a few things that vary with different code unit sizes. Handle them
70
by defining macros in order to minimize #if usage. */
71
72
#if PCRE2_CODE_UNIT_WIDTH == 8
73
#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74
#define XDIGIT(c) xdigitab[c]
75
76
#else /* Either 16-bit or 32-bit */
77
#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79
#if PCRE2_CODE_UNIT_WIDTH == 16
80
#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82
#else /* 32-bit */
83
#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84
#endif
85
#endif
86
87
/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88
consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89
them will be able to (i.e. assume a 64-bit world). */
90
91
#if PCRE2_SIZE_MAX <= UINT32_MAX
92
#define PUTOFFSET(s,p) *p++ = s
93
#define GETOFFSET(s,p) s = *p++
94
#define GETPLUSOFFSET(s,p) s = *(++p)
95
#define READPLUSOFFSET(s,p) s = p[1]
96
#define SKIPOFFSET(p) p++
97
#define SIZEOFFSET 1
98
#else
99
#define PUTOFFSET(s,p) \
100
{ *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101
#define GETOFFSET(s,p) \
102
{ s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103
#define GETPLUSOFFSET(s,p) \
104
{ s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105
#define READPLUSOFFSET(s,p) \
106
{ s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107
#define SKIPOFFSET(p) p += 2
108
#define SIZEOFFSET 2
109
#endif
110
111
/* Function definitions to allow mutual recursion */
112
113
static int
114
compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
115
uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
116
open_capitem *, compile_block *, PCRE2_SIZE *);
117
118
static int
119
get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
120
compile_block *);
121
122
static BOOL
123
set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
124
compile_block *);
125
126
static int
127
check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
128
compile_block *, int *);
129
130
131
/*************************************************
132
* Code parameters and static tables *
133
*************************************************/
134
135
#define MAX_GROUP_NUMBER 65535u
136
#define MAX_REPEAT_COUNT 65535u
137
#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
138
139
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
140
different ways in the different pattern scans. The parsing and group-
141
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
142
aligned for this. Having defined the size in code units, we set up
143
C16_WORK_SIZE as the number of elements in the 16-bit vector.
144
145
During the first compiling phase, when determining how much memory is required,
146
the regex is partly compiled into this space, but the compiled parts are
147
discarded as soon as they can be, so that hopefully there will never be an
148
overrun. The code does, however, check for an overrun, which can occur for
149
pathological patterns. The size of the workspace depends on LINK_SIZE because
150
the length of compiled items varies with this.
151
152
In the real compile phase, this workspace is not currently used. */
153
154
#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
155
156
#define C16_WORK_SIZE \
157
((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
158
159
/* A uint32_t vector is used for caching information about the size of
160
capturing groups, to improve performance. A default is created on the stack of
161
this size. */
162
163
#define GROUPINFO_DEFAULT_SIZE 256
164
165
/* The overrun tests check for a slightly smaller size so that they detect the
166
overrun before it actually does run off the end of the data block. */
167
168
#define WORK_SIZE_SAFETY_MARGIN (100)
169
170
/* This value determines the size of the initial vector that is used for
171
remembering named groups during the pre-compile. It is allocated on the stack,
172
but if it is too small, it is expanded, in a similar way to the workspace. The
173
value is the number of slots in the list. */
174
175
#define NAMED_GROUP_LIST_SIZE 20
176
177
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
178
of uint32_t. For short patterns this lives on the stack, with this size. Heap
179
memory is used for longer patterns. */
180
181
#define PARSED_PATTERN_DEFAULT_SIZE 1024
182
183
/* Maximum length value to check against when making sure that the variable
184
that holds the compiled pattern length does not overflow. We make it a bit less
185
than INT_MAX to allow for adding in group terminating code units, so that we
186
don't have to check them every time. */
187
188
#define OFLOW_MAX (INT_MAX - 20)
189
190
/* Table of extra lengths for each of the meta codes. Must be kept in step with
191
the definitions above. For some items these values are a basic length to which
192
a variable amount has to be added. */
193
194
static unsigned char meta_extra_lengths[] = {
195
0, /* META_END */
196
0, /* META_ALT */
197
0, /* META_ATOMIC */
198
0, /* META_BACKREF - more if group is >= 10 */
199
1+SIZEOFFSET, /* META_BACKREF_BYNAME */
200
1, /* META_BIGVALUE */
201
3, /* META_CALLOUT_NUMBER */
202
3+SIZEOFFSET, /* META_CALLOUT_STRING */
203
0, /* META_CAPTURE */
204
0, /* META_CIRCUMFLEX */
205
0, /* META_CLASS */
206
0, /* META_CLASS_EMPTY */
207
0, /* META_CLASS_EMPTY_NOT */
208
0, /* META_CLASS_END */
209
0, /* META_CLASS_NOT */
210
0, /* META_COND_ASSERT */
211
SIZEOFFSET, /* META_COND_DEFINE */
212
1+SIZEOFFSET, /* META_COND_NAME */
213
1+SIZEOFFSET, /* META_COND_NUMBER */
214
1+SIZEOFFSET, /* META_COND_RNAME */
215
1+SIZEOFFSET, /* META_COND_RNUMBER */
216
3, /* META_COND_VERSION */
217
SIZEOFFSET, /* META_OFFSET */
218
0, /* META_SCS */
219
1, /* META_SCS_NAME */
220
1, /* META_SCS_NUMBER */
221
0, /* META_DOLLAR */
222
0, /* META_DOT */
223
0, /* META_ESCAPE - one more for ESC_P and ESC_p */
224
0, /* META_KET */
225
0, /* META_NOCAPTURE */
226
2, /* META_OPTIONS */
227
1, /* META_POSIX */
228
1, /* META_POSIX_NEG */
229
0, /* META_RANGE_ESCAPED */
230
0, /* META_RANGE_LITERAL */
231
SIZEOFFSET, /* META_RECURSE */
232
1+SIZEOFFSET, /* META_RECURSE_BYNAME */
233
0, /* META_SCRIPT_RUN */
234
0, /* META_LOOKAHEAD */
235
0, /* META_LOOKAHEADNOT */
236
SIZEOFFSET, /* META_LOOKBEHIND */
237
SIZEOFFSET, /* META_LOOKBEHINDNOT */
238
0, /* META_LOOKAHEAD_NA */
239
SIZEOFFSET, /* META_LOOKBEHIND_NA */
240
1, /* META_MARK - plus the string length */
241
0, /* META_ACCEPT */
242
0, /* META_FAIL */
243
0, /* META_COMMIT */
244
1, /* META_COMMIT_ARG - plus the string length */
245
0, /* META_PRUNE */
246
1, /* META_PRUNE_ARG - plus the string length */
247
0, /* META_SKIP */
248
1, /* META_SKIP_ARG - plus the string length */
249
0, /* META_THEN */
250
1, /* META_THEN_ARG - plus the string length */
251
0, /* META_ASTERISK */
252
0, /* META_ASTERISK_PLUS */
253
0, /* META_ASTERISK_QUERY */
254
0, /* META_PLUS */
255
0, /* META_PLUS_PLUS */
256
0, /* META_PLUS_QUERY */
257
0, /* META_QUERY */
258
0, /* META_QUERY_PLUS */
259
0, /* META_QUERY_QUERY */
260
2, /* META_MINMAX */
261
2, /* META_MINMAX_PLUS */
262
2, /* META_MINMAX_QUERY */
263
0, /* META_ECLASS_AND */
264
0, /* META_ECLASS_OR */
265
0, /* META_ECLASS_SUB */
266
0, /* META_ECLASS_XOR */
267
0 /* META_ECLASS_NOT */
268
};
269
270
/* Types for skipping parts of a parsed pattern. */
271
272
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
273
274
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
275
variables, which are concerned with first and required code units. A value
276
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
277
matching xxcu variable is set, and the low valued bits are relevant. */
278
279
#define REQ_UNSET 0xffffffffu /* Not yet found anything */
280
#define REQ_NONE 0xfffffffeu /* Found not fixed character */
281
#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
282
#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
283
284
/* These flags are used in the groupinfo vector. */
285
286
#define GI_SET_FIXED_LENGTH 0x80000000u
287
#define GI_NOT_FIXED_LENGTH 0x40000000u
288
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
289
290
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
291
and is fast (a good compiler can turn it into a subtraction and unsigned
292
comparison). */
293
294
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
295
296
/* Table to identify hex digits. The tables in chartables are dependent on the
297
locale, and may mark arbitrary characters as digits. We want to recognize only
298
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
299
costs 256 bytes, but it is a lot faster than doing character value tests (at
300
least in some simple cases I timed), and in some applications one wants PCRE2
301
to compile efficiently as well as match efficiently. The value in the table is
302
the binary hex digit value, or 0xff for non-hex digits. */
303
304
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
305
UTF-8 mode. */
306
307
#ifndef EBCDIC
308
static const uint8_t xdigitab[] =
309
{
310
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
311
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
312
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
313
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
314
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
315
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
316
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
317
0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
318
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
319
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
320
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
321
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
322
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
323
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
324
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
325
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
326
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
327
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
328
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
329
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
330
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
331
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
332
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
333
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
334
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
335
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
336
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
337
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
338
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
339
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
340
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
341
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
342
343
#else
344
345
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
346
347
static const uint8_t xdigitab[] =
348
{
349
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
350
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
351
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
352
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
353
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
354
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
355
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
356
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
357
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
358
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
359
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
360
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
361
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
362
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
363
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
364
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
365
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
366
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
367
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
368
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
369
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
370
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
371
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
372
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
373
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
374
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
375
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
376
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
377
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
378
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
379
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
380
0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
381
#endif /* EBCDIC */
382
383
384
/* Table for handling alphanumeric escaped characters. Positive returns are
385
simple data values; negative values are for special things like \d and so on.
386
Zero means further processing is needed (for things like \x), or the escape is
387
invalid. */
388
389
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
390
in UTF-8 mode. It runs from '0' to 'z'. */
391
392
#ifndef EBCDIC
393
#define ESCAPES_FIRST CHAR_0
394
#define ESCAPES_LAST CHAR_z
395
#define UPPER_CASE(c) (c-32)
396
397
static const short int escapes[] = {
398
/* 0 */ 0, /* 1 */ 0,
399
/* 2 */ 0, /* 3 */ 0,
400
/* 4 */ 0, /* 5 */ 0,
401
/* 6 */ 0, /* 7 */ 0,
402
/* 8 */ 0, /* 9 */ 0,
403
/* : */ CHAR_COLON, /* ; */ CHAR_SEMICOLON,
404
/* < */ CHAR_LESS_THAN_SIGN, /* = */ CHAR_EQUALS_SIGN,
405
/* > */ CHAR_GREATER_THAN_SIGN, /* ? */ CHAR_QUESTION_MARK,
406
/* @ */ CHAR_COMMERCIAL_AT, /* A */ -ESC_A,
407
/* B */ -ESC_B, /* C */ -ESC_C,
408
/* D */ -ESC_D, /* E */ -ESC_E,
409
/* F */ 0, /* G */ -ESC_G,
410
/* H */ -ESC_H, /* I */ 0,
411
/* J */ 0, /* K */ -ESC_K,
412
/* L */ 0, /* M */ 0,
413
/* N */ -ESC_N, /* O */ 0,
414
/* P */ -ESC_P, /* Q */ -ESC_Q,
415
/* R */ -ESC_R, /* S */ -ESC_S,
416
/* T */ 0, /* U */ 0,
417
/* V */ -ESC_V, /* W */ -ESC_W,
418
/* X */ -ESC_X, /* Y */ 0,
419
/* Z */ -ESC_Z, /* [ */ CHAR_LEFT_SQUARE_BRACKET,
420
/* \ */ CHAR_BACKSLASH, /* ] */ CHAR_RIGHT_SQUARE_BRACKET,
421
/* ^ */ CHAR_CIRCUMFLEX_ACCENT, /* _ */ CHAR_UNDERSCORE,
422
/* ` */ CHAR_GRAVE_ACCENT, /* a */ CHAR_BEL,
423
/* b */ -ESC_b, /* c */ 0,
424
/* d */ -ESC_d, /* e */ CHAR_ESC,
425
/* f */ CHAR_FF, /* g */ 0,
426
/* h */ -ESC_h, /* i */ 0,
427
/* j */ 0, /* k */ -ESC_k,
428
/* l */ 0, /* m */ 0,
429
/* n */ CHAR_LF, /* o */ 0,
430
/* p */ -ESC_p, /* q */ 0,
431
/* r */ CHAR_CR, /* s */ -ESC_s,
432
/* t */ CHAR_HT, /* u */ 0,
433
/* v */ -ESC_v, /* w */ -ESC_w,
434
/* x */ 0, /* y */ 0,
435
/* z */ -ESC_z
436
};
437
438
#else
439
440
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
441
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
442
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
443
because it is defined as 'a', which of course picks up the ASCII value. */
444
445
#if 'a' == 0x81 /* Check for a real EBCDIC environment */
446
#define ESCAPES_FIRST CHAR_a
447
#define ESCAPES_LAST CHAR_9
448
#define UPPER_CASE(c) (c+64)
449
#else /* Testing in an ASCII environment */
450
#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
451
#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
452
#define UPPER_CASE(c) (c-32)
453
#endif
454
455
static const short int escapes[] = {
456
/* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
457
/* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
458
/* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
459
/* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
460
/* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
461
/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
462
/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
463
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
464
/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
465
/* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
466
/* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
467
/* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
468
/* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
469
/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
470
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
471
/* F8 */ 0, 0
472
};
473
474
/* We also need a table of characters that may follow \c in an EBCDIC
475
environment for characters 0-31. */
476
477
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
478
479
#endif /* EBCDIC */
480
481
482
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
483
searched linearly. Put all the names into a single string, in order to reduce
484
the number of relocations when a shared library is dynamically linked. The
485
string is built from string macros so that it works in UTF-8 mode on EBCDIC
486
platforms. */
487
488
typedef struct verbitem {
489
unsigned int len; /* Length of verb name */
490
uint32_t meta; /* Base META_ code */
491
int has_arg; /* Argument requirement */
492
} verbitem;
493
494
static const char verbnames[] =
495
"\0" /* Empty name is a shorthand for MARK */
496
STRING_MARK0
497
STRING_ACCEPT0
498
STRING_F0
499
STRING_FAIL0
500
STRING_COMMIT0
501
STRING_PRUNE0
502
STRING_SKIP0
503
STRING_THEN;
504
505
static const verbitem verbs[] = {
506
{ 0, META_MARK, +1 }, /* > 0 => must have an argument */
507
{ 4, META_MARK, +1 },
508
{ 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
509
{ 1, META_FAIL, -1 },
510
{ 4, META_FAIL, -1 },
511
{ 6, META_COMMIT, 0 },
512
{ 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
513
{ 4, META_SKIP, 0 },
514
{ 4, META_THEN, 0 }
515
};
516
517
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
518
519
/* Verb opcodes, indexed by their META code offset from META_MARK. */
520
521
static const uint32_t verbops[] = {
522
OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
523
OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
524
525
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
526
527
typedef struct alasitem {
528
unsigned int len; /* Length of name */
529
uint32_t meta; /* Base META_ code */
530
} alasitem;
531
532
static const char alasnames[] =
533
STRING_pla0
534
STRING_plb0
535
STRING_napla0
536
STRING_naplb0
537
STRING_nla0
538
STRING_nlb0
539
STRING_positive_lookahead0
540
STRING_positive_lookbehind0
541
STRING_non_atomic_positive_lookahead0
542
STRING_non_atomic_positive_lookbehind0
543
STRING_negative_lookahead0
544
STRING_negative_lookbehind0
545
STRING_scs0
546
STRING_scan_substring0
547
STRING_atomic0
548
STRING_sr0
549
STRING_asr0
550
STRING_script_run0
551
STRING_atomic_script_run;
552
553
static const alasitem alasmeta[] = {
554
{ 3, META_LOOKAHEAD },
555
{ 3, META_LOOKBEHIND },
556
{ 5, META_LOOKAHEAD_NA },
557
{ 5, META_LOOKBEHIND_NA },
558
{ 3, META_LOOKAHEADNOT },
559
{ 3, META_LOOKBEHINDNOT },
560
{ 18, META_LOOKAHEAD },
561
{ 19, META_LOOKBEHIND },
562
{ 29, META_LOOKAHEAD_NA },
563
{ 30, META_LOOKBEHIND_NA },
564
{ 18, META_LOOKAHEADNOT },
565
{ 19, META_LOOKBEHINDNOT },
566
{ 3, META_SCS },
567
{ 14, META_SCS },
568
{ 6, META_ATOMIC },
569
{ 2, META_SCRIPT_RUN }, /* sr = script run */
570
{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
571
{ 10, META_SCRIPT_RUN }, /* script run */
572
{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
573
};
574
575
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
576
577
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
578
579
static uint32_t chartypeoffset[] = {
580
OP_STAR - OP_STAR, OP_STARI - OP_STAR,
581
OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
582
583
/* Tables of names of POSIX character classes and their lengths. The names are
584
now all in a single string, to reduce the number of relocations when a shared
585
library is dynamically loaded. The list of lengths is terminated by a zero
586
length entry. The first three must be alpha, lower, upper, as this is assumed
587
for handling case independence.
588
589
The indices for several classes are stored in pcre2_compile.h - these must
590
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
591
and posix_substitutes. */
592
593
static const char posix_names[] =
594
STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
595
STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
596
STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
597
STRING_word0 STRING_xdigit;
598
599
static const uint8_t posix_name_lengths[] = {
600
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
601
602
/* Table of class bit maps for each POSIX class. Each class is formed from a
603
base map, with an optional addition or removal of another map. Then, for some
604
classes, there is some additional tweaking: for [:blank:] the vertical space
605
characters are removed, and for [:alpha:] and [:alnum:] the underscore
606
character is removed. The triples in the table consist of the base map offset,
607
second map offset or -1 if no second map, and a non-negative value for map
608
addition or a negative value for map subtraction (if there are two maps). The
609
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
610
remove vertical space characters, 2 => remove underscore. */
611
612
const int PRIV(posix_class_maps)[] = {
613
cbit_word, cbit_digit, -2, /* alpha */
614
cbit_lower, -1, 0, /* lower */
615
cbit_upper, -1, 0, /* upper */
616
cbit_word, -1, 2, /* alnum - word without underscore */
617
cbit_print, cbit_cntrl, 0, /* ascii */
618
cbit_space, -1, 1, /* blank - a GNU extension */
619
cbit_cntrl, -1, 0, /* cntrl */
620
cbit_digit, -1, 0, /* digit */
621
cbit_graph, -1, 0, /* graph */
622
cbit_print, -1, 0, /* print */
623
cbit_punct, -1, 0, /* punct */
624
cbit_space, -1, 0, /* space */
625
cbit_word, -1, 0, /* word - a Perl extension */
626
cbit_xdigit, -1, 0 /* xdigit */
627
};
628
629
#ifdef SUPPORT_UNICODE
630
631
/* The POSIX class Unicode property substitutes that are used in UCP mode must
632
be in the order of the POSIX class names, defined above. */
633
634
static int posix_substitutes[] = {
635
PT_GC, ucp_L, /* alpha */
636
PT_PC, ucp_Ll, /* lower */
637
PT_PC, ucp_Lu, /* upper */
638
PT_ALNUM, 0, /* alnum */
639
-1, 0, /* ascii, treat as non-UCP */
640
-1, 1, /* blank, treat as \h */
641
PT_PC, ucp_Cc, /* cntrl */
642
PT_PC, ucp_Nd, /* digit */
643
PT_PXGRAPH, 0, /* graph */
644
PT_PXPRINT, 0, /* print */
645
PT_PXPUNCT, 0, /* punct */
646
PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
647
PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
648
PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */
649
};
650
#endif /* SUPPORT_UNICODE */
651
652
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
653
are allowed. */
654
655
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
656
(PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
657
PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
658
PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
659
660
#define PUBLIC_COMPILE_OPTIONS \
661
(PUBLIC_LITERAL_COMPILE_OPTIONS| \
662
PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
663
PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
664
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
665
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
666
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
667
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
668
669
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
670
(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
671
PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
672
673
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
674
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
675
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
676
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
677
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
678
PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
679
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
680
PCRE2_EXTRA_NEVER_CALLOUT)
681
682
/* This is a table of start-of-pattern options such as (*UTF) and settings such
683
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
684
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
685
generic and always supported. */
686
687
enum { PSO_OPT, /* Value is an option bit */
688
PSO_XOPT, /* Value is an xoption bit */
689
PSO_FLG, /* Value is a flag bit */
690
PSO_NL, /* Value is a newline type */
691
PSO_BSR, /* Value is a \R type */
692
PSO_LIMH, /* Read integer value for heap limit */
693
PSO_LIMM, /* Read integer value for match limit */
694
PSO_LIMD, /* Read integer value for depth limit */
695
PSO_OPTMZ /* Value is an optimization bit */
696
};
697
698
typedef struct pso {
699
const char *name;
700
uint16_t length;
701
uint16_t type;
702
uint32_t value;
703
} pso;
704
705
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
706
707
static const pso pso_list[] = {
708
{ STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
709
{ STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
710
{ STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
711
{ STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
712
{ STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
713
{ STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
714
{ STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
715
{ STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
716
{ STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
717
{ STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
718
{ STRING_TURKISH_CASING_RIGHTPAR, 15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
719
{ STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
720
{ STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
721
{ STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
722
{ STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
723
{ STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
724
{ STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
725
{ STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
726
{ STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
727
{ STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
728
{ STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
729
{ STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
730
{ STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
731
};
732
733
/* This table is used when converting repeating opcodes into possessified
734
versions as a result of an explicit possessive quantifier such as ++. A zero
735
value means there is no possessified version - in those cases the item in
736
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
737
because all relevant opcodes are less than that. */
738
739
static const uint8_t opcode_possessify[] = {
740
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
741
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
742
743
0, /* NOTI */
744
OP_POSSTAR, 0, /* STAR, MINSTAR */
745
OP_POSPLUS, 0, /* PLUS, MINPLUS */
746
OP_POSQUERY, 0, /* QUERY, MINQUERY */
747
OP_POSUPTO, 0, /* UPTO, MINUPTO */
748
0, /* EXACT */
749
0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
750
751
OP_POSSTARI, 0, /* STARI, MINSTARI */
752
OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
753
OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
754
OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
755
0, /* EXACTI */
756
0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
757
758
OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
759
OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
760
OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
761
OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
762
0, /* NOTEXACT */
763
0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
764
765
OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
766
OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
767
OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
768
OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
769
0, /* NOTEXACTI */
770
0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
771
772
OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
773
OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
774
OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
775
OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
776
0, /* TYPEEXACT */
777
0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
778
779
OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
780
OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
781
OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
782
OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
783
0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
784
785
0, 0, 0, 0, /* CLASS, NCLASS, XCLASS, ECLASS */
786
0, 0, /* REF, REFI */
787
0, 0, /* DNREF, DNREFI */
788
0, 0, /* RECURSE, CALLOUT */
789
};
790
791
/* Compile-time check that the table has the correct size. */
792
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
793
794
795
#ifdef DEBUG_SHOW_PARSED
796
/*************************************************
797
* Show the parsed pattern for debugging *
798
*************************************************/
799
800
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
801
can be enabled. */
802
803
static void show_parsed(compile_block *cb)
804
{
805
uint32_t *pptr = cb->parsed_pattern;
806
807
for (;;)
808
{
809
int max, min;
810
PCRE2_SIZE offset;
811
uint32_t i;
812
uint32_t length;
813
uint32_t meta_arg = META_DATA(*pptr);
814
815
fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
816
817
if (*pptr < META_END)
818
{
819
if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
820
pptr++;
821
}
822
823
else switch (META_CODE(*pptr++))
824
{
825
default:
826
fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
827
return;
828
829
case META_END:
830
fprintf(stderr, "META_END\n");
831
return;
832
833
case META_CAPTURE:
834
fprintf(stderr, "META_CAPTURE %d", meta_arg);
835
break;
836
837
case META_RECURSE:
838
GETOFFSET(offset, pptr);
839
fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
840
break;
841
842
case META_BACKREF:
843
if (meta_arg < 10)
844
offset = cb->small_ref_offset[meta_arg];
845
else
846
GETOFFSET(offset, pptr);
847
fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
848
break;
849
850
case META_ESCAPE:
851
if (meta_arg == ESC_P || meta_arg == ESC_p)
852
{
853
uint32_t ptype = *pptr >> 16;
854
uint32_t pvalue = *pptr++ & 0xffff;
855
fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
856
ptype, pvalue);
857
}
858
else
859
{
860
uint32_t cc;
861
/* There's just one escape we might have here that isn't negated in the
862
escapes table. */
863
if (meta_arg == ESC_g) cc = CHAR_g;
864
else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
865
{
866
if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
867
}
868
if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
869
fprintf(stderr, "META \\%c", cc);
870
}
871
break;
872
873
case META_MINMAX:
874
min = *pptr++;
875
max = *pptr++;
876
if (max != REPEAT_UNLIMITED)
877
fprintf(stderr, "META {%d,%d}", min, max);
878
else
879
fprintf(stderr, "META {%d,}", min);
880
break;
881
882
case META_MINMAX_QUERY:
883
min = *pptr++;
884
max = *pptr++;
885
if (max != REPEAT_UNLIMITED)
886
fprintf(stderr, "META {%d,%d}?", min, max);
887
else
888
fprintf(stderr, "META {%d,}?", min);
889
break;
890
891
case META_MINMAX_PLUS:
892
min = *pptr++;
893
max = *pptr++;
894
if (max != REPEAT_UNLIMITED)
895
fprintf(stderr, "META {%d,%d}+", min, max);
896
else
897
fprintf(stderr, "META {%d,}+", min);
898
break;
899
900
case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
901
case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
902
case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
903
case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
904
case META_DOT: fprintf(stderr, "META_DOT"); break;
905
case META_ASTERISK: fprintf(stderr, "META *"); break;
906
case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
907
case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
908
case META_PLUS: fprintf(stderr, "META +"); break;
909
case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
910
case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
911
case META_QUERY: fprintf(stderr, "META ?"); break;
912
case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
913
case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
914
915
case META_ATOMIC: fprintf(stderr, "META (?>"); break;
916
case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
917
case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
918
case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
919
case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
920
case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
921
case META_KET: fprintf(stderr, "META )"); break;
922
case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
923
924
case META_CLASS: fprintf(stderr, "META ["); break;
925
case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
926
case META_CLASS_END: fprintf(stderr, "META ]"); break;
927
case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
928
case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
929
930
case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
931
case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
932
933
case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
934
case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
935
936
case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
937
case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
938
case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
939
case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
940
case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
941
case META_THEN: fprintf(stderr, "META (*THEN)"); break;
942
943
case META_OPTIONS:
944
fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
945
pptr += 2;
946
break;
947
948
case META_LOOKBEHIND:
949
fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
950
pptr += 2;
951
break;
952
953
case META_LOOKBEHIND_NA:
954
fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
955
pptr += 2;
956
break;
957
958
case META_LOOKBEHINDNOT:
959
fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
960
pptr += 2;
961
break;
962
963
case META_CALLOUT_NUMBER:
964
fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
965
pptr[1]);
966
pptr += 3;
967
break;
968
969
case META_CALLOUT_STRING:
970
{
971
uint32_t patoffset = *pptr++; /* Offset of next pattern item */
972
uint32_t patlength = *pptr++; /* Length of next pattern item */
973
fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
974
GETOFFSET(offset, pptr);
975
fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
976
}
977
break;
978
979
case META_RECURSE_BYNAME:
980
fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
981
GETOFFSET(offset, pptr);
982
fprintf(stderr, "%zd", offset);
983
break;
984
985
case META_BACKREF_BYNAME:
986
fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
987
GETOFFSET(offset, pptr);
988
fprintf(stderr, "%zd", offset);
989
break;
990
991
case META_COND_NUMBER:
992
fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
993
GETOFFSET(offset, pptr);
994
fprintf(stderr, "%zd", offset);
995
pptr++;
996
break;
997
998
case META_COND_DEFINE:
999
fprintf(stderr, "META (?(DEFINE) offset=");
1000
GETOFFSET(offset, pptr);
1001
fprintf(stderr, "%zd", offset);
1002
break;
1003
1004
case META_COND_VERSION:
1005
fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1006
fprintf(stderr, "%d.", *pptr++);
1007
fprintf(stderr, "%d)", *pptr++);
1008
break;
1009
1010
case META_COND_NAME:
1011
fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1012
GETOFFSET(offset, pptr);
1013
fprintf(stderr, "%zd", offset);
1014
break;
1015
1016
case META_COND_RNAME:
1017
fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1018
GETOFFSET(offset, pptr);
1019
fprintf(stderr, "%zd", offset);
1020
break;
1021
1022
/* This is kept as a name, because it might be. */
1023
1024
case META_COND_RNUMBER:
1025
fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1026
GETOFFSET(offset, pptr);
1027
fprintf(stderr, "%zd", offset);
1028
break;
1029
1030
case META_OFFSET:
1031
fprintf(stderr, "META_OFFSET offset=");
1032
GETOFFSET(offset, pptr);
1033
fprintf(stderr, "%zd", offset);
1034
break;
1035
1036
case META_SCS:
1037
fprintf(stderr, "META (*scan_substring:");
1038
break;
1039
1040
case META_SCS_NAME:
1041
fprintf(stderr, "META_SCS_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1042
break;
1043
1044
case META_SCS_NUMBER:
1045
fprintf(stderr, "META_SCS_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1046
break;
1047
1048
case META_MARK:
1049
fprintf(stderr, "META (*MARK:");
1050
goto SHOWARG;
1051
1052
case META_COMMIT_ARG:
1053
fprintf(stderr, "META (*COMMIT:");
1054
goto SHOWARG;
1055
1056
case META_PRUNE_ARG:
1057
fprintf(stderr, "META (*PRUNE:");
1058
goto SHOWARG;
1059
1060
case META_SKIP_ARG:
1061
fprintf(stderr, "META (*SKIP:");
1062
goto SHOWARG;
1063
1064
case META_THEN_ARG:
1065
fprintf(stderr, "META (*THEN:");
1066
SHOWARG:
1067
length = *pptr++;
1068
for (i = 0; i < length; i++)
1069
{
1070
uint32_t cc = *pptr++;
1071
if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1072
else fprintf(stderr, "\\x{%x}", cc);
1073
}
1074
fprintf(stderr, ") length=%u", length);
1075
break;
1076
1077
case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1078
case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1079
case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1080
case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1081
case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1082
}
1083
fprintf(stderr, "\n");
1084
}
1085
return;
1086
}
1087
#endif /* DEBUG_SHOW_PARSED */
1088
1089
1090
1091
/*************************************************
1092
* Copy compiled code *
1093
*************************************************/
1094
1095
/* Compiled JIT code cannot be copied, so the new compiled block has no
1096
associated JIT data. */
1097
1098
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1099
pcre2_code_copy(const pcre2_code *code)
1100
{
1101
PCRE2_SIZE *ref_count;
1102
pcre2_code *newcode;
1103
1104
if (code == NULL) return NULL;
1105
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1106
if (newcode == NULL) return NULL;
1107
memcpy(newcode, code, code->blocksize);
1108
newcode->executable_jit = NULL;
1109
1110
/* If the code is one that has been deserialized, increment the reference count
1111
in the decoded tables. */
1112
1113
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1114
{
1115
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1116
(*ref_count)++;
1117
}
1118
1119
return newcode;
1120
}
1121
1122
1123
1124
/*************************************************
1125
* Copy compiled code and character tables *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. This version of code_copy also makes a separate copy of
1130
the character tables. */
1131
1132
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1133
pcre2_code_copy_with_tables(const pcre2_code *code)
1134
{
1135
PCRE2_SIZE* ref_count;
1136
pcre2_code *newcode;
1137
uint8_t *newtables;
1138
1139
if (code == NULL) return NULL;
1140
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1141
if (newcode == NULL) return NULL;
1142
memcpy(newcode, code, code->blocksize);
1143
newcode->executable_jit = NULL;
1144
1145
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1146
code->memctl.memory_data);
1147
if (newtables == NULL)
1148
{
1149
code->memctl.free((void *)newcode, code->memctl.memory_data);
1150
return NULL;
1151
}
1152
memcpy(newtables, code->tables, TABLES_LENGTH);
1153
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1154
*ref_count = 1;
1155
1156
newcode->tables = newtables;
1157
newcode->flags |= PCRE2_DEREF_TABLES;
1158
return newcode;
1159
}
1160
1161
1162
1163
/*************************************************
1164
* Free compiled code *
1165
*************************************************/
1166
1167
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1168
pcre2_code_free(pcre2_code *code)
1169
{
1170
PCRE2_SIZE* ref_count;
1171
1172
if (code != NULL)
1173
{
1174
#ifdef SUPPORT_JIT
1175
if (code->executable_jit != NULL)
1176
PRIV(jit_free)(code->executable_jit, &code->memctl);
1177
#endif
1178
1179
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1180
{
1181
/* Decoded tables belong to the codes after deserialization, and they must
1182
be freed when there are no more references to them. The *ref_count should
1183
always be > 0. */
1184
1185
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1186
if (*ref_count > 0)
1187
{
1188
(*ref_count)--;
1189
if (*ref_count == 0)
1190
code->memctl.free((void *)code->tables, code->memctl.memory_data);
1191
}
1192
}
1193
1194
code->memctl.free(code, code->memctl.memory_data);
1195
}
1196
}
1197
1198
1199
1200
/*************************************************
1201
* Read a number, possibly signed *
1202
*************************************************/
1203
1204
/* This function is used to read numbers in the pattern. The initial pointer
1205
must be at the sign or first digit of the number. When relative values
1206
(introduced by + or -) are allowed, they are relative group numbers, and the
1207
result must be greater than zero.
1208
1209
Arguments:
1210
ptrptr points to the character pointer variable
1211
ptrend points to the end of the input string
1212
allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1213
max_value the largest number allowed;
1214
you must not pass a value for max_value larger than
1215
INT_MAX/10 - 1 because this function relies on max_value to
1216
avoid integer overflow
1217
max_error the error to give for an over-large number
1218
intptr where to put the result
1219
errcodeptr where to put an error code
1220
1221
Returns: TRUE - a number was read
1222
FALSE - errorcode == 0 => no number was found
1223
errorcode != 0 => an error occurred
1224
*/
1225
1226
static BOOL
1227
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1228
uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1229
{
1230
int sign = 0;
1231
uint32_t n = 0;
1232
PCRE2_SPTR ptr = *ptrptr;
1233
BOOL yield = FALSE;
1234
1235
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1236
1237
*errorcodeptr = 0;
1238
1239
if (allow_sign >= 0 && ptr < ptrend)
1240
{
1241
if (*ptr == CHAR_PLUS)
1242
{
1243
sign = +1;
1244
max_value -= allow_sign;
1245
ptr++;
1246
}
1247
else if (*ptr == CHAR_MINUS)
1248
{
1249
sign = -1;
1250
ptr++;
1251
}
1252
}
1253
1254
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1255
while (ptr < ptrend && IS_DIGIT(*ptr))
1256
{
1257
n = n * 10 + (*ptr++ - CHAR_0);
1258
if (n > max_value)
1259
{
1260
*errorcodeptr = max_error;
1261
while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1262
goto EXIT;
1263
}
1264
}
1265
1266
if (allow_sign >= 0 && sign != 0)
1267
{
1268
if (n == 0)
1269
{
1270
*errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1271
goto EXIT;
1272
}
1273
1274
if (sign > 0) n += allow_sign;
1275
else if (n > (uint32_t)allow_sign)
1276
{
1277
*errorcodeptr = ERR15; /* Non-existent subpattern */
1278
goto EXIT;
1279
}
1280
else n = allow_sign + 1 - n;
1281
}
1282
1283
yield = TRUE;
1284
1285
EXIT:
1286
*intptr = n;
1287
*ptrptr = ptr;
1288
return yield;
1289
}
1290
1291
1292
1293
/*************************************************
1294
* Read repeat counts *
1295
*************************************************/
1296
1297
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1298
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1299
larger value is used for "unlimited". We have to use signed arguments for
1300
read_number() because it is capable of returning a signed value. As of Perl
1301
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1302
tabs after { and before } and between the numbers and the comma, so we do too.
1303
1304
Arguments:
1305
ptrptr points to pointer to character after '{'
1306
ptrend pointer to end of input
1307
minp if not NULL, pointer to int for min
1308
maxp if not NULL, pointer to int for max
1309
errorcodeptr points to error code variable
1310
1311
Returns: FALSE if not a repeat quantifier, errorcode set zero
1312
FALSE on error, with errorcode set non-zero
1313
TRUE on success, with pointer updated to point after '}'
1314
*/
1315
1316
static BOOL
1317
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1318
uint32_t *maxp, int *errorcodeptr)
1319
{
1320
PCRE2_SPTR p = *ptrptr;
1321
PCRE2_SPTR pp;
1322
BOOL yield = FALSE;
1323
BOOL had_minimum = FALSE;
1324
int32_t min = 0;
1325
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1326
1327
*errorcodeptr = 0;
1328
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1329
1330
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1331
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1332
error. */
1333
1334
pp = p;
1335
if (pp < ptrend && IS_DIGIT(*pp))
1336
{
1337
had_minimum = TRUE;
1338
while (++pp < ptrend && IS_DIGIT(*pp)) {}
1339
}
1340
1341
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1342
if (pp >= ptrend) return FALSE;
1343
1344
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1345
{
1346
if (!had_minimum) return FALSE;
1347
}
1348
else
1349
{
1350
if (*pp++ != CHAR_COMMA) return FALSE;
1351
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1352
if (pp >= ptrend) return FALSE;
1353
if (IS_DIGIT(*pp))
1354
{
1355
while (++pp < ptrend && IS_DIGIT(*pp)) {}
1356
}
1357
else if (!had_minimum) return FALSE;
1358
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1359
if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1360
}
1361
1362
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1363
or {n,m}. The only error that read_number() can return is for a number that is
1364
too big. If *errorcodeptr is returned as zero it means no number was found. */
1365
1366
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1367
check m >= n because n defaults to zero. */
1368
1369
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1370
{
1371
if (*errorcodeptr != 0) goto EXIT; /* n too big */
1372
p++; /* Skip comma and subsequent spaces */
1373
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1374
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1375
{
1376
if (*errorcodeptr != 0) goto EXIT; /* m too big */
1377
}
1378
}
1379
1380
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1381
1382
else
1383
{
1384
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1385
if (*p == CHAR_RIGHT_CURLY_BRACKET)
1386
{
1387
max = min;
1388
}
1389
else /* Handle {n,} or {n,m} */
1390
{
1391
p++; /* Skip comma and subsequent spaces */
1392
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1393
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1394
{
1395
if (*errorcodeptr != 0) goto EXIT; /* m too big */
1396
}
1397
1398
if (max < min)
1399
{
1400
*errorcodeptr = ERR4;
1401
goto EXIT;
1402
}
1403
}
1404
}
1405
1406
/* Valid quantifier exists */
1407
1408
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1409
p++;
1410
yield = TRUE;
1411
if (minp != NULL) *minp = (uint32_t)min;
1412
if (maxp != NULL) *maxp = (uint32_t)max;
1413
1414
/* Update the pattern pointer */
1415
1416
EXIT:
1417
*ptrptr = p;
1418
return yield;
1419
}
1420
1421
1422
1423
/*************************************************
1424
* Handle escapes *
1425
*************************************************/
1426
1427
/* This function is called when a \ has been encountered. It either returns a
1428
positive value for a simple escape such as \d, or 0 for a data character, which
1429
is placed in chptr. A backreference to group n is returned as -(n+1). On
1430
entry, ptr is pointing at the character after \. On exit, it points after the
1431
final code unit of the escape sequence.
1432
1433
This function is also called from pcre2_substitute() to handle escape sequences
1434
in replacement strings. In this case, the cb argument is NULL, and in the case
1435
of escapes that have further processing, only sequences that define a data
1436
character are recognised. The options argument is the final value of the
1437
compiled pattern's options.
1438
1439
Arguments:
1440
ptrptr points to the input position pointer
1441
ptrend points to the end of the input
1442
chptr points to a returned data character
1443
errorcodeptr points to the errorcode variable (containing zero)
1444
options the current options bits
1445
xoptions the current extra options bits
1446
bracount the number of capturing parentheses encountered so far
1447
isclass TRUE if in a character class
1448
cb compile data block or NULL when called from pcre2_substitute()
1449
1450
Returns: zero => a data character
1451
positive => a special escape sequence
1452
negative => a numerical back reference
1453
on error, errorcodeptr is set non-zero
1454
*/
1455
1456
int
1457
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1458
int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1459
BOOL isclass, compile_block *cb)
1460
{
1461
BOOL utf = (options & PCRE2_UTF) != 0;
1462
BOOL alt_bsux =
1463
((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1464
PCRE2_SPTR ptr = *ptrptr;
1465
uint32_t c, cc;
1466
int escape = 0;
1467
int i;
1468
1469
/* If backslash is at the end of the string, it's an error. */
1470
1471
if (ptr >= ptrend)
1472
{
1473
*errorcodeptr = ERR1;
1474
return 0;
1475
}
1476
1477
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1478
*errorcodeptr = 0; /* Be optimistic */
1479
1480
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1481
value test saves a memory lookup for code points outside the alphanumeric
1482
range. */
1483
1484
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1485
1486
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1487
positive value is a literal value for something like \n. A negative value is
1488
the negation of one of the ESC_ macros that is passed back for handling by the
1489
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1490
is supported. If the value is zero, further processing is handled below. */
1491
1492
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1493
{
1494
if (i > 0)
1495
{
1496
c = (uint32_t)i;
1497
if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1498
c = CHAR_LF;
1499
}
1500
else /* Negative table entry */
1501
{
1502
escape = -i; /* Else return a special escape */
1503
if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1504
cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1505
1506
/* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1507
Unicode code points, as well as plain \N for "not newline". PCRE does not
1508
support \N{name}. However, it does support quantification such as \N{2,3},
1509
so if \N{ is not followed by U+dddd we check for a quantifier. */
1510
1511
if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1512
{
1513
PCRE2_SPTR p = ptr + 1;
1514
1515
/* Perl ignores spaces and tabs after { */
1516
1517
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1518
1519
/* \N{U+ can be handled by the \x{ code. However, this construction is
1520
not valid in EBCDIC environments because it specifies a Unicode
1521
character, not a codepoint in the local code. For example \N{U+0041}
1522
must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1523
casing semantics for the entire pattern, so allow it only in UTF (i.e.
1524
Unicode) mode. */
1525
1526
if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1527
{
1528
#ifndef EBCDIC
1529
if (utf)
1530
{
1531
ptr = p + 2;
1532
escape = 0; /* Not a fancy escape after all */
1533
goto COME_FROM_NU;
1534
}
1535
#endif
1536
*errorcodeptr = ERR93;
1537
}
1538
1539
/* Give an error in contexts where quantifiers are not allowed
1540
(character classes; substitution strings). */
1541
1542
else if (isclass || cb == NULL)
1543
{
1544
*errorcodeptr = ERR37;
1545
}
1546
1547
/* Give an error if what follows is not a quantifier, but don't override
1548
an error set by the quantifier reader (e.g. number overflow). */
1549
1550
else
1551
{
1552
if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1553
*errorcodeptr == 0)
1554
*errorcodeptr = ERR37;
1555
}
1556
}
1557
}
1558
}
1559
1560
/* Escapes that need further processing, including those that are unknown, have
1561
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1562
\o, and \x are recognized (\u and \U can never appear as they are used for case
1563
forcing). */
1564
1565
else
1566
{
1567
int s;
1568
PCRE2_SPTR oldptr;
1569
BOOL overflow;
1570
1571
/* Filter calls from pcre2_substitute(). */
1572
1573
if (cb == NULL)
1574
{
1575
if (c < CHAR_0 ||
1576
(c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x && c != CHAR_g)))
1577
{
1578
*errorcodeptr = ERR3;
1579
return 0;
1580
}
1581
alt_bsux = FALSE; /* Do not modify \x handling */
1582
}
1583
1584
switch (c)
1585
{
1586
/* A number of Perl escapes are not handled by PCRE. We give an explicit
1587
error. */
1588
1589
case CHAR_F:
1590
case CHAR_l:
1591
case CHAR_L:
1592
*errorcodeptr = ERR37;
1593
break;
1594
1595
/* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1596
is set. Otherwise, \u must be followed by exactly four hex digits or, if
1597
PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1598
Otherwise it is a lowercase u letter. This gives some compatibility with
1599
ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1600
allowed. When \u{ is not followed by hex digits, a special return is given
1601
because otherwise \u{ 12} (for example) would be treated as u{12}. */
1602
1603
case CHAR_u:
1604
if (!alt_bsux) *errorcodeptr = ERR37; else
1605
{
1606
uint32_t xc;
1607
1608
if (ptr >= ptrend) break;
1609
if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1610
(xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1611
{
1612
PCRE2_SPTR hptr = ptr + 1;
1613
1614
cc = 0;
1615
while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1616
{
1617
if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1618
{
1619
*errorcodeptr = ERR77;
1620
ptr = hptr; /* Show where */
1621
break; /* *hptr != } will cause another break below */
1622
}
1623
cc = (cc << 4) | xc;
1624
hptr++;
1625
}
1626
1627
if (hptr == ptr + 1 || /* No hex digits */
1628
hptr >= ptrend || /* Hit end of input */
1629
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1630
{
1631
if (isclass) break; /* In a class, just treat as '\u' literal */
1632
escape = ESC_ub; /* Special return */
1633
ptr++; /* Skip { */
1634
break; /* Hex escape not recognized */
1635
}
1636
1637
c = cc; /* Accept the code point */
1638
ptr = hptr + 1;
1639
}
1640
1641
else /* Must be exactly 4 hex digits */
1642
{
1643
if (ptrend - ptr < 4) break; /* Less than 4 chars */
1644
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1645
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1646
cc = (cc << 4) | xc;
1647
if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1648
cc = (cc << 4) | xc;
1649
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1650
c = (cc << 4) | xc;
1651
ptr += 4;
1652
}
1653
1654
if (utf)
1655
{
1656
if (c > 0x10ffffU) *errorcodeptr = ERR77;
1657
else
1658
if (c >= 0xd800 && c <= 0xdfff &&
1659
(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1660
*errorcodeptr = ERR73;
1661
}
1662
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1663
}
1664
break;
1665
1666
/* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1667
in which case it is an upper case letter. */
1668
1669
case CHAR_U:
1670
if (!alt_bsux) *errorcodeptr = ERR37;
1671
break;
1672
1673
/* In a character class, \g is just a literal "g". Outside a character
1674
class, \g must be followed by one of a number of specific things:
1675
1676
(1) A number, either plain or braced. If positive, it is an absolute
1677
backreference. If negative, it is a relative backreference. This is a Perl
1678
5.10 feature.
1679
1680
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1681
is part of Perl's movement towards a unified syntax for back references. As
1682
this is synonymous with \k{name}, we fudge it up by pretending it really
1683
was \k{name}.
1684
1685
(3) For Oniguruma compatibility we also support \g followed by a name or a
1686
number either in angle brackets or in single quotes. However, these are
1687
(possibly recursive) subroutine calls, _not_ backreferences. We return
1688
the ESC_g code.
1689
1690
Summary: Return a negative number for a numerical back reference (offset
1691
by 1), ESC_k for a named back reference, and ESC_g for a named or
1692
numbered subroutine call.
1693
1694
The above describes the \g behaviour inside patterns. Inside replacement
1695
strings (pcre2_substitute) we support only \g<nameornum> for Python
1696
compatibility. Return ESG_g for the named case, and -(num+1) for the
1697
numbered case.
1698
*/
1699
1700
case CHAR_g:
1701
if (isclass) break;
1702
1703
if (ptr >= ptrend)
1704
{
1705
*errorcodeptr = ERR57;
1706
break;
1707
}
1708
1709
if (cb == NULL)
1710
{
1711
PCRE2_SPTR p;
1712
/* Substitution strings */
1713
if (*ptr != CHAR_LESS_THAN_SIGN)
1714
{
1715
*errorcodeptr = ERR57;
1716
break;
1717
}
1718
1719
p = ptr + 1;
1720
1721
if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1722
errorcodeptr))
1723
{
1724
if (*errorcodeptr == 0) escape = ESC_g; /* No number found */
1725
break;
1726
}
1727
1728
if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1729
{
1730
/* not advancing ptr; report error at the \g character */
1731
*errorcodeptr = ERR57;
1732
break;
1733
}
1734
1735
/* This is the reason that back references are returned as -(s+1) rather
1736
than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1737
valid in a substitution string, so this must be representable. */
1738
ptr = p + 1;
1739
escape = -(s+1);
1740
break;
1741
}
1742
1743
if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1744
{
1745
escape = ESC_g;
1746
break;
1747
}
1748
1749
/* If there is a brace delimiter, try to read a numerical reference. If
1750
there isn't one, assume we have a name and treat it as \k. */
1751
1752
if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1753
{
1754
PCRE2_SPTR p = ptr + 1;
1755
1756
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1757
if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1758
errorcodeptr))
1759
{
1760
if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1761
break;
1762
}
1763
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1764
1765
if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1766
{
1767
/* not advancing ptr; report error at the \g character */
1768
*errorcodeptr = ERR57;
1769
break;
1770
}
1771
ptr = p + 1;
1772
}
1773
1774
/* Read an undelimited number */
1775
1776
else
1777
{
1778
if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1779
errorcodeptr))
1780
{
1781
if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1782
break;
1783
}
1784
}
1785
1786
if (s <= 0)
1787
{
1788
*errorcodeptr = ERR15;
1789
break;
1790
}
1791
1792
escape = -(s+1);
1793
break;
1794
1795
/* The handling of escape sequences consisting of a string of digits
1796
starting with one that is not zero is not straightforward. Perl has changed
1797
over the years. Nowadays \g{} for backreferences and \o{} for octal are
1798
recommended to avoid the ambiguities in the old syntax.
1799
1800
Outside a character class, the digits are read as a decimal number. If the
1801
number is less than 10, or if there are that many previous extracting left
1802
brackets, it is a back reference. Otherwise, up to three octal digits are
1803
read to form an escaped character code. Thus \123 is likely to be octal 123
1804
(cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1805
style" of handling ambiguous octal/backrefences such as \12.
1806
1807
There is an alternative disambiguation strategy, selected by
1808
PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1809
have either a leading zero, or exactly three octal digits; otherwise it's
1810
a backreference. The disambiguation is stable, and does not depend on how
1811
many capture groups are defined (it's simply an invalid backreference if
1812
there is no corresponding capture group). Additionally, octal values above
1813
\377 (\xff) are rejected.
1814
1815
Inside a character class, \ followed by a digit is always either a literal
1816
8 or 9 or an octal number. */
1817
1818
case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1819
case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1820
1821
if (isclass)
1822
{
1823
/* Fall through to octal handling; never a backreference inside a class. */
1824
}
1825
else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1826
{
1827
/* Python-style disambiguation. */
1828
if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1829
ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1830
{
1831
/* We peeked a three-digit octal, so fall through */
1832
}
1833
else
1834
{
1835
/* We are at a digit, so the only possible error from read_number() is
1836
a number that is too large. */
1837
ptr--; /* Back to the digit */
1838
1839
if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1840
{
1841
*errorcodeptr = ERR61;
1842
break;
1843
}
1844
1845
escape = -(s+1);
1846
break;
1847
}
1848
}
1849
else
1850
{
1851
/* Perl-style disambiguation. */
1852
oldptr = ptr;
1853
ptr--; /* Back to the digit */
1854
1855
/* As we know we are at a digit, the only possible error from
1856
read_number() is a number that is too large to be a group number. Because
1857
that number might be still valid if read as an octal, errorcodeptr is not
1858
set on failure and therefore a sentinel value of INT_MAX is used instead
1859
of the original value, and will be used later to properly set the error,
1860
if not falling through. */
1861
1862
if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1863
s = INT_MAX;
1864
1865
/* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1866
are octal escapes if there are not that many previous captures. */
1867
1868
if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1869
{
1870
/* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1871
but we keep it just to be safe and because it will also catch the
1872
sentinel value that was set on failure by that function. */
1873
1874
if ((unsigned)s > MAX_GROUP_NUMBER)
1875
{
1876
PCRE2_ASSERT(s == INT_MAX);
1877
*errorcodeptr = ERR61;
1878
}
1879
else escape = -(s+1); /* Indicates a back reference */
1880
break;
1881
}
1882
1883
ptr = oldptr; /* Put the pointer back and fall through */
1884
}
1885
1886
/* Handle a digit following \ when the number is not a back reference, or
1887
we are within a character class. If the first digit is 8 or 9, Perl used to
1888
generate a binary zero and then treat the digit as a following literal. At
1889
least by Perl 5.18 this changed so as not to insert the binary zero. */
1890
1891
if (c >= CHAR_8) break;
1892
1893
/* Fall through */
1894
1895
/* \0 always starts an octal number, but we may drop through to here with a
1896
larger first octal digit. The original code used just to take the least
1897
significant 8 bits of octal numbers (I think this is what early Perls used
1898
to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1899
but no more than 3 octal digits. */
1900
1901
case CHAR_0:
1902
c -= CHAR_0;
1903
while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1904
c = c * 8 + *ptr++ - CHAR_0;
1905
if (c > 0xff)
1906
{
1907
if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1908
#if PCRE2_CODE_UNIT_WIDTH == 8
1909
else if (!utf) *errorcodeptr = ERR51;
1910
#endif
1911
}
1912
1913
/* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1914
two- or three-character octal escapes \00 and \000, nor \x00. */
1915
1916
if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1917
*errorcodeptr = ERR98;
1918
break;
1919
1920
/* \o is a relatively new Perl feature, supporting a more general way of
1921
specifying character codes in octal. The only supported form is \o{ddd},
1922
with optional spaces or tabs after { and before }. */
1923
1924
case CHAR_o:
1925
if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1926
{
1927
ptr--;
1928
*errorcodeptr = ERR55;
1929
break;
1930
}
1931
1932
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1933
if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1934
{
1935
*errorcodeptr = ERR78;
1936
break;
1937
}
1938
1939
c = 0;
1940
overflow = FALSE;
1941
while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1942
{
1943
cc = *ptr++;
1944
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1945
#if PCRE2_CODE_UNIT_WIDTH == 32
1946
if (c >= 0x20000000u) { overflow = TRUE; break; }
1947
#endif
1948
c = (c << 3) + (cc - CHAR_0);
1949
#if PCRE2_CODE_UNIT_WIDTH == 8
1950
if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1951
#elif PCRE2_CODE_UNIT_WIDTH == 16
1952
if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1953
#elif PCRE2_CODE_UNIT_WIDTH == 32
1954
if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1955
#endif
1956
}
1957
1958
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1959
1960
if (overflow)
1961
{
1962
while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1963
*errorcodeptr = ERR34;
1964
}
1965
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1966
{
1967
if (utf && c >= 0xd800 && c <= 0xdfff &&
1968
(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1969
{
1970
ptr--;
1971
*errorcodeptr = ERR73;
1972
}
1973
}
1974
else
1975
{
1976
ptr--;
1977
*errorcodeptr = ERR64;
1978
}
1979
break;
1980
1981
/* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1982
by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1983
1984
case CHAR_x:
1985
if (alt_bsux)
1986
{
1987
uint32_t xc;
1988
if (ptrend - ptr < 2) break; /* Less than 2 characters */
1989
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1990
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1991
c = (cc << 4) | xc;
1992
ptr += 2;
1993
}
1994
1995
/* Handle \x in Perl's style. \x{ddd} is a character code which can be
1996
greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1997
digits. If not, { used to be treated as a data character. However, Perl
1998
seems to read hex digits up to the first non-such, and ignore the rest, so
1999
that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2000
now gives an error. */
2001
2002
else
2003
{
2004
if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2005
{
2006
ptr++;
2007
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2008
2009
#ifndef EBCDIC
2010
COME_FROM_NU:
2011
#endif
2012
if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2013
{
2014
*errorcodeptr = ERR78;
2015
break;
2016
}
2017
c = 0;
2018
overflow = FALSE;
2019
2020
while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2021
{
2022
ptr++;
2023
if (c == 0 && cc == 0) continue; /* Leading zeroes */
2024
#if PCRE2_CODE_UNIT_WIDTH == 32
2025
if (c >= 0x10000000l) { overflow = TRUE; break; }
2026
#endif
2027
c = (c << 4) | cc;
2028
if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2029
{
2030
overflow = TRUE;
2031
break;
2032
}
2033
}
2034
2035
/* Perl ignores spaces and tabs before } */
2036
2037
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2038
2039
/* On overflow, skip remaining hex digits */
2040
2041
if (overflow)
2042
{
2043
while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2044
*errorcodeptr = ERR34;
2045
}
2046
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2047
{
2048
if (utf && c >= 0xd800 && c <= 0xdfff &&
2049
(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2050
{
2051
ptr--;
2052
*errorcodeptr = ERR73;
2053
}
2054
}
2055
2056
/* If the sequence of hex digits (followed by optional space) does not
2057
end with '}', give an error. We used just to recognize this construct
2058
and fall through to the normal \x handling, but nowadays Perl gives an
2059
error, which seems much more sensible, so we do too. */
2060
2061
else
2062
{
2063
ptr--;
2064
*errorcodeptr = ERR67;
2065
}
2066
} /* End of \x{} processing */
2067
2068
/* Read a up to two hex digits after \x */
2069
2070
else
2071
{
2072
/* Perl has the surprising/broken behaviour that \x without following
2073
hex digits is treated as an escape for NUL. Their source code laments
2074
this but keeps it for backwards compatibility. A warning is printed
2075
when "use warnings" is enabled. Because we don't have warnings, we
2076
simply forbid it. */
2077
if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2078
{
2079
/* Not a hex digit */
2080
*errorcodeptr = ERR78;
2081
break;
2082
}
2083
ptr++;
2084
c = cc;
2085
2086
/* With "use re 'strict'" Perl actually requires exactly two digits (error
2087
for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2088
strict, and there seems little incentive to align with that, given the
2089
backwards-compatibility cost.
2090
2091
For comparison, note that other engines disagree. For example:
2092
- Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2093
- .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2094
*/
2095
if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2096
ptr++;
2097
c = (c << 4) | cc;
2098
} /* End of \xdd handling */
2099
} /* End of Perl-style \x handling */
2100
break;
2101
2102
/* The handling of \c is different in ASCII and EBCDIC environments. In an
2103
ASCII (or Unicode) environment, an error is given if the character
2104
following \c is not a printable ASCII character. Otherwise, the following
2105
character is upper-cased if it is a letter, and after that the 0x40 bit is
2106
flipped. The result is the value of the escape.
2107
2108
In an EBCDIC environment the handling of \c is compatible with the
2109
specification in the perlebcdic document. The following character must be
2110
a letter or one of small number of special characters. These provide a
2111
means of defining the character values 0-31.
2112
2113
For testing the EBCDIC handling of \c in an ASCII environment, recognize
2114
the EBCDIC value of 'c' explicitly. */
2115
2116
#if defined EBCDIC && 'a' != 0x81
2117
case 0x83:
2118
#else
2119
case CHAR_c:
2120
#endif
2121
if (ptr >= ptrend)
2122
{
2123
*errorcodeptr = ERR2;
2124
break;
2125
}
2126
c = *ptr;
2127
if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2128
2129
/* Handle \c in an ASCII/Unicode environment. */
2130
2131
#ifndef EBCDIC /* ASCII/UTF-8 coding */
2132
if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2133
{
2134
*errorcodeptr = ERR68;
2135
break;
2136
}
2137
c ^= 0x40;
2138
2139
/* Handle \c in an EBCDIC environment. The special case \c? is converted to
2140
255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2141
POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2142
The other valid sequences correspond to a list of specific characters. */
2143
2144
#else
2145
if (c == CHAR_QUESTION_MARK)
2146
c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2147
else
2148
{
2149
for (i = 0; i < 32; i++)
2150
{
2151
if (c == ebcdic_escape_c[i]) break;
2152
}
2153
if (i < 32) c = i; else *errorcodeptr = ERR68;
2154
}
2155
#endif /* EBCDIC */
2156
2157
ptr++;
2158
break;
2159
2160
/* Any other alphanumeric following \ is an error. Perl gives an error only
2161
if in warning mode, but PCRE doesn't have a warning mode. */
2162
2163
default:
2164
*errorcodeptr = ERR3;
2165
*ptrptr = ptr - 1; /* Point to the character at fault */
2166
return 0;
2167
}
2168
}
2169
2170
/* Set the pointer to the next character before returning. */
2171
2172
*ptrptr = ptr;
2173
*chptr = c;
2174
return escape;
2175
}
2176
2177
2178
2179
#ifdef SUPPORT_UNICODE
2180
/*************************************************
2181
* Handle \P and \p *
2182
*************************************************/
2183
2184
/* This function is called after \P or \p has been encountered, provided that
2185
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2186
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2187
after the final code unit of the escape sequence.
2188
2189
Arguments:
2190
ptrptr the pattern position pointer
2191
negptr a boolean that is set TRUE for negation else FALSE
2192
ptypeptr an unsigned int that is set to the type value
2193
pdataptr an unsigned int that is set to the detailed property value
2194
errorcodeptr the error code variable
2195
cb the compile data
2196
2197
Returns: TRUE if the type value was found, or FALSE for an invalid type
2198
*/
2199
2200
static BOOL
2201
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2202
uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2203
{
2204
PCRE2_UCHAR c;
2205
PCRE2_SIZE i, bot, top;
2206
PCRE2_SPTR ptr = *ptrptr;
2207
PCRE2_UCHAR name[50];
2208
PCRE2_UCHAR *vptr = NULL;
2209
uint16_t ptscript = PT_NOTSCRIPT;
2210
2211
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2212
c = *ptr++;
2213
*negptr = FALSE;
2214
2215
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2216
negation. We must be handling Unicode encoding here, though we may be compiling
2217
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2218
input and Unicode input in the same build.) In accordance with Unicode's "loose
2219
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2220
don't use isspace() or tolower() because (a) code points may be greater than
2221
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2222
environment. */
2223
2224
if (c == CHAR_LEFT_CURLY_BRACKET)
2225
{
2226
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2227
2228
for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2229
{
2230
REDO:
2231
2232
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2233
c = *ptr++;
2234
2235
/* Skip ignorable Unicode characters. */
2236
2237
while (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2238
(c >= CHAR_HT && c <= CHAR_CR))
2239
{
2240
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2241
c = *ptr++;
2242
}
2243
2244
/* The first significant character being circumflex negates the meaning of
2245
the item. */
2246
2247
if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2248
{
2249
*negptr = TRUE;
2250
goto REDO;
2251
}
2252
2253
if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2254
2255
/* Names consist of ASCII letters and digits, but equals and colon may also
2256
occur as a name/value separator. We must also allow for \p{L&}. A simple
2257
check for a value between '&' and 'z' suffices because anything else in a
2258
name or value will cause an "unknown property" error anyway. */
2259
2260
if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2261
2262
/* Lower case a capital letter or remember where the name/value separator
2263
is. */
2264
2265
if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2266
else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2267
vptr = name + i;
2268
2269
name[i] = c;
2270
}
2271
2272
/* Error if the loop didn't end with '}' - either we hit the end of the
2273
pattern or the name was longer than any legal property name. */
2274
2275
if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2276
name[i] = 0;
2277
}
2278
2279
/* If { doesn't follow \p or \P there is just one following character, which
2280
must be an ASCII letter. */
2281
2282
else if (c >= CHAR_A && c <= CHAR_Z)
2283
{
2284
name[0] = c | 0x20; /* Lower case */
2285
name[1] = 0;
2286
}
2287
else if (c >= CHAR_a && c <= CHAR_z)
2288
{
2289
name[0] = c;
2290
name[1] = 0;
2291
}
2292
else goto ERROR_RETURN;
2293
2294
*ptrptr = ptr; /* Update pattern pointer */
2295
2296
/* If the property contains ':' or '=' we have class name and value separately
2297
specified. The following are supported:
2298
2299
. Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2300
. Script (synonym sc) for which the property name is the script name
2301
. Script_Extensions (synonym scx), ditto
2302
2303
As this is a small number, we currently just check the names directly. If this
2304
grows, a sorted table and a switch will be neater.
2305
2306
For both the script properties, set a PT_xxx value so that (1) they can be
2307
distinguished and (2) invalid script names that happen to be the name of
2308
another property can be diagnosed. */
2309
2310
if (vptr != NULL)
2311
{
2312
int offset = 0;
2313
PCRE2_UCHAR sname[8];
2314
2315
*vptr = 0; /* Terminate property name */
2316
if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2317
PRIV(strcmp_c8)(name, STRING_bc) == 0)
2318
{
2319
offset = 4;
2320
sname[0] = CHAR_b;
2321
sname[1] = CHAR_i; /* There is no strcpy_c8 function */
2322
sname[2] = CHAR_d;
2323
sname[3] = CHAR_i;
2324
}
2325
2326
else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2327
PRIV(strcmp_c8)(name, STRING_sc) == 0)
2328
ptscript = PT_SC;
2329
2330
else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2331
PRIV(strcmp_c8)(name, STRING_scx) == 0)
2332
ptscript = PT_SCX;
2333
2334
else
2335
{
2336
*errorcodeptr = ERR47;
2337
return FALSE;
2338
}
2339
2340
/* Adjust the string in name[] as needed */
2341
2342
memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2343
if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2344
}
2345
2346
/* Search for a recognized property using binary chop. */
2347
2348
bot = 0;
2349
top = PRIV(utt_size);
2350
2351
while (bot < top)
2352
{
2353
int r;
2354
i = (bot + top) >> 1;
2355
r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2356
2357
/* When a matching property is found, some extra checking is needed when the
2358
\p{xx:yy} syntax is used and xx is either sc or scx. */
2359
2360
if (r == 0)
2361
{
2362
*pdataptr = PRIV(utt)[i].value;
2363
if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2364
{
2365
*ptypeptr = PRIV(utt)[i].type;
2366
return TRUE;
2367
}
2368
2369
switch (PRIV(utt)[i].type)
2370
{
2371
case PT_SC:
2372
*ptypeptr = PT_SC;
2373
return TRUE;
2374
2375
case PT_SCX:
2376
*ptypeptr = ptscript;
2377
return TRUE;
2378
}
2379
2380
break; /* Non-script found */
2381
}
2382
2383
if (r > 0) bot = i + 1; else top = i;
2384
}
2385
2386
*errorcodeptr = ERR47; /* Unrecognized property */
2387
return FALSE;
2388
2389
ERROR_RETURN: /* Malformed \P or \p */
2390
*errorcodeptr = ERR46;
2391
*ptrptr = ptr;
2392
return FALSE;
2393
}
2394
#endif
2395
2396
2397
2398
/*************************************************
2399
* Check for POSIX class syntax *
2400
*************************************************/
2401
2402
/* This function is called when the sequence "[:" or "[." or "[=" is
2403
encountered in a character class. It checks whether this is followed by a
2404
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2405
reach an unescaped ']' without the special preceding character, return FALSE.
2406
2407
Originally, this function only recognized a sequence of letters between the
2408
terminators, but it seems that Perl recognizes any sequence of characters,
2409
though of course unknown POSIX names are subsequently rejected. Perl gives an
2410
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2411
didn't consider this to be a POSIX class. Likewise for [:1234:].
2412
2413
The problem in trying to be exactly like Perl is in the handling of escapes. We
2414
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2415
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2416
below handles the special cases \\ and \], but does not try to do any other
2417
escape processing. This makes it different from Perl for cases such as
2418
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2419
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2420
when Perl does, I think.
2421
2422
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2423
It seems that the appearance of a nested POSIX class supersedes an apparent
2424
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2425
a digit. This is handled by returning FALSE if the start of a new group with
2426
the same terminator is encountered, since the next closing sequence must close
2427
the nested group, not the outer one.
2428
2429
In Perl, unescaped square brackets may also appear as part of class names. For
2430
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2431
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2432
seem right at all. PCRE does not allow closing square brackets in POSIX class
2433
names.
2434
2435
Arguments:
2436
ptr pointer to the character after the initial [ (colon, dot, equals)
2437
ptrend pointer to the end of the pattern
2438
endptr where to return a pointer to the terminating ':', '.', or '='
2439
2440
Returns: TRUE or FALSE
2441
*/
2442
2443
static BOOL
2444
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2445
{
2446
PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2447
terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2448
2449
for (; ptrend - ptr >= 2; ptr++)
2450
{
2451
if (*ptr == CHAR_BACKSLASH &&
2452
(ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2453
ptr++;
2454
2455
else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2456
*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2457
2458
else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2459
{
2460
*endptr = ptr;
2461
return TRUE;
2462
}
2463
}
2464
2465
return FALSE;
2466
}
2467
2468
2469
2470
/*************************************************
2471
* Check POSIX class name *
2472
*************************************************/
2473
2474
/* This function is called to check the name given in a POSIX-style class entry
2475
such as [:alnum:].
2476
2477
Arguments:
2478
ptr points to the first letter
2479
len the length of the name
2480
2481
Returns: a value representing the name, or -1 if unknown
2482
*/
2483
2484
static int
2485
check_posix_name(PCRE2_SPTR ptr, int len)
2486
{
2487
const char *pn = posix_names;
2488
int yield = 0;
2489
while (posix_name_lengths[yield] != 0)
2490
{
2491
if (len == posix_name_lengths[yield] &&
2492
PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2493
pn += posix_name_lengths[yield] + 1;
2494
yield++;
2495
}
2496
return -1;
2497
}
2498
2499
2500
2501
/*************************************************
2502
* Read a subpattern or VERB name *
2503
*************************************************/
2504
2505
/* This function is called from parse_regex() below whenever it needs to read
2506
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2507
pointer must be to the preceding character. If that character is '*' we are
2508
reading a verb or alpha assertion name. The pointer is updated to point after
2509
the name, for a VERB or alpha assertion name, or after tha name's terminator
2510
for a subpattern name. Returning both the offset and the name pointer is
2511
redundant information, but some callers use one and some the other, so it is
2512
simplest just to return both. When the name is in braces, spaces and tabs are
2513
allowed (and ignored) at either end.
2514
2515
Arguments:
2516
ptrptr points to the character pointer variable
2517
ptrend points to the end of the input string
2518
utf true if the input is UTF-encoded
2519
terminator the terminator of a subpattern name must be this
2520
offsetptr where to put the offset from the start of the pattern
2521
nameptr where to put a pointer to the name in the input
2522
namelenptr where to put the length of the name
2523
errcodeptr where to put an error code
2524
cb pointer to the compile data block
2525
2526
Returns: TRUE if a name was read
2527
FALSE otherwise, with error code set
2528
*/
2529
2530
static BOOL
2531
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2532
PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2533
int *errorcodeptr, compile_block *cb)
2534
{
2535
PCRE2_SPTR ptr = *ptrptr;
2536
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2537
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2538
2539
if (is_braced)
2540
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2541
2542
if (ptr >= ptrend) /* No characters in name */
2543
{
2544
*errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2545
ERR60; /* Verb not recognized or malformed */
2546
goto FAILED;
2547
}
2548
2549
*nameptr = ptr;
2550
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2551
2552
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2553
ought to be updated to match. */
2554
2555
/* In UTF mode, a group name may contain letters and decimal digits as defined
2556
by Unicode properties, and underscores, but must not start with a digit. */
2557
2558
#ifdef SUPPORT_UNICODE
2559
if (utf && is_group)
2560
{
2561
uint32_t c, type;
2562
2563
GETCHAR(c, ptr);
2564
type = UCD_CHARTYPE(c);
2565
2566
if (type == ucp_Nd)
2567
{
2568
*errorcodeptr = ERR44;
2569
goto FAILED;
2570
}
2571
2572
for(;;)
2573
{
2574
if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2575
c != CHAR_UNDERSCORE) break;
2576
ptr++;
2577
FORWARDCHARTEST(ptr, ptrend);
2578
if (ptr >= ptrend) break;
2579
GETCHAR(c, ptr);
2580
type = UCD_CHARTYPE(c);
2581
}
2582
}
2583
else
2584
#else
2585
(void)utf; /* Avoid compiler warning */
2586
#endif /* SUPPORT_UNICODE */
2587
2588
/* Handle non-group names and group names in non-UTF modes. A group name must
2589
not start with a digit. If either of the others start with a digit it just
2590
won't be recognized. */
2591
2592
{
2593
if (is_group && IS_DIGIT(*ptr))
2594
{
2595
*errorcodeptr = ERR44;
2596
goto FAILED;
2597
}
2598
2599
while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2600
{
2601
ptr++;
2602
}
2603
}
2604
2605
/* Check name length */
2606
2607
if (ptr > *nameptr + MAX_NAME_SIZE)
2608
{
2609
*errorcodeptr = ERR48;
2610
goto FAILED;
2611
}
2612
*namelenptr = (uint32_t)(ptr - *nameptr);
2613
2614
/* Subpattern names must not be empty, and their terminator is checked here.
2615
(What follows a verb or alpha assertion name is checked separately.) */
2616
2617
if (is_group)
2618
{
2619
if (ptr == *nameptr)
2620
{
2621
*errorcodeptr = ERR62; /* Subpattern name expected */
2622
goto FAILED;
2623
}
2624
if (is_braced)
2625
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2626
if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2627
{
2628
*errorcodeptr = ERR42;
2629
goto FAILED;
2630
}
2631
ptr++;
2632
}
2633
2634
*ptrptr = ptr;
2635
return TRUE;
2636
2637
FAILED:
2638
*ptrptr = ptr;
2639
return FALSE;
2640
}
2641
2642
2643
2644
/*************************************************
2645
* Manage callouts at start of cycle *
2646
*************************************************/
2647
2648
/* At the start of a new item in parse_regex() we are able to record the
2649
details of the previous item in a prior callout, and also to set up an
2650
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2651
which would otherwise happen for items such as \Q that contribute nothing to
2652
the parsed pattern.
2653
2654
Arguments:
2655
ptr current pattern pointer
2656
pcalloutptr points to a pointer to previous callout, or NULL
2657
auto_callout TRUE if auto_callouts are enabled
2658
parsed_pattern the parsed pattern pointer
2659
cb compile block
2660
2661
Returns: possibly updated parsed_pattern pointer.
2662
*/
2663
2664
static uint32_t *
2665
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2666
uint32_t *parsed_pattern, compile_block *cb)
2667
{
2668
uint32_t *previous_callout = *pcalloutptr;
2669
2670
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2671
cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2672
2673
if (!auto_callout) previous_callout = NULL; else
2674
{
2675
if (previous_callout == NULL ||
2676
previous_callout != parsed_pattern - 4 ||
2677
previous_callout[3] != 255)
2678
{
2679
previous_callout = parsed_pattern; /* Set up new automatic callout */
2680
parsed_pattern += 4;
2681
previous_callout[0] = META_CALLOUT_NUMBER;
2682
previous_callout[2] = 0;
2683
previous_callout[3] = 255;
2684
}
2685
previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2686
}
2687
2688
*pcalloutptr = previous_callout;
2689
return parsed_pattern;
2690
}
2691
2692
2693
2694
/*************************************************
2695
* Handle \d, \D, \s, \S, \w, \W *
2696
*************************************************/
2697
2698
/* This function is called from parse_regex() below, both for freestanding
2699
escapes, and those within classes, to handle those escapes that may change when
2700
Unicode property support is requested. Note that PCRE2_UCP will never be set
2701
without Unicode support because that is checked when pcre2_compile() is called.
2702
2703
Arguments:
2704
escape the ESC_... value
2705
parsed_pattern where to add the code
2706
options options bits
2707
xoptions extra options bits
2708
2709
Returns: updated value of parsed_pattern
2710
*/
2711
static uint32_t *
2712
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2713
uint32_t xoptions)
2714
{
2715
uint32_t ascii_option = 0;
2716
uint32_t prop = ESC_p;
2717
2718
switch(escape)
2719
{
2720
case ESC_D:
2721
prop = ESC_P;
2722
/* Fall through */
2723
case ESC_d:
2724
ascii_option = PCRE2_EXTRA_ASCII_BSD;
2725
break;
2726
2727
case ESC_S:
2728
prop = ESC_P;
2729
/* Fall through */
2730
case ESC_s:
2731
ascii_option = PCRE2_EXTRA_ASCII_BSS;
2732
break;
2733
2734
case ESC_W:
2735
prop = ESC_P;
2736
/* Fall through */
2737
case ESC_w:
2738
ascii_option = PCRE2_EXTRA_ASCII_BSW;
2739
break;
2740
}
2741
2742
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2743
{
2744
*parsed_pattern++ = META_ESCAPE + escape;
2745
}
2746
else
2747
{
2748
*parsed_pattern++ = META_ESCAPE + prop;
2749
switch(escape)
2750
{
2751
case ESC_d:
2752
case ESC_D:
2753
*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2754
break;
2755
2756
case ESC_s:
2757
case ESC_S:
2758
*parsed_pattern++ = PT_SPACE << 16;
2759
break;
2760
2761
case ESC_w:
2762
case ESC_W:
2763
*parsed_pattern++ = PT_WORD << 16;
2764
break;
2765
}
2766
}
2767
2768
return parsed_pattern;
2769
}
2770
2771
2772
2773
/*************************************************
2774
* Maximum size of parsed_pattern for given input *
2775
*************************************************/
2776
2777
/* This function is called from parse_regex() below, to determine the amount
2778
of memory to allocate for parsed_pattern. It is also called to check whether
2779
the amount of data written respects the amount of memory allocated.
2780
2781
Arguments:
2782
ptr points to the start of the pattern
2783
ptrend points to the end of the pattern
2784
utf TRUE in UTF mode
2785
options the options bits
2786
2787
Returns: the number of uint32_t units for parsed_pattern
2788
*/
2789
static ptrdiff_t
2790
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2791
uint32_t options)
2792
{
2793
PCRE2_SIZE big32count = 0;
2794
ptrdiff_t parsed_size_needed;
2795
2796
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2797
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2798
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2799
when literal characters greater than META_END (0x80000000) have to be coded as
2800
two units. In this case, therefore, we scan the pattern to check for such
2801
values. */
2802
2803
#if PCRE2_CODE_UNIT_WIDTH == 32
2804
if (!utf)
2805
{
2806
PCRE2_SPTR p;
2807
for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2808
}
2809
#else
2810
(void)utf; /* Avoid compiler warning */
2811
#endif
2812
2813
parsed_size_needed = (ptrend - ptr) + big32count;
2814
2815
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
2816
elements) for each character. This is overkill, but memory is plentiful these
2817
days. */
2818
2819
if ((options & PCRE2_AUTO_CALLOUT) != 0)
2820
parsed_size_needed += (ptrend - ptr) * 4;
2821
2822
return parsed_size_needed;
2823
}
2824
2825
2826
2827
/*************************************************
2828
* Parse regex and identify named groups *
2829
*************************************************/
2830
2831
/* This function is called first of all. It scans the pattern and does two
2832
things: (1) It identifies capturing groups and makes a table of named capturing
2833
groups so that information about them is fully available to both the compiling
2834
scans. (2) It writes a parsed version of the pattern with comments omitted and
2835
escapes processed into the parsed_pattern vector.
2836
2837
Arguments:
2838
ptr points to the start of the pattern
2839
options compiling dynamic options (may change during the scan)
2840
has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2841
cb pointer to the compile data block
2842
2843
Returns: zero on success or a non-zero error code, with the
2844
error offset placed in the cb field
2845
*/
2846
2847
/* A structure and some flags for dealing with nested groups. */
2848
2849
typedef struct nest_save {
2850
uint16_t nest_depth;
2851
uint16_t reset_group;
2852
uint16_t max_group;
2853
uint16_t flags;
2854
uint32_t options;
2855
uint32_t xoptions;
2856
} nest_save;
2857
2858
#define NSF_RESET 0x0001u
2859
#define NSF_CONDASSERT 0x0002u
2860
#define NSF_ATOMICSR 0x0004u
2861
2862
/* Options that are changeable within the pattern must be tracked during
2863
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2864
but all must be tracked so that META_OPTIONS items set the correct values for
2865
the main compiling phase. */
2866
2867
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2868
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2869
PCRE2_UNGREEDY)
2870
2871
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2872
PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2873
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2874
2875
/* States used for analyzing ranges in character classes. The two OK values
2876
must be last. */
2877
2878
enum {
2879
RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
2880
RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
2881
RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
2882
RANGE_FORBID_STARTED, /* State after '[\d-'*/
2883
RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
2884
RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
2885
};
2886
2887
/* States used for analyzing operators and operands in extended character
2888
classes. */
2889
2890
enum {
2891
CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
2892
CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
2893
CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
2894
};
2895
2896
/* States used for determining the parse mode in character classes. The two
2897
PERL_EXT values must be last. */
2898
2899
enum {
2900
CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
2901
CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
2902
CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
2903
CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
2904
};
2905
2906
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2907
the storing of literal values in the main parsed pattern, where they can always
2908
be quantified. */
2909
2910
#if PCRE2_CODE_UNIT_WIDTH == 32
2911
#define PARSED_LITERAL(c, p) \
2912
{ \
2913
if (c >= META_END) *p++ = META_BIGVALUE; \
2914
*p++ = c; \
2915
okquantifier = TRUE; \
2916
}
2917
#else
2918
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2919
#endif
2920
2921
/* Here's the actual function. */
2922
2923
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
2924
BOOL *has_lookbehind, compile_block *cb)
2925
{
2926
uint32_t c;
2927
uint32_t delimiter;
2928
uint32_t namelen;
2929
uint32_t class_range_state;
2930
uint32_t class_op_state;
2931
uint32_t class_mode_state;
2932
uint32_t *class_start;
2933
uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2934
uint32_t *verbstartptr = NULL;
2935
uint32_t *previous_callout = NULL;
2936
uint32_t *parsed_pattern = cb->parsed_pattern;
2937
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2938
uint32_t *this_parsed_item = NULL;
2939
uint32_t *prev_parsed_item = NULL;
2940
uint32_t meta_quantifier = 0;
2941
uint32_t add_after_mark = 0;
2942
uint16_t nest_depth = 0;
2943
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
2944
int16_t class_maxdepth_m1 = -1;
2945
int after_manual_callout = 0;
2946
int expect_cond_assert = 0;
2947
int errorcode = 0;
2948
int escape;
2949
int i;
2950
BOOL inescq = FALSE;
2951
BOOL inverbname = FALSE;
2952
BOOL utf = (options & PCRE2_UTF) != 0;
2953
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2954
BOOL isdupname;
2955
BOOL negate_class;
2956
BOOL okquantifier = FALSE;
2957
PCRE2_SPTR thisptr;
2958
PCRE2_SPTR name;
2959
PCRE2_SPTR ptrend = cb->end_pattern;
2960
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2961
PCRE2_SPTR class_range_forbid_ptr = NULL;
2962
named_group *ng;
2963
nest_save *top_nest, *end_nests;
2964
#ifdef PCRE2_DEBUG
2965
uint32_t *parsed_pattern_check;
2966
ptrdiff_t parsed_pattern_extra = 0;
2967
ptrdiff_t parsed_pattern_extra_check = 0;
2968
PCRE2_SPTR ptr_check;
2969
#endif
2970
2971
PCRE2_ASSERT(parsed_pattern != NULL);
2972
2973
/* Insert leading items for word and line matching (features provided for the
2974
benefit of pcre2grep). */
2975
2976
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2977
{
2978
*parsed_pattern++ = META_CIRCUMFLEX;
2979
*parsed_pattern++ = META_NOCAPTURE;
2980
}
2981
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2982
{
2983
*parsed_pattern++ = META_ESCAPE + ESC_b;
2984
*parsed_pattern++ = META_NOCAPTURE;
2985
}
2986
2987
#ifdef PCRE2_DEBUG
2988
parsed_pattern_check = parsed_pattern;
2989
ptr_check = ptr;
2990
#endif
2991
2992
/* If the pattern is actually a literal string, process it separately to avoid
2993
cluttering up the main loop. */
2994
2995
if ((options & PCRE2_LITERAL) != 0)
2996
{
2997
while (ptr < ptrend)
2998
{
2999
if (parsed_pattern >= parsed_pattern_end)
3000
{
3001
PCRE2_DEBUG_UNREACHABLE();
3002
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
3003
goto FAILED;
3004
}
3005
thisptr = ptr;
3006
GETCHARINCTEST(c, ptr);
3007
if (auto_callout)
3008
parsed_pattern = manage_callouts(thisptr, &previous_callout,
3009
auto_callout, parsed_pattern, cb);
3010
PARSED_LITERAL(c, parsed_pattern);
3011
}
3012
goto PARSED_END;
3013
}
3014
3015
/* Process a real regex which may contain meta-characters. */
3016
3017
top_nest = NULL;
3018
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3019
3020
/* The size of the nest_save structure might not be a factor of the size of the
3021
workspace. Therefore we must round down end_nests so as to correctly avoid
3022
creating a nest_save that spans the end of the workspace. */
3023
3024
end_nests = (nest_save *)((char *)end_nests -
3025
((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3026
3027
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3028
3029
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3030
3031
/* Now scan the pattern */
3032
3033
while (ptr < ptrend)
3034
{
3035
int prev_expect_cond_assert;
3036
uint32_t min_repeat = 0, max_repeat = 0;
3037
uint32_t set, unset, *optset;
3038
uint32_t xset, xunset, *xoptset;
3039
uint32_t terminator;
3040
uint32_t prev_meta_quantifier;
3041
BOOL prev_okquantifier;
3042
PCRE2_SPTR tempptr;
3043
PCRE2_SIZE offset;
3044
3045
if (nest_depth > cb->cx->parens_nest_limit)
3046
{
3047
errorcode = ERR19;
3048
goto FAILED; /* Parentheses too deeply nested */
3049
}
3050
3051
/* Check that we haven't emitted too much into parsed_pattern. We allocate
3052
a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3053
write a little bit too much, everything will appear to be OK, because the
3054
upfront size is an overestimate... but a malicious pattern could end up
3055
forcing a write past the buffer end. We must catch this during
3056
development. */
3057
3058
#ifdef PCRE2_DEBUG
3059
/* Strong post-write check. Won't help in release builds - at this point
3060
the write has already occurred so it's too late. However, should stop us
3061
committing unsafe code. */
3062
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3063
(parsed_pattern_extra - parsed_pattern_extra_check) <=
3064
max_parsed_pattern(ptr_check, ptr, utf, options));
3065
parsed_pattern_check = parsed_pattern;
3066
parsed_pattern_extra_check = parsed_pattern_extra;
3067
ptr_check = ptr;
3068
#endif
3069
3070
if (parsed_pattern >= parsed_pattern_end)
3071
{
3072
/* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3073
(but the code below can write many chars). Better than nothing. */
3074
PCRE2_DEBUG_UNREACHABLE();
3075
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
3076
goto FAILED;
3077
}
3078
3079
/* If the last time round this loop something was added, parsed_pattern will
3080
no longer be equal to this_parsed_item. Remember where the previous item
3081
started and reset for the next item. Note that sometimes round the loop,
3082
nothing gets added (e.g. for ignored white space). */
3083
3084
if (this_parsed_item != parsed_pattern)
3085
{
3086
prev_parsed_item = this_parsed_item;
3087
this_parsed_item = parsed_pattern;
3088
}
3089
3090
/* Get next input character, save its position for callout handling. */
3091
3092
thisptr = ptr;
3093
GETCHARINCTEST(c, ptr);
3094
3095
/* Copy quoted literals until \E, allowing for the possibility of automatic
3096
callouts, except when processing a (*VERB) "name". */
3097
3098
if (inescq)
3099
{
3100
if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3101
{
3102
inescq = FALSE;
3103
ptr++; /* Skip E */
3104
}
3105
else
3106
{
3107
if (expect_cond_assert > 0) /* A literal is not allowed if we are */
3108
{ /* expecting a conditional assertion, */
3109
ptr--; /* but an empty \Q\E sequence is OK. */
3110
errorcode = ERR28;
3111
goto FAILED;
3112
}
3113
if (inverbname)
3114
{ /* Don't use PARSED_LITERAL() because it */
3115
#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3116
if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3117
#endif
3118
*parsed_pattern++ = c;
3119
}
3120
else
3121
{
3122
if (after_manual_callout-- <= 0)
3123
parsed_pattern = manage_callouts(thisptr, &previous_callout,
3124
auto_callout, parsed_pattern, cb);
3125
PARSED_LITERAL(c, parsed_pattern);
3126
}
3127
meta_quantifier = 0;
3128
}
3129
continue; /* Next character */
3130
}
3131
3132
/* If we are processing the "name" part of a (*VERB:NAME) item, all
3133
characters up to the closing parenthesis are literals except when
3134
PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3135
and \E and escaped characters are allowed (no character types such as \d). If
3136
PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3137
this by not entering the special (*VERB:NAME) processing - they are then
3138
picked up below. Note that c is a character, not a code unit, so we must not
3139
use MAX_255 to test its size because MAX_255 tests code units and is assumed
3140
TRUE in 8-bit mode. */
3141
3142
if (inverbname &&
3143
(
3144
/* EITHER: not both options set */
3145
((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3146
(PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3147
#ifdef SUPPORT_UNICODE
3148
/* OR: character > 255 AND not Unicode Pattern White Space */
3149
(c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3150
#endif
3151
/* OR: not a # comment or isspace() white space */
3152
(c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3153
#ifdef SUPPORT_UNICODE
3154
/* and not CHAR_NEL when Unicode is supported */
3155
&& c != CHAR_NEL
3156
#endif
3157
)))
3158
{
3159
PCRE2_SIZE verbnamelength;
3160
3161
switch(c)
3162
{
3163
default: /* Don't use PARSED_LITERAL() because it */
3164
#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3165
if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3166
#endif
3167
*parsed_pattern++ = c;
3168
break;
3169
3170
case CHAR_RIGHT_PARENTHESIS:
3171
inverbname = FALSE;
3172
/* This is the length in characters */
3173
verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3174
/* But the limit on the length is in code units */
3175
if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3176
{
3177
ptr--;
3178
errorcode = ERR76;
3179
goto FAILED;
3180
}
3181
*verblengthptr = (uint32_t)verbnamelength;
3182
3183
/* If this name was on a verb such as (*ACCEPT) which does not continue,
3184
a (*MARK) was generated for the name. We now add the original verb as the
3185
next item. */
3186
3187
if (add_after_mark != 0)
3188
{
3189
*parsed_pattern++ = add_after_mark;
3190
add_after_mark = 0;
3191
}
3192
break;
3193
3194
case CHAR_BACKSLASH:
3195
if ((options & PCRE2_ALT_VERBNAMES) != 0)
3196
{
3197
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3198
xoptions, cb->bracount, FALSE, cb);
3199
if (errorcode != 0) goto FAILED;
3200
}
3201
else escape = 0; /* Treat all as literal */
3202
3203
switch(escape)
3204
{
3205
case 0: /* Don't use PARSED_LITERAL() because it */
3206
#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3207
if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3208
#endif
3209
*parsed_pattern++ = c;
3210
break;
3211
3212
case ESC_ub:
3213
*parsed_pattern++ = CHAR_u;
3214
PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3215
break;
3216
3217
case ESC_Q:
3218
inescq = TRUE;
3219
break;
3220
3221
case ESC_E: /* Ignore */
3222
break;
3223
3224
default:
3225
errorcode = ERR40; /* Invalid in verb name */
3226
goto FAILED;
3227
}
3228
}
3229
continue; /* Next character in pattern */
3230
}
3231
3232
/* Not a verb name character. At this point we must process everything that
3233
must not change the quantification state. This is mainly comments, but we
3234
handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3235
A+, as in Perl. An isolated \E is ignored. */
3236
3237
if (c == CHAR_BACKSLASH && ptr < ptrend)
3238
{
3239
if (*ptr == CHAR_Q || *ptr == CHAR_E)
3240
{
3241
inescq = *ptr == CHAR_Q;
3242
ptr++;
3243
continue;
3244
}
3245
}
3246
3247
/* Skip over whitespace and # comments in extended mode. Note that c is a
3248
character, not a code unit, so we must not use MAX_255 to test its size
3249
because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3250
whitespace characters are those designated as "Pattern White Space" by
3251
Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3252
U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3253
subset of space characters that match \h and \v. */
3254
3255
if ((options & PCRE2_EXTENDED) != 0)
3256
{
3257
if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3258
#ifdef SUPPORT_UNICODE
3259
if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3260
#endif
3261
if (c == CHAR_NUMBER_SIGN)
3262
{
3263
while (ptr < ptrend)
3264
{
3265
if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3266
{ /* IS_NEWLINE sets cb->nllen. */
3267
ptr += cb->nllen;
3268
break;
3269
}
3270
ptr++;
3271
#ifdef SUPPORT_UNICODE
3272
if (utf) FORWARDCHARTEST(ptr, ptrend);
3273
#endif
3274
}
3275
continue; /* Next character in pattern */
3276
}
3277
}
3278
3279
/* Skip over bracketed comments */
3280
3281
if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3282
ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3283
{
3284
while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3285
if (ptr >= ptrend)
3286
{
3287
errorcode = ERR18; /* A special error for missing ) in a comment */
3288
goto FAILED; /* to make it easier to debug. */
3289
}
3290
ptr++;
3291
continue; /* Next character in pattern */
3292
}
3293
3294
/* If the next item is not a quantifier, fill in length of any previous
3295
callout and create an auto callout if required. */
3296
3297
if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3298
(c != CHAR_LEFT_CURLY_BRACKET ||
3299
(tempptr = ptr,
3300
!read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3301
{
3302
if (after_manual_callout-- <= 0)
3303
{
3304
parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3305
parsed_pattern, cb);
3306
this_parsed_item = parsed_pattern; /* New start for current item */
3307
}
3308
}
3309
3310
/* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3311
assertion, possibly preceded by a callout. If the value is 1, we have just
3312
had the callout and expect an assertion. There must be at least 3 more
3313
characters in all cases. When expect_cond_assert is 2, we know that the
3314
current character is an opening parenthesis, as otherwise we wouldn't be
3315
here. However, when it is 1, we need to check, and it's easiest just to check
3316
always. Note that expect_cond_assert may be negative, since all callouts just
3317
decrement it. */
3318
3319
if (expect_cond_assert > 0)
3320
{
3321
BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3322
(ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3323
if (ok)
3324
{
3325
if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
3326
{
3327
ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3328
}
3329
else switch(ptr[1]) /* Traditional symbolic format */
3330
{
3331
case CHAR_C:
3332
ok = expect_cond_assert == 2;
3333
break;
3334
3335
case CHAR_EQUALS_SIGN:
3336
case CHAR_EXCLAMATION_MARK:
3337
break;
3338
3339
case CHAR_LESS_THAN_SIGN:
3340
ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3341
break;
3342
3343
default:
3344
ok = FALSE;
3345
}
3346
}
3347
3348
if (!ok)
3349
{
3350
ptr--; /* Adjust error offset */
3351
errorcode = ERR28;
3352
goto FAILED;
3353
}
3354
}
3355
3356
/* Remember whether we are expecting a conditional assertion, and set the
3357
default for this item. */
3358
3359
prev_expect_cond_assert = expect_cond_assert;
3360
expect_cond_assert = 0;
3361
3362
/* Remember quantification status for the previous significant item, then set
3363
default for this item. */
3364
3365
prev_okquantifier = okquantifier;
3366
prev_meta_quantifier = meta_quantifier;
3367
okquantifier = FALSE;
3368
meta_quantifier = 0;
3369
3370
/* If the previous significant item was a quantifier, adjust the parsed code
3371
if there is a following modifier. The base meta value is always followed by
3372
the PLUS and QUERY values, in that order. We do this here rather than after
3373
reading a quantifier so that intervening comments and /x whitespace can be
3374
ignored without having to replicate code. */
3375
3376
if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3377
{
3378
parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3379
prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3380
0x00020000u : 0x00010000u);
3381
continue; /* Next character in pattern */
3382
}
3383
3384
/* Process the next item in the main part of a pattern. */
3385
3386
switch(c)
3387
{
3388
default: /* Non-special character */
3389
PARSED_LITERAL(c, parsed_pattern);
3390
break;
3391
3392
3393
/* ---- Escape sequence ---- */
3394
3395
case CHAR_BACKSLASH:
3396
tempptr = ptr;
3397
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3398
xoptions, cb->bracount, FALSE, cb);
3399
if (errorcode != 0)
3400
{
3401
ESCAPE_FAILED:
3402
if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3403
goto FAILED;
3404
ptr = tempptr;
3405
if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3406
{
3407
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3408
}
3409
escape = 0; /* Treat as literal character */
3410
}
3411
3412
/* The escape was a data escape or literal character. */
3413
3414
if (escape == 0)
3415
{
3416
PARSED_LITERAL(c, parsed_pattern);
3417
}
3418
3419
/* The escape was a back (or forward) reference. We keep the offset in
3420
order to give a more useful diagnostic for a bad forward reference. For
3421
references to groups numbered less than 10 we can't use more than two items
3422
in parsed_pattern because they may be just two characters in the input (and
3423
in a 64-bit world an offset may need two elements). So for them, the offset
3424
of the first occurrent is held in a special vector. */
3425
3426
else if (escape < 0)
3427
{
3428
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3429
escape = -escape - 1;
3430
*parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3431
if (escape < 10)
3432
{
3433
if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3434
cb->small_ref_offset[escape] = offset;
3435
}
3436
else
3437
{
3438
PUTOFFSET(offset, parsed_pattern);
3439
}
3440
okquantifier = TRUE;
3441
}
3442
3443
/* The escape was a character class such as \d etc. or other special
3444
escape indicator such as \A or \X. Most of them generate just a single
3445
parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3446
value. They are supported only when Unicode is available. The type and
3447
value are packed into a single 32-bit value so that the whole sequences
3448
uses only two elements in the parsed_vector. This is because the same
3449
coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3450
set.
3451
3452
There are also some cases where the escape sequence is followed by a name:
3453
\k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3454
and \g'name' are subroutine calls by name; \g{name} is a synonym for
3455
\k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3456
and returned as a negative value (handled above). A name is coded as an
3457
offset into the pattern and a length. */
3458
3459
else switch (escape)
3460
{
3461
case ESC_C:
3462
#ifdef NEVER_BACKSLASH_C
3463
errorcode = ERR85;
3464
goto ESCAPE_FAILED;
3465
#else
3466
if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3467
{
3468
errorcode = ERR83;
3469
goto ESCAPE_FAILED;
3470
}
3471
#endif
3472
okquantifier = TRUE;
3473
*parsed_pattern++ = META_ESCAPE + escape;
3474
break;
3475
3476
/* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3477
when \u{ is not followed by hex digits and }. It requests two literal
3478
characters, u and { and we need this, as otherwise \u{ 12} (for example)
3479
would be treated as u{12} now that spaces are allowed in quantifiers. */
3480
3481
case ESC_ub:
3482
*parsed_pattern++ = CHAR_u;
3483
PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3484
break;
3485
3486
case ESC_X:
3487
#ifndef SUPPORT_UNICODE
3488
errorcode = ERR45; /* Supported only with Unicode support */
3489
goto ESCAPE_FAILED;
3490
#endif
3491
case ESC_H:
3492
case ESC_h:
3493
case ESC_N:
3494
case ESC_R:
3495
case ESC_V:
3496
case ESC_v:
3497
okquantifier = TRUE;
3498
*parsed_pattern++ = META_ESCAPE + escape;
3499
break;
3500
3501
default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3502
*parsed_pattern++ = META_ESCAPE + escape;
3503
break;
3504
3505
/* Escapes that may change in UCP mode. */
3506
3507
case ESC_d:
3508
case ESC_D:
3509
case ESC_s:
3510
case ESC_S:
3511
case ESC_w:
3512
case ESC_W:
3513
okquantifier = TRUE;
3514
parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3515
xoptions);
3516
break;
3517
3518
/* Unicode property matching */
3519
3520
case ESC_P:
3521
case ESC_p:
3522
#ifdef SUPPORT_UNICODE
3523
{
3524
BOOL negated;
3525
uint16_t ptype = 0, pdata = 0;
3526
if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3527
goto ESCAPE_FAILED;
3528
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3529
*parsed_pattern++ = META_ESCAPE + escape;
3530
*parsed_pattern++ = (ptype << 16) | pdata;
3531
okquantifier = TRUE;
3532
}
3533
#else
3534
errorcode = ERR45;
3535
goto ESCAPE_FAILED;
3536
#endif
3537
break; /* End \P and \p */
3538
3539
/* When \g is used with quotes or angle brackets as delimiters, it is a
3540
numerical or named subroutine call, and control comes here. When used
3541
with brace delimiters it is a numerical back reference and does not come
3542
here because check_escape() returns it directly as a reference. \k is
3543
always a named back reference. */
3544
3545
case ESC_g:
3546
case ESC_k:
3547
if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3548
*ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3549
{
3550
errorcode = (escape == ESC_g)? ERR57 : ERR69;
3551
goto ESCAPE_FAILED;
3552
}
3553
terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3554
CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3555
CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3556
3557
/* For a non-braced \g, check for a numerical recursion. */
3558
3559
if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3560
{
3561
PCRE2_SPTR p = ptr + 1;
3562
3563
if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3564
&errorcode))
3565
{
3566
if (p >= ptrend || *p != terminator)
3567
{
3568
errorcode = ERR57;
3569
goto ESCAPE_FAILED;
3570
}
3571
ptr = p;
3572
goto SET_RECURSION;
3573
}
3574
if (errorcode != 0) goto ESCAPE_FAILED;
3575
}
3576
3577
/* Not a numerical recursion. Perl allows spaces and tabs after { and
3578
before } but not for other delimiters. */
3579
3580
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3581
&errorcode, cb)) goto ESCAPE_FAILED;
3582
3583
/* \k and \g when used with braces are back references, whereas \g used
3584
with quotes or angle brackets is a recursion */
3585
3586
*parsed_pattern++ =
3587
(escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3588
META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3589
*parsed_pattern++ = namelen;
3590
3591
PUTOFFSET(offset, parsed_pattern);
3592
okquantifier = TRUE;
3593
break; /* End special escape processing */
3594
}
3595
break; /* End escape sequence processing */
3596
3597
3598
/* ---- Single-character special items ---- */
3599
3600
case CHAR_CIRCUMFLEX_ACCENT:
3601
*parsed_pattern++ = META_CIRCUMFLEX;
3602
break;
3603
3604
case CHAR_DOLLAR_SIGN:
3605
*parsed_pattern++ = META_DOLLAR;
3606
break;
3607
3608
case CHAR_DOT:
3609
*parsed_pattern++ = META_DOT;
3610
okquantifier = TRUE;
3611
break;
3612
3613
3614
/* ---- Single-character quantifiers ---- */
3615
3616
case CHAR_ASTERISK:
3617
meta_quantifier = META_ASTERISK;
3618
goto CHECK_QUANTIFIER;
3619
3620
case CHAR_PLUS:
3621
meta_quantifier = META_PLUS;
3622
goto CHECK_QUANTIFIER;
3623
3624
case CHAR_QUESTION_MARK:
3625
meta_quantifier = META_QUERY;
3626
goto CHECK_QUANTIFIER;
3627
3628
3629
/* ---- Potential {n,m} quantifier ---- */
3630
3631
case CHAR_LEFT_CURLY_BRACKET:
3632
if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3633
&errorcode))
3634
{
3635
if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3636
PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3637
break; /* No more quantifier processing */
3638
}
3639
meta_quantifier = META_MINMAX;
3640
/* Fall through */
3641
3642
3643
/* ---- Quantifier post-processing ---- */
3644
3645
/* Check that a quantifier is allowed after the previous item. This
3646
guarantees that there is a previous item. */
3647
3648
CHECK_QUANTIFIER:
3649
if (!prev_okquantifier)
3650
{
3651
errorcode = ERR9;
3652
goto FAILED_BACK; // TODO https://github.com/PCRE2Project/pcre2/issues/549
3653
}
3654
3655
/* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3656
quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3657
sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3658
wrapping it in non-capturing brackets, but we have to allow for a preceding
3659
(*MARK) for when (*ACCEPT) has an argument. */
3660
3661
if (*prev_parsed_item == META_ACCEPT)
3662
{
3663
uint32_t *p;
3664
for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3665
*verbstartptr = META_NOCAPTURE;
3666
parsed_pattern[1] = META_KET;
3667
parsed_pattern += 2;
3668
3669
#ifdef PCRE2_DEBUG
3670
PCRE2_ASSERT(parsed_pattern_extra >= 2);
3671
parsed_pattern_extra -= 2;
3672
#endif
3673
}
3674
3675
/* Now we can put the quantifier into the parsed pattern vector. At this
3676
stage, we have only the basic quantifier. The check for a following + or ?
3677
modifier happens at the top of the loop, after any intervening comments
3678
have been removed. */
3679
3680
*parsed_pattern++ = meta_quantifier;
3681
if (c == CHAR_LEFT_CURLY_BRACKET)
3682
{
3683
*parsed_pattern++ = min_repeat;
3684
*parsed_pattern++ = max_repeat;
3685
}
3686
break;
3687
3688
3689
/* ---- Character class ---- */
3690
3691
case CHAR_LEFT_SQUARE_BRACKET:
3692
3693
/* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3694
used for "start of word" and "end of word". As these are otherwise illegal
3695
sequences, we don't break anything by recognizing them. They are replaced
3696
by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3697
erroneous and are handled by the normal code below. */
3698
3699
if (ptrend - ptr >= 6 &&
3700
(PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3701
PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3702
{
3703
*parsed_pattern++ = META_ESCAPE + ESC_b;
3704
3705
if (ptr[2] == CHAR_LESS_THAN_SIGN)
3706
{
3707
*parsed_pattern++ = META_LOOKAHEAD;
3708
}
3709
else
3710
{
3711
*parsed_pattern++ = META_LOOKBEHIND;
3712
*has_lookbehind = TRUE;
3713
3714
/* The offset is used only for the "non-fixed length" error; this won't
3715
occur here, so just store zero. */
3716
3717
PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3718
}
3719
3720
if ((options & PCRE2_UCP) == 0)
3721
*parsed_pattern++ = META_ESCAPE + ESC_w;
3722
else
3723
{
3724
*parsed_pattern++ = META_ESCAPE + ESC_p;
3725
*parsed_pattern++ = PT_WORD << 16;
3726
}
3727
*parsed_pattern++ = META_KET;
3728
ptr += 6;
3729
okquantifier = TRUE;
3730
break;
3731
}
3732
3733
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3734
they are encountered at the top level, so we'll do that too. */
3735
3736
if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3737
*ptr == CHAR_EQUALS_SIGN) &&
3738
check_posix_syntax(ptr, ptrend, &tempptr))
3739
{
3740
errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3741
goto FAILED;
3742
}
3743
3744
class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3745
CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3746
3747
/* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3748
set c to the '[' character, and ptr to just after the '['. */
3749
3750
FROM_PERL_EXTENDED_CLASS:
3751
okquantifier = TRUE;
3752
3753
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
3754
because there are holes in the encoding, and simply using the range A-Z
3755
(for example) would include the characters in the holes. This applies only
3756
to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3757
in this respect. In order to accommodate this, we keep track of whether
3758
character values are literal or not, and a state variable for handling
3759
ranges. */
3760
3761
/* Loop for the contents of the class. Classes may be nested, if
3762
PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3763
3764
/* c is still set to '[' so the loop will handle the start of the class. */
3765
3766
class_depth_m1 = -1;
3767
class_maxdepth_m1 = -1;
3768
class_range_state = RANGE_NO;
3769
class_op_state = CLASS_OP_EMPTY;
3770
class_start = NULL;
3771
3772
for (;;)
3773
{
3774
BOOL char_is_literal = TRUE;
3775
3776
/* Inside \Q...\E everything is literal except \E */
3777
3778
if (inescq)
3779
{
3780
if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3781
{
3782
inescq = FALSE; /* Reset literal state */
3783
ptr++; /* Skip the 'E' */
3784
goto CLASS_CONTINUE;
3785
}
3786
3787
/* Surprisingly, you cannot use \Q..\E to escape a character inside a
3788
Perl extended class. However, empty \Q\E sequences are allowed, so here
3789
were're only giving an error if the \Q..\E is non-empty. */
3790
3791
if (class_mode_state == CLASS_MODE_PERL_EXT)
3792
{
3793
errorcode = ERR116;
3794
goto FAILED;
3795
}
3796
3797
goto CLASS_LITERAL;
3798
}
3799
3800
/* Skip over space and tab (only) in extended-more mode, or anywhere
3801
inside a Perl extended class (which implies /xx). */
3802
3803
if ((c == CHAR_SPACE || c == CHAR_HT) &&
3804
((options & PCRE2_EXTENDED_MORE) != 0 ||
3805
class_mode_state >= CLASS_MODE_PERL_EXT))
3806
goto CLASS_CONTINUE;
3807
3808
/* Handle POSIX class names. Perl allows a negation extension of the
3809
form [:^name:]. A square bracket that doesn't match the syntax is
3810
treated as a literal. We also recognize the POSIX constructions
3811
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3812
5.6 and 5.8 do. */
3813
3814
if (class_depth_m1 >= 0 &&
3815
c == CHAR_LEFT_SQUARE_BRACKET &&
3816
ptrend - ptr >= 3 &&
3817
(*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3818
*ptr == CHAR_EQUALS_SIGN) &&
3819
check_posix_syntax(ptr, ptrend, &tempptr))
3820
{
3821
BOOL posix_negate = FALSE;
3822
int posix_class;
3823
3824
/* Perl treats a hyphen before a POSIX class as a literal, not the
3825
start of a range. However, it gives a warning in its warning mode. PCRE
3826
does not have a warning mode, so we give an error, because this is
3827
likely an error on the user's part. */
3828
3829
if (class_range_state == RANGE_STARTED)
3830
{
3831
ptr = tempptr + 2;
3832
errorcode = ERR50;
3833
goto FAILED;
3834
}
3835
3836
/* Perl treats a hyphen after a POSIX class as a literal, not the
3837
start of a range. However, it gives a warning in its warning mode
3838
unless the hyphen is the last character in the class. PCRE does not
3839
have a warning mode, so we give an error, because this is likely an
3840
error on the user's part.
3841
3842
Roll back to the hyphen for the error position. */
3843
3844
if (class_range_state == RANGE_FORBID_STARTED)
3845
{
3846
ptr = class_range_forbid_ptr;
3847
errorcode = ERR50;
3848
goto FAILED;
3849
}
3850
3851
/* Disallow implicit union in Perl extended classes. */
3852
3853
if (class_op_state == CLASS_OP_OPERAND &&
3854
class_mode_state == CLASS_MODE_PERL_EXT)
3855
{
3856
ptr = tempptr + 2;
3857
errorcode = ERR113;
3858
goto FAILED;
3859
}
3860
3861
if (*ptr != CHAR_COLON)
3862
{
3863
ptr = tempptr + 2;
3864
errorcode = ERR13;
3865
goto FAILED;
3866
}
3867
3868
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3869
{
3870
posix_negate = TRUE;
3871
ptr++;
3872
}
3873
3874
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3875
ptr = tempptr + 2;
3876
if (posix_class < 0)
3877
{
3878
errorcode = ERR30;
3879
goto FAILED;
3880
}
3881
3882
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
3883
case, the hyphen is treated as a literal, but for '-1' it is disallowed
3884
(because it would be interpreted as range). */
3885
3886
class_range_state = RANGE_FORBID_NO;
3887
class_op_state = CLASS_OP_OPERAND;
3888
3889
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3890
of the POSIX classes are converted to use Unicode properties \p or \P
3891
or, in one case, \h or \H. The substitutes table has two values per
3892
class, containing the type and value of a \p or \P item. The special
3893
cases are specified with a negative type: a non-zero value causes \h or
3894
\H to be used, and a zero value falls through to behave like a non-UCP
3895
POSIX class. There are now also some extra options that force ASCII for
3896
some classes. */
3897
3898
#ifdef SUPPORT_UNICODE
3899
if ((options & PCRE2_UCP) != 0 &&
3900
(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3901
!((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3902
(posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3903
{
3904
int ptype = posix_substitutes[2*posix_class];
3905
int pvalue = posix_substitutes[2*posix_class + 1];
3906
3907
if (ptype >= 0)
3908
{
3909
*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3910
*parsed_pattern++ = (ptype << 16) | pvalue;
3911
goto CLASS_CONTINUE;
3912
}
3913
3914
if (pvalue != 0)
3915
{
3916
*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3917
goto CLASS_CONTINUE;
3918
}
3919
3920
/* Fall through */
3921
}
3922
#endif /* SUPPORT_UNICODE */
3923
3924
/* Non-UCP POSIX class */
3925
3926
*parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3927
*parsed_pattern++ = posix_class;
3928
}
3929
3930
/* Check for the start of the outermost class, or the start of a nested class. */
3931
3932
else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
3933
(class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
3934
class_mode_state == CLASS_MODE_PERL_EXT)) ||
3935
(c == CHAR_LEFT_PARENTHESIS &&
3936
class_mode_state == CLASS_MODE_PERL_EXT))
3937
{
3938
uint32_t start_c = c;
3939
uint32_t new_class_mode_state;
3940
3941
/* Update the class mode, if moving into a 'leaf' inside a Perl extended
3942
class. */
3943
3944
if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
3945
class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
3946
new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
3947
else
3948
new_class_mode_state = class_mode_state;
3949
3950
/* Tidy up the other class before starting the nested class. */
3951
/* -[ beginning a nested class is a literal '-' */
3952
3953
if (class_range_state == RANGE_STARTED)
3954
parsed_pattern[-1] = CHAR_MINUS;
3955
3956
/* Disallow implicit union in Perl extended classes. */
3957
3958
if (class_op_state == CLASS_OP_OPERAND &&
3959
class_mode_state == CLASS_MODE_PERL_EXT)
3960
{
3961
errorcode = ERR113;
3962
goto FAILED;
3963
}
3964
3965
/* Validate nesting depth */
3966
if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
3967
{
3968
errorcode = ERR107;
3969
goto FAILED; /* Classes too deeply nested */
3970
}
3971
3972
/* Process the character class start. If the first character is '^', set
3973
the negation flag. If the first few characters (either before or after ^)
3974
are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3975
This makes for compatibility with Perl. */
3976
3977
negate_class = FALSE;
3978
for (;;)
3979
{
3980
if (ptr >= ptrend)
3981
{
3982
if (start_c == CHAR_LEFT_PARENTHESIS)
3983
errorcode = ERR14; /* Missing terminating ')' */
3984
else
3985
errorcode = ERR6; /* Missing terminating ']' */
3986
goto FAILED;
3987
}
3988
3989
GETCHARINCTEST(c, ptr);
3990
if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
3991
else if (c == CHAR_BACKSLASH)
3992
{
3993
if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3994
else if (ptrend - ptr >= 3 &&
3995
PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3996
ptr += 3;
3997
else
3998
break;
3999
}
4000
else if ((c == CHAR_SPACE || c == CHAR_HT) && /* Note: just these two */
4001
((options & PCRE2_EXTENDED_MORE) != 0 ||
4002
new_class_mode_state >= CLASS_MODE_PERL_EXT))
4003
continue;
4004
else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4005
negate_class = TRUE;
4006
else break;
4007
}
4008
4009
/* Now the real contents of the class; c has the first "real" character.
4010
Empty classes are permitted only if the option is set, and if it's not
4011
a Perl-extended class. */
4012
4013
if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4014
(cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4015
new_class_mode_state < CLASS_MODE_PERL_EXT)
4016
{
4017
PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4018
4019
if (class_start != NULL)
4020
{
4021
PCRE2_ASSERT(class_depth_m1 >= 0);
4022
/* Represents that the class is an extended class. */
4023
*class_start |= CLASS_IS_ECLASS;
4024
class_start = NULL;
4025
}
4026
4027
*parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4028
4029
/* Leave nesting depth unchanged; but check for zero depth to handle the
4030
very first (top-level) class being empty. */
4031
if (class_depth_m1 < 0) break;
4032
4033
class_range_state = RANGE_NO; /* for processing the containing class */
4034
class_op_state = CLASS_OP_OPERAND;
4035
goto CLASS_CONTINUE;
4036
}
4037
4038
/* Enter a non-empty class. */
4039
4040
if (class_start != NULL)
4041
{
4042
PCRE2_ASSERT(class_depth_m1 >= 0);
4043
/* Represents that the class is an extended class. */
4044
*class_start |= CLASS_IS_ECLASS;
4045
class_start = NULL;
4046
}
4047
4048
class_start = parsed_pattern;
4049
*parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4050
class_range_state = RANGE_NO;
4051
class_op_state = CLASS_OP_EMPTY;
4052
class_mode_state = new_class_mode_state;
4053
++class_depth_m1;
4054
if (class_maxdepth_m1 < class_depth_m1)
4055
class_maxdepth_m1 = class_depth_m1;
4056
/* Reset; no op seen yet at new depth. */
4057
cb->class_op_used[class_depth_m1] = 0;
4058
4059
/* Implement the special start-of-class literal meaning of ']'. */
4060
if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4061
new_class_mode_state != CLASS_MODE_PERL_EXT)
4062
{
4063
class_range_state = RANGE_OK_LITERAL;
4064
class_op_state = CLASS_OP_OPERAND;
4065
PARSED_LITERAL(c, parsed_pattern);
4066
goto CLASS_CONTINUE;
4067
}
4068
4069
continue; /* We have already loaded c with the next character */
4070
}
4071
4072
/* Check for the end of the class. */
4073
4074
else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4075
(c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4076
{
4077
/* In Perl extended mode, the ']' can only be used to match the
4078
opening '[', and ')' must match an opening parenthesis. */
4079
if (class_mode_state == CLASS_MODE_PERL_EXT)
4080
{
4081
if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4082
{
4083
errorcode = ERR14;
4084
goto FAILED_BACK;
4085
}
4086
if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4087
{
4088
errorcode = ERR22;
4089
goto FAILED;
4090
}
4091
}
4092
4093
/* Check no trailing operator. */
4094
if (class_op_state == CLASS_OP_OPERATOR)
4095
{
4096
errorcode = ERR110;
4097
goto FAILED;
4098
}
4099
4100
/* Check no empty expression for Perl extended expressions. */
4101
if (class_mode_state == CLASS_MODE_PERL_EXT &&
4102
class_op_state == CLASS_OP_EMPTY)
4103
{
4104
errorcode = ERR114;
4105
goto FAILED;
4106
}
4107
4108
/* -] at the end of a class is a literal '-' */
4109
if (class_range_state == RANGE_STARTED)
4110
parsed_pattern[-1] = CHAR_MINUS;
4111
4112
*parsed_pattern++ = META_CLASS_END;
4113
4114
if (--class_depth_m1 < 0)
4115
{
4116
/* Check for and consume ')' after '(?[...]'. */
4117
PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4118
if (class_mode_state == CLASS_MODE_PERL_EXT)
4119
{
4120
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4121
{
4122
errorcode = ERR115;
4123
goto FAILED;
4124
}
4125
4126
ptr++;
4127
}
4128
4129
break;
4130
}
4131
4132
class_range_state = RANGE_NO; /* for processing the containing class */
4133
class_op_state = CLASS_OP_OPERAND;
4134
if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4135
class_mode_state = CLASS_MODE_PERL_EXT;
4136
/* The extended class flag has already
4137
been set for the parent class. */
4138
class_start = NULL;
4139
}
4140
4141
/* Handle a Perl set binary operator */
4142
4143
else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4144
(c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4145
c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4146
{
4147
/* Check that there was a preceding operand. */
4148
if (class_op_state != CLASS_OP_OPERAND)
4149
{
4150
errorcode = ERR109;
4151
goto FAILED;
4152
}
4153
4154
if (class_start != NULL)
4155
{
4156
PCRE2_ASSERT(class_depth_m1 >= 0);
4157
/* Represents that the class is an extended class. */
4158
*class_start |= CLASS_IS_ECLASS;
4159
class_start = NULL;
4160
}
4161
4162
PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4163
class_range_state != RANGE_FORBID_STARTED);
4164
4165
*parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4166
c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4167
c == CHAR_MINUS? META_ECLASS_SUB :
4168
c == CHAR_AMPERSAND? META_ECLASS_AND :
4169
META_ECLASS_XOR;
4170
class_range_state = RANGE_NO;
4171
class_op_state = CLASS_OP_OPERATOR;
4172
}
4173
4174
/* Handle a Perl set unary operator */
4175
4176
else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4177
c == CHAR_EXCLAMATION_MARK)
4178
{
4179
/* Check that the "!" has not got a preceding operand (i.e. it's the
4180
start of the class, or follows an operator). */
4181
if (class_op_state == CLASS_OP_OPERAND)
4182
{
4183
errorcode = ERR113;
4184
goto FAILED;
4185
}
4186
4187
if (class_start != NULL)
4188
{
4189
PCRE2_ASSERT(class_depth_m1 >= 0);
4190
/* Represents that the class is an extended class. */
4191
*class_start |= CLASS_IS_ECLASS;
4192
class_start = NULL;
4193
}
4194
4195
PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4196
class_range_state != RANGE_FORBID_STARTED);
4197
4198
*parsed_pattern++ = META_ECLASS_NOT;
4199
class_range_state = RANGE_NO;
4200
class_op_state = CLASS_OP_OPERATOR;
4201
}
4202
4203
/* Handle a UTS#18 set operator */
4204
4205
else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4206
(c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4207
c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4208
ptr < ptrend && *ptr == c)
4209
{
4210
++ptr;
4211
4212
/* Check there isn't a triple-repetition. */
4213
if (ptr < ptrend && *ptr == c)
4214
{
4215
while (ptr < ptrend && *ptr == c) ++ptr; /* Improve error offset. */
4216
errorcode = ERR108;
4217
goto FAILED;
4218
}
4219
4220
/* Check for a preceding operand. */
4221
if (class_op_state != CLASS_OP_OPERAND)
4222
{
4223
errorcode = ERR109;
4224
goto FAILED;
4225
}
4226
4227
/* Check for mixed precedence. Forbid [A--B&&C]. */
4228
if (cb->class_op_used[class_depth_m1] != 0 &&
4229
cb->class_op_used[class_depth_m1] != (uint8_t)c)
4230
{
4231
errorcode = ERR111;
4232
goto FAILED;
4233
}
4234
4235
if (class_start != NULL)
4236
{
4237
PCRE2_ASSERT(class_depth_m1 >= 0);
4238
/* Represents that the class is an extended class. */
4239
*class_start |= CLASS_IS_ECLASS;
4240
class_start = NULL;
4241
}
4242
4243
/* Dangling '-' before an operator is a literal */
4244
if (class_range_state == RANGE_STARTED)
4245
parsed_pattern[-1] = CHAR_MINUS;
4246
4247
*parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4248
c == CHAR_MINUS? META_ECLASS_SUB :
4249
c == CHAR_AMPERSAND? META_ECLASS_AND :
4250
META_ECLASS_XOR;
4251
class_range_state = RANGE_NO;
4252
class_op_state = CLASS_OP_OPERATOR;
4253
cb->class_op_used[class_depth_m1] = (uint8_t)c;
4254
}
4255
4256
/* Handle escapes in a class */
4257
4258
else if (c == CHAR_BACKSLASH)
4259
{
4260
tempptr = ptr;
4261
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4262
xoptions, cb->bracount, TRUE, cb);
4263
4264
if (errorcode != 0)
4265
{
4266
if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4267
class_mode_state >= CLASS_MODE_PERL_EXT)
4268
goto FAILED;
4269
ptr = tempptr;
4270
if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4271
{
4272
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
4273
}
4274
escape = 0; /* Treat as literal character */
4275
}
4276
4277
switch(escape)
4278
{
4279
case 0: /* Escaped character code point is in c */
4280
char_is_literal = FALSE;
4281
goto CLASS_LITERAL; /* (a few lines above) */
4282
4283
case ESC_b:
4284
c = CHAR_BS; /* \b is backspace in a class */
4285
char_is_literal = FALSE;
4286
goto CLASS_LITERAL;
4287
4288
case ESC_k:
4289
c = CHAR_k; /* \k is not special in a class, just like \g */
4290
char_is_literal = FALSE;
4291
goto CLASS_LITERAL;
4292
4293
case ESC_Q:
4294
inescq = TRUE; /* Enter literal mode */
4295
goto CLASS_CONTINUE;
4296
4297
case ESC_E: /* Ignore orphan \E */
4298
goto CLASS_CONTINUE;
4299
4300
case ESC_B: /* Always an error in a class */
4301
case ESC_R:
4302
case ESC_X:
4303
errorcode = ERR7;
4304
ptr--; // TODO https://github.com/PCRE2Project/pcre2/issues/549
4305
goto FAILED;
4306
4307
case ESC_N: /* Not permitted by Perl either */
4308
errorcode = ERR71;
4309
goto FAILED;
4310
4311
case ESC_H:
4312
case ESC_h:
4313
case ESC_V:
4314
case ESC_v:
4315
*parsed_pattern++ = META_ESCAPE + escape;
4316
break;
4317
4318
/* These escapes may be converted to Unicode property tests when
4319
PCRE2_UCP is set. */
4320
4321
case ESC_d:
4322
case ESC_D:
4323
case ESC_s:
4324
case ESC_S:
4325
case ESC_w:
4326
case ESC_W:
4327
parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4328
xoptions);
4329
break;
4330
4331
/* Explicit Unicode property matching */
4332
4333
case ESC_P:
4334
case ESC_p:
4335
#ifdef SUPPORT_UNICODE
4336
{
4337
BOOL negated;
4338
uint16_t ptype = 0, pdata = 0;
4339
if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
4340
goto FAILED;
4341
4342
/* In caseless matching, particular characteristics Lu, Ll, and Lt
4343
get converted to the general characteristic L&. That is, upper,
4344
lower, and title case letters are all conflated. */
4345
4346
if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4347
(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4348
{
4349
ptype = PT_LAMP;
4350
pdata = 0;
4351
}
4352
4353
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4354
*parsed_pattern++ = META_ESCAPE + escape;
4355
*parsed_pattern++ = (ptype << 16) | pdata;
4356
}
4357
#else
4358
errorcode = ERR45;
4359
goto FAILED;
4360
#endif
4361
break; /* End \P and \p */
4362
4363
/* All others are not allowed in a class */
4364
4365
default:
4366
PCRE2_DEBUG_UNREACHABLE();
4367
/* Fall through */
4368
4369
case ESC_A:
4370
case ESC_Z:
4371
case ESC_z:
4372
case ESC_G:
4373
case ESC_K:
4374
case ESC_C:
4375
errorcode = ERR7;
4376
ptr--; // TODO https://github.com/PCRE2Project/pcre2/issues/549
4377
goto FAILED;
4378
}
4379
4380
/* All the switch-cases above which end in "break" describe a set
4381
of characters. None may start a range. */
4382
4383
/* The second part of a range can be a single-character escape
4384
sequence (detected above), but not any of the other escapes. Perl
4385
treats a hyphen as a literal in such circumstances. However, in Perl's
4386
warning mode, a warning is given, so PCRE now faults it, as it is
4387
almost certainly a mistake on the user's part. */
4388
4389
if (class_range_state == RANGE_STARTED)
4390
{
4391
errorcode = ERR50;
4392
goto FAILED;
4393
}
4394
4395
/* Perl gives a warning unless the hyphen following a multi-character
4396
escape is the last character in the class. PCRE throws an error. */
4397
4398
if (class_range_state == RANGE_FORBID_STARTED)
4399
{
4400
ptr = class_range_forbid_ptr;
4401
errorcode = ERR50;
4402
goto FAILED;
4403
}
4404
4405
/* Disallow implicit union in Perl extended classes. */
4406
4407
if (class_op_state == CLASS_OP_OPERAND &&
4408
class_mode_state == CLASS_MODE_PERL_EXT)
4409
{
4410
errorcode = ERR113;
4411
goto FAILED;
4412
}
4413
4414
class_range_state = RANGE_FORBID_NO;
4415
class_op_state = CLASS_OP_OPERAND;
4416
}
4417
4418
/* Forbid unescaped literals, and the special meaning of '-', inside a
4419
Perl extended class. */
4420
4421
else if (class_mode_state == CLASS_MODE_PERL_EXT)
4422
{
4423
errorcode = ERR116;
4424
goto FAILED;
4425
}
4426
4427
/* Handle potential start of range */
4428
4429
else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4430
{
4431
*parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4432
META_RANGE_LITERAL : META_RANGE_ESCAPED;
4433
class_range_state = RANGE_STARTED;
4434
}
4435
4436
/* Handle forbidden start of range */
4437
4438
else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4439
{
4440
*parsed_pattern++ = CHAR_MINUS;
4441
class_range_state = RANGE_FORBID_STARTED;
4442
class_range_forbid_ptr = ptr;
4443
}
4444
4445
/* Handle a literal character */
4446
4447
else
4448
{
4449
CLASS_LITERAL:
4450
4451
/* Disallow implicit union in Perl extended classes. */
4452
4453
if (class_op_state == CLASS_OP_OPERAND &&
4454
class_mode_state == CLASS_MODE_PERL_EXT)
4455
{
4456
errorcode = ERR113;
4457
goto FAILED;
4458
}
4459
4460
if (class_range_state == RANGE_STARTED)
4461
{
4462
if (c == parsed_pattern[-2]) /* Optimize one-char range */
4463
parsed_pattern--;
4464
else if (parsed_pattern[-2] > c) /* Check range is in order */
4465
{
4466
errorcode = ERR8;
4467
goto FAILED_BACK; // TODO https://github.com/PCRE2Project/pcre2/issues/549
4468
}
4469
else
4470
{
4471
if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4472
parsed_pattern[-1] = META_RANGE_ESCAPED;
4473
PARSED_LITERAL(c, parsed_pattern);
4474
}
4475
class_range_state = RANGE_NO;
4476
class_op_state = CLASS_OP_OPERAND;
4477
}
4478
else if (class_range_state == RANGE_FORBID_STARTED)
4479
{
4480
ptr = class_range_forbid_ptr;
4481
errorcode = ERR50;
4482
goto FAILED;
4483
}
4484
else /* Potential start of range */
4485
{
4486
class_range_state = char_is_literal?
4487
RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4488
class_op_state = CLASS_OP_OPERAND;
4489
PARSED_LITERAL(c, parsed_pattern);
4490
}
4491
}
4492
4493
/* Proceed to next thing in the class. */
4494
4495
CLASS_CONTINUE:
4496
if (ptr >= ptrend)
4497
{
4498
if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4499
errorcode = ERR14; /* Missing terminating ')' */
4500
if (class_mode_state == CLASS_MODE_ALT_EXT &&
4501
class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4502
errorcode = ERR112; /* Missing terminating ']', but we saw '[ [ ]...' */
4503
else
4504
errorcode = ERR6; /* Missing terminating ']' */
4505
goto FAILED;
4506
}
4507
GETCHARINCTEST(c, ptr);
4508
} /* End of class-processing loop */
4509
4510
break; /* End of character class */
4511
4512
4513
/* ---- Opening parenthesis ---- */
4514
4515
case CHAR_LEFT_PARENTHESIS:
4516
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4517
4518
/* If ( is not followed by ? it is either a capture or a special verb or an
4519
alpha assertion or a positive non-atomic lookahead. */
4520
4521
if (*ptr != CHAR_QUESTION_MARK)
4522
{
4523
const char *vn;
4524
4525
/* Handle capturing brackets (or non-capturing if auto-capture is turned
4526
off). */
4527
4528
if (*ptr != CHAR_ASTERISK)
4529
{
4530
nest_depth++;
4531
if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4532
{
4533
if (cb->bracount >= MAX_GROUP_NUMBER)
4534
{
4535
errorcode = ERR97;
4536
goto FAILED;
4537
}
4538
cb->bracount++;
4539
*parsed_pattern++ = META_CAPTURE | cb->bracount;
4540
}
4541
else *parsed_pattern++ = META_NOCAPTURE;
4542
}
4543
4544
/* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4545
quantifier" error rather than "(*MARK) must have an argument". */
4546
4547
else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4548
break;
4549
4550
/* Handle "alpha assertions" such as (*pla:...). Most of these are
4551
synonyms for the historical symbolic assertions, but the script run and
4552
non-atomic lookaround ones are new. They are distinguished by starting
4553
with a lower case letter. Checking both ends of the alphabet makes this
4554
work in all character codes. */
4555
4556
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4557
{
4558
uint32_t meta;
4559
4560
vn = alasnames;
4561
if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4562
&errorcode, cb)) goto FAILED;
4563
if (ptr >= ptrend || *ptr != CHAR_COLON)
4564
{
4565
errorcode = ERR95; /* Malformed */
4566
goto FAILED;
4567
}
4568
4569
/* Scan the table of alpha assertion names */
4570
4571
for (i = 0; i < alascount; i++)
4572
{
4573
if (namelen == alasmeta[i].len &&
4574
PRIV(strncmp_c8)(name, vn, namelen) == 0)
4575
break;
4576
vn += alasmeta[i].len + 1;
4577
}
4578
4579
if (i >= alascount)
4580
{
4581
errorcode = ERR95; /* Alpha assertion not recognized */
4582
goto FAILED;
4583
}
4584
4585
/* Check for expecting an assertion condition. If so, only atomic
4586
lookaround assertions are valid. */
4587
4588
meta = alasmeta[i].meta;
4589
if (prev_expect_cond_assert > 0 &&
4590
(meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4591
{
4592
errorcode = ERR28; /* Atomic assertion expected */
4593
goto FAILED;
4594
}
4595
4596
/* The lookaround alphabetic synonyms can mostly be handled by jumping
4597
to the code that handles the traditional symbolic forms. */
4598
4599
switch(meta)
4600
{
4601
default:
4602
PCRE2_DEBUG_UNREACHABLE();
4603
errorcode = ERR89; /* Unknown code; should never occur because */
4604
goto FAILED; /* the meta values come from a table above. */
4605
4606
case META_ATOMIC:
4607
goto ATOMIC_GROUP;
4608
4609
case META_LOOKAHEAD:
4610
goto POSITIVE_LOOK_AHEAD;
4611
4612
case META_LOOKAHEAD_NA:
4613
goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4614
4615
case META_LOOKAHEADNOT:
4616
goto NEGATIVE_LOOK_AHEAD;
4617
4618
case META_SCS:
4619
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4620
4621
if (*ptr != CHAR_LEFT_PARENTHESIS)
4622
{
4623
errorcode = ERR15;
4624
goto FAILED;
4625
}
4626
4627
ptr++;
4628
*parsed_pattern++ = META_SCS;
4629
/* Temporary variable, zero in the first iteration. */
4630
offset = 0;
4631
4632
for (;;)
4633
{
4634
PCRE2_SIZE next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4635
4636
/* Handle (scan_substring:([+-]number)... */
4637
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
4638
&i, &errorcode))
4639
{
4640
PCRE2_ASSERT(i >= 0);
4641
if (i <= 0)
4642
{
4643
errorcode = ERR15;
4644
goto FAILED;
4645
}
4646
meta = META_SCS_NUMBER;
4647
namelen = (uint32_t)i;
4648
}
4649
else if (errorcode != 0) goto FAILED; /* Number too big */
4650
else
4651
{
4652
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4653
4654
/* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */
4655
if (*ptr == CHAR_LESS_THAN_SIGN)
4656
terminator = CHAR_GREATER_THAN_SIGN;
4657
else if (*ptr == CHAR_APOSTROPHE)
4658
terminator = CHAR_APOSTROPHE;
4659
else
4660
{
4661
errorcode = ERR15;
4662
goto FAILED;
4663
}
4664
4665
if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
4666
&name, &namelen, &errorcode, cb)) goto FAILED;
4667
4668
meta = META_SCS_NAME;
4669
}
4670
4671
PCRE2_ASSERT(next_offset > 0);
4672
if (offset == 0 || (next_offset - offset) >= 0x10000)
4673
{
4674
*parsed_pattern++ = META_OFFSET;
4675
PUTOFFSET(next_offset, parsed_pattern);
4676
offset = next_offset;
4677
}
4678
4679
/* The offset is encoded as a relative offset, because for some
4680
inputs such as ",2" in (*scs:(1,2,3)...), we only have space for
4681
two uint32_t values, and an opcode and absolute offset may require
4682
three uint32_t values. */
4683
*parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
4684
*parsed_pattern++ = namelen;
4685
offset = next_offset;
4686
4687
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4688
4689
if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
4690
4691
if (*ptr != CHAR_COMMA)
4692
{
4693
errorcode = ERR24;
4694
goto FAILED;
4695
}
4696
4697
ptr++;
4698
}
4699
ptr++;
4700
goto POST_ASSERTION;
4701
4702
case META_LOOKBEHIND:
4703
case META_LOOKBEHINDNOT:
4704
case META_LOOKBEHIND_NA:
4705
*parsed_pattern++ = meta;
4706
ptr--;
4707
goto POST_LOOKBEHIND;
4708
4709
/* The script run facilities are handled here. Unicode support is
4710
required (give an error if not, as this is a security issue). Always
4711
record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4712
META_ATOMIC and remember that we need two META_KETs at the end. */
4713
4714
case META_SCRIPT_RUN:
4715
case META_ATOMIC_SCRIPT_RUN:
4716
#ifdef SUPPORT_UNICODE
4717
*parsed_pattern++ = META_SCRIPT_RUN;
4718
nest_depth++;
4719
ptr++;
4720
if (meta == META_ATOMIC_SCRIPT_RUN)
4721
{
4722
*parsed_pattern++ = META_ATOMIC;
4723
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4724
else if (++top_nest >= end_nests)
4725
{
4726
errorcode = ERR84;
4727
goto FAILED;
4728
}
4729
top_nest->nest_depth = nest_depth;
4730
top_nest->flags = NSF_ATOMICSR;
4731
top_nest->options = options & PARSE_TRACKED_OPTIONS;
4732
top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4733
4734
#ifdef PCRE2_DEBUG
4735
/* We'll write out two META_KETs for a single ")" in the input
4736
pattern, so we reserve space for that in our bounds check. */
4737
parsed_pattern_extra++;
4738
#endif
4739
}
4740
break;
4741
#else /* SUPPORT_UNICODE */
4742
errorcode = ERR96;
4743
goto FAILED;
4744
#endif
4745
}
4746
}
4747
4748
4749
/* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4750
4751
else
4752
{
4753
vn = verbnames;
4754
if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4755
&errorcode, cb)) goto FAILED;
4756
if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4757
*ptr != CHAR_RIGHT_PARENTHESIS))
4758
{
4759
errorcode = ERR60; /* Malformed */
4760
goto FAILED;
4761
}
4762
4763
/* Scan the table of verb names */
4764
4765
for (i = 0; i < verbcount; i++)
4766
{
4767
if (namelen == verbs[i].len &&
4768
PRIV(strncmp_c8)(name, vn, namelen) == 0)
4769
break;
4770
vn += verbs[i].len + 1;
4771
}
4772
4773
if (i >= verbcount)
4774
{
4775
errorcode = ERR60; /* Verb not recognized */
4776
goto FAILED;
4777
}
4778
4779
/* An empty argument is treated as no argument. */
4780
4781
if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4782
ptr[1] == CHAR_RIGHT_PARENTHESIS)
4783
ptr++; /* Advance to the closing parens */
4784
4785
/* Check for mandatory non-empty argument; this is (*MARK) */
4786
4787
if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4788
{
4789
errorcode = ERR66;
4790
goto FAILED;
4791
}
4792
4793
/* Remember where this verb, possibly with a preceding (*MARK), starts,
4794
for handling quantified (*ACCEPT). */
4795
4796
verbstartptr = parsed_pattern;
4797
okquantifier = (verbs[i].meta == META_ACCEPT);
4798
#ifdef PCRE2_DEBUG
4799
/* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4800
with a non-capturing bracket, if there is a following quantifier. */
4801
if (okquantifier) parsed_pattern_extra += 2;
4802
#endif
4803
4804
/* It appears that Perl allows any characters whatsoever, other than a
4805
closing parenthesis, to appear in arguments ("names"), so we no longer
4806
insist on letters, digits, and underscores. Perl does not, however, do
4807
any interpretation within arguments, and has no means of including a
4808
closing parenthesis. PCRE supports escape processing but only when it
4809
is requested by an option. We set inverbname TRUE here, and let the
4810
main loop take care of this so that escape and \x processing is done by
4811
the main code above. */
4812
4813
if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
4814
{
4815
/* Some optional arguments can be treated as a preceding (*MARK) */
4816
4817
if (verbs[i].has_arg < 0)
4818
{
4819
add_after_mark = verbs[i].meta;
4820
*parsed_pattern++ = META_MARK;
4821
}
4822
4823
/* The remaining verbs with arguments (except *MARK) need a different
4824
opcode. */
4825
4826
else
4827
{
4828
*parsed_pattern++ = verbs[i].meta +
4829
((verbs[i].meta != META_MARK)? 0x00010000u:0);
4830
}
4831
4832
/* Set up for reading the name in the main loop. */
4833
4834
verblengthptr = parsed_pattern++;
4835
verbnamestart = ptr;
4836
inverbname = TRUE;
4837
}
4838
else /* No verb "name" argument */
4839
{
4840
*parsed_pattern++ = verbs[i].meta;
4841
}
4842
} /* End of (*VERB) handling */
4843
break; /* Done with this parenthesis */
4844
} /* End of groups that don't start with (? */
4845
4846
4847
/* ---- Items starting (? ---- */
4848
4849
/* The type of item is determined by what follows (?. Handle (?| and option
4850
changes under "default" because both need a new block on the nest stack.
4851
Comments starting with (?# are handled above. Note that there is some
4852
ambiguity about the sequence (?- because if a digit follows it's a relative
4853
recursion or subroutine call whereas otherwise it's an option unsetting. */
4854
4855
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4856
4857
switch(*ptr)
4858
{
4859
default:
4860
if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4861
goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
4862
4863
/* We now have either (?| or a (possibly empty) option setting,
4864
optionally followed by a non-capturing group. */
4865
4866
nest_depth++;
4867
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4868
else if (++top_nest >= end_nests)
4869
{
4870
errorcode = ERR84;
4871
goto FAILED;
4872
}
4873
top_nest->nest_depth = nest_depth;
4874
top_nest->flags = 0;
4875
top_nest->options = options & PARSE_TRACKED_OPTIONS;
4876
top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4877
4878
/* Start of non-capturing group that resets the capture count for each
4879
branch. */
4880
4881
if (*ptr == CHAR_VERTICAL_LINE)
4882
{
4883
top_nest->reset_group = (uint16_t)cb->bracount;
4884
top_nest->max_group = (uint16_t)cb->bracount;
4885
top_nest->flags |= NSF_RESET;
4886
cb->external_flags |= PCRE2_DUPCAPUSED;
4887
*parsed_pattern++ = META_NOCAPTURE;
4888
ptr++;
4889
}
4890
4891
/* Scan for options imnrsxJU to be set or unset. */
4892
4893
else
4894
{
4895
BOOL hyphenok = TRUE;
4896
uint32_t oldoptions = options;
4897
uint32_t oldxoptions = xoptions;
4898
4899
top_nest->reset_group = 0;
4900
top_nest->max_group = 0;
4901
set = unset = 0;
4902
optset = &set;
4903
xset = xunset = 0;
4904
xoptset = &xset;
4905
4906
/* ^ at the start unsets irmnsx and disables the subsequent use of - */
4907
4908
if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4909
{
4910
options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4911
PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4912
xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4913
hyphenok = FALSE;
4914
ptr++;
4915
}
4916
4917
while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4918
*ptr != CHAR_COLON)
4919
{
4920
switch (*ptr++)
4921
{
4922
case CHAR_MINUS:
4923
if (!hyphenok)
4924
{
4925
errorcode = ERR94;
4926
ptr--; /* Correct the offset */
4927
goto FAILED;
4928
}
4929
optset = &unset;
4930
xoptset = &xunset;
4931
hyphenok = FALSE;
4932
break;
4933
4934
/* There are some two-character sequences that start with 'a'. */
4935
4936
case CHAR_a:
4937
if (ptr < ptrend)
4938
{
4939
if (*ptr == CHAR_D)
4940
{
4941
*xoptset |= PCRE2_EXTRA_ASCII_BSD;
4942
ptr++;
4943
break;
4944
}
4945
if (*ptr == CHAR_P)
4946
{
4947
*xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4948
ptr++;
4949
break;
4950
}
4951
if (*ptr == CHAR_S)
4952
{
4953
*xoptset |= PCRE2_EXTRA_ASCII_BSS;
4954
ptr++;
4955
break;
4956
}
4957
if (*ptr == CHAR_T)
4958
{
4959
*xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4960
ptr++;
4961
break;
4962
}
4963
if (*ptr == CHAR_W)
4964
{
4965
*xoptset |= PCRE2_EXTRA_ASCII_BSW;
4966
ptr++;
4967
break;
4968
}
4969
}
4970
*xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4971
PCRE2_EXTRA_ASCII_BSW|
4972
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4973
break;
4974
4975
case CHAR_J: /* Record that it changed in the external options */
4976
*optset |= PCRE2_DUPNAMES;
4977
cb->external_flags |= PCRE2_JCHANGED;
4978
break;
4979
4980
case CHAR_i: *optset |= PCRE2_CASELESS; break;
4981
case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4982
case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4983
case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4984
case CHAR_s: *optset |= PCRE2_DOTALL; break;
4985
case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4986
4987
/* If x appears twice it sets the extended extended option. */
4988
4989
case CHAR_x:
4990
*optset |= PCRE2_EXTENDED;
4991
if (ptr < ptrend && *ptr == CHAR_x)
4992
{
4993
*optset |= PCRE2_EXTENDED_MORE;
4994
ptr++;
4995
}
4996
break;
4997
4998
default:
4999
errorcode = ERR11;
5000
ptr--; /* Correct the offset */
5001
goto FAILED;
5002
}
5003
}
5004
5005
/* If we are setting extended without extended-more, ensure that any
5006
existing extended-more gets unset. Also, unsetting extended must also
5007
unset extended-more. */
5008
5009
if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5010
(unset & PCRE2_EXTENDED) != 0)
5011
unset |= PCRE2_EXTENDED_MORE;
5012
5013
options = (options | set) & (~unset);
5014
xoptions = (xoptions | xset) & (~xunset);
5015
5016
/* If the options ended with ')' this is not the start of a nested
5017
group with option changes, so the options change at this level.
5018
In this case, if the previous level set up a nest block, discard the
5019
one we have just created. Otherwise adjust it for the previous level.
5020
If the options ended with ':' we are starting a non-capturing group,
5021
possibly with an options setting. */
5022
5023
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5024
if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5025
{
5026
nest_depth--; /* This is not a nested group after all. */
5027
if (top_nest > (nest_save *)(cb->start_workspace) &&
5028
(top_nest-1)->nest_depth == nest_depth) top_nest--;
5029
else top_nest->nest_depth = nest_depth;
5030
}
5031
else *parsed_pattern++ = META_NOCAPTURE;
5032
5033
/* If nothing changed, no need to record. */
5034
5035
if (options != oldoptions || xoptions != oldxoptions)
5036
{
5037
*parsed_pattern++ = META_OPTIONS;
5038
*parsed_pattern++ = options;
5039
*parsed_pattern++ = xoptions;
5040
}
5041
} /* End options processing */
5042
break; /* End default case after (? */
5043
5044
5045
/* ---- Python syntax support ---- */
5046
5047
case CHAR_P:
5048
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5049
5050
/* (?P<name> is the same as (?<name>, which defines a named group. */
5051
5052
if (*ptr == CHAR_LESS_THAN_SIGN)
5053
{
5054
terminator = CHAR_GREATER_THAN_SIGN;
5055
goto DEFINE_NAME;
5056
}
5057
5058
/* (?P>name) is the same as (?&name), which is a recursion or subroutine
5059
call. */
5060
5061
if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5062
5063
/* (?P=name) is the same as \k<name>, a back reference by name. Anything
5064
else after (?P is an error. */
5065
5066
if (*ptr != CHAR_EQUALS_SIGN)
5067
{
5068
errorcode = ERR41;
5069
goto FAILED;
5070
}
5071
if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5072
&namelen, &errorcode, cb)) goto FAILED;
5073
*parsed_pattern++ = META_BACKREF_BYNAME;
5074
*parsed_pattern++ = namelen;
5075
PUTOFFSET(offset, parsed_pattern);
5076
okquantifier = TRUE;
5077
break; /* End of (?P processing */
5078
5079
5080
/* ---- Recursion/subroutine calls by number ---- */
5081
5082
case CHAR_R:
5083
i = 0; /* (?R) == (?R0) */
5084
ptr++;
5085
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5086
{
5087
errorcode = ERR58;
5088
goto FAILED;
5089
}
5090
goto SET_RECURSION;
5091
5092
/* An item starting (?- followed by a digit comes here via the "default"
5093
case because (?- followed by a non-digit is an options setting. */
5094
5095
case CHAR_PLUS:
5096
if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
5097
{
5098
errorcode = ERR29; /* Missing number */
5099
goto FAILED;
5100
}
5101
/* Fall through */
5102
5103
case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5104
case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5105
RECURSION_BYNUMBER:
5106
if (!read_number(&ptr, ptrend,
5107
(IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5108
MAX_GROUP_NUMBER, ERR61,
5109
&i, &errorcode)) goto FAILED;
5110
PCRE2_ASSERT(i >= 0); /* NB (?0) is permitted, represented by i=0 */
5111
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5112
goto UNCLOSED_PARENTHESIS;
5113
5114
SET_RECURSION:
5115
*parsed_pattern++ = META_RECURSE | (uint32_t)i;
5116
offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5117
ptr++;
5118
PUTOFFSET(offset, parsed_pattern);
5119
okquantifier = TRUE;
5120
break; /* End of recursive call by number handling */
5121
5122
5123
/* ---- Recursion/subroutine calls by name ---- */
5124
5125
case CHAR_AMPERSAND:
5126
RECURSE_BY_NAME:
5127
if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5128
&namelen, &errorcode, cb)) goto FAILED;
5129
*parsed_pattern++ = META_RECURSE_BYNAME;
5130
*parsed_pattern++ = namelen;
5131
PUTOFFSET(offset, parsed_pattern);
5132
okquantifier = TRUE;
5133
break;
5134
5135
/* ---- Callout with numerical or string argument ---- */
5136
5137
case CHAR_C:
5138
if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5139
{
5140
errorcode = ERR103;
5141
goto FAILED;
5142
}
5143
5144
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5145
5146
/* If the previous item was a condition starting (?(? an assertion,
5147
optionally preceded by a callout, is expected. This is checked later on,
5148
during actual compilation. However we need to identify this kind of
5149
assertion in this pass because it must not be qualified. The value of
5150
expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5151
for a callout - still leaving a positive value that identifies the
5152
assertion. Multiple callouts or any other items will make it zero or
5153
less, which doesn't matter because they will cause an error later. */
5154
5155
expect_cond_assert = prev_expect_cond_assert - 1;
5156
5157
/* If previous_callout is not NULL, it means this follows a previous
5158
callout. If it was a manual callout, do nothing; this means its "length
5159
of next pattern item" field will remain zero. If it was an automatic
5160
callout, abolish it. */
5161
5162
if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5163
previous_callout == parsed_pattern - 4 &&
5164
parsed_pattern[-1] == 255)
5165
parsed_pattern = previous_callout;
5166
5167
/* Save for updating next pattern item length, and skip one item before
5168
completing. */
5169
5170
previous_callout = parsed_pattern;
5171
after_manual_callout = 1;
5172
5173
/* Handle a string argument; specific delimiter is required. */
5174
5175
if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5176
{
5177
PCRE2_SIZE calloutlength;
5178
PCRE2_SPTR startptr = ptr;
5179
5180
delimiter = 0;
5181
for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5182
{
5183
if (*ptr == PRIV(callout_start_delims)[i])
5184
{
5185
delimiter = PRIV(callout_end_delims)[i];
5186
break;
5187
}
5188
}
5189
if (delimiter == 0)
5190
{
5191
errorcode = ERR82;
5192
goto FAILED;
5193
}
5194
5195
*parsed_pattern = META_CALLOUT_STRING;
5196
parsed_pattern += 3; /* Skip pattern info */
5197
5198
for (;;)
5199
{
5200
if (++ptr >= ptrend)
5201
{
5202
errorcode = ERR81;
5203
ptr = startptr; /* To give a more useful message */
5204
goto FAILED;
5205
}
5206
if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5207
break;
5208
}
5209
5210
calloutlength = (PCRE2_SIZE)(ptr - startptr);
5211
if (calloutlength > UINT32_MAX)
5212
{
5213
errorcode = ERR72;
5214
goto FAILED;
5215
}
5216
*parsed_pattern++ = (uint32_t)calloutlength;
5217
offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5218
PUTOFFSET(offset, parsed_pattern);
5219
}
5220
5221
/* Handle a callout with an optional numerical argument, which must be
5222
less than or equal to 255. A missing argument gives 0. */
5223
5224
else
5225
{
5226
int n = 0;
5227
*parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
5228
parsed_pattern += 3; /* Skip pattern info */
5229
while (ptr < ptrend && IS_DIGIT(*ptr))
5230
{
5231
n = n * 10 + (*ptr++ - CHAR_0);
5232
if (n > 255)
5233
{
5234
errorcode = ERR38;
5235
goto FAILED;
5236
}
5237
}
5238
*parsed_pattern++ = n;
5239
}
5240
5241
/* Both formats must have a closing parenthesis */
5242
5243
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5244
{
5245
errorcode = ERR39;
5246
goto FAILED;
5247
}
5248
ptr++;
5249
5250
/* Remember the offset to the next item in the pattern, and set a default
5251
length. This should get updated after the next item is read. */
5252
5253
previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5254
previous_callout[2] = 0;
5255
break; /* End callout */
5256
5257
5258
/* ---- Conditional group ---- */
5259
5260
/* A condition can be an assertion, a number (referring to a numbered
5261
group's having been set), a name (referring to a named group), or 'R',
5262
referring to overall recursion. R<digits> and R&name are also permitted
5263
for recursion state tests. Numbers may be preceded by + or - to specify a
5264
relative group number.
5265
5266
There are several syntaxes for testing a named group: (?(name)) is used
5267
by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5268
5269
There are two unfortunate ambiguities. 'R' can be the recursive thing or
5270
the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5271
the Perl DEFINE feature or the Python named test. We look for a name
5272
first; if not found, we try the other case.
5273
5274
For compatibility with auto-callouts, we allow a callout to be specified
5275
before a condition that is an assertion. */
5276
5277
case CHAR_LEFT_PARENTHESIS:
5278
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5279
nest_depth++;
5280
5281
/* If the next character is ? or * there must be an assertion next
5282
(optionally preceded by a callout). We do not check this here, but
5283
instead we set expect_cond_assert to 2. If this is still greater than
5284
zero (callouts decrement it) when the next assertion is read, it will be
5285
marked as a condition that must not be repeated. A value greater than
5286
zero also causes checking that an assertion (possibly with callout)
5287
follows. */
5288
5289
if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5290
{
5291
*parsed_pattern++ = META_COND_ASSERT;
5292
ptr--; /* Pull pointer back to the opening parenthesis. */
5293
expect_cond_assert = 2;
5294
break; /* End of conditional */
5295
}
5296
5297
/* Handle (?([+-]number)... */
5298
5299
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5300
&errorcode))
5301
{
5302
PCRE2_ASSERT(i >= 0);
5303
if (i <= 0)
5304
{
5305
errorcode = ERR15;
5306
goto FAILED;
5307
}
5308
*parsed_pattern++ = META_COND_NUMBER;
5309
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5310
PUTOFFSET(offset, parsed_pattern);
5311
*parsed_pattern++ = i;
5312
}
5313
else if (errorcode != 0) goto FAILED; /* Number too big */
5314
5315
/* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5316
5317
else if (ptrend - ptr >= 10 &&
5318
PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5319
ptr[7] != CHAR_RIGHT_PARENTHESIS)
5320
{
5321
uint32_t ge = 0;
5322
int major = 0;
5323
int minor = 0;
5324
5325
ptr += 7;
5326
if (*ptr == CHAR_GREATER_THAN_SIGN)
5327
{
5328
ge = 1;
5329
ptr++;
5330
}
5331
5332
/* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5333
references its argument twice. */
5334
5335
if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5336
goto BAD_VERSION_CONDITION;
5337
5338
if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5339
goto FAILED;
5340
5341
if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5342
if (*ptr == CHAR_DOT)
5343
{
5344
if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
5345
minor = (*ptr++ - CHAR_0) * 10;
5346
if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5347
if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
5348
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5349
goto BAD_VERSION_CONDITION;
5350
}
5351
5352
*parsed_pattern++ = META_COND_VERSION;
5353
*parsed_pattern++ = ge;
5354
*parsed_pattern++ = major;
5355
*parsed_pattern++ = minor;
5356
}
5357
5358
/* All the remaining cases now require us to read a name. We cannot at
5359
this stage distinguish ambiguous cases such as (?(R12) which might be a
5360
recursion test by number or a name, because the named groups have not yet
5361
all been identified. Those cases are treated as names, but given a
5362
different META code. */
5363
5364
else
5365
{
5366
BOOL was_r_ampersand = FALSE;
5367
5368
if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5369
{
5370
terminator = CHAR_RIGHT_PARENTHESIS;
5371
was_r_ampersand = TRUE;
5372
ptr++;
5373
}
5374
else if (*ptr == CHAR_LESS_THAN_SIGN)
5375
terminator = CHAR_GREATER_THAN_SIGN;
5376
else if (*ptr == CHAR_APOSTROPHE)
5377
terminator = CHAR_APOSTROPHE;
5378
else
5379
{
5380
terminator = CHAR_RIGHT_PARENTHESIS;
5381
ptr--; /* Point to char before name */
5382
}
5383
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5384
&errorcode, cb)) goto FAILED;
5385
5386
/* Handle (?(R&name) */
5387
5388
if (was_r_ampersand)
5389
{
5390
*parsed_pattern = META_COND_RNAME;
5391
ptr--; /* Back to closing parens */
5392
}
5393
5394
/* Handle (?(name). If the name is "DEFINE" we identify it with a
5395
special code. Likewise if the name consists of R followed only by
5396
digits. Otherwise, handle it like a quoted name. */
5397
5398
else if (terminator == CHAR_RIGHT_PARENTHESIS)
5399
{
5400
if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5401
*parsed_pattern = META_COND_DEFINE;
5402
else
5403
{
5404
for (i = 1; i < (int)namelen; i++)
5405
if (!IS_DIGIT(name[i])) break;
5406
*parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5407
META_COND_RNUMBER : META_COND_NAME;
5408
}
5409
ptr--; /* Back to closing parens */
5410
}
5411
5412
/* Handle (?('name') or (?(<name>) */
5413
5414
else *parsed_pattern = META_COND_NAME;
5415
5416
/* All these cases except DEFINE end with the name length and offset;
5417
DEFINE just has an offset (for the "too many branches" error). */
5418
5419
if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5420
PUTOFFSET(offset, parsed_pattern);
5421
} /* End cases that read a name */
5422
5423
/* Check the closing parenthesis of the condition */
5424
5425
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5426
{
5427
errorcode = ERR24;
5428
goto FAILED;
5429
}
5430
ptr++;
5431
break; /* End of condition processing */
5432
5433
5434
/* ---- Atomic group ---- */
5435
5436
case CHAR_GREATER_THAN_SIGN:
5437
ATOMIC_GROUP: /* Come from (*atomic: */
5438
*parsed_pattern++ = META_ATOMIC;
5439
nest_depth++;
5440
ptr++;
5441
break;
5442
5443
5444
/* ---- Lookahead assertions ---- */
5445
5446
case CHAR_EQUALS_SIGN:
5447
POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
5448
*parsed_pattern++ = META_LOOKAHEAD;
5449
ptr++;
5450
goto POST_ASSERTION;
5451
5452
case CHAR_ASTERISK:
5453
POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (*napla: */
5454
*parsed_pattern++ = META_LOOKAHEAD_NA;
5455
ptr++;
5456
goto POST_ASSERTION;
5457
5458
case CHAR_EXCLAMATION_MARK:
5459
NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
5460
*parsed_pattern++ = META_LOOKAHEADNOT;
5461
ptr++;
5462
goto POST_ASSERTION;
5463
5464
5465
/* ---- Lookbehind assertions ---- */
5466
5467
/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5468
is the start of the name of a capturing group. */
5469
5470
case CHAR_LESS_THAN_SIGN:
5471
if (ptrend - ptr <= 1 ||
5472
(ptr[1] != CHAR_EQUALS_SIGN &&
5473
ptr[1] != CHAR_EXCLAMATION_MARK &&
5474
ptr[1] != CHAR_ASTERISK))
5475
{
5476
terminator = CHAR_GREATER_THAN_SIGN;
5477
goto DEFINE_NAME;
5478
}
5479
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5480
META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5481
META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5482
5483
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
5484
*has_lookbehind = TRUE;
5485
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5486
PUTOFFSET(offset, parsed_pattern);
5487
ptr += 2;
5488
/* Fall through */
5489
5490
/* If the previous item was a condition starting (?(? an assertion,
5491
optionally preceded by a callout, is expected. This is checked later on,
5492
during actual compilation. However we need to identify this kind of
5493
assertion in this pass because it must not be qualified. The value of
5494
expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5495
for a callout - still leaving a positive value that identifies the
5496
assertion. Multiple callouts or any other items will make it zero or
5497
less, which doesn't matter because they will cause an error later. */
5498
5499
POST_ASSERTION:
5500
nest_depth++;
5501
if (prev_expect_cond_assert > 0)
5502
{
5503
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5504
else if (++top_nest >= end_nests)
5505
{
5506
errorcode = ERR84;
5507
goto FAILED;
5508
}
5509
top_nest->nest_depth = nest_depth;
5510
top_nest->flags = NSF_CONDASSERT;
5511
top_nest->options = options & PARSE_TRACKED_OPTIONS;
5512
top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5513
}
5514
break;
5515
5516
5517
/* ---- Define a named group ---- */
5518
5519
/* A named group may be defined as (?'name') or (?<name>). In the latter
5520
case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5521
terminator set to '>'. */
5522
5523
case CHAR_APOSTROPHE:
5524
terminator = CHAR_APOSTROPHE; /* Terminator */
5525
5526
DEFINE_NAME:
5527
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5528
&errorcode, cb)) goto FAILED;
5529
5530
/* We have a name for this capturing group. It is also assigned a number,
5531
which is its primary means of identification. */
5532
5533
if (cb->bracount >= MAX_GROUP_NUMBER)
5534
{
5535
errorcode = ERR97;
5536
goto FAILED;
5537
}
5538
cb->bracount++;
5539
*parsed_pattern++ = META_CAPTURE | cb->bracount;
5540
nest_depth++;
5541
5542
/* Check not too many names */
5543
5544
if (cb->names_found >= MAX_NAME_COUNT)
5545
{
5546
errorcode = ERR49;
5547
goto FAILED;
5548
}
5549
5550
/* Adjust the entry size to accommodate the longest name found. */
5551
5552
if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5553
cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5554
5555
/* Scan the list to check for duplicates. For duplicate names, if the
5556
number is the same, break the loop, which causes the name to be
5557
discarded; otherwise, if DUPNAMES is not set, give an error.
5558
If it is set, allow the name with a different number, but continue
5559
scanning in case this is a duplicate with the same number. For
5560
non-duplicate names, give an error if the number is duplicated. */
5561
5562
isdupname = FALSE;
5563
ng = cb->named_groups;
5564
for (i = 0; i < cb->names_found; i++, ng++)
5565
{
5566
if (namelen == ng->length &&
5567
PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5568
{
5569
if (ng->number == cb->bracount) break;
5570
if ((options & PCRE2_DUPNAMES) == 0)
5571
{
5572
errorcode = ERR43;
5573
goto FAILED;
5574
}
5575
isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
5576
cb->dupnames = TRUE; /* Duplicate names exist */
5577
}
5578
else if (ng->number == cb->bracount)
5579
{
5580
errorcode = ERR65;
5581
goto FAILED;
5582
}
5583
}
5584
5585
if (i < cb->names_found) break; /* Ignore duplicate with same number */
5586
5587
/* Increase the list size if necessary */
5588
5589
if (cb->names_found >= cb->named_group_list_size)
5590
{
5591
uint32_t newsize = cb->named_group_list_size * 2;
5592
named_group *newspace =
5593
cb->cx->memctl.malloc(newsize * sizeof(named_group),
5594
cb->cx->memctl.memory_data);
5595
if (newspace == NULL)
5596
{
5597
errorcode = ERR21;
5598
goto FAILED;
5599
}
5600
5601
memcpy(newspace, cb->named_groups,
5602
cb->named_group_list_size * sizeof(named_group));
5603
if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5604
cb->cx->memctl.free((void *)cb->named_groups,
5605
cb->cx->memctl.memory_data);
5606
cb->named_groups = newspace;
5607
cb->named_group_list_size = newsize;
5608
}
5609
5610
/* Add this name to the list */
5611
5612
cb->named_groups[cb->names_found].name = name;
5613
cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5614
cb->named_groups[cb->names_found].number = cb->bracount;
5615
cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
5616
cb->names_found++;
5617
break;
5618
5619
5620
/* ---- Perl extended character class ---- */
5621
5622
/* These are of the form '(?[...])'. We handle these via the same parser
5623
that consumes ordinary '[...]' classes, but with a flag set to activate
5624
the extended behaviour. */
5625
5626
case CHAR_LEFT_SQUARE_BRACKET:
5627
class_mode_state = CLASS_MODE_PERL_EXT;
5628
c = *ptr++;
5629
goto FROM_PERL_EXTENDED_CLASS;
5630
} /* End of (? switch */
5631
break; /* End of ( handling */
5632
5633
5634
/* ---- Branch terminators ---- */
5635
5636
/* Alternation: reset the capture count if we are in a (?| group. */
5637
5638
case CHAR_VERTICAL_LINE:
5639
if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5640
(top_nest->flags & NSF_RESET) != 0)
5641
{
5642
if (cb->bracount > top_nest->max_group)
5643
top_nest->max_group = (uint16_t)cb->bracount;
5644
cb->bracount = top_nest->reset_group;
5645
}
5646
*parsed_pattern++ = META_ALT;
5647
break;
5648
5649
/* End of group; reset the capture count to the maximum if we are in a (?|
5650
group and/or reset the options that are tracked during parsing. Disallow
5651
quantifier for a condition that is an assertion. */
5652
5653
case CHAR_RIGHT_PARENTHESIS:
5654
okquantifier = TRUE;
5655
if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5656
{
5657
options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5658
xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5659
if ((top_nest->flags & NSF_RESET) != 0 &&
5660
top_nest->max_group > cb->bracount)
5661
cb->bracount = top_nest->max_group;
5662
if ((top_nest->flags & NSF_CONDASSERT) != 0)
5663
okquantifier = FALSE;
5664
5665
if ((top_nest->flags & NSF_ATOMICSR) != 0)
5666
{
5667
*parsed_pattern++ = META_KET;
5668
5669
#ifdef PCRE2_DEBUG
5670
PCRE2_ASSERT(parsed_pattern_extra > 0);
5671
parsed_pattern_extra--;
5672
#endif
5673
}
5674
5675
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5676
else top_nest--;
5677
}
5678
if (nest_depth == 0) /* Unmatched closing parenthesis */
5679
{
5680
errorcode = ERR22;
5681
goto FAILED_BACK; // TODO https://github.com/PCRE2Project/pcre2/issues/549
5682
}
5683
nest_depth--;
5684
*parsed_pattern++ = META_KET;
5685
break;
5686
} /* End of switch on pattern character */
5687
} /* End of main character scan loop */
5688
5689
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5690
5691
if (inverbname && ptr >= ptrend)
5692
{
5693
errorcode = ERR60;
5694
goto FAILED;
5695
}
5696
5697
5698
PARSED_END:
5699
5700
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5701
(parsed_pattern_extra - parsed_pattern_extra_check) <=
5702
max_parsed_pattern(ptr_check, ptr, utf, options));
5703
5704
/* Manage callout for the final item */
5705
5706
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5707
parsed_pattern, cb);
5708
5709
/* Insert trailing items for word and line matching (features provided for the
5710
benefit of pcre2grep). */
5711
5712
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5713
{
5714
*parsed_pattern++ = META_KET;
5715
*parsed_pattern++ = META_DOLLAR;
5716
}
5717
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5718
{
5719
*parsed_pattern++ = META_KET;
5720
*parsed_pattern++ = META_ESCAPE + ESC_b;
5721
}
5722
5723
/* Terminate the parsed pattern, then return success if all groups are closed.
5724
Otherwise we have unclosed parentheses. */
5725
5726
if (parsed_pattern >= parsed_pattern_end)
5727
{
5728
PCRE2_DEBUG_UNREACHABLE();
5729
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
5730
goto FAILED;
5731
}
5732
5733
*parsed_pattern = META_END;
5734
if (nest_depth == 0) return 0;
5735
5736
UNCLOSED_PARENTHESIS:
5737
errorcode = ERR14;
5738
5739
/* Come here for all failures. */
5740
5741
FAILED:
5742
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5743
return errorcode;
5744
5745
/* Some errors need to indicate the previous character. */
5746
5747
FAILED_BACK:
5748
ptr--;
5749
goto FAILED;
5750
5751
/* This failure happens several times. */
5752
5753
BAD_VERSION_CONDITION:
5754
errorcode = ERR79;
5755
goto FAILED;
5756
}
5757
5758
5759
5760
/*************************************************
5761
* Find first significant opcode *
5762
*************************************************/
5763
5764
/* This is called by several functions that scan a compiled expression looking
5765
for a fixed first character, or an anchoring opcode etc. It skips over things
5766
that do not influence this. For some calls, it makes sense to skip negative
5767
forward and all backward assertions, and also the \b assertion; for others it
5768
does not.
5769
5770
Arguments:
5771
code pointer to the start of the group
5772
skipassert TRUE if certain assertions are to be skipped
5773
5774
Returns: pointer to the first significant opcode
5775
*/
5776
5777
static const PCRE2_UCHAR*
5778
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5779
{
5780
for (;;)
5781
{
5782
switch ((int)*code)
5783
{
5784
case OP_ASSERT_NOT:
5785
case OP_ASSERTBACK:
5786
case OP_ASSERTBACK_NOT:
5787
case OP_ASSERTBACK_NA:
5788
if (!skipassert) return code;
5789
do code += GET(code, 1); while (*code == OP_ALT);
5790
code += PRIV(OP_lengths)[*code];
5791
break;
5792
5793
case OP_WORD_BOUNDARY:
5794
case OP_NOT_WORD_BOUNDARY:
5795
case OP_UCP_WORD_BOUNDARY:
5796
case OP_NOT_UCP_WORD_BOUNDARY:
5797
if (!skipassert) return code;
5798
/* Fall through */
5799
5800
case OP_CALLOUT:
5801
case OP_CREF:
5802
case OP_DNCREF:
5803
case OP_RREF:
5804
case OP_DNRREF:
5805
case OP_FALSE:
5806
case OP_TRUE:
5807
code += PRIV(OP_lengths)[*code];
5808
break;
5809
5810
case OP_CALLOUT_STR:
5811
code += GET(code, 1 + 2*LINK_SIZE);
5812
break;
5813
5814
case OP_SKIPZERO:
5815
code += 2 + GET(code, 2) + LINK_SIZE;
5816
break;
5817
5818
case OP_COND:
5819
case OP_SCOND:
5820
if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
5821
code[GET(code, 1)] != OP_KET) /* More than one branch */
5822
return code;
5823
code += GET(code, 1) + 1 + LINK_SIZE;
5824
break;
5825
5826
case OP_MARK:
5827
case OP_COMMIT_ARG:
5828
case OP_PRUNE_ARG:
5829
case OP_SKIP_ARG:
5830
case OP_THEN_ARG:
5831
code += code[1] + PRIV(OP_lengths)[*code];
5832
break;
5833
5834
default:
5835
return code;
5836
}
5837
}
5838
5839
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
5840
}
5841
5842
5843
5844
/*************************************************
5845
* Find details of duplicate group names *
5846
*************************************************/
5847
5848
/* This is called from compile_branch() when it needs to know the index and
5849
count of duplicates in the names table when processing named backreferences,
5850
either directly, or as conditions.
5851
5852
Arguments:
5853
name points to the name
5854
length the length of the name
5855
indexptr where to put the index
5856
countptr where to put the count of duplicates
5857
errorcodeptr where to put an error code
5858
cb the compile block
5859
5860
Returns: TRUE if OK, FALSE if not, error code set
5861
*/
5862
5863
static BOOL
5864
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5865
int *countptr, int *errorcodeptr, compile_block *cb)
5866
{
5867
uint32_t i, groupnumber;
5868
int count;
5869
PCRE2_UCHAR *slot = cb->name_table;
5870
5871
/* Find the first entry in the table */
5872
5873
for (i = 0; i < cb->names_found; i++)
5874
{
5875
if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5876
slot[IMM2_SIZE+length] == 0) break;
5877
slot += cb->name_entry_size;
5878
}
5879
5880
/* This should not occur, because this function is called only when we know we
5881
have duplicate names. Give an internal error. */
5882
5883
if (i >= cb->names_found)
5884
{
5885
PCRE2_DEBUG_UNREACHABLE();
5886
*errorcodeptr = ERR53;
5887
cb->erroroffset = name - cb->start_pattern;
5888
return FALSE;
5889
}
5890
5891
/* Record the index and then see how many duplicates there are, updating the
5892
backref map and maximum back reference as we do. */
5893
5894
*indexptr = i;
5895
count = 0;
5896
5897
for (;;)
5898
{
5899
count++;
5900
groupnumber = GET2(slot,0);
5901
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5902
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5903
if (++i >= cb->names_found) break;
5904
slot += cb->name_entry_size;
5905
if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5906
(slot+IMM2_SIZE)[length] != 0) break;
5907
}
5908
5909
*countptr = count;
5910
return TRUE;
5911
}
5912
5913
5914
5915
/*************************************************
5916
* Compile one branch *
5917
*************************************************/
5918
5919
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5920
the options are changed during the branch, the pointer is used to change the
5921
external options bits. This function is used during the pre-compile phase when
5922
we are trying to find out the amount of memory needed, as well as during the
5923
real compile phase. The value of lengthptr distinguishes the two phases.
5924
5925
Arguments:
5926
optionsptr pointer to the option bits
5927
xoptionsptr pointer to the extra option bits
5928
codeptr points to the pointer to the current code point
5929
pptrptr points to the current parsed pattern pointer
5930
errorcodeptr points to error code variable
5931
firstcuptr place to put the first required code unit
5932
firstcuflagsptr place to put the first code unit flags
5933
reqcuptr place to put the last required code unit
5934
reqcuflagsptr place to put the last required code unit flags
5935
bcptr points to current branch chain
5936
open_caps points to current capitem
5937
cb contains pointers to tables etc.
5938
lengthptr NULL during the real compile phase
5939
points to length accumulator during pre-compile phase
5940
5941
Returns: 0 There's been an error, *errorcodeptr is non-zero
5942
+1 Success, this branch must match at least one character
5943
-1 Success, this branch may match an empty string
5944
*/
5945
5946
static int
5947
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5948
PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5949
uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5950
uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5951
compile_block *cb, PCRE2_SIZE *lengthptr)
5952
{
5953
int bravalue = 0;
5954
int okreturn = -1;
5955
int group_return = 0;
5956
uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5957
uint32_t greedy_default, greedy_non_default;
5958
uint32_t repeat_type, op_type;
5959
uint32_t options = *optionsptr; /* May change dynamically */
5960
uint32_t xoptions = *xoptionsptr; /* May change dynamically */
5961
uint32_t firstcu, reqcu;
5962
uint32_t zeroreqcu, zerofirstcu;
5963
uint32_t *pptr = *pptrptr;
5964
uint32_t meta, meta_arg;
5965
uint32_t firstcuflags, reqcuflags;
5966
uint32_t zeroreqcuflags, zerofirstcuflags;
5967
uint32_t req_caseopt, reqvary, tempreqvary;
5968
/* Some opcodes, such as META_SCS_NUMBER or META_SCS_NAME,
5969
depends on the previous value of offset. */
5970
PCRE2_SIZE offset = 0;
5971
PCRE2_SIZE length_prevgroup = 0;
5972
PCRE2_UCHAR *code = *codeptr;
5973
PCRE2_UCHAR *last_code = code;
5974
PCRE2_UCHAR *orig_code = code;
5975
PCRE2_UCHAR *tempcode;
5976
PCRE2_UCHAR *previous = NULL;
5977
PCRE2_UCHAR op_previous;
5978
BOOL groupsetfirstcu = FALSE;
5979
BOOL had_accept = FALSE;
5980
BOOL matched_char = FALSE;
5981
BOOL previous_matched_char = FALSE;
5982
BOOL reset_caseful = FALSE;
5983
5984
/* We can fish out the UTF setting once and for all into a BOOL, but we must
5985
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5986
as we process the pattern. */
5987
5988
#ifdef SUPPORT_UNICODE
5989
BOOL utf = (options & PCRE2_UTF) != 0;
5990
BOOL ucp = (options & PCRE2_UCP) != 0;
5991
#else /* No Unicode support */
5992
BOOL utf = FALSE;
5993
#endif
5994
5995
/* Set up the default and non-default settings for greediness */
5996
5997
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5998
greedy_non_default = greedy_default ^ 1;
5999
6000
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6001
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6002
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6003
6004
When we hit a repeat whose minimum is zero, we may have to adjust these values
6005
to take the zero repeat into account. This is implemented by setting them to
6006
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6007
item types that can be repeated set these backoff variables appropriately. */
6008
6009
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6010
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6011
6012
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6013
according to the current setting of the caseless flag. The REQ_CASELESS value
6014
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6015
to record the case status of the value. This is used only for ASCII characters.
6016
*/
6017
6018
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6019
6020
/* Switch on next META item until the end of the branch */
6021
6022
for (;; pptr++)
6023
{
6024
BOOL possessive_quantifier;
6025
BOOL note_group_empty;
6026
uint32_t mclength;
6027
uint32_t skipunits;
6028
uint32_t subreqcu, subfirstcu;
6029
uint32_t groupnumber;
6030
uint32_t verbarglen, verbculen;
6031
uint32_t subreqcuflags, subfirstcuflags;
6032
open_capitem *oc;
6033
PCRE2_UCHAR mcbuffer[8];
6034
6035
/* Get next META item in the pattern and its potential argument. */
6036
6037
meta = META_CODE(*pptr);
6038
meta_arg = META_DATA(*pptr);
6039
6040
/* If we are in the pre-compile phase, accumulate the length used for the
6041
previous cycle of this loop, unless the next item is a quantifier. */
6042
6043
if (lengthptr != NULL)
6044
{
6045
if (code > cb->start_workspace + cb->workspace_size -
6046
WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
6047
{
6048
if (code >= cb->start_workspace + cb->workspace_size)
6049
{
6050
PCRE2_DEBUG_UNREACHABLE();
6051
*errorcodeptr = ERR52; /* Over-ran workspace - internal error */
6052
}
6053
else
6054
*errorcodeptr = ERR86;
6055
return 0;
6056
}
6057
6058
/* There is at least one situation where code goes backwards: this is the
6059
case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6060
is processed, the whole class is eliminated. However, it is created first,
6061
so we have to allow memory for it. Therefore, don't ever reduce the length
6062
at this point. */
6063
6064
if (code < last_code) code = last_code;
6065
6066
/* If the next thing is not a quantifier, we add the length of the previous
6067
item into the total, and reset the code pointer to the start of the
6068
workspace. Otherwise leave the previous item available to be quantified. */
6069
6070
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6071
{
6072
if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6073
{
6074
*errorcodeptr = ERR20; /* Integer overflow */
6075
return 0;
6076
}
6077
*lengthptr += (PCRE2_SIZE)(code - orig_code);
6078
if (*lengthptr > MAX_PATTERN_SIZE)
6079
{
6080
*errorcodeptr = ERR20; /* Pattern is too large */
6081
return 0;
6082
}
6083
code = orig_code;
6084
}
6085
6086
/* Remember where this code item starts so we can catch the "backwards"
6087
case above next time round. */
6088
6089
last_code = code;
6090
}
6091
6092
/* Process the next parsed pattern item. If it is not a quantifier, remember
6093
where it starts so that it can be quantified when a quantifier follows.
6094
Checking for the legality of quantifiers happens in parse_regex(), except for
6095
a quantifier after an assertion that is a condition. */
6096
6097
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6098
{
6099
previous = code;
6100
if (matched_char && !had_accept) okreturn = 1;
6101
}
6102
6103
previous_matched_char = matched_char;
6104
matched_char = FALSE;
6105
note_group_empty = FALSE;
6106
skipunits = 0; /* Default value for most subgroups */
6107
6108
switch(meta)
6109
{
6110
/* ===================================================================*/
6111
/* The branch terminates at pattern end or | or ) */
6112
6113
case META_END:
6114
case META_ALT:
6115
case META_KET:
6116
*firstcuptr = firstcu;
6117
*firstcuflagsptr = firstcuflags;
6118
*reqcuptr = reqcu;
6119
*reqcuflagsptr = reqcuflags;
6120
*codeptr = code;
6121
*pptrptr = pptr;
6122
return okreturn;
6123
6124
6125
/* ===================================================================*/
6126
/* Handle single-character metacharacters. In multiline mode, ^ disables
6127
the setting of any following char as a first character. */
6128
6129
case META_CIRCUMFLEX:
6130
if ((options & PCRE2_MULTILINE) != 0)
6131
{
6132
if (firstcuflags == REQ_UNSET)
6133
zerofirstcuflags = firstcuflags = REQ_NONE;
6134
*code++ = OP_CIRCM;
6135
}
6136
else *code++ = OP_CIRC;
6137
break;
6138
6139
case META_DOLLAR:
6140
*code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6141
break;
6142
6143
/* There can never be a first char if '.' is first, whatever happens about
6144
repeats. The value of reqcu doesn't change either. */
6145
6146
case META_DOT:
6147
matched_char = TRUE;
6148
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6149
zerofirstcu = firstcu;
6150
zerofirstcuflags = firstcuflags;
6151
zeroreqcu = reqcu;
6152
zeroreqcuflags = reqcuflags;
6153
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6154
break;
6155
6156
6157
/* ===================================================================*/
6158
/* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6159
Otherwise, an initial ']' is taken as a data character. When empty classes
6160
are allowed, [] must generate an empty class - we have no dedicated opcode
6161
to optimise the representation, but it's a rare case (the '(*FAIL)'
6162
construct would be a clearer way for a pattern author to represent a
6163
non-matching branch, but it does have different semantics to '[]' if both
6164
are followed by a quantifier). The empty-negated [^] matches any character,
6165
so is useful: generate OP_ALLANY for this. */
6166
6167
case META_CLASS_EMPTY:
6168
case META_CLASS_EMPTY_NOT:
6169
matched_char = TRUE;
6170
if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6171
else
6172
{
6173
*code++ = OP_CLASS;
6174
memset(code, 0, 32);
6175
code += 32 / sizeof(PCRE2_UCHAR);
6176
}
6177
6178
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6179
zerofirstcu = firstcu;
6180
zerofirstcuflags = firstcuflags;
6181
break;
6182
6183
6184
/* ===================================================================*/
6185
/* Non-empty character class. If the included characters are all < 256, we
6186
build a 32-byte bitmap of the permitted characters, except in the special
6187
case where there is only one such character. For negated classes, we build
6188
the map as usual, then invert it at the end. However, we use a different
6189
opcode so that data characters > 255 can be handled correctly.
6190
6191
If the class contains characters outside the 0-255 range, a different
6192
opcode is compiled. It may optionally have a bit map for characters < 256,
6193
but those above are explicitly listed afterwards. A flag code unit tells
6194
whether the bitmap is present, and whether this is a negated class or
6195
not. */
6196
6197
case META_CLASS_NOT:
6198
case META_CLASS:
6199
matched_char = TRUE;
6200
6201
/* Check for complex extended classes and handle them separately. */
6202
6203
if ((*pptr & CLASS_IS_ECLASS) != 0)
6204
{
6205
if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6206
errorcodeptr, cb, lengthptr))
6207
return 0;
6208
goto CLASS_END_PROCESSING;
6209
}
6210
6211
/* We can optimize the case of a single character in a class by generating
6212
OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6213
negative. In the negative case there can be no first char if this item is
6214
first, whatever repeat count may follow. In the case of reqcu, save the
6215
previous value for reinstating. */
6216
6217
/* NOTE: at present this optimization is not effective if the only
6218
character in a class in 32-bit, non-UCP mode has its top bit set. */
6219
6220
if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6221
{
6222
uint32_t c = pptr[1];
6223
6224
pptr += 2; /* Move on to class end */
6225
if (meta == META_CLASS) /* A positive one-char class can be */
6226
{ /* handled as a normal literal character. */
6227
meta = c; /* Set up the character */
6228
goto NORMAL_CHAR_SET;
6229
}
6230
6231
/* Handle a negative one-character class */
6232
6233
zeroreqcu = reqcu;
6234
zeroreqcuflags = reqcuflags;
6235
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6236
zerofirstcu = firstcu;
6237
zerofirstcuflags = firstcuflags;
6238
6239
/* For caseless UTF or UCP mode, check whether this character has more
6240
than one other case. If so, generate a special OP_NOTPROP item instead of
6241
OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6242
caseless set that starts with an ASCII character. If the character is
6243
affected by the special Turkish rules, hardcode the not-matching
6244
characters using a caseset. */
6245
6246
#ifdef SUPPORT_UNICODE
6247
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6248
{
6249
uint32_t caseset;
6250
6251
if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6252
PCRE2_EXTRA_TURKISH_CASING &&
6253
UCD_ANY_I(c))
6254
{
6255
caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6256
}
6257
else if ((caseset = UCD_CASESET(c)) != 0 &&
6258
(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6259
PRIV(ucd_caseless_sets)[caseset] < 128)
6260
{
6261
caseset = 0; /* Ignore the caseless set if it's restricted. */
6262
}
6263
6264
if (caseset != 0)
6265
{
6266
*code++ = OP_NOTPROP;
6267
*code++ = PT_CLIST;
6268
*code++ = caseset;
6269
break; /* We are finished with this class */
6270
}
6271
}
6272
#endif
6273
/* Char has only one other (usable) case, or UCP not available */
6274
6275
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6276
code += PUTCHAR(c, code);
6277
break; /* We are finished with this class */
6278
} /* End of 1-char optimization */
6279
6280
/* Handle character classes that contain more than just one literal
6281
character. If there are exactly two characters in a positive class, see if
6282
they are case partners. This can be optimized to generate a caseless single
6283
character match (which also sets first/required code units if relevant).
6284
When casing restrictions apply, ignore a caseless set if both characters
6285
are ASCII. When Turkish casing applies, an 'i' does not match its normal
6286
Unicode "othercase". */
6287
6288
if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6289
pptr[3] == META_CLASS_END)
6290
{
6291
uint32_t c = pptr[1];
6292
6293
#ifdef SUPPORT_UNICODE
6294
if ((UCD_CASESET(c) == 0 ||
6295
((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6296
c < 128 && pptr[2] < 128)) &&
6297
!((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6298
PCRE2_EXTRA_TURKISH_CASING &&
6299
UCD_ANY_I(c)))
6300
#endif
6301
{
6302
uint32_t d;
6303
6304
#ifdef SUPPORT_UNICODE
6305
if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6306
#endif
6307
{
6308
#if PCRE2_CODE_UNIT_WIDTH != 8
6309
if (c > 255) d = c; else
6310
#endif
6311
d = TABLE_GET(c, cb->fcc, c);
6312
}
6313
6314
if (c != d && pptr[2] == d)
6315
{
6316
pptr += 3; /* Move on to class end */
6317
meta = c;
6318
if ((options & PCRE2_CASELESS) == 0)
6319
{
6320
reset_caseful = TRUE;
6321
options |= PCRE2_CASELESS;
6322
req_caseopt = REQ_CASELESS;
6323
}
6324
goto CLASS_CASELESS_CHAR;
6325
}
6326
}
6327
}
6328
6329
/* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6330
6331
pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6332
&code, meta == META_CLASS_NOT, NULL,
6333
errorcodeptr, cb, lengthptr);
6334
if (pptr == NULL) return 0;
6335
PCRE2_ASSERT(*pptr == META_CLASS_END);
6336
6337
CLASS_END_PROCESSING:
6338
6339
/* If this class is the first thing in the branch, there can be no first
6340
char setting, whatever the repeat count. Any reqcu setting must remain
6341
unchanged after any kind of repeat. */
6342
6343
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6344
zerofirstcu = firstcu;
6345
zerofirstcuflags = firstcuflags;
6346
zeroreqcu = reqcu;
6347
zeroreqcuflags = reqcuflags;
6348
break; /* End of class processing */
6349
6350
6351
/* ===================================================================*/
6352
/* Deal with (*VERB)s. */
6353
6354
/* Check for open captures before ACCEPT and close those that are within
6355
the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6356
assertion. In the first pass, just accumulate the length required;
6357
otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6358
workspace overflow. Do not set firstcu after *ACCEPT. */
6359
6360
case META_ACCEPT:
6361
cb->had_accept = had_accept = TRUE;
6362
for (oc = open_caps;
6363
oc != NULL && oc->assert_depth >= cb->assert_depth;
6364
oc = oc->next)
6365
{
6366
if (lengthptr != NULL)
6367
{
6368
*lengthptr += CU2BYTES(1) + IMM2_SIZE;
6369
}
6370
else
6371
{
6372
*code++ = OP_CLOSE;
6373
PUT2INC(code, 0, oc->number);
6374
}
6375
}
6376
*code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6377
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378
break;
6379
6380
case META_PRUNE:
6381
case META_SKIP:
6382
cb->had_pruneorskip = TRUE;
6383
/* Fall through */
6384
case META_COMMIT:
6385
case META_FAIL:
6386
*code++ = verbops[(meta - META_MARK) >> 16];
6387
break;
6388
6389
case META_THEN:
6390
cb->external_flags |= PCRE2_HASTHEN;
6391
*code++ = OP_THEN;
6392
break;
6393
6394
/* Handle verbs with arguments. Arguments can be very long, especially in
6395
16- and 32-bit modes, and can overflow the workspace in the first pass.
6396
However, the argument length is constrained to be small enough to fit in
6397
one code unit. This check happens in parse_regex(). In the first pass,
6398
instead of putting the argument into memory, we just update the length
6399
counter and set up an empty argument. */
6400
6401
case META_THEN_ARG:
6402
cb->external_flags |= PCRE2_HASTHEN;
6403
goto VERB_ARG;
6404
6405
case META_PRUNE_ARG:
6406
case META_SKIP_ARG:
6407
cb->had_pruneorskip = TRUE;
6408
/* Fall through */
6409
case META_MARK:
6410
case META_COMMIT_ARG:
6411
VERB_ARG:
6412
*code++ = verbops[(meta - META_MARK) >> 16];
6413
/* The length is in characters. */
6414
verbarglen = *(++pptr);
6415
verbculen = 0;
6416
tempcode = code++;
6417
for (int i = 0; i < (int)verbarglen; i++)
6418
{
6419
meta = *(++pptr);
6420
#ifdef SUPPORT_UNICODE
6421
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6422
#endif
6423
{
6424
mclength = 1;
6425
mcbuffer[0] = meta;
6426
}
6427
if (lengthptr != NULL) *lengthptr += mclength; else
6428
{
6429
memcpy(code, mcbuffer, CU2BYTES(mclength));
6430
code += mclength;
6431
verbculen += mclength;
6432
}
6433
}
6434
6435
*tempcode = verbculen; /* Fill in the code unit length */
6436
*code++ = 0; /* Terminating zero */
6437
break;
6438
6439
6440
/* ===================================================================*/
6441
/* Handle options change. The new setting must be passed back for use in
6442
subsequent branches. Reset the greedy defaults and the case value for
6443
firstcu and reqcu. */
6444
6445
case META_OPTIONS:
6446
*optionsptr = options = *(++pptr);
6447
*xoptionsptr = xoptions = *(++pptr);
6448
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6449
greedy_non_default = greedy_default ^ 1;
6450
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6451
break;
6452
6453
case META_OFFSET:
6454
GETPLUSOFFSET(offset, pptr);
6455
break;
6456
6457
case META_SCS:
6458
bravalue = OP_ASSERT_SCS;
6459
cb->assert_depth += 1;
6460
goto GROUP_PROCESS;
6461
6462
6463
/* ===================================================================*/
6464
/* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6465
because it could be a numerical check on recursion, or a name check on a
6466
group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6467
we can handle it either way. We first try for a name; if not found, process
6468
the number. */
6469
6470
case META_COND_RNUMBER: /* (?(Rdigits) */
6471
case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6472
case META_COND_RNAME: /* (?(R&name) - test for recursion */
6473
case META_SCS_NAME: /* Name of scan substring */
6474
bravalue = OP_COND;
6475
{
6476
int count, index;
6477
unsigned int i;
6478
PCRE2_SPTR name;
6479
named_group *ng = cb->named_groups;
6480
uint32_t length = *(++pptr);
6481
6482
if (meta == META_SCS_NAME)
6483
offset += meta_arg;
6484
else
6485
GETPLUSOFFSET(offset, pptr);
6486
name = cb->start_pattern + offset;
6487
6488
/* In the first pass, the names generated in the pre-pass are available,
6489
but the main name table has not yet been created. Scan the list of names
6490
generated in the pre-pass in order to get a number and whether or not
6491
this name is duplicated. If it is not duplicated, we can handle it as a
6492
numerical group. */
6493
6494
for (i = 0; i < cb->names_found; i++, ng++)
6495
if (length == ng->length &&
6496
PRIV(strncmp)(name, ng->name, length) == 0) break;
6497
6498
if (i >= cb->names_found)
6499
{
6500
/* If the name was not found we have a bad reference, unless we are
6501
dealing with R<digits>, which is treated as a recursion test by
6502
number. */
6503
6504
groupnumber = 0;
6505
if (meta == META_COND_RNUMBER)
6506
{
6507
for (i = 1; i < length; i++)
6508
{
6509
groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6510
if (groupnumber > MAX_GROUP_NUMBER)
6511
{
6512
*errorcodeptr = ERR61;
6513
cb->erroroffset = offset + i;
6514
return 0;
6515
}
6516
}
6517
}
6518
6519
if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6520
{
6521
*errorcodeptr = ERR15;
6522
cb->erroroffset = offset;
6523
return 0;
6524
}
6525
6526
/* (?Rdigits) treated as a recursion reference by number. A value of
6527
zero (which is the result of both (?R) and (?R0)) means "any", and is
6528
translated into RREF_ANY (which is 0xffff). */
6529
6530
if (groupnumber == 0) groupnumber = RREF_ANY;
6531
code[1+LINK_SIZE] = OP_RREF;
6532
PUT2(code, 2+LINK_SIZE, groupnumber);
6533
skipunits = 1+IMM2_SIZE;
6534
goto GROUP_PROCESS_NOTE_EMPTY;
6535
}
6536
else if (!ng->isdup)
6537
{
6538
/* Otherwise found a duplicated name */
6539
if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6540
6541
if (meta == META_SCS_NAME)
6542
{
6543
code[0] = OP_CREF;
6544
PUT2(code, 1, ng->number);
6545
code += 1+IMM2_SIZE;
6546
break;
6547
}
6548
6549
code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6550
PUT2(code, 2+LINK_SIZE, ng->number);
6551
skipunits = 1+IMM2_SIZE;
6552
if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;
6553
cb->assert_depth += 1;
6554
goto GROUP_PROCESS;
6555
}
6556
6557
/* We have a duplicated name. In the compile pass we have to search the
6558
main table in order to get the index and count values. */
6559
6560
count = 0; /* Values for first pass (avoids compiler warning) */
6561
index = 0;
6562
if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6563
&count, errorcodeptr, cb)) return 0;
6564
6565
if (meta == META_SCS_NAME)
6566
{
6567
code[0] = OP_DNCREF;
6568
PUT2(code, 1, index);
6569
PUT2(code, 1+IMM2_SIZE, count);
6570
code += 1+2*IMM2_SIZE;
6571
break;
6572
}
6573
6574
/* A duplicated name was found. Note that if an R<digits> name is found
6575
(META_COND_RNUMBER), it is a reference test, not a recursion test. */
6576
6577
code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6578
6579
/* Insert appropriate data values. */
6580
skipunits = 1+2*IMM2_SIZE;
6581
PUT2(code, 2+LINK_SIZE, index);
6582
PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6583
}
6584
6585
PCRE2_ASSERT(meta != META_SCS_NAME);
6586
goto GROUP_PROCESS_NOTE_EMPTY;
6587
6588
/* The DEFINE condition is always false. Its internal groups may never
6589
be called, so matched_char must remain false, hence the jump to
6590
GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6591
6592
case META_COND_DEFINE:
6593
bravalue = OP_COND;
6594
GETPLUSOFFSET(offset, pptr);
6595
code[1+LINK_SIZE] = OP_DEFINE;
6596
skipunits = 1;
6597
goto GROUP_PROCESS;
6598
6599
/* Conditional test of a group's being set. */
6600
6601
case META_COND_NUMBER:
6602
case META_SCS_NUMBER:
6603
bravalue = OP_COND;
6604
if (meta == META_SCS_NUMBER)
6605
offset += meta_arg;
6606
else
6607
GETPLUSOFFSET(offset, pptr);
6608
6609
groupnumber = *(++pptr);
6610
if (groupnumber > cb->bracount)
6611
{
6612
*errorcodeptr = ERR15;
6613
cb->erroroffset = offset;
6614
return 0;
6615
}
6616
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6617
6618
if (meta == META_SCS_NUMBER)
6619
{
6620
code[0] = OP_CREF;
6621
PUT2(code, 1, groupnumber);
6622
code += 1+IMM2_SIZE;
6623
break;
6624
}
6625
6626
/* Point at initial ( for too many branches error */
6627
offset -= 2;
6628
code[1+LINK_SIZE] = OP_CREF;
6629
skipunits = 1+IMM2_SIZE;
6630
PUT2(code, 2+LINK_SIZE, groupnumber);
6631
goto GROUP_PROCESS_NOTE_EMPTY;
6632
6633
/* Test for the PCRE2 version. */
6634
6635
case META_COND_VERSION:
6636
bravalue = OP_COND;
6637
if (pptr[1] > 0)
6638
code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6639
(PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6640
OP_TRUE : OP_FALSE;
6641
else
6642
code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6643
OP_TRUE : OP_FALSE;
6644
skipunits = 1;
6645
pptr += 3;
6646
goto GROUP_PROCESS_NOTE_EMPTY;
6647
6648
/* The condition is an assertion, possibly preceded by a callout. */
6649
6650
case META_COND_ASSERT:
6651
bravalue = OP_COND;
6652
goto GROUP_PROCESS_NOTE_EMPTY;
6653
6654
6655
/* ===================================================================*/
6656
/* Handle all kinds of nested bracketed groups. The non-capturing,
6657
non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6658
6659
case META_LOOKAHEAD:
6660
bravalue = OP_ASSERT;
6661
cb->assert_depth += 1;
6662
goto GROUP_PROCESS;
6663
6664
case META_LOOKAHEAD_NA:
6665
bravalue = OP_ASSERT_NA;
6666
cb->assert_depth += 1;
6667
goto GROUP_PROCESS;
6668
6669
/* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6670
thing to do, but Perl allows all assertions to be quantified, and when
6671
they contain capturing parentheses there may be a potential use for
6672
this feature. Not that that applies to a quantified (?!) but we allow
6673
it for uniformity. */
6674
6675
case META_LOOKAHEADNOT:
6676
if (pptr[1] == META_KET &&
6677
(pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6678
{
6679
*code++ = OP_FAIL;
6680
pptr++;
6681
}
6682
else
6683
{
6684
bravalue = OP_ASSERT_NOT;
6685
cb->assert_depth += 1;
6686
goto GROUP_PROCESS;
6687
}
6688
break;
6689
6690
case META_LOOKBEHIND:
6691
bravalue = OP_ASSERTBACK;
6692
cb->assert_depth += 1;
6693
goto GROUP_PROCESS;
6694
6695
case META_LOOKBEHINDNOT:
6696
bravalue = OP_ASSERTBACK_NOT;
6697
cb->assert_depth += 1;
6698
goto GROUP_PROCESS;
6699
6700
case META_LOOKBEHIND_NA:
6701
bravalue = OP_ASSERTBACK_NA;
6702
cb->assert_depth += 1;
6703
goto GROUP_PROCESS;
6704
6705
case META_ATOMIC:
6706
bravalue = OP_ONCE;
6707
goto GROUP_PROCESS_NOTE_EMPTY;
6708
6709
case META_SCRIPT_RUN:
6710
bravalue = OP_SCRIPT_RUN;
6711
goto GROUP_PROCESS_NOTE_EMPTY;
6712
6713
case META_NOCAPTURE:
6714
bravalue = OP_BRA;
6715
/* Fall through */
6716
6717
/* Process nested bracketed regex. The nesting depth is maintained for the
6718
benefit of the stackguard function. The test for too deep nesting is now
6719
done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6720
others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6721
note of whether or not they may match an empty string. */
6722
6723
GROUP_PROCESS_NOTE_EMPTY:
6724
note_group_empty = TRUE;
6725
6726
GROUP_PROCESS:
6727
cb->parens_depth += 1;
6728
*code = bravalue;
6729
pptr++;
6730
tempcode = code;
6731
tempreqvary = cb->req_varyopt; /* Save value before group */
6732
length_prevgroup = 0; /* Initialize for pre-compile phase */
6733
6734
if ((group_return =
6735
compile_regex(
6736
options, /* The options state */
6737
xoptions, /* The extra options state */
6738
&tempcode, /* Where to put code (updated) */
6739
&pptr, /* Input pointer (updated) */
6740
errorcodeptr, /* Where to put an error message */
6741
skipunits, /* Skip over bracket number */
6742
&subfirstcu, /* For possible first char */
6743
&subfirstcuflags,
6744
&subreqcu, /* For possible last char */
6745
&subreqcuflags,
6746
bcptr, /* Current branch chain */
6747
open_caps, /* Pointer to capture stack */
6748
cb, /* Compile data block */
6749
(lengthptr == NULL)? NULL : /* Actual compile phase */
6750
&length_prevgroup /* Pre-compile phase */
6751
)) == 0)
6752
return 0; /* Error */
6753
6754
cb->parens_depth -= 1;
6755
6756
/* If that was a non-conditional significant group (not an assertion, not a
6757
DEFINE) that matches at least one character, then the current item matches
6758
a character. Conditionals are handled below. */
6759
6760
if (note_group_empty && bravalue != OP_COND && group_return > 0)
6761
matched_char = TRUE;
6762
6763
/* If we've just compiled an assertion, pop the assert depth. */
6764
6765
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6766
cb->assert_depth -= 1;
6767
6768
/* At the end of compiling, code is still pointing to the start of the
6769
group, while tempcode has been updated to point past the end of the group.
6770
The parsed pattern pointer (pptr) is on the closing META_KET.
6771
6772
If this is a conditional bracket, check that there are no more than
6773
two branches in the group, or just one if it's a DEFINE group. We do this
6774
in the real compile phase, not in the pre-pass, where the whole group may
6775
not be available. */
6776
6777
if (bravalue == OP_COND && lengthptr == NULL)
6778
{
6779
PCRE2_UCHAR *tc = code;
6780
int condcount = 0;
6781
6782
do {
6783
condcount++;
6784
tc += GET(tc,1);
6785
}
6786
while (*tc != OP_KET);
6787
6788
/* A DEFINE group is never obeyed inline (the "condition" is always
6789
false). It must have only one branch. Having checked this, change the
6790
opcode to OP_FALSE. */
6791
6792
if (code[LINK_SIZE+1] == OP_DEFINE)
6793
{
6794
if (condcount > 1)
6795
{
6796
cb->erroroffset = offset;
6797
*errorcodeptr = ERR54;
6798
return 0;
6799
}
6800
code[LINK_SIZE+1] = OP_FALSE;
6801
bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6802
}
6803
6804
/* A "normal" conditional group. If there is just one branch, we must not
6805
make use of its firstcu or reqcu, because this is equivalent to an
6806
empty second branch. Also, it may match an empty string. If there are two
6807
branches, this item must match a character if the group must. */
6808
6809
else
6810
{
6811
if (condcount > 2)
6812
{
6813
cb->erroroffset = offset;
6814
*errorcodeptr = ERR27;
6815
return 0;
6816
}
6817
if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6818
else if (group_return > 0) matched_char = TRUE;
6819
}
6820
}
6821
6822
/* In the pre-compile phase, update the length by the length of the group,
6823
less the brackets at either end. Then reduce the compiled code to just a
6824
set of non-capturing brackets so that it doesn't use much memory if it is
6825
duplicated by a quantifier.*/
6826
6827
if (lengthptr != NULL)
6828
{
6829
if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6830
{
6831
*errorcodeptr = ERR20;
6832
return 0;
6833
}
6834
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6835
code++; /* This already contains bravalue */
6836
PUTINC(code, 0, 1 + LINK_SIZE);
6837
*code++ = OP_KET;
6838
PUTINC(code, 0, 1 + LINK_SIZE);
6839
break; /* No need to waste time with special character handling */
6840
}
6841
6842
/* Otherwise update the main code pointer to the end of the group. */
6843
6844
code = tempcode;
6845
6846
/* For a DEFINE group, required and first character settings are not
6847
relevant. */
6848
6849
if (bravalue == OP_DEFINE) break;
6850
6851
/* Handle updating of the required and first code units for other types of
6852
group. Update for normal brackets of all kinds, and conditions with two
6853
branches (see code above). If the bracket is followed by a quantifier with
6854
zero repeat, we have to back off. Hence the definition of zeroreqcu and
6855
zerofirstcu outside the main loop so that they can be accessed for the back
6856
off. */
6857
6858
zeroreqcu = reqcu;
6859
zeroreqcuflags = reqcuflags;
6860
zerofirstcu = firstcu;
6861
zerofirstcuflags = firstcuflags;
6862
groupsetfirstcu = FALSE;
6863
6864
if (bravalue >= OP_ONCE) /* Not an assertion */
6865
{
6866
/* If we have not yet set a firstcu in this branch, take it from the
6867
subpattern, remembering that it was set here so that a repeat of more
6868
than one can replicate it as reqcu if necessary. If the subpattern has
6869
no firstcu, set "none" for the whole branch. In both cases, a zero
6870
repeat forces firstcu to "none". */
6871
6872
if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6873
{
6874
if (subfirstcuflags < REQ_NONE)
6875
{
6876
firstcu = subfirstcu;
6877
firstcuflags = subfirstcuflags;
6878
groupsetfirstcu = TRUE;
6879
}
6880
else firstcuflags = REQ_NONE;
6881
zerofirstcuflags = REQ_NONE;
6882
}
6883
6884
/* If firstcu was previously set, convert the subpattern's firstcu
6885
into reqcu if there wasn't one, using the vary flag that was in
6886
existence beforehand. */
6887
6888
else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6889
{
6890
subreqcu = subfirstcu;
6891
subreqcuflags = subfirstcuflags | tempreqvary;
6892
}
6893
6894
/* If the subpattern set a required code unit (or set a first code unit
6895
that isn't really the first code unit - see above), set it. */
6896
6897
if (subreqcuflags < REQ_NONE)
6898
{
6899
reqcu = subreqcu;
6900
reqcuflags = subreqcuflags;
6901
}
6902
}
6903
6904
/* For a forward assertion, we take the reqcu, if set, provided that the
6905
group has also set a firstcu. This can be helpful if the pattern that
6906
follows the assertion doesn't set a different char. For example, it's
6907
useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6908
because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6909
the "real" "a" would then become a reqcu instead of a firstcu. This is
6910
overcome by a scan at the end if there's no firstcu, looking for an
6911
asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6912
we must only take the reqcu when the group also set a firstcu. Otherwise,
6913
in that example, 'X' ends up set for both. */
6914
6915
else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6916
subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
6917
{
6918
reqcu = subreqcu;
6919
reqcuflags = subreqcuflags;
6920
}
6921
6922
break; /* End of nested group handling */
6923
6924
6925
/* ===================================================================*/
6926
/* Handle named backreferences and recursions. */
6927
6928
case META_BACKREF_BYNAME:
6929
case META_RECURSE_BYNAME:
6930
{
6931
int count, index;
6932
PCRE2_SPTR name;
6933
BOOL is_dupname = FALSE;
6934
named_group *ng = cb->named_groups;
6935
uint32_t length = *(++pptr);
6936
6937
GETPLUSOFFSET(offset, pptr);
6938
name = cb->start_pattern + offset;
6939
6940
/* In the first pass, the names generated in the pre-pass are available,
6941
but the main name table has not yet been created. Scan the list of names
6942
generated in the pre-pass in order to get a number and whether or not
6943
this name is duplicated. */
6944
6945
groupnumber = 0;
6946
for (unsigned int i = 0; i < cb->names_found; i++, ng++)
6947
{
6948
if (length == ng->length &&
6949
PRIV(strncmp)(name, ng->name, length) == 0)
6950
{
6951
is_dupname = ng->isdup;
6952
groupnumber = ng->number;
6953
6954
/* For a recursion, that's all that is needed. We can now go to
6955
the code that handles numerical recursion, applying it to the first
6956
group with the given name. */
6957
6958
if (meta == META_RECURSE_BYNAME)
6959
{
6960
meta_arg = groupnumber;
6961
goto HANDLE_NUMERICAL_RECURSION;
6962
}
6963
6964
/* For a back reference, update the back reference map and the
6965
maximum back reference. */
6966
6967
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6968
if (groupnumber > cb->top_backref)
6969
cb->top_backref = groupnumber;
6970
}
6971
}
6972
6973
/* If the name was not found we have a bad reference. */
6974
6975
if (groupnumber == 0)
6976
{
6977
*errorcodeptr = ERR15;
6978
cb->erroroffset = offset;
6979
return 0;
6980
}
6981
6982
/* If a back reference name is not duplicated, we can handle it as
6983
a numerical reference. */
6984
6985
if (!is_dupname)
6986
{
6987
meta_arg = groupnumber;
6988
goto HANDLE_SINGLE_REFERENCE;
6989
}
6990
6991
/* If a back reference name is duplicated, we generate a different
6992
opcode to a numerical back reference. In the second pass we must
6993
search for the index and count in the final name table. */
6994
6995
count = 0; /* Values for first pass (avoids compiler warning) */
6996
index = 0;
6997
if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6998
&count, errorcodeptr, cb)) return 0;
6999
7000
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7001
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7002
PUT2INC(code, 0, index);
7003
PUT2INC(code, 0, count);
7004
if ((options & PCRE2_CASELESS) != 0)
7005
*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7006
REFI_FLAG_CASELESS_RESTRICT : 0) |
7007
(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7008
REFI_FLAG_TURKISH_CASING : 0);
7009
}
7010
break;
7011
7012
7013
/* ===================================================================*/
7014
/* Handle a numerical callout. */
7015
7016
case META_CALLOUT_NUMBER:
7017
code[0] = OP_CALLOUT;
7018
PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7019
PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7020
code[1 + 2*LINK_SIZE] = pptr[3];
7021
pptr += 3;
7022
code += PRIV(OP_lengths)[OP_CALLOUT];
7023
break;
7024
7025
7026
/* ===================================================================*/
7027
/* Handle a callout with a string argument. In the pre-pass we just compute
7028
the length without generating anything. The length in pptr[3] includes both
7029
delimiters; in the actual compile only the first one is copied, but a
7030
terminating zero is added. Any doubled delimiters within the string make
7031
this an overestimate, but it is not worth bothering about. */
7032
7033
case META_CALLOUT_STRING:
7034
if (lengthptr != NULL)
7035
{
7036
*lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7037
pptr += 3;
7038
SKIPOFFSET(pptr);
7039
}
7040
7041
/* In the real compile we can copy the string. The starting delimiter is
7042
included so that the client can discover it if they want. We also pass the
7043
start offset to help a script language give better error messages. */
7044
7045
else
7046
{
7047
PCRE2_SPTR pp;
7048
uint32_t delimiter;
7049
uint32_t length = pptr[3];
7050
PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7051
7052
code[0] = OP_CALLOUT_STR;
7053
PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7054
PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7055
7056
pptr += 3;
7057
GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
7058
pp = cb->start_pattern + offset;
7059
delimiter = *callout_string++ = *pp++;
7060
if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7061
delimiter = CHAR_RIGHT_CURLY_BRACKET;
7062
PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
7063
7064
/* The syntax of the pattern was checked in the parsing scan. The length
7065
includes both delimiters, but we have passed the opening one just above,
7066
so we reduce length before testing it. The test is for > 1 because we do
7067
not want to copy the final delimiter. This also ensures that pp[1] is
7068
accessible. */
7069
7070
while (--length > 1)
7071
{
7072
if (*pp == delimiter && pp[1] == delimiter)
7073
{
7074
*callout_string++ = delimiter;
7075
pp += 2;
7076
length--;
7077
}
7078
else *callout_string++ = *pp++;
7079
}
7080
*callout_string++ = CHAR_NUL;
7081
7082
/* Set the length of the entire item, the advance to its end. */
7083
7084
PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7085
code = callout_string;
7086
}
7087
break;
7088
7089
7090
/* ===================================================================*/
7091
/* Handle repetition. The different types are all sorted out in the parsing
7092
pass. */
7093
7094
case META_MINMAX_PLUS:
7095
case META_MINMAX_QUERY:
7096
case META_MINMAX:
7097
repeat_min = *(++pptr);
7098
repeat_max = *(++pptr);
7099
goto REPEAT;
7100
7101
case META_ASTERISK:
7102
case META_ASTERISK_PLUS:
7103
case META_ASTERISK_QUERY:
7104
repeat_min = 0;
7105
repeat_max = REPEAT_UNLIMITED;
7106
goto REPEAT;
7107
7108
case META_PLUS:
7109
case META_PLUS_PLUS:
7110
case META_PLUS_QUERY:
7111
repeat_min = 1;
7112
repeat_max = REPEAT_UNLIMITED;
7113
goto REPEAT;
7114
7115
case META_QUERY:
7116
case META_QUERY_PLUS:
7117
case META_QUERY_QUERY:
7118
repeat_min = 0;
7119
repeat_max = 1;
7120
7121
REPEAT:
7122
if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7123
7124
/* Remember whether this is a variable length repeat, and default to
7125
single-char opcodes. */
7126
7127
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7128
7129
/* Adjust first and required code units for a zero repeat. */
7130
7131
if (repeat_min == 0)
7132
{
7133
firstcu = zerofirstcu;
7134
firstcuflags = zerofirstcuflags;
7135
reqcu = zeroreqcu;
7136
reqcuflags = zeroreqcuflags;
7137
}
7138
7139
/* Note the greediness and possessiveness. */
7140
7141
switch (meta)
7142
{
7143
case META_MINMAX_PLUS:
7144
case META_ASTERISK_PLUS:
7145
case META_PLUS_PLUS:
7146
case META_QUERY_PLUS:
7147
repeat_type = 0; /* Force greedy */
7148
possessive_quantifier = TRUE;
7149
break;
7150
7151
case META_MINMAX_QUERY:
7152
case META_ASTERISK_QUERY:
7153
case META_PLUS_QUERY:
7154
case META_QUERY_QUERY:
7155
repeat_type = greedy_non_default;
7156
possessive_quantifier = FALSE;
7157
break;
7158
7159
default:
7160
repeat_type = greedy_default;
7161
possessive_quantifier = FALSE;
7162
break;
7163
}
7164
7165
/* Save start of previous item, in case we have to move it up in order to
7166
insert something before it, and remember what it was. */
7167
7168
PCRE2_ASSERT(previous != NULL);
7169
tempcode = previous;
7170
op_previous = *previous;
7171
7172
/* Now handle repetition for the different types of item. If the repeat
7173
minimum and the repeat maximum are both 1, we can ignore the quantifier for
7174
non-parenthesized items, as they have only one alternative. For anything in
7175
parentheses, we must not ignore if {1} is possessive. */
7176
7177
switch (op_previous)
7178
{
7179
/* If previous was a character or negated character match, abolish the
7180
item and generate a repeat item instead. If a char item has a minimum of
7181
more than one, ensure that it is set in reqcu - it might not be if a
7182
sequence such as x{3} is the first thing in a branch because the x will
7183
have gone into firstcu instead. */
7184
7185
case OP_CHAR:
7186
case OP_CHARI:
7187
case OP_NOT:
7188
case OP_NOTI:
7189
if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7190
op_type = chartypeoffset[op_previous - OP_CHAR];
7191
7192
/* Deal with UTF characters that take up more than one code unit. */
7193
7194
#ifdef MAYBE_UTF_MULTI
7195
if (utf && NOT_FIRSTCU(code[-1]))
7196
{
7197
PCRE2_UCHAR *lastchar = code - 1;
7198
BACKCHAR(lastchar);
7199
mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
7200
memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
7201
}
7202
else
7203
#endif /* MAYBE_UTF_MULTI */
7204
7205
/* Handle the case of a single code unit - either with no UTF support, or
7206
with UTF disabled, or for a single-code-unit UTF character. In the latter
7207
case, for a repeated positive match, get the caseless flag for the
7208
required code unit from the previous character, because a class like [Aa]
7209
sets a caseless A but by now the req_caseopt flag has been reset. */
7210
7211
{
7212
mcbuffer[0] = code[-1];
7213
mclength = 1;
7214
if (op_previous <= OP_CHARI && repeat_min > 1)
7215
{
7216
reqcu = mcbuffer[0];
7217
reqcuflags = cb->req_varyopt;
7218
if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7219
}
7220
}
7221
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
7222
7223
/* If previous was a character class or a back reference, we put the
7224
repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7225
7226
#ifdef SUPPORT_WIDE_CHARS
7227
case OP_XCLASS:
7228
case OP_ECLASS:
7229
#endif
7230
case OP_CLASS:
7231
case OP_NCLASS:
7232
case OP_REF:
7233
case OP_REFI:
7234
case OP_DNREF:
7235
case OP_DNREFI:
7236
7237
if (repeat_max == 0)
7238
{
7239
code = previous;
7240
goto END_REPEAT;
7241
}
7242
if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7243
7244
if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7245
*code++ = OP_CRSTAR + repeat_type;
7246
else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7247
*code++ = OP_CRPLUS + repeat_type;
7248
else if (repeat_min == 0 && repeat_max == 1)
7249
*code++ = OP_CRQUERY + repeat_type;
7250
else
7251
{
7252
*code++ = OP_CRRANGE + repeat_type;
7253
PUT2INC(code, 0, repeat_min);
7254
if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
7255
PUT2INC(code, 0, repeat_max);
7256
}
7257
break;
7258
7259
/* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7260
because pcre2_match() could not handle backtracking into recursively
7261
called groups. Now that this backtracking is available, we no longer need
7262
to do this. However, we still need to replicate recursions as we do for
7263
groups so as to have independent backtracking points. We can replicate
7264
for the minimum number of repeats directly. For optional repeats we now
7265
wrap the recursion in OP_BRA brackets and make use of the bracket
7266
repetition. */
7267
7268
case OP_RECURSE:
7269
if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7270
goto END_REPEAT;
7271
7272
/* Generate unwrapped repeats for a non-zero minimum, except when the
7273
minimum is 1 and the maximum unlimited, because that can be handled with
7274
OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7275
minimum, we just need to generate the appropriate additional copies.
7276
Otherwise we need to generate one more, to simulate the situation when
7277
the minimum is zero. */
7278
7279
if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7280
{
7281
int replicate = repeat_min;
7282
if (repeat_min == repeat_max) replicate--;
7283
7284
/* In the pre-compile phase, we don't actually do the replication. We
7285
just adjust the length as if we had. Do some paranoid checks for
7286
potential integer overflow. */
7287
7288
if (lengthptr != NULL)
7289
{
7290
PCRE2_SIZE delta;
7291
if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7292
OFLOW_MAX - *lengthptr < delta)
7293
{
7294
*errorcodeptr = ERR20;
7295
return 0;
7296
}
7297
*lengthptr += delta;
7298
}
7299
7300
else for (int i = 0; i < replicate; i++)
7301
{
7302
memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7303
previous = code;
7304
code += 1 + LINK_SIZE;
7305
}
7306
7307
/* If the number of repeats is fixed, we are done. Otherwise, adjust
7308
the counts and fall through. */
7309
7310
if (repeat_min == repeat_max) break;
7311
if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7312
repeat_min = 0;
7313
}
7314
7315
/* Wrap the recursion call in OP_BRA brackets. */
7316
7317
(void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7318
op_previous = *previous = OP_BRA;
7319
PUT(previous, 1, 2 + 2*LINK_SIZE);
7320
previous[2 + 2*LINK_SIZE] = OP_KET;
7321
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7322
code += 2 + 2 * LINK_SIZE;
7323
length_prevgroup = 3 + 3*LINK_SIZE;
7324
group_return = -1; /* Set "may match empty string" */
7325
7326
/* Now treat as a repeated OP_BRA. */
7327
/* Fall through */
7328
7329
/* If previous was a bracket group, we may have to replicate it in
7330
certain cases. Note that at this point we can encounter only the "basic"
7331
bracket opcodes such as BRA and CBRA, as this is the place where they get
7332
converted into the more special varieties such as BRAPOS and SBRA.
7333
Originally, PCRE did not allow repetition of assertions, but now it does,
7334
for Perl compatibility. */
7335
7336
case OP_ASSERT:
7337
case OP_ASSERT_NOT:
7338
case OP_ASSERT_NA:
7339
case OP_ASSERTBACK:
7340
case OP_ASSERTBACK_NOT:
7341
case OP_ASSERTBACK_NA:
7342
case OP_ASSERT_SCS:
7343
case OP_ONCE:
7344
case OP_SCRIPT_RUN:
7345
case OP_BRA:
7346
case OP_CBRA:
7347
case OP_COND:
7348
{
7349
int len = (int)(code - previous);
7350
PCRE2_UCHAR *bralink = NULL;
7351
PCRE2_UCHAR *brazeroptr = NULL;
7352
7353
if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7354
goto END_REPEAT;
7355
7356
/* Repeating a DEFINE group (or any group where the condition is always
7357
FALSE and there is only one branch) is pointless, but Perl allows the
7358
syntax, so we just ignore the repeat. */
7359
7360
if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7361
previous[GET(previous, 1)] != OP_ALT)
7362
goto END_REPEAT;
7363
7364
/* Perl allows all assertions to be quantified, and when they contain
7365
capturing parentheses and/or are optional there are potential uses for
7366
this feature. PCRE2 used to force the maximum quantifier to 1 on the
7367
invalid grounds that further repetition was never useful. This was
7368
always a bit pointless, since an assertion could be wrapped with a
7369
repeated group to achieve the effect. General repetition is now
7370
permitted, but if the maximum is unlimited it is set to one more than
7371
the minimum. */
7372
7373
if (op_previous < OP_ONCE) /* Assertion */
7374
{
7375
if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7376
}
7377
7378
/* The case of a zero minimum is special because of the need to stick
7379
OP_BRAZERO in front of it, and because the group appears once in the
7380
data, whereas in other cases it appears the minimum number of times. For
7381
this reason, it is simplest to treat this case separately, as otherwise
7382
the code gets far too messy. There are several special subcases when the
7383
minimum is zero. */
7384
7385
if (repeat_min == 0)
7386
{
7387
/* If the maximum is also zero, we used to just omit the group from
7388
the output altogether, like this:
7389
7390
** if (repeat_max == 0)
7391
** {
7392
** code = previous;
7393
** goto END_REPEAT;
7394
** }
7395
7396
However, that fails when a group or a subgroup within it is
7397
referenced as a subroutine from elsewhere in the pattern, so now we
7398
stick in OP_SKIPZERO in front of it so that it is skipped on
7399
execution. As we don't have a list of which groups are referenced, we
7400
cannot do this selectively.
7401
7402
If the maximum is 1 or unlimited, we just have to stick in the
7403
BRAZERO and do no more at this point. */
7404
7405
if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7406
{
7407
(void)memmove(previous + 1, previous, CU2BYTES(len));
7408
code++;
7409
if (repeat_max == 0)
7410
{
7411
*previous++ = OP_SKIPZERO;
7412
goto END_REPEAT;
7413
}
7414
brazeroptr = previous; /* Save for possessive optimizing */
7415
*previous++ = OP_BRAZERO + repeat_type;
7416
}
7417
7418
/* If the maximum is greater than 1 and limited, we have to replicate
7419
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7420
The first one has to be handled carefully because it's the original
7421
copy, which has to be moved up. The remainder can be handled by code
7422
that is common with the non-zero minimum case below. We have to
7423
adjust the value or repeat_max, since one less copy is required. */
7424
7425
else
7426
{
7427
int linkoffset;
7428
(void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7429
code += 2 + LINK_SIZE;
7430
*previous++ = OP_BRAZERO + repeat_type;
7431
*previous++ = OP_BRA;
7432
7433
/* We chain together the bracket link offset fields that have to be
7434
filled in later when the ends of the brackets are reached. */
7435
7436
linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7437
bralink = previous;
7438
PUTINC(previous, 0, linkoffset);
7439
}
7440
7441
if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7442
}
7443
7444
/* If the minimum is greater than zero, replicate the group as many
7445
times as necessary, and adjust the maximum to the number of subsequent
7446
copies that we need. */
7447
7448
else
7449
{
7450
if (repeat_min > 1)
7451
{
7452
/* In the pre-compile phase, we don't actually do the replication.
7453
We just adjust the length as if we had. Do some paranoid checks for
7454
potential integer overflow. */
7455
7456
if (lengthptr != NULL)
7457
{
7458
PCRE2_SIZE delta;
7459
if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7460
(int)length_prevgroup) ||
7461
OFLOW_MAX - *lengthptr < delta)
7462
{
7463
*errorcodeptr = ERR20;
7464
return 0;
7465
}
7466
*lengthptr += delta;
7467
}
7468
7469
/* This is compiling for real. If there is a set first code unit
7470
for the group, and we have not yet set a "required code unit", set
7471
it. */
7472
7473
else
7474
{
7475
if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7476
{
7477
reqcu = firstcu;
7478
reqcuflags = firstcuflags;
7479
}
7480
for (uint32_t i = 1; i < repeat_min; i++)
7481
{
7482
memcpy(code, previous, CU2BYTES(len));
7483
code += len;
7484
}
7485
}
7486
}
7487
7488
if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7489
}
7490
7491
/* This code is common to both the zero and non-zero minimum cases. If
7492
the maximum is limited, it replicates the group in a nested fashion,
7493
remembering the bracket starts on a stack. In the case of a zero
7494
minimum, the first one was set up above. In all cases the repeat_max
7495
now specifies the number of additional copies needed. Again, we must
7496
remember to replicate entries on the forward reference list. */
7497
7498
if (repeat_max != REPEAT_UNLIMITED)
7499
{
7500
/* In the pre-compile phase, we don't actually do the replication. We
7501
just adjust the length as if we had. For each repetition we must add
7502
1 to the length for BRAZERO and for all but the last repetition we
7503
must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7504
paranoid checks to avoid integer overflow. */
7505
7506
if (lengthptr != NULL && repeat_max > 0)
7507
{
7508
PCRE2_SIZE delta;
7509
if (PRIV(ckd_smul)(&delta, repeat_max,
7510
(int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7511
OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7512
{
7513
*errorcodeptr = ERR20;
7514
return 0;
7515
}
7516
delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */
7517
*lengthptr += delta;
7518
}
7519
7520
/* This is compiling for real */
7521
7522
else for (uint32_t i = repeat_max; i >= 1; i--)
7523
{
7524
*code++ = OP_BRAZERO + repeat_type;
7525
7526
/* All but the final copy start a new nesting, maintaining the
7527
chain of brackets outstanding. */
7528
7529
if (i != 1)
7530
{
7531
int linkoffset;
7532
*code++ = OP_BRA;
7533
linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7534
bralink = code;
7535
PUTINC(code, 0, linkoffset);
7536
}
7537
7538
memcpy(code, previous, CU2BYTES(len));
7539
code += len;
7540
}
7541
7542
/* Now chain through the pending brackets, and fill in their length
7543
fields (which are holding the chain links pro tem). */
7544
7545
while (bralink != NULL)
7546
{
7547
int oldlinkoffset;
7548
int linkoffset = (int)(code - bralink + 1);
7549
PCRE2_UCHAR *bra = code - linkoffset;
7550
oldlinkoffset = GET(bra, 1);
7551
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7552
*code++ = OP_KET;
7553
PUTINC(code, 0, linkoffset);
7554
PUT(bra, 1, linkoffset);
7555
}
7556
}
7557
7558
/* If the maximum is unlimited, set a repeater in the final copy. For
7559
SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7560
possessively repeated ONCE brackets can be converted into non-capturing
7561
brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7562
saves having to deal with possessive ONCEs specially.
7563
7564
Otherwise, when we are doing the actual compile phase, check to see
7565
whether this group is one that could match an empty string. If so,
7566
convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7567
that runtime checking can be done. [This check is also applied to ONCE
7568
and SCRIPT_RUN groups at runtime, but in a different way.]
7569
7570
Then, if the quantifier was possessive and the bracket is not a
7571
conditional, we convert the BRA code to the POS form, and the KET code
7572
to KETRPOS. (It turns out to be convenient at runtime to detect this
7573
kind of subpattern at both the start and at the end.) The use of
7574
special opcodes makes it possible to reduce greatly the stack usage in
7575
pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7576
OP_BRAPOSZERO.
7577
7578
Then, if the minimum number of matches is 1 or 0, cancel the possessive
7579
flag so that the default action below, of wrapping everything inside
7580
atomic brackets, does not happen. When the minimum is greater than 1,
7581
there will be earlier copies of the group, and so we still have to wrap
7582
the whole thing. */
7583
7584
else
7585
{
7586
PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7587
PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7588
7589
/* Convert possessive ONCE brackets to non-capturing */
7590
7591
if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7592
7593
/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7594
to do is to set the KET. */
7595
7596
if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7597
*ketcode = OP_KETRMAX + repeat_type;
7598
7599
/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7600
(which have been converted to non-capturing above). */
7601
7602
else
7603
{
7604
/* In the compile phase, adjust the opcode if the group can match
7605
an empty string. For a conditional group with only one branch, the
7606
value of group_return will not show "could be empty", so we must
7607
check that separately. */
7608
7609
if (lengthptr == NULL)
7610
{
7611
if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7612
if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7613
*bracode = OP_SCOND;
7614
}
7615
7616
/* Handle possessive quantifiers. */
7617
7618
if (possessive_quantifier)
7619
{
7620
/* For COND brackets, we wrap the whole thing in a possessively
7621
repeated non-capturing bracket, because we have not invented POS
7622
versions of the COND opcodes. */
7623
7624
if (*bracode == OP_COND || *bracode == OP_SCOND)
7625
{
7626
int nlen = (int)(code - bracode);
7627
(void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7628
code += 1 + LINK_SIZE;
7629
nlen += 1 + LINK_SIZE;
7630
*bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7631
*code++ = OP_KETRPOS;
7632
PUTINC(code, 0, nlen);
7633
PUT(bracode, 1, nlen);
7634
}
7635
7636
/* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7637
7638
else
7639
{
7640
*bracode += 1; /* Switch to xxxPOS opcodes */
7641
*ketcode = OP_KETRPOS;
7642
}
7643
7644
/* If the minimum is zero, mark it as possessive, then unset the
7645
possessive flag when the minimum is 0 or 1. */
7646
7647
if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7648
if (repeat_min < 2) possessive_quantifier = FALSE;
7649
}
7650
7651
/* Non-possessive quantifier */
7652
7653
else *ketcode = OP_KETRMAX + repeat_type;
7654
}
7655
}
7656
}
7657
break;
7658
7659
/* If previous was a character type match (\d or similar), abolish it and
7660
create a suitable repeat item. The code is shared with single-character
7661
repeats by setting op_type to add a suitable offset into repeat_type.
7662
Note the the Unicode property types will be present only when
7663
SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7664
here because it just makes it horribly messy. */
7665
7666
default:
7667
if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7668
{
7669
PCRE2_DEBUG_UNREACHABLE();
7670
*errorcodeptr = ERR10; /* Not a character type - internal error */
7671
return 0;
7672
}
7673
else
7674
{
7675
int prop_type, prop_value;
7676
PCRE2_UCHAR *oldcode;
7677
7678
if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7679
7680
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7681
mclength = 0; /* Not a character */
7682
7683
if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7684
{
7685
prop_type = previous[1];
7686
prop_value = previous[2];
7687
}
7688
else
7689
{
7690
/* Come here from just above with a character in mcbuffer/mclength.
7691
You must also set op_type before the jump. */
7692
OUTPUT_SINGLE_REPEAT:
7693
prop_type = prop_value = -1;
7694
}
7695
7696
/* At this point, if prop_type == prop_value == -1 we either have a
7697
character in mcbuffer when mclength is greater than zero, or we have
7698
mclength zero, in which case there is a non-property character type in
7699
op_previous. If prop_type/value are not negative, we have a property
7700
character type in op_previous. */
7701
7702
oldcode = code; /* Save where we were */
7703
code = previous; /* Usually overwrite previous item */
7704
7705
/* If the maximum is zero then the minimum must also be zero; Perl allows
7706
this case, so we do too - by simply omitting the item altogether. */
7707
7708
if (repeat_max == 0) goto END_REPEAT;
7709
7710
/* Combine the op_type with the repeat_type */
7711
7712
repeat_type += op_type;
7713
7714
/* A minimum of zero is handled either as the special case * or ?, or as
7715
an UPTO, with the maximum given. */
7716
7717
if (repeat_min == 0)
7718
{
7719
if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7720
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7721
else
7722
{
7723
*code++ = OP_UPTO + repeat_type;
7724
PUT2INC(code, 0, repeat_max);
7725
}
7726
}
7727
7728
/* A repeat minimum of 1 is optimized into some special cases. If the
7729
maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7730
left in place and, if the maximum is greater than 1, we use OP_UPTO with
7731
one less than the maximum. */
7732
7733
else if (repeat_min == 1)
7734
{
7735
if (repeat_max == REPEAT_UNLIMITED)
7736
*code++ = OP_PLUS + repeat_type;
7737
else
7738
{
7739
code = oldcode; /* Leave previous item in place */
7740
if (repeat_max == 1) goto END_REPEAT;
7741
*code++ = OP_UPTO + repeat_type;
7742
PUT2INC(code, 0, repeat_max - 1);
7743
}
7744
}
7745
7746
/* The case {n,n} is just an EXACT, while the general case {n,m} is
7747
handled as an EXACT followed by an UPTO or STAR or QUERY. */
7748
7749
else
7750
{
7751
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7752
PUT2INC(code, 0, repeat_min);
7753
7754
/* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7755
and then generate the second opcode. For a repeated Unicode property
7756
match, there are two extra values that define the required property,
7757
and mclength is set zero to indicate this. */
7758
7759
if (repeat_max != repeat_min)
7760
{
7761
if (mclength > 0)
7762
{
7763
memcpy(code, mcbuffer, CU2BYTES(mclength));
7764
code += mclength;
7765
}
7766
else
7767
{
7768
*code++ = op_previous;
7769
if (prop_type >= 0)
7770
{
7771
*code++ = prop_type;
7772
*code++ = prop_value;
7773
}
7774
}
7775
7776
/* Now set up the following opcode */
7777
7778
if (repeat_max == REPEAT_UNLIMITED)
7779
*code++ = OP_STAR + repeat_type;
7780
else
7781
{
7782
repeat_max -= repeat_min;
7783
if (repeat_max == 1)
7784
{
7785
*code++ = OP_QUERY + repeat_type;
7786
}
7787
else
7788
{
7789
*code++ = OP_UPTO + repeat_type;
7790
PUT2INC(code, 0, repeat_max);
7791
}
7792
}
7793
}
7794
}
7795
7796
/* Fill in the character or character type for the final opcode. */
7797
7798
if (mclength > 0)
7799
{
7800
memcpy(code, mcbuffer, CU2BYTES(mclength));
7801
code += mclength;
7802
}
7803
else
7804
{
7805
*code++ = op_previous;
7806
if (prop_type >= 0)
7807
{
7808
*code++ = prop_type;
7809
*code++ = prop_value;
7810
}
7811
}
7812
}
7813
break;
7814
} /* End of switch on different op_previous values */
7815
7816
7817
/* If the character following a repeat is '+', possessive_quantifier is
7818
TRUE. For some opcodes, there are special alternative opcodes for this
7819
case. For anything else, we wrap the entire repeated item inside OP_ONCE
7820
brackets. Logically, the '+' notation is just syntactic sugar, taken from
7821
Sun's Java package, but the special opcodes can optimize it.
7822
7823
Some (but not all) possessively repeated subpatterns have already been
7824
completely handled in the code just above. For them, possessive_quantifier
7825
is always FALSE at this stage. Note that the repeated item starts at
7826
tempcode, not at previous, which might be the first part of a string whose
7827
(former) last char we repeated. */
7828
7829
if (possessive_quantifier)
7830
{
7831
int len;
7832
7833
/* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7834
However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7835
{5,}, or {5,10}). We skip over an EXACT item; if the length of what
7836
remains is greater than zero, there's a further opcode that can be
7837
handled. If not, do nothing, leaving the EXACT alone. */
7838
7839
switch(*tempcode)
7840
{
7841
case OP_TYPEEXACT:
7842
tempcode += PRIV(OP_lengths)[*tempcode] +
7843
((tempcode[1 + IMM2_SIZE] == OP_PROP
7844
|| tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7845
break;
7846
7847
/* CHAR opcodes are used for exacts whose count is 1. */
7848
7849
case OP_CHAR:
7850
case OP_CHARI:
7851
case OP_NOT:
7852
case OP_NOTI:
7853
case OP_EXACT:
7854
case OP_EXACTI:
7855
case OP_NOTEXACT:
7856
case OP_NOTEXACTI:
7857
tempcode += PRIV(OP_lengths)[*tempcode];
7858
#ifdef SUPPORT_UNICODE
7859
if (utf && HAS_EXTRALEN(tempcode[-1]))
7860
tempcode += GET_EXTRALEN(tempcode[-1]);
7861
#endif
7862
break;
7863
7864
/* For the class opcodes, the repeat operator appears at the end;
7865
adjust tempcode to point to it. */
7866
7867
case OP_CLASS:
7868
case OP_NCLASS:
7869
tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7870
break;
7871
7872
#ifdef SUPPORT_WIDE_CHARS
7873
case OP_XCLASS:
7874
case OP_ECLASS:
7875
tempcode += GET(tempcode, 1);
7876
break;
7877
#endif
7878
}
7879
7880
/* If tempcode is equal to code (which points to the end of the repeated
7881
item), it means we have skipped an EXACT item but there is no following
7882
QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7883
all other cases, tempcode will be pointing to the repeat opcode, and will
7884
be less than code, so the value of len will be greater than 0. */
7885
7886
len = (int)(code - tempcode);
7887
if (len > 0)
7888
{
7889
unsigned int repcode = *tempcode;
7890
7891
/* There is a table for possessifying opcodes, all of which are less
7892
than OP_CALLOUT. A zero entry means there is no possessified version.
7893
*/
7894
7895
if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7896
*tempcode = opcode_possessify[repcode];
7897
7898
/* For opcode without a special possessified version, wrap the item in
7899
ONCE brackets. */
7900
7901
else
7902
{
7903
(void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7904
code += 1 + LINK_SIZE;
7905
len += 1 + LINK_SIZE;
7906
tempcode[0] = OP_ONCE;
7907
*code++ = OP_KET;
7908
PUTINC(code, 0, len);
7909
PUT(tempcode, 1, len);
7910
}
7911
}
7912
}
7913
7914
/* We set the "follows varying string" flag for subsequently encountered
7915
reqcus if it isn't already set and we have just passed a varying length
7916
item. */
7917
7918
END_REPEAT:
7919
cb->req_varyopt |= reqvary;
7920
break;
7921
7922
7923
/* ===================================================================*/
7924
/* Handle a 32-bit data character with a value greater than META_END. */
7925
7926
case META_BIGVALUE:
7927
pptr++;
7928
goto NORMAL_CHAR;
7929
7930
7931
/* ===============================================================*/
7932
/* Handle a back reference by number, which is the meta argument. The
7933
pattern offsets for back references to group numbers less than 10 are held
7934
in a special vector, to avoid using more than two parsed pattern elements
7935
in 64-bit environments. We only need the offset to the first occurrence,
7936
because if that doesn't fail, subsequent ones will also be OK. */
7937
7938
case META_BACKREF:
7939
if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7940
else GETPLUSOFFSET(offset, pptr);
7941
7942
if (meta_arg > cb->bracount)
7943
{
7944
cb->erroroffset = offset;
7945
*errorcodeptr = ERR15; /* Non-existent subpattern */
7946
return 0;
7947
}
7948
7949
/* Come here from named backref handling when the reference is to a
7950
single group (that is, not to a duplicated name). The back reference
7951
data will have already been updated. We must disable firstcu if not
7952
set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7953
later. */
7954
7955
HANDLE_SINGLE_REFERENCE:
7956
if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7957
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7958
PUT2INC(code, 0, meta_arg);
7959
if ((options & PCRE2_CASELESS) != 0)
7960
*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7961
REFI_FLAG_CASELESS_RESTRICT : 0) |
7962
(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7963
REFI_FLAG_TURKISH_CASING : 0);
7964
7965
/* Update the map of back references, and keep the highest one. We
7966
could do this in parse_regex() for numerical back references, but not
7967
for named back references, because we don't know the numbers to which
7968
named back references refer. So we do it all in this function. */
7969
7970
cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7971
if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7972
break;
7973
7974
7975
/* ===============================================================*/
7976
/* Handle recursion by inserting the number of the called group (which is
7977
the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7978
scanned and these numbers are replaced by offsets within the pattern. It is
7979
done like this to avoid problems with forward references and adjusting
7980
offsets when groups are duplicated and moved (as discovered in previous
7981
implementations). Note that a recursion does not have a set first
7982
character. */
7983
7984
case META_RECURSE:
7985
GETPLUSOFFSET(offset, pptr);
7986
if (meta_arg > cb->bracount)
7987
{
7988
cb->erroroffset = offset;
7989
*errorcodeptr = ERR15; /* Non-existent subpattern */
7990
return 0;
7991
}
7992
HANDLE_NUMERICAL_RECURSION:
7993
*code = OP_RECURSE;
7994
PUT(code, 1, meta_arg);
7995
code += 1 + LINK_SIZE;
7996
groupsetfirstcu = FALSE;
7997
cb->had_recurse = TRUE;
7998
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7999
zerofirstcu = firstcu;
8000
zerofirstcuflags = firstcuflags;
8001
break;
8002
8003
8004
/* ===============================================================*/
8005
/* Handle capturing parentheses; the number is the meta argument. */
8006
8007
case META_CAPTURE:
8008
bravalue = OP_CBRA;
8009
skipunits = IMM2_SIZE;
8010
PUT2(code, 1+LINK_SIZE, meta_arg);
8011
cb->lastcapture = meta_arg;
8012
goto GROUP_PROCESS_NOTE_EMPTY;
8013
8014
8015
/* ===============================================================*/
8016
/* Handle escape sequence items. For ones like \d, the ESC_values are
8017
arranged to be the same as the corresponding OP_values in the default case
8018
when PCRE2_UCP is not set (which is the only case in which they will appear
8019
here).
8020
8021
Note: \Q and \E are never seen here, as they were dealt with in
8022
parse_pattern(). Neither are numerical back references or recursions, which
8023
were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8024
\g, when followed by names, are turned into META_BACKREF_BYNAME or
8025
META_RECURSE_BYNAME. */
8026
8027
case META_ESCAPE:
8028
8029
/* We can test for escape sequences that consume a character because their
8030
values lie between ESC_b and ESC_Z; this may have to change if any new ones
8031
are ever created. For these sequences, we disable the setting of a first
8032
character if it hasn't already been set. */
8033
8034
if (meta_arg > ESC_b && meta_arg < ESC_Z)
8035
{
8036
matched_char = TRUE;
8037
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8038
}
8039
8040
/* Set values to reset to if this is followed by a zero repeat. */
8041
8042
zerofirstcu = firstcu;
8043
zerofirstcuflags = firstcuflags;
8044
zeroreqcu = reqcu;
8045
zeroreqcuflags = reqcuflags;
8046
8047
/* If Unicode is not supported, \P and \p are not allowed and are
8048
faulted at parse time, so will never appear here. */
8049
8050
#ifdef SUPPORT_UNICODE
8051
if (meta_arg == ESC_P || meta_arg == ESC_p)
8052
{
8053
uint32_t ptype = *(++pptr) >> 16;
8054
uint32_t pdata = *pptr & 0xffff;
8055
8056
/* In caseless matching, particular characteristics Lu, Ll, and Lt get
8057
converted to the general characteristic L&. That is, upper, lower, and
8058
title case letters are all conflated. */
8059
8060
if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8061
(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8062
{
8063
ptype = PT_LAMP;
8064
pdata = 0;
8065
}
8066
8067
/* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8068
is compiled to [] so as to benefit from the auto-anchoring code. */
8069
8070
if (ptype == PT_ANY)
8071
{
8072
if (meta_arg == ESC_P)
8073
{
8074
*code++ = OP_CLASS;
8075
memset(code, 0, 32);
8076
code += 32 / sizeof(PCRE2_UCHAR);
8077
}
8078
else
8079
*code++ = OP_ALLANY;
8080
}
8081
else
8082
{
8083
*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8084
*code++ = ptype;
8085
*code++ = pdata;
8086
}
8087
break; /* End META_ESCAPE */
8088
}
8089
#endif
8090
8091
/* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8092
done. However, there's an option, in case anyone was relying on it. */
8093
8094
if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8095
(xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8096
{
8097
*errorcodeptr = ERR99;
8098
return 0;
8099
}
8100
8101
/* For the rest (including \X when Unicode is supported - if not it's
8102
faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8103
not set; if it is set, most of them do not show up here because they are
8104
converted into Unicode property tests in parse_regex().
8105
8106
In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8107
instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8108
There are special UCP codes for \B and \b which are used in UCP mode unless
8109
"word" matching is being forced to ASCII.
8110
8111
Note that \b and \B do a one-character lookbehind, and \A also behaves as
8112
if it does. */
8113
8114
switch(meta_arg)
8115
{
8116
case ESC_C:
8117
cb->external_flags |= PCRE2_HASBKC; /* Record */
8118
#if PCRE2_CODE_UNIT_WIDTH == 32
8119
meta_arg = OP_ALLANY;
8120
#else
8121
if (!utf) meta_arg = OP_ALLANY;
8122
#endif
8123
break;
8124
8125
case ESC_B:
8126
case ESC_b:
8127
if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8128
meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8129
OP_UCP_WORD_BOUNDARY;
8130
/* Fall through */
8131
8132
case ESC_A:
8133
if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8134
break;
8135
}
8136
8137
*code++ = meta_arg;
8138
break; /* End META_ESCAPE */
8139
8140
8141
/* ===================================================================*/
8142
/* Handle an unrecognized meta value. A parsed pattern value less than
8143
META_END is a literal. Otherwise we have a problem. */
8144
8145
default:
8146
if (meta >= META_END)
8147
{
8148
PCRE2_DEBUG_UNREACHABLE();
8149
*errorcodeptr = ERR89; /* Internal error - unrecognized. */
8150
return 0;
8151
}
8152
8153
/* Handle a literal character. We come here by goto in the case of a
8154
32-bit, non-UTF character whose value is greater than META_END. */
8155
8156
NORMAL_CHAR:
8157
meta = *pptr; /* Get the full 32 bits */
8158
NORMAL_CHAR_SET: /* Character is already in meta */
8159
matched_char = TRUE;
8160
8161
/* For caseless UTF or UCP mode, check whether this character has more than
8162
one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8163
When casing restrictions apply, ignore caseless sets that start with an
8164
ASCII character. If the character is affected by the special Turkish rules,
8165
hardcode the matching characters using a caseset. */
8166
8167
#ifdef SUPPORT_UNICODE
8168
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8169
{
8170
uint32_t caseset;
8171
8172
if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8173
PCRE2_EXTRA_TURKISH_CASING &&
8174
UCD_ANY_I(meta))
8175
{
8176
caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8177
}
8178
else if ((caseset = UCD_CASESET(meta)) != 0 &&
8179
(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8180
PRIV(ucd_caseless_sets)[caseset] < 128)
8181
{
8182
caseset = 0; /* Ignore the caseless set if it's restricted. */
8183
}
8184
8185
if (caseset != 0)
8186
{
8187
*code++ = OP_PROP;
8188
*code++ = PT_CLIST;
8189
*code++ = caseset;
8190
if (firstcuflags == REQ_UNSET)
8191
firstcuflags = zerofirstcuflags = REQ_NONE;
8192
break; /* End handling this meta item */
8193
}
8194
}
8195
#endif
8196
8197
/* Caseful matches, or caseless and not one of the multicase characters. We
8198
come here by goto in the case of a positive class that contains only
8199
case-partners of a character with just two cases; matched_char has already
8200
been set TRUE and options fudged if necessary. */
8201
8202
CLASS_CASELESS_CHAR:
8203
8204
/* Get the character's code units into mcbuffer, with the length in
8205
mclength. When not in UTF mode, the length is always 1. */
8206
8207
#ifdef SUPPORT_UNICODE
8208
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8209
#endif
8210
{
8211
mclength = 1;
8212
mcbuffer[0] = meta;
8213
}
8214
8215
/* Generate the appropriate code */
8216
8217
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8218
memcpy(code, mcbuffer, CU2BYTES(mclength));
8219
code += mclength;
8220
8221
/* Remember if \r or \n were seen */
8222
8223
if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8224
cb->external_flags |= PCRE2_HASCRORLF;
8225
8226
/* Set the first and required code units appropriately. If no previous
8227
first code unit, set it from this character, but revert to none on a zero
8228
repeat. Otherwise, leave the firstcu value alone, and don't change it on
8229
a zero repeat. */
8230
8231
if (firstcuflags == REQ_UNSET)
8232
{
8233
zerofirstcuflags = REQ_NONE;
8234
zeroreqcu = reqcu;
8235
zeroreqcuflags = reqcuflags;
8236
8237
/* If the character is more than one code unit long, we can set a single
8238
firstcu only if it is not to be matched caselessly. Multiple possible
8239
starting code units may be picked up later in the studying code. */
8240
8241
if (mclength == 1 || req_caseopt == 0)
8242
{
8243
firstcu = mcbuffer[0];
8244
firstcuflags = req_caseopt;
8245
if (mclength != 1)
8246
{
8247
reqcu = code[-1];
8248
reqcuflags = cb->req_varyopt;
8249
}
8250
}
8251
else firstcuflags = reqcuflags = REQ_NONE;
8252
}
8253
8254
/* firstcu was previously set; we can set reqcu only if the length is
8255
1 or the matching is caseful. */
8256
8257
else
8258
{
8259
zerofirstcu = firstcu;
8260
zerofirstcuflags = firstcuflags;
8261
zeroreqcu = reqcu;
8262
zeroreqcuflags = reqcuflags;
8263
if (mclength == 1 || req_caseopt == 0)
8264
{
8265
reqcu = code[-1];
8266
reqcuflags = req_caseopt | cb->req_varyopt;
8267
}
8268
}
8269
8270
/* If caselessness was temporarily instated, reset it. */
8271
8272
if (reset_caseful)
8273
{
8274
options &= ~PCRE2_CASELESS;
8275
req_caseopt = 0;
8276
reset_caseful = FALSE;
8277
}
8278
8279
break; /* End literal character handling */
8280
} /* End of big switch */
8281
} /* End of big loop */
8282
8283
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8284
return 0; /* Avoid compiler warnings */
8285
}
8286
8287
8288
8289
/*************************************************
8290
* Compile regex: a sequence of alternatives *
8291
*************************************************/
8292
8293
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8294
the closing bracket or META_END. The code variable is pointing at the code unit
8295
into which the BRA operator has been stored. This function is used during the
8296
pre-compile phase when we are trying to find out the amount of memory needed,
8297
as well as during the real compile phase. The value of lengthptr distinguishes
8298
the two phases.
8299
8300
Arguments:
8301
options option bits, including any changes for this subpattern
8302
xoptions extra option bits, ditto
8303
codeptr -> the address of the current code pointer
8304
pptrptr -> the address of the current parsed pattern pointer
8305
errorcodeptr -> pointer to error code variable
8306
skipunits skip this many code units at start (for brackets and OP_COND)
8307
firstcuptr place to put the first required code unit
8308
firstcuflagsptr place to put the first code unit flags
8309
reqcuptr place to put the last required code unit
8310
reqcuflagsptr place to put the last required code unit flags
8311
bcptr pointer to the chain of currently open branches
8312
cb points to the data block with tables pointers etc.
8313
lengthptr NULL during the real compile phase
8314
points to length accumulator during pre-compile phase
8315
8316
Returns: 0 There has been an error
8317
+1 Success, this group must match at least one character
8318
-1 Success, this group may match an empty string
8319
*/
8320
8321
static int
8322
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8323
uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8324
uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8325
uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8326
compile_block *cb, PCRE2_SIZE *lengthptr)
8327
{
8328
PCRE2_UCHAR *code = *codeptr;
8329
PCRE2_UCHAR *last_branch = code;
8330
PCRE2_UCHAR *start_bracket = code;
8331
BOOL lookbehind;
8332
open_capitem capitem;
8333
int capnumber = 0;
8334
int okreturn = 1;
8335
uint32_t *pptr = *pptrptr;
8336
uint32_t firstcu, reqcu;
8337
uint32_t lookbehindlength;
8338
uint32_t lookbehindminlength;
8339
uint32_t firstcuflags, reqcuflags;
8340
PCRE2_SIZE length;
8341
branch_chain bc;
8342
8343
/* If set, call the external function that checks for stack availability. */
8344
8345
if (cb->cx->stack_guard != NULL &&
8346
cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8347
{
8348
*errorcodeptr= ERR33;
8349
return 0;
8350
}
8351
8352
/* Miscellaneous initialization */
8353
8354
bc.outer = bcptr;
8355
bc.current_branch = code;
8356
8357
firstcu = reqcu = 0;
8358
firstcuflags = reqcuflags = REQ_UNSET;
8359
8360
/* Accumulate the length for use in the pre-compile phase. Start with the
8361
length of the BRA and KET and any extra code units that are required at the
8362
beginning. We accumulate in a local variable to save frequent testing of
8363
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8364
start and end of each alternative, because compiled items are discarded during
8365
the pre-compile phase so that the workspace is not exceeded. */
8366
8367
length = 2 + 2*LINK_SIZE + skipunits;
8368
8369
/* Remember if this is a lookbehind assertion, and if it is, save its length
8370
and skip over the pattern offset. */
8371
8372
lookbehind = *code == OP_ASSERTBACK ||
8373
*code == OP_ASSERTBACK_NOT ||
8374
*code == OP_ASSERTBACK_NA;
8375
8376
if (lookbehind)
8377
{
8378
lookbehindlength = META_DATA(pptr[-1]);
8379
lookbehindminlength = *pptr;
8380
pptr += SIZEOFFSET;
8381
}
8382
else lookbehindlength = lookbehindminlength = 0;
8383
8384
/* If this is a capturing subpattern, add to the chain of open capturing items
8385
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8386
need be tested here; changing this opcode to one of its variants, e.g.
8387
OP_SCBRAPOS, happens later, after the group has been compiled. */
8388
8389
if (*code == OP_CBRA)
8390
{
8391
capnumber = GET2(code, 1 + LINK_SIZE);
8392
capitem.number = capnumber;
8393
capitem.next = open_caps;
8394
capitem.assert_depth = cb->assert_depth;
8395
open_caps = &capitem;
8396
}
8397
8398
/* Offset is set zero to mark that this bracket is still open */
8399
8400
PUT(code, 1, 0);
8401
code += 1 + LINK_SIZE + skipunits;
8402
8403
/* Loop for each alternative branch */
8404
8405
for (;;)
8406
{
8407
int branch_return;
8408
uint32_t branchfirstcu = 0, branchreqcu = 0;
8409
uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8410
8411
/* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8412
is only a single minimum length for the whole assertion. When the minimum
8413
length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8414
though not necessarily the same length. In this case, the original OP_REVERSE
8415
can be used. It can also be used if a branch in a variable length lookbehind
8416
has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8417
maximum and minimum values. */
8418
8419
if (lookbehind && lookbehindlength > 0)
8420
{
8421
if (lookbehindminlength == LOOKBEHIND_MAX ||
8422
lookbehindminlength == lookbehindlength)
8423
{
8424
*code++ = OP_REVERSE;
8425
PUT2INC(code, 0, lookbehindlength);
8426
length += 1 + IMM2_SIZE;
8427
}
8428
else
8429
{
8430
*code++ = OP_VREVERSE;
8431
PUT2INC(code, 0, lookbehindminlength);
8432
PUT2INC(code, 0, lookbehindlength);
8433
length += 1 + 2*IMM2_SIZE;
8434
}
8435
}
8436
8437
/* Now compile the branch; in the pre-compile phase its length gets added
8438
into the length. */
8439
8440
if ((branch_return =
8441
compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8442
&branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8443
&bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8444
return 0;
8445
8446
/* If a branch can match an empty string, so can the whole group. */
8447
8448
if (branch_return < 0) okreturn = -1;
8449
8450
/* In the real compile phase, there is some post-processing to be done. */
8451
8452
if (lengthptr == NULL)
8453
{
8454
/* If this is the first branch, the firstcu and reqcu values for the
8455
branch become the values for the regex. */
8456
8457
if (*last_branch != OP_ALT)
8458
{
8459
firstcu = branchfirstcu;
8460
firstcuflags = branchfirstcuflags;
8461
reqcu = branchreqcu;
8462
reqcuflags = branchreqcuflags;
8463
}
8464
8465
/* If this is not the first branch, the first char and reqcu have to
8466
match the values from all the previous branches, except that if the
8467
previous value for reqcu didn't have REQ_VARY set, it can still match,
8468
and we set REQ_VARY for the group from this branch's value. */
8469
8470
else
8471
{
8472
/* If we previously had a firstcu, but it doesn't match the new branch,
8473
we have to abandon the firstcu for the regex, but if there was
8474
previously no reqcu, it takes on the value of the old firstcu. */
8475
8476
if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8477
{
8478
if (firstcuflags < REQ_NONE)
8479
{
8480
if (reqcuflags >= REQ_NONE)
8481
{
8482
reqcu = firstcu;
8483
reqcuflags = firstcuflags;
8484
}
8485
}
8486
firstcuflags = REQ_NONE;
8487
}
8488
8489
/* If we (now or from before) have no firstcu, a firstcu from the
8490
branch becomes a reqcu if there isn't a branch reqcu. */
8491
8492
if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8493
branchreqcuflags >= REQ_NONE)
8494
{
8495
branchreqcu = branchfirstcu;
8496
branchreqcuflags = branchfirstcuflags;
8497
}
8498
8499
/* Now ensure that the reqcus match */
8500
8501
if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8502
reqcu != branchreqcu)
8503
reqcuflags = REQ_NONE;
8504
else
8505
{
8506
reqcu = branchreqcu;
8507
reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8508
}
8509
}
8510
}
8511
8512
/* Handle reaching the end of the expression, either ')' or end of pattern.
8513
In the real compile phase, go back through the alternative branches and
8514
reverse the chain of offsets, with the field in the BRA item now becoming an
8515
offset to the first alternative. If there are no alternatives, it points to
8516
the end of the group. The length in the terminating ket is always the length
8517
of the whole bracketed item. Return leaving the pointer at the terminating
8518
char. */
8519
8520
if (META_CODE(*pptr) != META_ALT)
8521
{
8522
if (lengthptr == NULL)
8523
{
8524
uint32_t branch_length = (uint32_t)(code - last_branch);
8525
do
8526
{
8527
uint32_t prev_length = GET(last_branch, 1);
8528
PUT(last_branch, 1, branch_length);
8529
branch_length = prev_length;
8530
last_branch -= branch_length;
8531
}
8532
while (branch_length > 0);
8533
}
8534
8535
/* Fill in the ket */
8536
8537
*code = OP_KET;
8538
PUT(code, 1, (uint32_t)(code - start_bracket));
8539
code += 1 + LINK_SIZE;
8540
8541
/* Set values to pass back */
8542
8543
*codeptr = code;
8544
*pptrptr = pptr;
8545
*firstcuptr = firstcu;
8546
*firstcuflagsptr = firstcuflags;
8547
*reqcuptr = reqcu;
8548
*reqcuflagsptr = reqcuflags;
8549
if (lengthptr != NULL)
8550
{
8551
if (OFLOW_MAX - *lengthptr < length)
8552
{
8553
*errorcodeptr = ERR20;
8554
return 0;
8555
}
8556
*lengthptr += length;
8557
}
8558
return okreturn;
8559
}
8560
8561
/* Another branch follows. In the pre-compile phase, we can move the code
8562
pointer back to where it was for the start of the first branch. (That is,
8563
pretend that each branch is the only one.)
8564
8565
In the real compile phase, insert an ALT node. Its length field points back
8566
to the previous branch while the bracket remains open. At the end the chain
8567
is reversed. It's done like this so that the start of the bracket has a
8568
zero offset until it is closed, making it possible to detect recursion. */
8569
8570
if (lengthptr != NULL)
8571
{
8572
code = *codeptr + 1 + LINK_SIZE + skipunits;
8573
length += 1 + LINK_SIZE;
8574
}
8575
else
8576
{
8577
*code = OP_ALT;
8578
PUT(code, 1, (int)(code - last_branch));
8579
bc.current_branch = last_branch = code;
8580
code += 1 + LINK_SIZE;
8581
}
8582
8583
/* Set the maximum lookbehind length for the next branch (if not in a
8584
lookbehind the value will be zero) and then advance past the vertical bar. */
8585
8586
lookbehindlength = META_DATA(*pptr);
8587
pptr++;
8588
}
8589
8590
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8591
return 0; /* Avoid compiler warnings */
8592
}
8593
8594
8595
8596
/*************************************************
8597
* Check for anchored pattern *
8598
*************************************************/
8599
8600
/* Try to find out if this is an anchored regular expression. Consider each
8601
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8602
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8603
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8604
be found, because ^ generates OP_CIRCM in that mode.
8605
8606
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8607
This is the code for \G, which means "match at start of match position, taking
8608
into account the match offset".
8609
8610
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8611
because that will try the rest of the pattern at all possible matching points,
8612
so there is no point trying again.... er ....
8613
8614
.... except when the .* appears inside capturing parentheses, and there is a
8615
subsequent back reference to those parentheses. We haven't enough information
8616
to catch that case precisely.
8617
8618
At first, the best we could do was to detect when .* was in capturing brackets
8619
and the highest back reference was greater than or equal to that level.
8620
However, by keeping a bitmap of the first 31 back references, we can catch some
8621
of the more common cases more precisely.
8622
8623
... A second exception is when the .* appears inside an atomic group, because
8624
this prevents the number of characters it matches from being adjusted.
8625
8626
Arguments:
8627
code points to start of the compiled pattern
8628
bracket_map a bitmap of which brackets we are inside while testing; this
8629
handles up to substring 31; after that we just have to take
8630
the less precise approach
8631
cb points to the compile data block
8632
atomcount atomic group level
8633
inassert TRUE if in an assertion
8634
dotstar_anchor TRUE if automatic anchoring optimization is enabled
8635
8636
Returns: TRUE or FALSE
8637
*/
8638
8639
static BOOL
8640
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8641
int atomcount, BOOL inassert, BOOL dotstar_anchor)
8642
{
8643
do {
8644
PCRE2_SPTR scode = first_significant_code(
8645
code + PRIV(OP_lengths)[*code], FALSE);
8646
int op = *scode;
8647
8648
/* Non-capturing brackets */
8649
8650
if (op == OP_BRA || op == OP_BRAPOS ||
8651
op == OP_SBRA || op == OP_SBRAPOS)
8652
{
8653
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8654
return FALSE;
8655
}
8656
8657
/* Capturing brackets */
8658
8659
else if (op == OP_CBRA || op == OP_CBRAPOS ||
8660
op == OP_SCBRA || op == OP_SCBRAPOS)
8661
{
8662
int n = GET2(scode, 1+LINK_SIZE);
8663
uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8664
if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8665
}
8666
8667
/* Positive forward assertion */
8668
8669
else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8670
{
8671
if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8672
}
8673
8674
/* Condition. If there is no second branch, it can't be anchored. */
8675
8676
else if (op == OP_COND || op == OP_SCOND)
8677
{
8678
if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8679
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8680
return FALSE;
8681
}
8682
8683
/* Atomic groups */
8684
8685
else if (op == OP_ONCE)
8686
{
8687
if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8688
return FALSE;
8689
}
8690
8691
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8692
it isn't in brackets that are or may be referenced or inside an atomic
8693
group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8694
because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8695
with the subject "aab", which matches "b", i.e. not at the start of a line.
8696
There is also an option that disables auto-anchoring. */
8697
8698
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8699
op == OP_TYPEPOSSTAR))
8700
{
8701
if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8702
atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8703
return FALSE;
8704
}
8705
8706
/* Check for explicit anchoring */
8707
8708
else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8709
8710
code += GET(code, 1);
8711
}
8712
while (*code == OP_ALT); /* Loop for each alternative */
8713
return TRUE;
8714
}
8715
8716
8717
8718
/*************************************************
8719
* Check for starting with ^ or .* *
8720
*************************************************/
8721
8722
/* This is called to find out if every branch starts with ^ or .* so that
8723
"first char" processing can be done to speed things up in multiline
8724
matching and for non-DOTALL patterns that start with .* (which must start at
8725
the beginning or after \n). As in the case of is_anchored() (see above), we
8726
have to take account of back references to capturing brackets that contain .*
8727
because in that case we can't make the assumption. Also, the appearance of .*
8728
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8729
or *SKIP does not count, because once again the assumption no longer holds.
8730
8731
Arguments:
8732
code points to start of the compiled pattern or a group
8733
bracket_map a bitmap of which brackets we are inside while testing; this
8734
handles up to substring 31; after that we just have to take
8735
the less precise approach
8736
cb points to the compile data
8737
atomcount atomic group level
8738
inassert TRUE if in an assertion
8739
dotstar_anchor TRUE if automatic anchoring optimization is enabled
8740
8741
Returns: TRUE or FALSE
8742
*/
8743
8744
static BOOL
8745
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8746
int atomcount, BOOL inassert, BOOL dotstar_anchor)
8747
{
8748
do {
8749
PCRE2_SPTR scode = first_significant_code(
8750
code + PRIV(OP_lengths)[*code], FALSE);
8751
int op = *scode;
8752
8753
/* If we are at the start of a conditional assertion group, *both* the
8754
conditional assertion *and* what follows the condition must satisfy the test
8755
for start of line. Other kinds of condition fail. Note that there may be an
8756
auto-callout at the start of a condition. */
8757
8758
if (op == OP_COND)
8759
{
8760
scode += 1 + LINK_SIZE;
8761
8762
if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8763
else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8764
8765
switch (*scode)
8766
{
8767
case OP_CREF:
8768
case OP_DNCREF:
8769
case OP_RREF:
8770
case OP_DNRREF:
8771
case OP_FAIL:
8772
case OP_FALSE:
8773
case OP_TRUE:
8774
return FALSE;
8775
8776
default: /* Assertion */
8777
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8778
return FALSE;
8779
do scode += GET(scode, 1); while (*scode == OP_ALT);
8780
scode += 1 + LINK_SIZE;
8781
break;
8782
}
8783
scode = first_significant_code(scode, FALSE);
8784
op = *scode;
8785
}
8786
8787
/* Non-capturing brackets */
8788
8789
if (op == OP_BRA || op == OP_BRAPOS ||
8790
op == OP_SBRA || op == OP_SBRAPOS)
8791
{
8792
if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8793
return FALSE;
8794
}
8795
8796
/* Capturing brackets */
8797
8798
else if (op == OP_CBRA || op == OP_CBRAPOS ||
8799
op == OP_SCBRA || op == OP_SCBRAPOS)
8800
{
8801
int n = GET2(scode, 1+LINK_SIZE);
8802
unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8803
if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
8804
return FALSE;
8805
}
8806
8807
/* Positive forward assertions */
8808
8809
else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8810
{
8811
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8812
return FALSE;
8813
}
8814
8815
/* Atomic brackets */
8816
8817
else if (op == OP_ONCE)
8818
{
8819
if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8820
return FALSE;
8821
}
8822
8823
/* .* means "start at start or after \n" if it isn't in atomic brackets or
8824
brackets that may be referenced or an assertion, and as long as the pattern
8825
does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8826
for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8827
i.e. not at the start of a line. There is also an option that disables this
8828
optimization. */
8829
8830
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8831
{
8832
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8833
atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8834
return FALSE;
8835
}
8836
8837
/* Check for explicit circumflex; anything else gives a FALSE result. Note
8838
in particular that this includes atomic brackets OP_ONCE because the number
8839
of characters matched by .* cannot be adjusted inside them. */
8840
8841
else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8842
8843
/* Move on to the next alternative */
8844
8845
code += GET(code, 1);
8846
}
8847
while (*code == OP_ALT); /* Loop for each alternative */
8848
return TRUE;
8849
}
8850
8851
8852
8853
/*************************************************
8854
* Scan compiled regex for recursion reference *
8855
*************************************************/
8856
8857
/* This function scans through a compiled pattern until it finds an instance of
8858
OP_RECURSE.
8859
8860
Arguments:
8861
code points to start of expression
8862
utf TRUE in UTF mode
8863
8864
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8865
*/
8866
8867
static PCRE2_UCHAR *
8868
find_recurse(PCRE2_UCHAR *code, BOOL utf)
8869
{
8870
for (;;)
8871
{
8872
PCRE2_UCHAR c = *code;
8873
if (c == OP_END) return NULL;
8874
if (c == OP_RECURSE) return code;
8875
8876
/* XCLASS is used for classes that cannot be represented just by a bit map.
8877
This includes negated single high-valued characters. ECLASS is used for
8878
classes that use set operations internally. CALLOUT_STR is used for
8879
callouts with string arguments. In each case the length in the table is
8880
zero; the actual length is stored in the compiled code. */
8881
8882
if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
8883
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8884
8885
/* Otherwise, we can get the item's length from the table, except that for
8886
repeated character types, we have to test for \p and \P, which have an extra
8887
two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8888
we must add in its length. */
8889
8890
else
8891
{
8892
switch(c)
8893
{
8894
case OP_TYPESTAR:
8895
case OP_TYPEMINSTAR:
8896
case OP_TYPEPLUS:
8897
case OP_TYPEMINPLUS:
8898
case OP_TYPEQUERY:
8899
case OP_TYPEMINQUERY:
8900
case OP_TYPEPOSSTAR:
8901
case OP_TYPEPOSPLUS:
8902
case OP_TYPEPOSQUERY:
8903
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8904
break;
8905
8906
case OP_TYPEPOSUPTO:
8907
case OP_TYPEUPTO:
8908
case OP_TYPEMINUPTO:
8909
case OP_TYPEEXACT:
8910
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8911
code += 2;
8912
break;
8913
8914
case OP_MARK:
8915
case OP_COMMIT_ARG:
8916
case OP_PRUNE_ARG:
8917
case OP_SKIP_ARG:
8918
case OP_THEN_ARG:
8919
code += code[1];
8920
break;
8921
}
8922
8923
/* Add in the fixed length from the table */
8924
8925
code += PRIV(OP_lengths)[c];
8926
8927
/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8928
be followed by a multi-unit character. The length in the table is a
8929
minimum, so we have to arrange to skip the extra units. */
8930
8931
#ifdef MAYBE_UTF_MULTI
8932
if (utf) switch(c)
8933
{
8934
case OP_CHAR:
8935
case OP_CHARI:
8936
case OP_NOT:
8937
case OP_NOTI:
8938
case OP_EXACT:
8939
case OP_EXACTI:
8940
case OP_NOTEXACT:
8941
case OP_NOTEXACTI:
8942
case OP_UPTO:
8943
case OP_UPTOI:
8944
case OP_NOTUPTO:
8945
case OP_NOTUPTOI:
8946
case OP_MINUPTO:
8947
case OP_MINUPTOI:
8948
case OP_NOTMINUPTO:
8949
case OP_NOTMINUPTOI:
8950
case OP_POSUPTO:
8951
case OP_POSUPTOI:
8952
case OP_NOTPOSUPTO:
8953
case OP_NOTPOSUPTOI:
8954
case OP_STAR:
8955
case OP_STARI:
8956
case OP_NOTSTAR:
8957
case OP_NOTSTARI:
8958
case OP_MINSTAR:
8959
case OP_MINSTARI:
8960
case OP_NOTMINSTAR:
8961
case OP_NOTMINSTARI:
8962
case OP_POSSTAR:
8963
case OP_POSSTARI:
8964
case OP_NOTPOSSTAR:
8965
case OP_NOTPOSSTARI:
8966
case OP_PLUS:
8967
case OP_PLUSI:
8968
case OP_NOTPLUS:
8969
case OP_NOTPLUSI:
8970
case OP_MINPLUS:
8971
case OP_MINPLUSI:
8972
case OP_NOTMINPLUS:
8973
case OP_NOTMINPLUSI:
8974
case OP_POSPLUS:
8975
case OP_POSPLUSI:
8976
case OP_NOTPOSPLUS:
8977
case OP_NOTPOSPLUSI:
8978
case OP_QUERY:
8979
case OP_QUERYI:
8980
case OP_NOTQUERY:
8981
case OP_NOTQUERYI:
8982
case OP_MINQUERY:
8983
case OP_MINQUERYI:
8984
case OP_NOTMINQUERY:
8985
case OP_NOTMINQUERYI:
8986
case OP_POSQUERY:
8987
case OP_POSQUERYI:
8988
case OP_NOTPOSQUERY:
8989
case OP_NOTPOSQUERYI:
8990
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8991
break;
8992
}
8993
#else
8994
(void)(utf); /* Keep compiler happy by referencing function argument */
8995
#endif /* MAYBE_UTF_MULTI */
8996
}
8997
}
8998
}
8999
9000
9001
9002
/*************************************************
9003
* Check for asserted fixed first code unit *
9004
*************************************************/
9005
9006
/* During compilation, the "first code unit" settings from forward assertions
9007
are discarded, because they can cause conflicts with actual literals that
9008
follow. However, if we end up without a first code unit setting for an
9009
unanchored pattern, it is worth scanning the regex to see if there is an
9010
initial asserted first code unit. If all branches start with the same asserted
9011
code unit, or with a non-conditional bracket all of whose alternatives start
9012
with the same asserted code unit (recurse ad lib), then we return that code
9013
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9014
REQ_NONE in the flags.
9015
9016
Arguments:
9017
code points to start of compiled pattern
9018
flags points to the first code unit flags
9019
inassert non-zero if in an assertion
9020
9021
Returns: the fixed first code unit, or 0 with REQ_NONE in flags
9022
*/
9023
9024
static uint32_t
9025
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9026
{
9027
uint32_t c = 0;
9028
uint32_t cflags = REQ_NONE;
9029
9030
*flags = REQ_NONE;
9031
do {
9032
uint32_t d;
9033
uint32_t dflags;
9034
int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9035
*code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9036
PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9037
PCRE2_UCHAR op = *scode;
9038
9039
switch(op)
9040
{
9041
default:
9042
return 0;
9043
9044
case OP_BRA:
9045
case OP_BRAPOS:
9046
case OP_CBRA:
9047
case OP_SCBRA:
9048
case OP_CBRAPOS:
9049
case OP_SCBRAPOS:
9050
case OP_ASSERT:
9051
case OP_ASSERT_NA:
9052
case OP_ONCE:
9053
case OP_SCRIPT_RUN:
9054
d = find_firstassertedcu(scode, &dflags, inassert +
9055
((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9056
if (dflags >= REQ_NONE) return 0;
9057
if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9058
else if (c != d || cflags != dflags) return 0;
9059
break;
9060
9061
case OP_EXACT:
9062
scode += IMM2_SIZE;
9063
/* Fall through */
9064
9065
case OP_CHAR:
9066
case OP_PLUS:
9067
case OP_MINPLUS:
9068
case OP_POSPLUS:
9069
if (inassert == 0) return 0;
9070
if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9071
else if (c != scode[1]) return 0;
9072
break;
9073
9074
case OP_EXACTI:
9075
scode += IMM2_SIZE;
9076
/* Fall through */
9077
9078
case OP_CHARI:
9079
case OP_PLUSI:
9080
case OP_MINPLUSI:
9081
case OP_POSPLUSI:
9082
if (inassert == 0) return 0;
9083
9084
/* If the character is more than one code unit long, we cannot set its
9085
first code unit when matching caselessly. Later scanning may pick up
9086
multiple code units. */
9087
9088
#ifdef SUPPORT_UNICODE
9089
#if PCRE2_CODE_UNIT_WIDTH == 8
9090
if (scode[1] >= 0x80) return 0;
9091
#elif PCRE2_CODE_UNIT_WIDTH == 16
9092
if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9093
#endif
9094
#endif
9095
9096
if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9097
else if (c != scode[1]) return 0;
9098
break;
9099
}
9100
9101
code += GET(code, 1);
9102
}
9103
while (*code == OP_ALT);
9104
9105
*flags = cflags;
9106
return c;
9107
}
9108
9109
9110
9111
/*************************************************
9112
* Add an entry to the name/number table *
9113
*************************************************/
9114
9115
/* This function is called between compiling passes to add an entry to the
9116
name/number table, maintaining alphabetical order. Checking for permitted
9117
and forbidden duplicates has already been done.
9118
9119
Arguments:
9120
cb the compile data block
9121
name the name to add
9122
length the length of the name
9123
groupno the group number
9124
tablecount the count of names in the table so far
9125
9126
Returns: nothing
9127
*/
9128
9129
static void
9130
add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9131
unsigned int groupno, uint32_t tablecount)
9132
{
9133
uint32_t i;
9134
PCRE2_UCHAR *slot = cb->name_table;
9135
9136
for (i = 0; i < tablecount; i++)
9137
{
9138
int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9139
if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9140
crc = -1; /* Current name is a substring */
9141
9142
/* Make space in the table and break the loop for an earlier name. For a
9143
duplicate or later name, carry on. We do this for duplicates so that in the
9144
simple case (when ?(| is not used) they are in order of their numbers. In all
9145
cases they are in the order in which they appear in the pattern. */
9146
9147
if (crc < 0)
9148
{
9149
(void)memmove(slot + cb->name_entry_size, slot,
9150
CU2BYTES((tablecount - i) * cb->name_entry_size));
9151
break;
9152
}
9153
9154
/* Continue the loop for a later or duplicate name */
9155
9156
slot += cb->name_entry_size;
9157
}
9158
9159
PUT2(slot, 0, groupno);
9160
memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9161
9162
/* Add a terminating zero and fill the rest of the slot with zeroes so that
9163
the memory is all initialized. Otherwise valgrind moans about uninitialized
9164
memory when saving serialized compiled patterns. */
9165
9166
memset(slot + IMM2_SIZE + length, 0,
9167
CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9168
}
9169
9170
9171
9172
/*************************************************
9173
* Skip in parsed pattern *
9174
*************************************************/
9175
9176
/* This function is called to skip parts of the parsed pattern when finding the
9177
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9178
the end of the branch, it is called to skip over an internal lookaround or
9179
(DEFINE) group, and it is also called to skip to the end of a class, during
9180
which it will never encounter nested groups (but there's no need to have
9181
special code for that).
9182
9183
When called to find the end of a branch or group, pptr must point to the first
9184
meta code inside the branch, not the branch-starting code. In other cases it
9185
can point to the item that causes the function to be called.
9186
9187
Arguments:
9188
pptr current pointer to skip from
9189
skiptype PSKIP_CLASS when skipping to end of class
9190
PSKIP_ALT when META_ALT ends the skip
9191
PSKIP_KET when only META_KET ends the skip
9192
9193
Returns: new value of pptr
9194
NULL if META_END is reached - should never occur
9195
or for an unknown meta value - likewise
9196
*/
9197
9198
static uint32_t *
9199
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9200
{
9201
uint32_t nestlevel = 0;
9202
9203
for (;; pptr++)
9204
{
9205
uint32_t meta = META_CODE(*pptr);
9206
9207
switch(meta)
9208
{
9209
default: /* Just skip over most items */
9210
if (meta < META_END) continue; /* Literal */
9211
break;
9212
9213
case META_END:
9214
9215
/* The parsed regex is malformed; we have reached the end and did
9216
not find the end of the construct which we are skipping over. */
9217
9218
PCRE2_DEBUG_UNREACHABLE();
9219
return NULL;
9220
9221
/* The data for these items is variable in length. */
9222
9223
case META_BACKREF: /* Offset is present only if group >= 10 */
9224
if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9225
break;
9226
9227
case META_ESCAPE:
9228
if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9229
pptr += 1; /* Skip prop data */
9230
break;
9231
9232
case META_MARK: /* Add the length of the name. */
9233
case META_COMMIT_ARG:
9234
case META_PRUNE_ARG:
9235
case META_SKIP_ARG:
9236
case META_THEN_ARG:
9237
pptr += pptr[1];
9238
break;
9239
9240
/* These are the "active" items in this loop. */
9241
9242
case META_CLASS_END:
9243
if (skiptype == PSKIP_CLASS) return pptr;
9244
break;
9245
9246
case META_ATOMIC:
9247
case META_CAPTURE:
9248
case META_COND_ASSERT:
9249
case META_COND_DEFINE:
9250
case META_COND_NAME:
9251
case META_COND_NUMBER:
9252
case META_COND_RNAME:
9253
case META_COND_RNUMBER:
9254
case META_COND_VERSION:
9255
case META_SCS:
9256
case META_LOOKAHEAD:
9257
case META_LOOKAHEADNOT:
9258
case META_LOOKAHEAD_NA:
9259
case META_LOOKBEHIND:
9260
case META_LOOKBEHINDNOT:
9261
case META_LOOKBEHIND_NA:
9262
case META_NOCAPTURE:
9263
case META_SCRIPT_RUN:
9264
nestlevel++;
9265
break;
9266
9267
case META_ALT:
9268
if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9269
break;
9270
9271
case META_KET:
9272
if (nestlevel == 0) return pptr;
9273
nestlevel--;
9274
break;
9275
}
9276
9277
/* The extra data item length for each meta is in a table. */
9278
9279
meta = (meta >> 16) & 0x7fff;
9280
if (meta >= sizeof(meta_extra_lengths)) return NULL;
9281
pptr += meta_extra_lengths[meta];
9282
}
9283
9284
PCRE2_UNREACHABLE(); /* Control never reaches here */
9285
}
9286
9287
9288
9289
/*************************************************
9290
* Find length of a parsed group *
9291
*************************************************/
9292
9293
/* This is called for nested groups within a branch of a lookbehind whose
9294
length is being computed. On entry, the pointer must be at the first element
9295
after the group initializing code. On exit it points to OP_KET. Caching is used
9296
to improve processing speed when the same capturing group occurs many times.
9297
9298
Arguments:
9299
pptrptr pointer to pointer in the parsed pattern
9300
minptr where to return the minimum length
9301
isinline FALSE if a reference or recursion; TRUE for inline group
9302
errcodeptr pointer to the errorcode
9303
lcptr pointer to the loop counter
9304
group number of captured group or -1 for a non-capturing group
9305
recurses chain of recurse_check to catch mutual recursion
9306
cb pointer to the compile data
9307
9308
Returns: the maximum group length or a negative number
9309
*/
9310
9311
static int
9312
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9313
int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9314
{
9315
uint32_t *gi = cb->groupinfo + 2 * group;
9316
int branchlength, branchminlength;
9317
int grouplength = -1;
9318
int groupminlength = INT_MAX;
9319
9320
/* The cache can be used only if there is no possibility of there being two
9321
groups with the same number. We do not need to set the end pointer for a group
9322
that is being processed as a back reference or recursion, but we must do so for
9323
an inline group. */
9324
9325
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9326
{
9327
uint32_t groupinfo = gi[0];
9328
if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9329
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9330
{
9331
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9332
*minptr = gi[1];
9333
return groupinfo & GI_FIXED_LENGTH_MASK;
9334
}
9335
}
9336
9337
/* Scan the group. In this case we find the end pointer of necessity. */
9338
9339
for(;;)
9340
{
9341
branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9342
recurses, cb);
9343
if (branchlength < 0) goto ISNOTFIXED;
9344
if (branchlength > grouplength) grouplength = branchlength;
9345
if (branchminlength < groupminlength) groupminlength = branchminlength;
9346
if (**pptrptr == META_KET) break;
9347
*pptrptr += 1; /* Skip META_ALT */
9348
}
9349
9350
if (group > 0)
9351
{
9352
gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9353
gi[1] = groupminlength;
9354
}
9355
9356
*minptr = groupminlength;
9357
return grouplength;
9358
9359
ISNOTFIXED:
9360
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9361
return -1;
9362
}
9363
9364
9365
9366
/*************************************************
9367
* Find length of a parsed branch *
9368
*************************************************/
9369
9370
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9371
giving an error if the length is not limited. On entry, *pptrptr points to the
9372
first element inside the branch. On exit it is set to point to the ALT or KET.
9373
9374
Arguments:
9375
pptrptr pointer to pointer in the parsed pattern
9376
minptr where to return the minimum length
9377
errcodeptr pointer to error code
9378
lcptr pointer to loop counter
9379
recurses chain of recurse_check to catch mutual recursion
9380
cb pointer to compile block
9381
9382
Returns: the maximum length, or a negative value on error
9383
*/
9384
9385
static int
9386
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9387
parsed_recurse_check *recurses, compile_block *cb)
9388
{
9389
int branchlength = 0;
9390
int branchminlength = 0;
9391
int grouplength, groupminlength;
9392
uint32_t lastitemlength = 0;
9393
uint32_t lastitemminlength = 0;
9394
uint32_t *pptr = *pptrptr;
9395
PCRE2_SIZE offset;
9396
parsed_recurse_check this_recurse;
9397
9398
/* A large and/or complex regex can take too long to process. This can happen
9399
more often when (?| groups are present in the pattern because their length
9400
cannot be cached. */
9401
9402
if ((*lcptr)++ > 2000)
9403
{
9404
*errcodeptr = ERR35; /* Lookbehind is too complicated */
9405
return -1;
9406
}
9407
9408
/* Scan the branch, accumulating the length. */
9409
9410
for (;; pptr++)
9411
{
9412
parsed_recurse_check *r;
9413
uint32_t *gptr, *gptrend;
9414
uint32_t escape;
9415
uint32_t min, max;
9416
uint32_t group = 0;
9417
uint32_t itemlength = 0;
9418
uint32_t itemminlength = 0;
9419
9420
if (*pptr < META_END)
9421
{
9422
itemlength = itemminlength = 1;
9423
}
9424
9425
else switch (META_CODE(*pptr))
9426
{
9427
case META_KET:
9428
case META_ALT:
9429
goto EXIT;
9430
9431
/* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9432
actual termination. */
9433
9434
case META_ACCEPT:
9435
case META_FAIL:
9436
pptr = parsed_skip(pptr, PSKIP_ALT);
9437
if (pptr == NULL) goto PARSED_SKIP_FAILED;
9438
goto EXIT;
9439
9440
case META_MARK:
9441
case META_COMMIT_ARG:
9442
case META_PRUNE_ARG:
9443
case META_SKIP_ARG:
9444
case META_THEN_ARG:
9445
pptr += pptr[1] + 1;
9446
break;
9447
9448
case META_CIRCUMFLEX:
9449
case META_COMMIT:
9450
case META_DOLLAR:
9451
case META_PRUNE:
9452
case META_SKIP:
9453
case META_THEN:
9454
break;
9455
9456
case META_OPTIONS:
9457
pptr += 2;
9458
break;
9459
9460
case META_BIGVALUE:
9461
itemlength = itemminlength = 1;
9462
pptr += 1;
9463
break;
9464
9465
case META_CLASS:
9466
case META_CLASS_NOT:
9467
itemlength = itemminlength = 1;
9468
pptr = parsed_skip(pptr, PSKIP_CLASS);
9469
if (pptr == NULL) goto PARSED_SKIP_FAILED;
9470
break;
9471
9472
case META_CLASS_EMPTY_NOT:
9473
case META_DOT:
9474
itemlength = itemminlength = 1;
9475
break;
9476
9477
case META_CALLOUT_NUMBER:
9478
pptr += 3;
9479
break;
9480
9481
case META_CALLOUT_STRING:
9482
pptr += 3 + SIZEOFFSET;
9483
break;
9484
9485
/* Only some escapes consume a character. Of those, \R can match one or two
9486
characters, but \X is never allowed because it matches an unknown number of
9487
characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9488
9489
case META_ESCAPE:
9490
escape = META_DATA(*pptr);
9491
if (escape == ESC_X) return -1;
9492
if (escape == ESC_R)
9493
{
9494
itemminlength = 1;
9495
itemlength = 2;
9496
}
9497
else if (escape > ESC_b && escape < ESC_Z)
9498
{
9499
#if PCRE2_CODE_UNIT_WIDTH != 32
9500
if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9501
{
9502
*errcodeptr = ERR36;
9503
return -1;
9504
}
9505
#endif
9506
itemlength = itemminlength = 1;
9507
if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9508
}
9509
break;
9510
9511
/* Lookaheads do not contribute to the length of this branch, but they may
9512
contain lookbehinds within them whose lengths need to be set. */
9513
9514
case META_LOOKAHEAD:
9515
case META_LOOKAHEADNOT:
9516
case META_LOOKAHEAD_NA:
9517
case META_SCS:
9518
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9519
if (*errcodeptr != 0) return -1;
9520
9521
/* Ignore any qualifiers that follow a lookahead assertion. */
9522
9523
switch (pptr[1])
9524
{
9525
case META_ASTERISK:
9526
case META_ASTERISK_PLUS:
9527
case META_ASTERISK_QUERY:
9528
case META_PLUS:
9529
case META_PLUS_PLUS:
9530
case META_PLUS_QUERY:
9531
case META_QUERY:
9532
case META_QUERY_PLUS:
9533
case META_QUERY_QUERY:
9534
pptr++;
9535
break;
9536
9537
case META_MINMAX:
9538
case META_MINMAX_PLUS:
9539
case META_MINMAX_QUERY:
9540
pptr += 3;
9541
break;
9542
9543
default:
9544
break;
9545
}
9546
break;
9547
9548
/* A nested lookbehind does not contribute any length to this lookbehind,
9549
but must itself be checked and have its lengths set. Note that
9550
set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9551
of the group, so no need to update it here. */
9552
9553
case META_LOOKBEHIND:
9554
case META_LOOKBEHINDNOT:
9555
case META_LOOKBEHIND_NA:
9556
if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9557
return -1;
9558
break;
9559
9560
/* Back references and recursions are handled by very similar code. At this
9561
stage, the names generated in the parsing pass are available, but the main
9562
name table has not yet been created. So for the named varieties, scan the
9563
list of names in order to get the number of the first one in the pattern,
9564
and whether or not this name is duplicated. */
9565
9566
case META_BACKREF_BYNAME:
9567
if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9568
goto ISNOTFIXED;
9569
/* Fall through */
9570
9571
case META_RECURSE_BYNAME:
9572
{
9573
int i;
9574
PCRE2_SPTR name;
9575
BOOL is_dupname = FALSE;
9576
named_group *ng = cb->named_groups;
9577
uint32_t meta_code = META_CODE(*pptr);
9578
uint32_t length = *(++pptr);
9579
9580
GETPLUSOFFSET(offset, pptr);
9581
name = cb->start_pattern + offset;
9582
for (i = 0; i < cb->names_found; i++, ng++)
9583
{
9584
if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9585
{
9586
group = ng->number;
9587
is_dupname = ng->isdup;
9588
break;
9589
}
9590
}
9591
9592
if (group == 0)
9593
{
9594
*errcodeptr = ERR15; /* Non-existent subpattern */
9595
cb->erroroffset = offset;
9596
return -1;
9597
}
9598
9599
/* A numerical back reference can be fixed length if duplicate capturing
9600
groups are not being used. A non-duplicate named back reference can also
9601
be handled. */
9602
9603
if (meta_code == META_RECURSE_BYNAME ||
9604
(!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9605
goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9606
}
9607
goto ISNOTFIXED; /* Duplicate name or number */
9608
9609
/* The offset values for back references < 10 are in a separate vector
9610
because otherwise they would use more than two parsed pattern elements on
9611
64-bit systems. */
9612
9613
case META_BACKREF:
9614
if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9615
(cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9616
goto ISNOTFIXED;
9617
group = META_DATA(*pptr);
9618
if (group < 10)
9619
{
9620
offset = cb->small_ref_offset[group];
9621
goto RECURSE_OR_BACKREF_LENGTH;
9622
}
9623
9624
/* Fall through */
9625
/* For groups >= 10 - picking up group twice does no harm. */
9626
9627
/* A true recursion implies not fixed length, but a subroutine call may
9628
be OK. Back reference "recursions" are also failed. */
9629
9630
case META_RECURSE:
9631
group = META_DATA(*pptr);
9632
GETPLUSOFFSET(offset, pptr);
9633
9634
RECURSE_OR_BACKREF_LENGTH:
9635
if (group > cb->bracount)
9636
{
9637
cb->erroroffset = offset;
9638
*errcodeptr = ERR15; /* Non-existent subpattern */
9639
return -1;
9640
}
9641
if (group == 0) goto ISNOTFIXED; /* Local recursion */
9642
for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9643
{
9644
if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9645
else if (*gptr == (META_CAPTURE | group)) break;
9646
}
9647
9648
/* We must start the search for the end of the group at the first meta code
9649
inside the group. Otherwise it will be treated as an enclosed group. */
9650
9651
gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9652
if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9653
if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9654
for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9655
if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9656
this_recurse.prev = recurses;
9657
this_recurse.groupptr = gptr;
9658
9659
/* We do not need to know the position of the end of the group, that is,
9660
gptr is not used after the call to get_grouplength(). Setting the second
9661
argument FALSE stops it scanning for the end when the length can be found
9662
in the cache. */
9663
9664
gptr++;
9665
grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9666
lcptr, group, &this_recurse, cb);
9667
if (grouplength < 0)
9668
{
9669
if (*errcodeptr == 0) goto ISNOTFIXED;
9670
return -1; /* Error already set */
9671
}
9672
itemlength = grouplength;
9673
itemminlength = groupminlength;
9674
break;
9675
9676
/* A (DEFINE) group is never obeyed inline and so it does not contribute to
9677
the length of this branch. Skip from the following item to the next
9678
unpaired ket. */
9679
9680
case META_COND_DEFINE:
9681
pptr = parsed_skip(pptr + 1, PSKIP_KET);
9682
break;
9683
9684
/* Check other nested groups - advance past the initial data for each type
9685
and then seek a fixed length with get_grouplength(). */
9686
9687
case META_COND_NAME:
9688
case META_COND_NUMBER:
9689
case META_COND_RNAME:
9690
case META_COND_RNUMBER:
9691
pptr += 2 + SIZEOFFSET;
9692
goto CHECK_GROUP;
9693
9694
case META_COND_ASSERT:
9695
pptr += 1;
9696
goto CHECK_GROUP;
9697
9698
case META_COND_VERSION:
9699
pptr += 4;
9700
goto CHECK_GROUP;
9701
9702
case META_CAPTURE:
9703
group = META_DATA(*pptr);
9704
/* Fall through */
9705
9706
case META_ATOMIC:
9707
case META_NOCAPTURE:
9708
case META_SCRIPT_RUN:
9709
pptr++;
9710
CHECK_GROUP:
9711
grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9712
lcptr, group, recurses, cb);
9713
if (grouplength < 0) return -1;
9714
itemlength = grouplength;
9715
itemminlength = groupminlength;
9716
break;
9717
9718
case META_QUERY:
9719
case META_QUERY_PLUS:
9720
case META_QUERY_QUERY:
9721
min = 0;
9722
max = 1;
9723
goto REPETITION;
9724
9725
/* Exact repetition is OK; variable repetition is not. A repetition of zero
9726
must subtract the length that has already been added. */
9727
9728
case META_MINMAX:
9729
case META_MINMAX_PLUS:
9730
case META_MINMAX_QUERY:
9731
min = pptr[1];
9732
max = pptr[2];
9733
pptr += 2;
9734
9735
REPETITION:
9736
if (max != REPEAT_UNLIMITED)
9737
{
9738
if (lastitemlength != 0 && /* Should not occur, but just in case */
9739
max != 0 &&
9740
(INT_MAX - branchlength)/lastitemlength < max - 1)
9741
{
9742
*errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9743
return -1;
9744
}
9745
if (min == 0) branchminlength -= lastitemminlength;
9746
else itemminlength = (min - 1) * lastitemminlength;
9747
if (max == 0) branchlength -= lastitemlength;
9748
else itemlength = (max - 1) * lastitemlength;
9749
break;
9750
}
9751
/* Fall through */
9752
9753
/* Any other item means this branch does not have a fixed length. */
9754
9755
default:
9756
ISNOTFIXED:
9757
*errcodeptr = ERR25; /* Not fixed length */
9758
return -1;
9759
}
9760
9761
/* Add the item length to the branchlength, checking for integer overflow and
9762
for the branch length exceeding the overall limit. Later, if there is at
9763
least one variable-length branch in the group, there is a test for the
9764
(smaller) variable-length branch length limit. */
9765
9766
if (INT_MAX - branchlength < (int)itemlength ||
9767
(branchlength += itemlength) > LOOKBEHIND_MAX)
9768
{
9769
*errcodeptr = ERR87;
9770
return -1;
9771
}
9772
9773
branchminlength += itemminlength;
9774
9775
/* Save this item length for use if the next item is a quantifier. */
9776
9777
lastitemlength = itemlength;
9778
lastitemminlength = itemminlength;
9779
}
9780
9781
EXIT:
9782
*pptrptr = pptr;
9783
*minptr = branchminlength;
9784
return branchlength;
9785
9786
PARSED_SKIP_FAILED:
9787
PCRE2_DEBUG_UNREACHABLE();
9788
*errcodeptr = ERR90; /* Unhandled META code - internal error */
9789
return -1;
9790
}
9791
9792
9793
9794
/*************************************************
9795
* Set lengths in a lookbehind *
9796
*************************************************/
9797
9798
/* This function is called for each lookbehind, to set the lengths in its
9799
branches. An error occurs if any branch does not have a limited maximum length
9800
that is less than the limit (65535). On exit, the pointer must be left on the
9801
final ket.
9802
9803
The function also maintains the max_lookbehind value. Any lookbehind branch
9804
that contains a nested lookbehind may actually look further back than the
9805
length of the branch. The additional amount is passed back from
9806
get_branchlength() as an "extra" value.
9807
9808
Arguments:
9809
pptrptr pointer to pointer in the parsed pattern
9810
errcodeptr pointer to error code
9811
lcptr pointer to loop counter
9812
recurses chain of recurse_check to catch mutual recursion
9813
cb pointer to compile block
9814
9815
Returns: TRUE if all is well
9816
FALSE otherwise, with error code and offset set
9817
*/
9818
9819
static BOOL
9820
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9821
parsed_recurse_check *recurses, compile_block *cb)
9822
{
9823
PCRE2_SIZE offset;
9824
uint32_t *bptr = *pptrptr;
9825
uint32_t *gbptr = bptr;
9826
int maxlength = 0;
9827
int minlength = INT_MAX;
9828
BOOL variable = FALSE;
9829
9830
READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9831
*pptrptr += SIZEOFFSET;
9832
9833
/* Each branch can have a different maximum length, but we can keep only a
9834
single minimum for the whole group, because there's nowhere to save individual
9835
values in the META_ALT item. */
9836
9837
do
9838
{
9839
int branchlength, branchminlength;
9840
9841
*pptrptr += 1;
9842
branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9843
recurses, cb);
9844
9845
if (branchlength < 0)
9846
{
9847
/* The errorcode and offset may already be set from a nested lookbehind. */
9848
if (*errcodeptr == 0) *errcodeptr = ERR25;
9849
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9850
return FALSE;
9851
}
9852
9853
if (branchlength != branchminlength) variable = TRUE;
9854
if (branchminlength < minlength) minlength = branchminlength;
9855
if (branchlength > maxlength) maxlength = branchlength;
9856
if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9857
*bptr |= branchlength; /* branchlength never more than 65535 */
9858
bptr = *pptrptr;
9859
}
9860
while (META_CODE(*bptr) == META_ALT);
9861
9862
/* If any branch is of variable length, the whole lookbehind is of variable
9863
length. If the maximum length of any branch exceeds the maximum for variable
9864
lookbehinds, give an error. Otherwise, the minimum length is set in the word
9865
that follows the original group META value. For a fixed-length lookbehind, this
9866
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9867
possibly different) length. */
9868
9869
if (variable)
9870
{
9871
gbptr[1] = minlength;
9872
if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
9873
{
9874
*errcodeptr = ERR100;
9875
cb->erroroffset = offset;
9876
return FALSE;
9877
}
9878
}
9879
else gbptr[1] = LOOKBEHIND_MAX;
9880
9881
return TRUE;
9882
}
9883
9884
9885
9886
/*************************************************
9887
* Check parsed pattern lookbehinds *
9888
*************************************************/
9889
9890
/* This function is called at the end of parsing a pattern if any lookbehinds
9891
were encountered. It scans the parsed pattern for them, calling
9892
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9893
the error offset is marked unset. The enables the functions above not to
9894
override settings from deeper nestings.
9895
9896
This function is called recursively from get_branchlength() for lookaheads in
9897
order to process any lookbehinds that they may contain. It stops when it hits a
9898
non-nested closing parenthesis in this case, returning a pointer to it.
9899
9900
Arguments
9901
pptr points to where to start (start of pattern or start of lookahead)
9902
retptr if not NULL, return the ket pointer here
9903
recurses chain of recurse_check to catch mutual recursion
9904
cb points to the compile block
9905
lcptr points to loop counter
9906
9907
Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9908
*/
9909
9910
static int
9911
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9912
parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9913
{
9914
int errorcode = 0;
9915
int nestlevel = 0;
9916
9917
cb->erroroffset = PCRE2_UNSET;
9918
9919
for (; *pptr != META_END; pptr++)
9920
{
9921
if (*pptr < META_END) continue; /* Literal */
9922
9923
switch (META_CODE(*pptr))
9924
{
9925
default:
9926
9927
/* The following erroroffset is a bogus but safe value. This branch should
9928
be avoided by providing a proper implementation for all supported cases
9929
below. */
9930
9931
PCRE2_DEBUG_UNREACHABLE();
9932
cb->erroroffset = 0;
9933
return ERR70; /* Unrecognized meta code */
9934
9935
case META_ESCAPE:
9936
if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9937
pptr += 1; /* Skip prop data */
9938
break;
9939
9940
case META_KET:
9941
if (--nestlevel < 0)
9942
{
9943
if (retptr != NULL) *retptr = pptr;
9944
return 0;
9945
}
9946
break;
9947
9948
case META_ATOMIC:
9949
case META_CAPTURE:
9950
case META_COND_ASSERT:
9951
case META_SCS:
9952
case META_LOOKAHEAD:
9953
case META_LOOKAHEADNOT:
9954
case META_LOOKAHEAD_NA:
9955
case META_NOCAPTURE:
9956
case META_SCRIPT_RUN:
9957
nestlevel++;
9958
break;
9959
9960
case META_ACCEPT:
9961
case META_ALT:
9962
case META_ASTERISK:
9963
case META_ASTERISK_PLUS:
9964
case META_ASTERISK_QUERY:
9965
case META_BACKREF:
9966
case META_CIRCUMFLEX:
9967
case META_CLASS:
9968
case META_CLASS_EMPTY:
9969
case META_CLASS_EMPTY_NOT:
9970
case META_CLASS_END:
9971
case META_CLASS_NOT:
9972
case META_COMMIT:
9973
case META_DOLLAR:
9974
case META_DOT:
9975
case META_FAIL:
9976
case META_PLUS:
9977
case META_PLUS_PLUS:
9978
case META_PLUS_QUERY:
9979
case META_PRUNE:
9980
case META_QUERY:
9981
case META_QUERY_PLUS:
9982
case META_QUERY_QUERY:
9983
case META_RANGE_ESCAPED:
9984
case META_RANGE_LITERAL:
9985
case META_SKIP:
9986
case META_THEN:
9987
break;
9988
9989
case META_OFFSET:
9990
case META_RECURSE:
9991
pptr += SIZEOFFSET;
9992
break;
9993
9994
case META_BACKREF_BYNAME:
9995
case META_RECURSE_BYNAME:
9996
pptr += 1 + SIZEOFFSET;
9997
break;
9998
9999
case META_COND_DEFINE:
10000
pptr += SIZEOFFSET;
10001
nestlevel++;
10002
break;
10003
10004
case META_COND_NAME:
10005
case META_COND_NUMBER:
10006
case META_COND_RNAME:
10007
case META_COND_RNUMBER:
10008
pptr += 1 + SIZEOFFSET;
10009
nestlevel++;
10010
break;
10011
10012
case META_COND_VERSION:
10013
pptr += 3;
10014
nestlevel++;
10015
break;
10016
10017
case META_CALLOUT_STRING:
10018
pptr += 3 + SIZEOFFSET;
10019
break;
10020
10021
case META_BIGVALUE:
10022
case META_POSIX:
10023
case META_POSIX_NEG:
10024
case META_SCS_NAME:
10025
case META_SCS_NUMBER:
10026
pptr += 1;
10027
break;
10028
10029
case META_MINMAX:
10030
case META_MINMAX_QUERY:
10031
case META_MINMAX_PLUS:
10032
case META_OPTIONS:
10033
pptr += 2;
10034
break;
10035
10036
case META_CALLOUT_NUMBER:
10037
pptr += 3;
10038
break;
10039
10040
case META_MARK:
10041
case META_COMMIT_ARG:
10042
case META_PRUNE_ARG:
10043
case META_SKIP_ARG:
10044
case META_THEN_ARG:
10045
pptr += 1 + pptr[1];
10046
break;
10047
10048
/* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10049
the final ket of the group, so no need to update it here. */
10050
10051
case META_LOOKBEHIND:
10052
case META_LOOKBEHINDNOT:
10053
case META_LOOKBEHIND_NA:
10054
if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10055
return errorcode;
10056
break;
10057
}
10058
}
10059
10060
return 0;
10061
}
10062
10063
10064
10065
/*************************************************
10066
* External function to compile a pattern *
10067
*************************************************/
10068
10069
/* This function reads a regular expression in the form of a string and returns
10070
a pointer to a block of store holding a compiled version of the expression.
10071
10072
Arguments:
10073
pattern the regular expression
10074
patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
10075
options option bits
10076
errorptr pointer to errorcode
10077
erroroffset pointer to error offset
10078
ccontext points to a compile context or is NULL
10079
10080
Returns: pointer to compiled data block, or NULL on error,
10081
with errorcode and erroroffset set
10082
*/
10083
10084
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10085
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10086
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10087
{
10088
BOOL utf; /* Set TRUE for UTF mode */
10089
BOOL ucp; /* Set TRUE for UCP mode */
10090
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
10091
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
10092
pcre2_real_code *re = NULL; /* What we will return */
10093
compile_block cb; /* "Static" compile-time data */
10094
const uint8_t *tables; /* Char tables base pointer */
10095
10096
PCRE2_UCHAR *code; /* Current pointer in compiled code */
10097
PCRE2_UCHAR * codestart; /* Start of compiled code */
10098
PCRE2_SPTR ptr; /* Current pointer in pattern */
10099
uint32_t *pptr; /* Current pointer in parsed pattern */
10100
10101
PCRE2_SIZE length = 1; /* Allow for final END opcode */
10102
PCRE2_SIZE usedlength; /* Actual length used */
10103
PCRE2_SIZE re_blocksize; /* Size of memory block */
10104
PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
10105
10106
uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
10107
uint32_t firstcu, reqcu; /* Value of first/req code unit */
10108
uint32_t setflags = 0; /* NL and BSR set flags */
10109
uint32_t xoptions; /* Flags from context, modified */
10110
10111
uint32_t skipatstart; /* When checking (*UTF) etc */
10112
uint32_t limit_heap = UINT32_MAX;
10113
uint32_t limit_match = UINT32_MAX; /* Unset match limits */
10114
uint32_t limit_depth = UINT32_MAX;
10115
10116
int newline = 0; /* Unset; can be set by the pattern */
10117
int bsr = 0; /* Unset; can be set by the pattern */
10118
int errorcode = 0; /* Initialize to avoid compiler warn */
10119
int regexrc; /* Return from compile */
10120
10121
uint32_t i; /* Local loop counter */
10122
10123
/* Enable all optimizations by default. */
10124
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10125
PCRE2_OPTIMIZATION_ALL;
10126
10127
/* Comments at the head of this file explain about these variables. */
10128
10129
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10130
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10131
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10132
10133
/* The workspace is used in different ways in the different compiling phases.
10134
It needs to be 16-bit aligned for the preliminary parsing scan. */
10135
10136
uint32_t c16workspace[C16_WORK_SIZE];
10137
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10138
10139
10140
/* -------------- Check arguments and set up the pattern ----------------- */
10141
10142
/* There must be error code and offset pointers. */
10143
10144
if (errorptr == NULL || erroroffset == NULL) return NULL;
10145
*errorptr = ERR0;
10146
*erroroffset = 0;
10147
10148
/* There must be a pattern, but NULL is allowed with zero length. */
10149
10150
if (pattern == NULL)
10151
{
10152
if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10153
{
10154
*errorptr = ERR16;
10155
return NULL;
10156
}
10157
}
10158
10159
/* A NULL compile context means "use a default context" */
10160
10161
if (ccontext == NULL)
10162
ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10163
10164
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10165
10166
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10167
10168
/* Check that all undefined public option bits are zero. */
10169
10170
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10171
(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10172
{
10173
*errorptr = ERR17;
10174
return NULL;
10175
}
10176
10177
if ((options & PCRE2_LITERAL) != 0 &&
10178
((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10179
(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10180
{
10181
*errorptr = ERR92;
10182
return NULL;
10183
}
10184
10185
/* A zero-terminated pattern is indicated by the special length value
10186
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10187
10188
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10189
patlen = PRIV(strlen)(pattern);
10190
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10191
10192
if (patlen > ccontext->max_pattern_length)
10193
{
10194
*errorptr = ERR88;
10195
return NULL;
10196
}
10197
10198
/* Optimization flags in 'options' can override those in the compile context.
10199
This is because some options to disable optimizations were added before the
10200
optimization flags word existed, and we need to continue supporting them
10201
for backwards compatibility. */
10202
10203
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10204
optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10205
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10206
optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10207
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10208
optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10209
10210
/* From here on, all returns from this function should end up going via the
10211
EXIT label. */
10212
10213
10214
/* ------------ Initialize the "static" compile data -------------- */
10215
10216
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10217
10218
cb.lcc = tables + lcc_offset; /* Individual */
10219
cb.fcc = tables + fcc_offset; /* character */
10220
cb.cbits = tables + cbits_offset; /* tables */
10221
cb.ctypes = tables + ctypes_offset;
10222
10223
cb.assert_depth = 0;
10224
cb.bracount = 0;
10225
cb.cx = ccontext;
10226
cb.dupnames = FALSE;
10227
cb.end_pattern = pattern + patlen;
10228
cb.erroroffset = 0;
10229
cb.external_flags = 0;
10230
cb.external_options = options;
10231
cb.groupinfo = stack_groupinfo;
10232
cb.had_recurse = FALSE;
10233
cb.lastcapture = 0;
10234
cb.max_lookbehind = 0; /* Max encountered */
10235
cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */
10236
cb.name_entry_size = 0;
10237
cb.name_table = NULL;
10238
cb.named_groups = named_groups;
10239
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10240
cb.names_found = 0;
10241
cb.parens_depth = 0;
10242
cb.parsed_pattern = stack_parsed_pattern;
10243
cb.req_varyopt = 0;
10244
cb.start_code = cworkspace;
10245
cb.start_pattern = pattern;
10246
cb.start_workspace = cworkspace;
10247
cb.workspace_size = COMPILE_WORK_SIZE;
10248
#ifdef SUPPORT_WIDE_CHARS
10249
cb.cranges = NULL;
10250
cb.next_cranges = NULL;
10251
cb.char_lists_size = 0;
10252
#endif
10253
10254
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10255
references to help in deciding whether (.*) can be treated as anchored or not.
10256
*/
10257
10258
cb.top_backref = 0;
10259
cb.backref_map = 0;
10260
10261
/* Escape sequences \1 to \9 are always back references, but as they are only
10262
two characters long, only two elements can be used in the parsed_pattern
10263
vector. The first contains the reference, and we'd like to use the second to
10264
record the offset in the pattern, so that forward references to non-existent
10265
groups can be diagnosed later with an offset. However, on 64-bit systems,
10266
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10267
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10268
references have enough space for the offset to be put into the parsed pattern.
10269
*/
10270
10271
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10272
10273
10274
/* --------------- Start looking at the pattern --------------- */
10275
10276
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10277
the start of the pattern, and remember the offset to the actual regex. With
10278
valgrind support, make the terminator of a zero-terminated pattern
10279
inaccessible. This catches bugs that would otherwise only show up for
10280
non-zero-terminated patterns. */
10281
10282
#ifdef SUPPORT_VALGRIND
10283
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10284
#endif
10285
10286
xoptions = ccontext->extra_options;
10287
ptr = pattern;
10288
skipatstart = 0;
10289
10290
if ((options & PCRE2_LITERAL) == 0)
10291
{
10292
while (patlen - skipatstart >= 2 &&
10293
ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10294
ptr[skipatstart+1] == CHAR_ASTERISK)
10295
{
10296
for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10297
{
10298
const pso *p = pso_list + i;
10299
10300
if (patlen - skipatstart - 2 >= p->length &&
10301
PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10302
{
10303
uint32_t c, pp;
10304
10305
skipatstart += p->length + 2;
10306
switch(p->type)
10307
{
10308
case PSO_OPT:
10309
cb.external_options |= p->value;
10310
break;
10311
10312
case PSO_XOPT:
10313
xoptions |= p->value;
10314
break;
10315
10316
case PSO_FLG:
10317
setflags |= p->value;
10318
break;
10319
10320
case PSO_NL:
10321
newline = p->value;
10322
setflags |= PCRE2_NL_SET;
10323
break;
10324
10325
case PSO_BSR:
10326
bsr = p->value;
10327
setflags |= PCRE2_BSR_SET;
10328
break;
10329
10330
case PSO_LIMM:
10331
case PSO_LIMD:
10332
case PSO_LIMH:
10333
c = 0;
10334
pp = skipatstart;
10335
while (pp < patlen && IS_DIGIT(ptr[pp]))
10336
{
10337
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
10338
c = c*10 + (ptr[pp++] - CHAR_0);
10339
}
10340
if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10341
{
10342
errorcode = ERR60;
10343
ptr += pp;
10344
goto HAD_EARLY_ERROR;
10345
}
10346
if (p->type == PSO_LIMH) limit_heap = c;
10347
else if (p->type == PSO_LIMM) limit_match = c;
10348
else limit_depth = c;
10349
skipatstart = ++pp;
10350
break;
10351
10352
case PSO_OPTMZ:
10353
optim_flags &= ~(p->value);
10354
10355
/* For backward compatibility the three original VERBs to disable
10356
optimizations need to also update the corresponding bit in the
10357
external options. */
10358
10359
switch(p->value)
10360
{
10361
case PCRE2_OPTIM_AUTO_POSSESS:
10362
cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10363
break;
10364
10365
case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10366
cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10367
break;
10368
10369
case PCRE2_OPTIM_START_OPTIMIZE:
10370
cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10371
break;
10372
}
10373
10374
break;
10375
10376
default:
10377
/* All values in the enum need an explicit entry for this switch
10378
but until a better way to prevent coding mistakes is invented keep
10379
a catch all that triggers a debug build assert as a failsafe */
10380
PCRE2_DEBUG_UNREACHABLE();
10381
}
10382
break; /* Out of the table scan loop */
10383
}
10384
}
10385
if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
10386
}
10387
PCRE2_ASSERT(skipatstart <= patlen);
10388
}
10389
10390
/* End of pattern-start options; advance to start of real regex. */
10391
10392
ptr += skipatstart;
10393
10394
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10395
10396
#ifndef SUPPORT_UNICODE
10397
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10398
{
10399
errorcode = ERR32;
10400
goto HAD_EARLY_ERROR;
10401
}
10402
#endif
10403
10404
/* Check UTF. We have the original options in 'options', with that value as
10405
modified by (*UTF) etc in cb->external_options. The extra option
10406
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10407
surrogate code points cannot be represented in UTF-16. */
10408
10409
utf = (cb.external_options & PCRE2_UTF) != 0;
10410
if (utf)
10411
{
10412
if ((options & PCRE2_NEVER_UTF) != 0)
10413
{
10414
errorcode = ERR74;
10415
goto HAD_EARLY_ERROR;
10416
}
10417
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10418
(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10419
goto HAD_ERROR; /* Offset was set by valid_utf() */
10420
10421
#if PCRE2_CODE_UNIT_WIDTH == 16
10422
if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10423
{
10424
errorcode = ERR91;
10425
goto HAD_EARLY_ERROR;
10426
}
10427
#endif
10428
}
10429
10430
/* Check UCP lockout. */
10431
10432
ucp = (cb.external_options & PCRE2_UCP) != 0;
10433
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10434
{
10435
errorcode = ERR75;
10436
goto HAD_EARLY_ERROR;
10437
}
10438
10439
/* PCRE2_EXTRA_TURKISH_CASING checks */
10440
10441
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10442
{
10443
if (!utf && !ucp)
10444
{
10445
errorcode = ERR104;
10446
goto HAD_EARLY_ERROR;
10447
}
10448
10449
#if PCRE2_CODE_UNIT_WIDTH == 8
10450
if (!utf)
10451
{
10452
errorcode = ERR105;
10453
goto HAD_EARLY_ERROR;
10454
}
10455
#endif
10456
10457
if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10458
{
10459
errorcode = ERR106;
10460
goto HAD_EARLY_ERROR;
10461
}
10462
}
10463
10464
/* Process the BSR setting. */
10465
10466
if (bsr == 0) bsr = ccontext->bsr_convention;
10467
10468
/* Process the newline setting. */
10469
10470
if (newline == 0) newline = ccontext->newline_convention;
10471
cb.nltype = NLTYPE_FIXED;
10472
switch(newline)
10473
{
10474
case PCRE2_NEWLINE_CR:
10475
cb.nllen = 1;
10476
cb.nl[0] = CHAR_CR;
10477
break;
10478
10479
case PCRE2_NEWLINE_LF:
10480
cb.nllen = 1;
10481
cb.nl[0] = CHAR_NL;
10482
break;
10483
10484
case PCRE2_NEWLINE_NUL:
10485
cb.nllen = 1;
10486
cb.nl[0] = CHAR_NUL;
10487
break;
10488
10489
case PCRE2_NEWLINE_CRLF:
10490
cb.nllen = 2;
10491
cb.nl[0] = CHAR_CR;
10492
cb.nl[1] = CHAR_NL;
10493
break;
10494
10495
case PCRE2_NEWLINE_ANY:
10496
cb.nltype = NLTYPE_ANY;
10497
break;
10498
10499
case PCRE2_NEWLINE_ANYCRLF:
10500
cb.nltype = NLTYPE_ANYCRLF;
10501
break;
10502
10503
default:
10504
PCRE2_DEBUG_UNREACHABLE();
10505
errorcode = ERR56;
10506
goto HAD_EARLY_ERROR;
10507
}
10508
10509
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10510
their numerical equivalents, so that this information is always available for
10511
the remaining processing. (2) At the same time, parse the pattern and put a
10512
processed version into the parsed_pattern vector. This has escapes interpreted
10513
and comments removed (amongst other things). */
10514
10515
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10516
patterns the vector on the stack (which was set up above) can be used. */
10517
10518
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10519
10520
/* Allow for 2x uint32_t at the start and 2 at the end, for
10521
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10522
10523
if ((ccontext->extra_options &
10524
(PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10525
parsed_size_needed += 4;
10526
10527
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10528
10529
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10530
parsed_size_needed += 4;
10531
10532
parsed_size_needed += 1; /* For the final META_END */
10533
10534
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10535
{
10536
uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10537
parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10538
if (heap_parsed_pattern == NULL)
10539
{
10540
*errorptr = ERR21;
10541
goto EXIT;
10542
}
10543
cb.parsed_pattern = heap_parsed_pattern;
10544
}
10545
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10546
10547
/* Do the parsing scan. */
10548
10549
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10550
if (errorcode != 0) goto HAD_CB_ERROR;
10551
10552
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10553
lengths. Workspace is needed to remember whether numbered groups are or are not
10554
of limited length, and if limited, what the minimum and maximum lengths are.
10555
This caching saves re-computing the length of any group that is referenced more
10556
than once, which is particularly relevant when recursion is involved.
10557
Unnumbered groups do not have this exposure because they cannot be referenced.
10558
If there are sufficiently few groups, the default index vector on the stack, as
10559
set up above, can be used. Otherwise we have to get/free some heap memory. The
10560
vector must be initialized to zero. */
10561
10562
if (has_lookbehind)
10563
{
10564
int loopcount = 0;
10565
if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10566
{
10567
cb.groupinfo = ccontext->memctl.malloc(
10568
(2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10569
if (cb.groupinfo == NULL)
10570
{
10571
errorcode = ERR21;
10572
cb.erroroffset = 0;
10573
goto HAD_CB_ERROR;
10574
}
10575
}
10576
memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10577
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10578
if (errorcode != 0) goto HAD_CB_ERROR;
10579
}
10580
10581
/* For debugging, there is a function that shows the parsed pattern vector. */
10582
10583
#ifdef DEBUG_SHOW_PARSED
10584
fprintf(stderr, "+++ Pre-scan complete:\n");
10585
show_parsed(&cb);
10586
#endif
10587
10588
/* For debugging capturing information this code can be enabled. */
10589
10590
#ifdef DEBUG_SHOW_CAPTURES
10591
{
10592
named_group *ng = cb.named_groups;
10593
fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10594
for (i = 0; i < cb.names_found; i++, ng++)
10595
{
10596
fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10597
}
10598
}
10599
#endif
10600
10601
/* Pretend to compile the pattern while actually just accumulating the amount
10602
of memory required in the 'length' variable. This behaviour is triggered by
10603
passing a non-NULL final argument to compile_regex(). We pass a block of
10604
workspace (cworkspace) for it to compile parts of the pattern into; the
10605
compiled code is discarded when it is no longer needed, so hopefully this
10606
workspace will never overflow, though there is a test for its doing so.
10607
10608
On error, errorcode will be set non-zero, so we don't need to look at the
10609
result of the function. The initial options have been put into the cb block,
10610
but we still have to pass a separate options variable (the first argument)
10611
because the options may change as the pattern is processed. */
10612
10613
cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10614
pptr = cb.parsed_pattern;
10615
code = cworkspace;
10616
*code = OP_BRA;
10617
10618
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10619
&errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10620
&cb, &length);
10621
10622
if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10623
10624
/* This should be caught in compile_regex(), but just in case... */
10625
10626
#if defined SUPPORT_WIDE_CHARS
10627
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10628
if (length > MAX_PATTERN_SIZE ||
10629
MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10630
#else
10631
if (length > MAX_PATTERN_SIZE)
10632
#endif
10633
{
10634
errorcode = ERR20;
10635
goto HAD_CB_ERROR;
10636
}
10637
10638
/* Compute the size of, then, if not too large, get and initialize the data
10639
block for storing the compiled pattern and names table. Integer overflow should
10640
no longer be possible because nowadays we limit the maximum value of
10641
cb.names_found and cb.name_entry_size. */
10642
10643
re_blocksize =
10644
CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10645
10646
#if defined SUPPORT_WIDE_CHARS
10647
if (cb.char_lists_size != 0)
10648
{
10649
#if PCRE2_CODE_UNIT_WIDTH != 32
10650
/* Align to 32 bit first. This ensures the
10651
allocated area will also be 32 bit aligned. */
10652
re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10653
#endif
10654
re_blocksize += cb.char_lists_size;
10655
}
10656
#endif
10657
10658
re_blocksize += CU2BYTES(length);
10659
10660
if (re_blocksize > ccontext->max_pattern_compiled_length)
10661
{
10662
errorcode = ERR101;
10663
goto HAD_CB_ERROR;
10664
}
10665
10666
re_blocksize += sizeof(pcre2_real_code);
10667
re = (pcre2_real_code *)
10668
ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10669
if (re == NULL)
10670
{
10671
errorcode = ERR21;
10672
goto HAD_CB_ERROR;
10673
}
10674
10675
/* The compiler may put padding at the end of the pcre2_real_code structure in
10676
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10677
compiled pattern is copied (for example, when serialized) undefined bytes are
10678
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10679
write to the last 8 bytes of the structure before setting the fields. */
10680
10681
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10682
re->memctl = ccontext->memctl;
10683
re->tables = tables;
10684
re->executable_jit = NULL;
10685
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10686
re->blocksize = re_blocksize;
10687
re->code_start = re_blocksize - CU2BYTES(length);
10688
re->magic_number = MAGIC_NUMBER;
10689
re->compile_options = options;
10690
re->overall_options = cb.external_options;
10691
re->extra_options = xoptions;
10692
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10693
re->limit_heap = limit_heap;
10694
re->limit_match = limit_match;
10695
re->limit_depth = limit_depth;
10696
re->first_codeunit = 0;
10697
re->last_codeunit = 0;
10698
re->bsr_convention = bsr;
10699
re->newline_convention = newline;
10700
re->max_lookbehind = 0;
10701
re->minlength = 0;
10702
re->top_bracket = 0;
10703
re->top_backref = 0;
10704
re->name_entry_size = cb.name_entry_size;
10705
re->name_count = cb.names_found;
10706
re->optimization_flags = optim_flags;
10707
10708
/* The basic block is immediately followed by the name table, and the compiled
10709
code follows after that. */
10710
10711
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10712
10713
/* Update the compile data block for the actual compile. The starting points of
10714
the name/number translation table and of the code are passed around in the
10715
compile data block. The start/end pattern and initial options are already set
10716
from the pre-compile phase, as is the name_entry_size field. */
10717
10718
cb.parens_depth = 0;
10719
cb.assert_depth = 0;
10720
cb.lastcapture = 0;
10721
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10722
cb.start_code = codestart;
10723
cb.req_varyopt = 0;
10724
cb.had_accept = FALSE;
10725
cb.had_pruneorskip = FALSE;
10726
#ifdef SUPPORT_WIDE_CHARS
10727
cb.char_lists_size = 0;
10728
#endif
10729
10730
10731
/* If any named groups were found, create the name/number table from the list
10732
created in the pre-pass. */
10733
10734
if (cb.names_found > 0)
10735
{
10736
named_group *ng = cb.named_groups;
10737
for (i = 0; i < cb.names_found; i++, ng++)
10738
add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10739
}
10740
10741
/* Set up a starting, non-extracting bracket, then compile the expression. On
10742
error, errorcode will be set non-zero, so we don't need to look at the result
10743
of the function here. */
10744
10745
pptr = cb.parsed_pattern;
10746
code = (PCRE2_UCHAR *)codestart;
10747
*code = OP_BRA;
10748
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10749
&pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10750
NULL, &cb, NULL);
10751
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10752
re->top_bracket = cb.bracount;
10753
re->top_backref = cb.top_backref;
10754
re->max_lookbehind = cb.max_lookbehind;
10755
10756
if (cb.had_accept)
10757
{
10758
reqcu = 0; /* Must disable after (*ACCEPT) */
10759
reqcuflags = REQ_NONE;
10760
re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10761
}
10762
10763
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10764
but the estimated length exceeds the really used length, adjust the value of
10765
re->blocksize, and if valgrind support is configured, mark the extra allocated
10766
memory as unaddressable, so that any out-of-bound reads can be detected. */
10767
10768
*code++ = OP_END;
10769
usedlength = code - codestart;
10770
if (usedlength > length)
10771
{
10772
PCRE2_DEBUG_UNREACHABLE();
10773
errorcode = ERR23; /* Overflow of code block - internal error */
10774
}
10775
else
10776
{
10777
re->blocksize -= CU2BYTES(length - usedlength);
10778
#ifdef SUPPORT_VALGRIND
10779
VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10780
#endif
10781
}
10782
10783
/* Scan the pattern for recursion/subroutine calls and convert the group
10784
numbers into offsets. Maintain a small cache so that repeated groups containing
10785
recursions are efficiently handled. */
10786
10787
#define RSCAN_CACHE_SIZE 8
10788
10789
if (errorcode == 0 && cb.had_recurse)
10790
{
10791
PCRE2_UCHAR *rcode;
10792
PCRE2_SPTR rgroup;
10793
unsigned int ccount = 0;
10794
int start = RSCAN_CACHE_SIZE;
10795
recurse_cache rc[RSCAN_CACHE_SIZE];
10796
10797
for (rcode = find_recurse(codestart, utf);
10798
rcode != NULL;
10799
rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
10800
{
10801
int p, groupnumber;
10802
10803
groupnumber = (int)GET(rcode, 1);
10804
if (groupnumber == 0) rgroup = codestart; else
10805
{
10806
PCRE2_SPTR search_from = codestart;
10807
rgroup = NULL;
10808
for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10809
{
10810
if (groupnumber == rc[p].groupnumber)
10811
{
10812
rgroup = rc[p].group;
10813
break;
10814
}
10815
10816
/* Group n+1 must always start to the right of group n, so we can save
10817
search time below when the new group number is greater than any of the
10818
previously found groups. */
10819
10820
if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10821
}
10822
10823
if (rgroup == NULL)
10824
{
10825
rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10826
if (rgroup == NULL)
10827
{
10828
PCRE2_DEBUG_UNREACHABLE();
10829
errorcode = ERR53;
10830
break;
10831
}
10832
if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10833
rc[start].groupnumber = groupnumber;
10834
rc[start].group = rgroup;
10835
if (ccount < RSCAN_CACHE_SIZE) ccount++;
10836
}
10837
}
10838
10839
PUT(rcode, 1, (uint32_t)(rgroup - codestart));
10840
}
10841
}
10842
10843
/* In rare debugging situations we sometimes need to look at the compiled code
10844
at this stage. */
10845
10846
#ifdef DEBUG_CALL_PRINTINT
10847
pcre2_printint(re, stderr, TRUE);
10848
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10849
#endif
10850
10851
/* Unless disabled, check whether any single character iterators can be
10852
auto-possessified. The function overwrites the appropriate opcode values, so
10853
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10854
used in this code because at least one compiler gives a warning about loss of
10855
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10856
function call. */
10857
10858
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
10859
{
10860
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10861
if (PRIV(auto_possessify)(temp, &cb) != 0)
10862
{
10863
PCRE2_DEBUG_UNREACHABLE();
10864
errorcode = ERR80;
10865
}
10866
}
10867
10868
/* Failed to compile, or error while post-processing. */
10869
10870
if (errorcode != 0) goto HAD_CB_ERROR;
10871
10872
/* Successful compile. If the anchored option was not passed, set it if
10873
we can determine that the pattern is anchored by virtue of ^ characters or \A
10874
or anything else, such as starting with non-atomic .* when DOTALL is set and
10875
there are no occurrences of *PRUNE or *SKIP (though there is an option to
10876
disable this case). */
10877
10878
if ((re->overall_options & PCRE2_ANCHORED) == 0)
10879
{
10880
BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10881
if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10882
re->overall_options |= PCRE2_ANCHORED;
10883
}
10884
10885
/* Set up the first code unit or startline flag, the required code unit, and
10886
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
10887
is disabled, as the data it would create will not be used. Note that a first code
10888
unit (but not the startline flag) is useful for anchored patterns because it
10889
can still give a quick "no match" and also avoid searching for a last code
10890
unit. */
10891
10892
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
10893
{
10894
int minminlength = 0; /* For minimal minlength from first/required CU */
10895
10896
/* If we do not have a first code unit, see if there is one that is asserted
10897
(these are not saved during the compile because they can cause conflicts with
10898
actual literals that follow). */
10899
10900
if (firstcuflags >= REQ_NONE) {
10901
uint32_t assertedcuflags = 0;
10902
uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
10903
/* It would be wrong to use the asserted first code unit as `firstcu` for
10904
* regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
10905
* For that example, if we set both firstcu and reqcu to 'a', it would mean
10906
* the subject string needs to be at least 2 characters long, which is wrong.
10907
* With more analysis, we would be able to set firstcu in more cases. */
10908
if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
10909
firstcu = assertedcu;
10910
firstcuflags = assertedcuflags;
10911
}
10912
}
10913
10914
/* Save the data for a first code unit. The existence of one means the
10915
minimum length must be at least 1. */
10916
10917
if (firstcuflags < REQ_NONE)
10918
{
10919
re->first_codeunit = firstcu;
10920
re->flags |= PCRE2_FIRSTSET;
10921
minminlength++;
10922
10923
/* Handle caseless first code units. */
10924
10925
if ((firstcuflags & REQ_CASELESS) != 0)
10926
{
10927
if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10928
{
10929
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10930
}
10931
10932
/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10933
In 8-bit UTF mode, code units in the range 128-255 are introductory code
10934
units and cannot have another case, but if UCP is set they may do. */
10935
10936
#ifdef SUPPORT_UNICODE
10937
#if PCRE2_CODE_UNIT_WIDTH == 8
10938
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10939
re->flags |= PCRE2_FIRSTCASELESS;
10940
#else
10941
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10942
UCD_OTHERCASE(firstcu) != firstcu)
10943
re->flags |= PCRE2_FIRSTCASELESS;
10944
#endif
10945
#endif /* SUPPORT_UNICODE */
10946
}
10947
}
10948
10949
/* When there is no first code unit, for non-anchored patterns, see if we can
10950
set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10951
branches start with ^ and also when all branches start with non-atomic .* for
10952
non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10953
that disables this case.) */
10954
10955
else if ((re->overall_options & PCRE2_ANCHORED) == 0)
10956
{
10957
BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10958
if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10959
re->flags |= PCRE2_STARTLINE;
10960
}
10961
10962
/* Handle the "required code unit", if one is set. In the UTF case we can
10963
increment the minimum minimum length only if we are sure this really is a
10964
different character and not a non-starting code unit of the first character,
10965
because the minimum length count is in characters, not code units. */
10966
10967
if (reqcuflags < REQ_NONE)
10968
{
10969
#if PCRE2_CODE_UNIT_WIDTH == 16
10970
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10971
firstcuflags >= REQ_NONE || /* First not set */
10972
(firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10973
(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10974
#elif PCRE2_CODE_UNIT_WIDTH == 8
10975
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10976
firstcuflags >= REQ_NONE || /* First not set */
10977
(firstcu & 0x80) == 0 || /* First is ASCII */
10978
(reqcu & 0x80) == 0) /* Req is ASCII */
10979
#endif
10980
{
10981
minminlength++;
10982
}
10983
10984
/* In the case of an anchored pattern, set up the value only if it follows
10985
a variable length item in the pattern. */
10986
10987
if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10988
(reqcuflags & REQ_VARY) != 0)
10989
{
10990
re->last_codeunit = reqcu;
10991
re->flags |= PCRE2_LASTSET;
10992
10993
/* Handle caseless required code units as for first code units (above). */
10994
10995
if ((reqcuflags & REQ_CASELESS) != 0)
10996
{
10997
if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10998
{
10999
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11000
}
11001
#ifdef SUPPORT_UNICODE
11002
#if PCRE2_CODE_UNIT_WIDTH == 8
11003
else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11004
re->flags |= PCRE2_LASTCASELESS;
11005
#else
11006
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11007
UCD_OTHERCASE(reqcu) != reqcu)
11008
re->flags |= PCRE2_LASTCASELESS;
11009
#endif
11010
#endif /* SUPPORT_UNICODE */
11011
}
11012
}
11013
}
11014
11015
/* Study the compiled pattern to set up information such as a bitmap of
11016
starting code units and a minimum matching length. */
11017
11018
if (PRIV(study)(re) != 0)
11019
{
11020
PCRE2_DEBUG_UNREACHABLE();
11021
errorcode = ERR31;
11022
goto HAD_CB_ERROR;
11023
}
11024
11025
/* If study() set a bitmap of starting code units, it implies a minimum
11026
length of at least one. */
11027
11028
if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11029
minminlength = 1;
11030
11031
/* If the minimum length set (or not set) by study() is less than the minimum
11032
implied by required code units, override it. */
11033
11034
if (re->minlength < minminlength) re->minlength = minminlength;
11035
} /* End of start-of-match optimizations. */
11036
11037
/* Control ends up here in all cases. When running under valgrind, make a
11038
pattern's terminating zero defined again. If memory was obtained for the parsed
11039
version of the pattern, free it before returning. Also free the list of named
11040
groups if a larger one had to be obtained, and likewise the group information
11041
vector. */
11042
11043
#ifdef SUPPORT_UNICODE
11044
PCRE2_ASSERT(cb.cranges == NULL);
11045
#endif
11046
11047
EXIT:
11048
#ifdef SUPPORT_VALGRIND
11049
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11050
#endif
11051
if (cb.parsed_pattern != stack_parsed_pattern)
11052
ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11053
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11054
ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11055
if (cb.groupinfo != stack_groupinfo)
11056
ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11057
11058
return re; /* Will be NULL after an error */
11059
11060
/* Errors discovered in parse_regex() set the offset value in the compile
11061
block. Errors discovered before it is called must compute it from the ptr
11062
value. After parse_regex() is called, the offset in the compile block is set to
11063
the end of the pattern, but certain errors in compile_regex() may reset it if
11064
an offset is available in the parsed pattern. */
11065
11066
HAD_CB_ERROR:
11067
ptr = pattern + cb.erroroffset;
11068
11069
HAD_EARLY_ERROR:
11070
PCRE2_ASSERT(ptr >= pattern); /* Ensure we don't return invalid erroroffset */
11071
PCRE2_ASSERT(ptr <= (pattern + patlen));
11072
*erroroffset = ptr - pattern;
11073
11074
HAD_ERROR:
11075
*errorptr = errorcode;
11076
pcre2_code_free(re);
11077
re = NULL;
11078
11079
#ifdef SUPPORT_WIDE_CHARS
11080
if (cb.cranges != NULL)
11081
{
11082
class_ranges* cranges = cb.cranges;
11083
do
11084
{
11085
class_ranges* next_cranges = cranges->next;
11086
cb.cx->memctl.free(cranges, cb.cx->memctl.memory_data);
11087
cranges = next_cranges;
11088
}
11089
while (cranges != NULL);
11090
}
11091
#endif
11092
goto EXIT;
11093
}
11094
11095
/* These #undefs are here to enable unity builds with CMake. */
11096
11097
#undef NLBLOCK /* Block containing newline information */
11098
#undef PSSTART /* Field containing processed string start */
11099
#undef PSEND /* Field containing processed string end */
11100
11101
/* End of pcre2_compile.c */
11102
11103