Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_compile.c
21658 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
#define NLBLOCK cb /* Block containing newline information */
47
#define PSSTART start_pattern /* Field containing processed string start */
48
#define PSEND end_pattern /* Field containing processed string end */
49
50
/* In rare error cases debugging might require calling pcre2_printint(). */
51
52
#if 0
53
#ifdef EBCDIC
54
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
55
#else
56
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
57
#endif
58
#define CHAR_OUTPUT(c) (c)
59
#define CHAR_OUTPUT_HEX(c) (c)
60
#define CHAR_INPUT(c) (c)
61
#define CHAR_INPUT_HEX(c) (c)
62
#include "pcre2_printint_inc.h"
63
#undef PRINTABLE
64
#undef CHAR_OUTPUT
65
#undef CHAR_OUTPUT_HEX
66
#undef CHAR_INPUT
67
#define DEBUG_CALL_PRINTINT
68
#endif
69
70
/* Other debugging code can be enabled by these defines. */
71
72
/* #define DEBUG_SHOW_CAPTURES */
73
/* #define DEBUG_SHOW_PARSED */
74
75
/* There are a few things that vary with different code unit sizes. Handle them
76
by defining macros in order to minimize #if usage. */
77
78
#if PCRE2_CODE_UNIT_WIDTH == 8
79
#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
80
#define XDIGIT(c) xdigitab[c]
81
82
#else /* Either 16-bit or 32-bit */
83
#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
84
85
#if PCRE2_CODE_UNIT_WIDTH == 16
86
#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
87
88
#else /* 32-bit */
89
#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
90
#endif
91
#endif
92
93
/* Function definitions to allow mutual recursion */
94
95
static int
96
compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
97
uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
98
open_capitem *, compile_block *, PCRE2_SIZE *);
99
100
static int
101
get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
102
compile_block *);
103
104
static BOOL
105
set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
106
compile_block *);
107
108
static int
109
check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
110
compile_block *, int *);
111
112
113
/*************************************************
114
* Code parameters and static tables *
115
*************************************************/
116
117
#define MAX_GROUP_NUMBER 65535u
118
#define MAX_REPEAT_COUNT 65535u
119
#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
120
121
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
122
different ways in the different pattern scans. The parsing and group-
123
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
124
aligned for this. Having defined the size in code units, we set up
125
C16_WORK_SIZE as the number of elements in the 16-bit vector.
126
127
During the first compiling phase, when determining how much memory is required,
128
the regex is partly compiled into this space, but the compiled parts are
129
discarded as soon as they can be, so that hopefully there will never be an
130
overrun. The code does, however, check for an overrun, which can occur for
131
pathological patterns. The size of the workspace depends on LINK_SIZE because
132
the length of compiled items varies with this.
133
134
In the real compile phase, this workspace is not currently used. */
135
136
#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
137
138
#define C16_WORK_SIZE \
139
((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
140
141
/* A uint32_t vector is used for caching information about the size of
142
capturing groups, to improve performance. A default is created on the stack of
143
this size. */
144
145
#define GROUPINFO_DEFAULT_SIZE 256
146
147
/* The overrun tests check for a slightly smaller size so that they detect the
148
overrun before it actually does run off the end of the data block. */
149
150
#define WORK_SIZE_SAFETY_MARGIN (100)
151
152
/* This value determines the size of the initial vector that is used for
153
remembering named groups during the pre-compile. It is allocated on the stack,
154
but if it is too small, it is expanded, in a similar way to the workspace. The
155
value is the number of slots in the list. */
156
157
#define NAMED_GROUP_LIST_SIZE 20
158
159
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
160
of uint32_t. For short patterns this lives on the stack, with this size. Heap
161
memory is used for longer patterns. */
162
163
#define PARSED_PATTERN_DEFAULT_SIZE 1024
164
165
/* Maximum length value to check against when making sure that the variable
166
that holds the compiled pattern length does not overflow. We make it a bit less
167
than INT_MAX to allow for adding in group terminating code units, so that we
168
don't have to check them every time. */
169
170
#define OFLOW_MAX (INT_MAX - 20)
171
172
/* Table of extra lengths for each of the meta codes. Must be kept in step with
173
the definitions above. For some items these values are a basic length to which
174
a variable amount has to be added. */
175
176
static unsigned char meta_extra_lengths[] = {
177
0, /* META_END */
178
0, /* META_ALT */
179
0, /* META_ATOMIC */
180
0, /* META_BACKREF - more if group is >= 10 */
181
1+SIZEOFFSET, /* META_BACKREF_BYNAME */
182
1, /* META_BIGVALUE */
183
3, /* META_CALLOUT_NUMBER */
184
3+SIZEOFFSET, /* META_CALLOUT_STRING */
185
0, /* META_CAPTURE */
186
0, /* META_CIRCUMFLEX */
187
0, /* META_CLASS */
188
0, /* META_CLASS_EMPTY */
189
0, /* META_CLASS_EMPTY_NOT */
190
0, /* META_CLASS_END */
191
0, /* META_CLASS_NOT */
192
0, /* META_COND_ASSERT */
193
SIZEOFFSET, /* META_COND_DEFINE */
194
1+SIZEOFFSET, /* META_COND_NAME */
195
1+SIZEOFFSET, /* META_COND_NUMBER */
196
1+SIZEOFFSET, /* META_COND_RNAME */
197
1+SIZEOFFSET, /* META_COND_RNUMBER */
198
3, /* META_COND_VERSION */
199
SIZEOFFSET, /* META_OFFSET */
200
0, /* META_SCS */
201
1, /* META_CAPTURE_NAME */
202
1, /* META_CAPTURE_NUMBER */
203
0, /* META_DOLLAR */
204
0, /* META_DOT */
205
0, /* META_ESCAPE - one more for ESC_P and ESC_p */
206
0, /* META_KET */
207
0, /* META_NOCAPTURE */
208
2, /* META_OPTIONS */
209
1, /* META_POSIX */
210
1, /* META_POSIX_NEG */
211
0, /* META_RANGE_ESCAPED */
212
0, /* META_RANGE_LITERAL */
213
SIZEOFFSET, /* META_RECURSE */
214
1+SIZEOFFSET, /* META_RECURSE_BYNAME */
215
0, /* META_SCRIPT_RUN */
216
0, /* META_LOOKAHEAD */
217
0, /* META_LOOKAHEADNOT */
218
SIZEOFFSET, /* META_LOOKBEHIND */
219
SIZEOFFSET, /* META_LOOKBEHINDNOT */
220
0, /* META_LOOKAHEAD_NA */
221
SIZEOFFSET, /* META_LOOKBEHIND_NA */
222
1, /* META_MARK - plus the string length */
223
0, /* META_ACCEPT */
224
0, /* META_FAIL */
225
0, /* META_COMMIT */
226
1, /* META_COMMIT_ARG - plus the string length */
227
0, /* META_PRUNE */
228
1, /* META_PRUNE_ARG - plus the string length */
229
0, /* META_SKIP */
230
1, /* META_SKIP_ARG - plus the string length */
231
0, /* META_THEN */
232
1, /* META_THEN_ARG - plus the string length */
233
0, /* META_ASTERISK */
234
0, /* META_ASTERISK_PLUS */
235
0, /* META_ASTERISK_QUERY */
236
0, /* META_PLUS */
237
0, /* META_PLUS_PLUS */
238
0, /* META_PLUS_QUERY */
239
0, /* META_QUERY */
240
0, /* META_QUERY_PLUS */
241
0, /* META_QUERY_QUERY */
242
2, /* META_MINMAX */
243
2, /* META_MINMAX_PLUS */
244
2, /* META_MINMAX_QUERY */
245
0, /* META_ECLASS_AND */
246
0, /* META_ECLASS_OR */
247
0, /* META_ECLASS_SUB */
248
0, /* META_ECLASS_XOR */
249
0 /* META_ECLASS_NOT */
250
};
251
252
/* Types for skipping parts of a parsed pattern. */
253
254
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
255
256
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
257
variables, which are concerned with first and required code units. A value
258
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
259
matching xxcu variable is set, and the low valued bits are relevant. */
260
261
#define REQ_UNSET 0xffffffffu /* Not yet found anything */
262
#define REQ_NONE 0xfffffffeu /* Found not fixed character */
263
#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
264
#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
265
266
/* These flags are used in the groupinfo vector. */
267
268
#define GI_SET_FIXED_LENGTH 0x80000000u
269
#define GI_NOT_FIXED_LENGTH 0x40000000u
270
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
271
272
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
273
and is fast (a good compiler can turn it into a subtraction and unsigned
274
comparison). */
275
276
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
277
278
/* Table to identify hex digits. The tables in chartables are dependent on the
279
locale, and may mark arbitrary characters as digits. We want to recognize only
280
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
281
costs 256 bytes, but it is a lot faster than doing character value tests (at
282
least in some simple cases I timed), and in some applications one wants PCRE2
283
to compile efficiently as well as match efficiently. The value in the table is
284
the binary hex digit value, or 0xff for non-hex digits. */
285
286
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
287
UTF-8 mode. */
288
289
#ifndef EBCDIC
290
static const uint8_t xdigitab[] =
291
{
292
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
293
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
294
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
295
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
296
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
297
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
298
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
299
0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
300
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
301
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
302
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
303
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
304
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
305
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
306
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
307
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
308
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
309
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
310
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
311
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
312
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
313
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
314
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
315
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
316
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
317
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
318
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
319
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
320
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
321
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
322
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
323
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
324
325
#else
326
327
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
328
329
static const uint8_t xdigitab[] =
330
{
331
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
332
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
333
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
334
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
335
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
336
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
337
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
338
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
339
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
340
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
341
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
342
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
343
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
344
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
345
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
346
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
347
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
348
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
349
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
350
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
351
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
352
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
353
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
354
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
355
0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
356
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
357
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
358
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
359
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
360
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
361
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
362
0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
363
#endif /* EBCDIC */
364
365
366
/* Table for handling alphanumeric escaped characters. Positive returns are
367
simple data values; negative values are for special things like \d and so on.
368
Zero means further processing is needed (for things like \x), or the escape is
369
invalid. */
370
371
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
372
in UTF-8 mode. It runs from '0' to 'z'. */
373
374
#ifndef EBCDIC
375
#define ESCAPES_FIRST CHAR_0
376
#define ESCAPES_LAST CHAR_z
377
#define UPPER_CASE(c) (c-32)
378
379
static const short int escapes[] = {
380
/* 0 */ 0, /* 1 */ 0,
381
/* 2 */ 0, /* 3 */ 0,
382
/* 4 */ 0, /* 5 */ 0,
383
/* 6 */ 0, /* 7 */ 0,
384
/* 8 */ 0, /* 9 */ 0,
385
/* : */ ESCAPES_FIRST+0x0a, /* ; */ ESCAPES_FIRST+0x0b,
386
/* < */ ESCAPES_FIRST+0x0c, /* = */ ESCAPES_FIRST+0x0d,
387
/* > */ ESCAPES_FIRST+0x0e, /* ? */ ESCAPES_FIRST+0x0f,
388
/* @ */ ESCAPES_FIRST+0x10, /* A */ -ESC_A,
389
/* B */ -ESC_B, /* C */ -ESC_C,
390
/* D */ -ESC_D, /* E */ -ESC_E,
391
/* F */ 0, /* G */ -ESC_G,
392
/* H */ -ESC_H, /* I */ 0,
393
/* J */ 0, /* K */ -ESC_K,
394
/* L */ 0, /* M */ 0,
395
/* N */ -ESC_N, /* O */ 0,
396
/* P */ -ESC_P, /* Q */ -ESC_Q,
397
/* R */ -ESC_R, /* S */ -ESC_S,
398
/* T */ 0, /* U */ 0,
399
/* V */ -ESC_V, /* W */ -ESC_W,
400
/* X */ -ESC_X, /* Y */ 0,
401
/* Z */ -ESC_Z, /* [ */ ESCAPES_FIRST+0x2b,
402
/* \ */ ESCAPES_FIRST+0x2c, /* ] */ ESCAPES_FIRST+0x2d,
403
/* ^ */ ESCAPES_FIRST+0x2e, /* _ */ ESCAPES_FIRST+0x2f,
404
/* ` */ ESCAPES_FIRST+0x30, /* a */ CHAR_BEL,
405
/* b */ -ESC_b, /* c */ 0,
406
/* d */ -ESC_d, /* e */ CHAR_ESC,
407
/* f */ CHAR_FF, /* g */ 0,
408
/* h */ -ESC_h, /* i */ 0,
409
/* j */ 0, /* k */ -ESC_k,
410
/* l */ 0, /* m */ 0,
411
/* n */ CHAR_LF, /* o */ 0,
412
/* p */ -ESC_p, /* q */ 0,
413
/* r */ CHAR_CR, /* s */ -ESC_s,
414
/* t */ CHAR_HT, /* u */ 0,
415
/* v */ -ESC_v, /* w */ -ESC_w,
416
/* x */ 0, /* y */ 0,
417
/* z */ -ESC_z
418
};
419
420
#else
421
422
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
423
It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,
424
which can interpret character literals like 'a' or '[' in an EBCDIC codepage;
425
in this case, there is wide variance between codepages on the interpretation of
426
characters between the letters ('[' and '{' and so on are placed in all sorts of
427
different positions in the table). Thankfully however, all EBCDIC codepages
428
place the letters and digits in the same location, so we hardcode that here.
429
Our EBCDIC support can also be provided via numeric literals instead of
430
character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled
431
in EBCDIC mode. */
432
433
#define ESCAPES_FIRST CHAR_a
434
#define ESCAPES_LAST CHAR_9
435
#define UPPER_CASE(c) (c+64)
436
437
static const short int escapes[] = {
438
/* 0x81 a */ CHAR_BEL, /* 0x82 b */ -ESC_b,
439
/* 0x83 c */ 0, /* 0x84 d */ -ESC_d,
440
/* 0x85 e */ CHAR_ESC, /* 0x86 f */ CHAR_FF,
441
/* 0x87 g */ 0, /* 0x88 h */ -ESC_h,
442
/* 0x89 i */ 0, /* 0x8a */ ESCAPES_FIRST+0x09,
443
/* 0x8b */ ESCAPES_FIRST+0x0a, /* 0x8c */ ESCAPES_FIRST+0x0b,
444
/* 0x8d */ ESCAPES_FIRST+0x0c, /* 0x8e */ ESCAPES_FIRST+0x0d,
445
/* 0x8f */ ESCAPES_FIRST+0x0e, /* 0x90 */ ESCAPES_FIRST+0x0f,
446
/* 0x91 j */ 0, /* 0x92 k */ -ESC_k,
447
/* 0x93 l */ 0, /* 0x94 m */ 0,
448
/* 0x95 n */ CHAR_LF, /* 0x96 o */ 0,
449
/* 0x97 p */ -ESC_p, /* 0x98 q */ 0,
450
/* 0x99 r */ CHAR_CR, /* 0x9a */ ESCAPES_FIRST+0x19,
451
/* 0x9b */ ESCAPES_FIRST+0x1a, /* 0x9c */ ESCAPES_FIRST+0x1b,
452
/* 0x9d */ ESCAPES_FIRST+0x1c, /* 0x9e */ ESCAPES_FIRST+0x1d,
453
/* 0x9f */ ESCAPES_FIRST+0x1e, /* 0xa0 */ ESCAPES_FIRST+0x1f,
454
/* 0xa1 */ ESCAPES_FIRST+0x20, /* 0xa2 s */ -ESC_s,
455
/* 0xa3 t */ CHAR_HT, /* 0xa4 u */ 0,
456
/* 0xa5 v */ -ESC_v, /* 0xa6 w */ -ESC_w,
457
/* 0xa7 x */ 0, /* 0xa8 y */ 0,
458
/* 0xa9 z */ -ESC_z, /* 0xaa */ ESCAPES_FIRST+0x29,
459
/* 0xab */ ESCAPES_FIRST+0x2a, /* 0xac */ ESCAPES_FIRST+0x2b,
460
/* 0xad */ ESCAPES_FIRST+0x2c, /* 0xae */ ESCAPES_FIRST+0x2d,
461
/* 0xaf */ ESCAPES_FIRST+0x2e, /* 0xb0 */ ESCAPES_FIRST+0x2f,
462
/* 0xb1 */ ESCAPES_FIRST+0x30, /* 0xb2 */ ESCAPES_FIRST+0x31,
463
/* 0xb3 */ ESCAPES_FIRST+0x32, /* 0xb4 */ ESCAPES_FIRST+0x33,
464
/* 0xb5 */ ESCAPES_FIRST+0x34, /* 0xb6 */ ESCAPES_FIRST+0x35,
465
/* 0xb7 */ ESCAPES_FIRST+0x36, /* 0xb8 */ ESCAPES_FIRST+0x37,
466
/* 0xb9 */ ESCAPES_FIRST+0x38, /* 0xba */ ESCAPES_FIRST+0x39,
467
/* 0xbb */ ESCAPES_FIRST+0x3a, /* 0xbc */ ESCAPES_FIRST+0x3b,
468
/* 0xbd */ ESCAPES_FIRST+0x3c, /* 0xbe */ ESCAPES_FIRST+0x3d,
469
/* 0xbf */ ESCAPES_FIRST+0x3e, /* 0xc0 */ ESCAPES_FIRST+0x3f,
470
/* 0xc1 A */ -ESC_A, /* 0xc2 B */ -ESC_B,
471
/* 0xc3 C */ -ESC_C, /* 0xc4 D */ -ESC_D,
472
/* 0xc5 E */ -ESC_E, /* 0xc6 F */ 0,
473
/* 0xc7 G */ -ESC_G, /* 0xc8 H */ -ESC_H,
474
/* 0xc9 I */ 0, /* 0xca */ ESCAPES_FIRST+0x49,
475
/* 0xcb */ ESCAPES_FIRST+0x4a, /* 0xcc */ ESCAPES_FIRST+0x4b,
476
/* 0xcd */ ESCAPES_FIRST+0x4c, /* 0xce */ ESCAPES_FIRST+0x4d,
477
/* 0xcf */ ESCAPES_FIRST+0x4e, /* 0xd0 */ ESCAPES_FIRST+0x4f,
478
/* 0xd1 J */ 0, /* 0xd2 K */ -ESC_K,
479
/* 0xd3 L */ 0, /* 0xd4 M */ 0,
480
/* 0xd5 N */ -ESC_N, /* 0xd6 O */ 0,
481
/* 0xd7 P */ -ESC_P, /* 0xd8 Q */ -ESC_Q,
482
/* 0xd9 R */ -ESC_R, /* 0xda */ ESCAPES_FIRST+0x59,
483
/* 0xdb */ ESCAPES_FIRST+0x5a, /* 0xdc */ ESCAPES_FIRST+0x5b,
484
/* 0xdd */ ESCAPES_FIRST+0x5c, /* 0xde */ ESCAPES_FIRST+0x5d,
485
/* 0xdf */ ESCAPES_FIRST+0x5e, /* 0xe0 */ ESCAPES_FIRST+0x5f,
486
/* 0xe1 */ ESCAPES_FIRST+0x60, /* 0xe2 S */ -ESC_S,
487
/* 0xe3 T */ 0, /* 0xe4 U */ 0,
488
/* 0xe5 V */ -ESC_V, /* 0xe6 W */ -ESC_W,
489
/* 0xe7 X */ -ESC_X, /* 0xe8 Y */ 0,
490
/* 0xe9 Z */ -ESC_Z, /* 0xea */ ESCAPES_FIRST+0x69,
491
/* 0xeb */ ESCAPES_FIRST+0x6a, /* 0xec */ ESCAPES_FIRST+0x6b,
492
/* 0xed */ ESCAPES_FIRST+0x6c, /* 0xee */ ESCAPES_FIRST+0x6d,
493
/* 0xef */ ESCAPES_FIRST+0x6e, /* 0xf0 0 */ 0,
494
/* 0xf1 1 */ 0, /* 0xf2 2 */ 0,
495
/* 0xf3 3 */ 0, /* 0xf4 4 */ 0,
496
/* 0xf5 5 */ 0, /* 0xf6 6 */ 0,
497
/* 0xf7 7 */ 0, /* 0xf8 8 */ 0,
498
/* 0xf9 9 */ 0,
499
};
500
501
/* We also need a table of characters that may follow \c in an EBCDIC
502
environment for characters 0-31. */
503
504
static unsigned char ebcdic_escape_c[] = {
505
CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,
506
CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,
507
CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,
508
CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
509
CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE
510
};
511
512
#endif /* EBCDIC */
513
514
515
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
516
searched linearly. Put all the names into a single string, in order to reduce
517
the number of relocations when a shared library is dynamically linked. The
518
string is built from string macros so that it works in UTF-8 mode on EBCDIC
519
platforms. */
520
521
typedef struct verbitem {
522
unsigned int len; /* Length of verb name */
523
uint32_t meta; /* Base META_ code */
524
int has_arg; /* Argument requirement */
525
} verbitem;
526
527
static const char verbnames[] =
528
"\0" /* Empty name is a shorthand for MARK */
529
STRING_MARK0
530
STRING_ACCEPT0
531
STRING_F0
532
STRING_FAIL0
533
STRING_COMMIT0
534
STRING_PRUNE0
535
STRING_SKIP0
536
STRING_THEN;
537
538
static const verbitem verbs[] = {
539
{ 0, META_MARK, +1 }, /* > 0 => must have an argument */
540
{ 4, META_MARK, +1 },
541
{ 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
542
{ 1, META_FAIL, -1 },
543
{ 4, META_FAIL, -1 },
544
{ 6, META_COMMIT, 0 },
545
{ 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
546
{ 4, META_SKIP, 0 },
547
{ 4, META_THEN, 0 }
548
};
549
550
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
551
552
/* Verb opcodes, indexed by their META code offset from META_MARK. */
553
554
static const uint32_t verbops[] = {
555
OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
556
OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
557
558
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
559
560
typedef struct alasitem {
561
unsigned int len; /* Length of name */
562
uint32_t meta; /* Base META_ code */
563
} alasitem;
564
565
static const char alasnames[] =
566
STRING_pla0
567
STRING_plb0
568
STRING_napla0
569
STRING_naplb0
570
STRING_nla0
571
STRING_nlb0
572
STRING_positive_lookahead0
573
STRING_positive_lookbehind0
574
STRING_non_atomic_positive_lookahead0
575
STRING_non_atomic_positive_lookbehind0
576
STRING_negative_lookahead0
577
STRING_negative_lookbehind0
578
STRING_scs0
579
STRING_scan_substring0
580
STRING_atomic0
581
STRING_sr0
582
STRING_asr0
583
STRING_script_run0
584
STRING_atomic_script_run;
585
586
static const alasitem alasmeta[] = {
587
{ 3, META_LOOKAHEAD },
588
{ 3, META_LOOKBEHIND },
589
{ 5, META_LOOKAHEAD_NA },
590
{ 5, META_LOOKBEHIND_NA },
591
{ 3, META_LOOKAHEADNOT },
592
{ 3, META_LOOKBEHINDNOT },
593
{ 18, META_LOOKAHEAD },
594
{ 19, META_LOOKBEHIND },
595
{ 29, META_LOOKAHEAD_NA },
596
{ 30, META_LOOKBEHIND_NA },
597
{ 18, META_LOOKAHEADNOT },
598
{ 19, META_LOOKBEHINDNOT },
599
{ 3, META_SCS },
600
{ 14, META_SCS },
601
{ 6, META_ATOMIC },
602
{ 2, META_SCRIPT_RUN }, /* sr = script run */
603
{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
604
{ 10, META_SCRIPT_RUN }, /* script run */
605
{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
606
};
607
608
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
609
610
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
611
612
static uint32_t chartypeoffset[] = {
613
OP_STAR - OP_STAR, OP_STARI - OP_STAR,
614
OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
615
616
/* Tables of names of POSIX character classes and their lengths. The names are
617
now all in a single string, to reduce the number of relocations when a shared
618
library is dynamically loaded. The list of lengths is terminated by a zero
619
length entry. The first three must be alpha, lower, upper, as this is assumed
620
for handling case independence.
621
622
The indices for several classes are stored in pcre2_compile.h - these must
623
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
624
and posix_substitutes. */
625
626
static const char posix_names[] =
627
STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
628
STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
629
STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
630
STRING_word0 STRING_xdigit;
631
632
static const uint8_t posix_name_lengths[] = {
633
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
634
635
/* Table of class bit maps for each POSIX class. Each class is formed from a
636
base map, with an optional addition or removal of another map. Then, for some
637
classes, there is some additional tweaking: for [:blank:] the vertical space
638
characters are removed, and for [:alpha:] and [:alnum:] the underscore
639
character is removed. The triples in the table consist of the base map offset,
640
second map offset or -1 if no second map, and a non-negative value for map
641
addition or a negative value for map subtraction (if there are two maps). The
642
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
643
remove vertical space characters, 2 => remove underscore. */
644
645
const int PRIV(posix_class_maps)[] = {
646
cbit_word, cbit_digit, -2, /* alpha */
647
cbit_lower, -1, 0, /* lower */
648
cbit_upper, -1, 0, /* upper */
649
cbit_word, -1, 2, /* alnum - word without underscore */
650
cbit_print, cbit_cntrl, 0, /* ascii */
651
cbit_space, -1, 1, /* blank - a GNU extension */
652
cbit_cntrl, -1, 0, /* cntrl */
653
cbit_digit, -1, 0, /* digit */
654
cbit_graph, -1, 0, /* graph */
655
cbit_print, -1, 0, /* print */
656
cbit_punct, -1, 0, /* punct */
657
cbit_space, -1, 0, /* space */
658
cbit_word, -1, 0, /* word - a Perl extension */
659
cbit_xdigit, -1, 0 /* xdigit */
660
};
661
662
#ifdef SUPPORT_UNICODE
663
664
/* The POSIX class Unicode property substitutes that are used in UCP mode must
665
be in the order of the POSIX class names, defined above. */
666
667
static int posix_substitutes[] = {
668
PT_GC, ucp_L, /* alpha */
669
PT_PC, ucp_Ll, /* lower */
670
PT_PC, ucp_Lu, /* upper */
671
PT_ALNUM, 0, /* alnum */
672
-1, 0, /* ascii, treat as non-UCP */
673
-1, 1, /* blank, treat as \h */
674
PT_PC, ucp_Cc, /* cntrl */
675
PT_PC, ucp_Nd, /* digit */
676
PT_PXGRAPH, 0, /* graph */
677
PT_PXPRINT, 0, /* print */
678
PT_PXPUNCT, 0, /* punct */
679
PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
680
PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
681
PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */
682
};
683
#endif /* SUPPORT_UNICODE */
684
685
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
686
are allowed. */
687
688
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
689
(PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
690
PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
691
PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
692
693
#define PUBLIC_COMPILE_OPTIONS \
694
(PUBLIC_LITERAL_COMPILE_OPTIONS| \
695
PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
696
PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
697
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
698
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
699
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
700
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
701
702
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
703
(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
704
PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
705
706
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
707
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
708
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
709
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
710
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
711
PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
712
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
713
PCRE2_EXTRA_NEVER_CALLOUT)
714
715
/* This is a table of start-of-pattern options such as (*UTF) and settings such
716
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
717
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
718
generic and always supported. */
719
720
enum { PSO_OPT, /* Value is an option bit */
721
PSO_XOPT, /* Value is an xoption bit */
722
PSO_FLG, /* Value is a flag bit */
723
PSO_NL, /* Value is a newline type */
724
PSO_BSR, /* Value is a \R type */
725
PSO_LIMH, /* Read integer value for heap limit */
726
PSO_LIMM, /* Read integer value for match limit */
727
PSO_LIMD, /* Read integer value for depth limit */
728
PSO_OPTMZ /* Value is an optimization bit */
729
};
730
731
typedef struct pso {
732
const char *name;
733
uint16_t length;
734
uint16_t type;
735
uint32_t value;
736
} pso;
737
738
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
739
740
static const pso pso_list[] = {
741
{ STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
742
{ STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
743
{ STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
744
{ STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
745
{ STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
746
{ STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
747
{ STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
748
{ STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
749
{ STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
750
{ STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
751
{ STRING_TURKISH_CASING_RIGHTPAR, 15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
752
{ STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
753
{ STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
754
{ STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
755
{ STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
756
{ STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
757
{ STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
758
{ STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
759
{ STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
760
{ STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
761
{ STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
762
{ STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
763
{ STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
764
};
765
766
/* This table is used when converting repeating opcodes into possessified
767
versions as a result of an explicit possessive quantifier such as ++. A zero
768
value means there is no possessified version - in those cases the item in
769
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
770
because all relevant opcodes are less than that. */
771
772
static const uint8_t opcode_possessify[] = {
773
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
774
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
775
776
0, /* NOTI */
777
OP_POSSTAR, 0, /* STAR, MINSTAR */
778
OP_POSPLUS, 0, /* PLUS, MINPLUS */
779
OP_POSQUERY, 0, /* QUERY, MINQUERY */
780
OP_POSUPTO, 0, /* UPTO, MINUPTO */
781
0, /* EXACT */
782
0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
783
784
OP_POSSTARI, 0, /* STARI, MINSTARI */
785
OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
786
OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
787
OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
788
0, /* EXACTI */
789
0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
790
791
OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
792
OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
793
OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
794
OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
795
0, /* NOTEXACT */
796
0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
797
798
OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
799
OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
800
OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
801
OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
802
0, /* NOTEXACTI */
803
0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
804
805
OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
806
OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
807
OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
808
OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
809
0, /* TYPEEXACT */
810
0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
811
812
OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
813
OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
814
OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
815
OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
816
0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
817
818
0, 0, 0, 0, /* CLASS, NCLASS, XCLASS, ECLASS */
819
0, 0, /* REF, REFI */
820
0, 0, /* DNREF, DNREFI */
821
0, 0, /* RECURSE, CALLOUT */
822
};
823
824
/* Compile-time check that the table has the correct size. */
825
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
826
827
828
#ifdef DEBUG_SHOW_PARSED
829
/*************************************************
830
* Show the parsed pattern for debugging *
831
*************************************************/
832
833
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
834
can be enabled. */
835
836
static void show_parsed(compile_block *cb)
837
{
838
uint32_t *pptr = cb->parsed_pattern;
839
840
for (;;)
841
{
842
int max, min;
843
PCRE2_SIZE offset;
844
uint32_t i;
845
uint32_t length;
846
uint32_t meta_arg = META_DATA(*pptr);
847
848
fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
849
850
if (*pptr < META_END)
851
{
852
if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
853
pptr++;
854
}
855
856
else switch (META_CODE(*pptr++))
857
{
858
default:
859
fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
860
return;
861
862
case META_END:
863
fprintf(stderr, "META_END\n");
864
return;
865
866
case META_CAPTURE:
867
fprintf(stderr, "META_CAPTURE %d", meta_arg);
868
break;
869
870
case META_RECURSE:
871
GETOFFSET(offset, pptr);
872
fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
873
break;
874
875
case META_BACKREF:
876
if (meta_arg < 10)
877
offset = cb->small_ref_offset[meta_arg];
878
else
879
GETOFFSET(offset, pptr);
880
fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
881
break;
882
883
case META_ESCAPE:
884
if (meta_arg == ESC_P || meta_arg == ESC_p)
885
{
886
uint32_t ptype = *pptr >> 16;
887
uint32_t pvalue = *pptr++ & 0xffff;
888
fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
889
ptype, pvalue);
890
}
891
else
892
{
893
uint32_t cc;
894
/* There's just one escape we might have here that isn't negated in the
895
escapes table. */
896
if (meta_arg == ESC_g) cc = CHAR_g;
897
else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
898
{
899
if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
900
}
901
if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
902
fprintf(stderr, "META \\%c", cc);
903
}
904
break;
905
906
case META_MINMAX:
907
min = *pptr++;
908
max = *pptr++;
909
if (max != REPEAT_UNLIMITED)
910
fprintf(stderr, "META {%d,%d}", min, max);
911
else
912
fprintf(stderr, "META {%d,}", min);
913
break;
914
915
case META_MINMAX_QUERY:
916
min = *pptr++;
917
max = *pptr++;
918
if (max != REPEAT_UNLIMITED)
919
fprintf(stderr, "META {%d,%d}?", min, max);
920
else
921
fprintf(stderr, "META {%d,}?", min);
922
break;
923
924
case META_MINMAX_PLUS:
925
min = *pptr++;
926
max = *pptr++;
927
if (max != REPEAT_UNLIMITED)
928
fprintf(stderr, "META {%d,%d}+", min, max);
929
else
930
fprintf(stderr, "META {%d,}+", min);
931
break;
932
933
case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
934
case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
935
case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
936
case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
937
case META_DOT: fprintf(stderr, "META_DOT"); break;
938
case META_ASTERISK: fprintf(stderr, "META *"); break;
939
case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
940
case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
941
case META_PLUS: fprintf(stderr, "META +"); break;
942
case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
943
case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
944
case META_QUERY: fprintf(stderr, "META ?"); break;
945
case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
946
case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
947
948
case META_ATOMIC: fprintf(stderr, "META (?>"); break;
949
case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
950
case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
951
case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
952
case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
953
case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
954
case META_KET: fprintf(stderr, "META )"); break;
955
case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
956
957
case META_CLASS: fprintf(stderr, "META ["); break;
958
case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
959
case META_CLASS_END: fprintf(stderr, "META ]"); break;
960
case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
961
case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
962
963
case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
964
case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
965
966
case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
967
case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
968
969
case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
970
case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
971
case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
972
case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
973
case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
974
case META_THEN: fprintf(stderr, "META (*THEN)"); break;
975
976
case META_OPTIONS:
977
fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
978
pptr += 2;
979
break;
980
981
case META_LOOKBEHIND:
982
fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
983
pptr += 2;
984
break;
985
986
case META_LOOKBEHIND_NA:
987
fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
988
pptr += 2;
989
break;
990
991
case META_LOOKBEHINDNOT:
992
fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
993
pptr += 2;
994
break;
995
996
case META_CALLOUT_NUMBER:
997
fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
998
pptr[1]);
999
pptr += 3;
1000
break;
1001
1002
case META_CALLOUT_STRING:
1003
{
1004
uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1005
uint32_t patlength = *pptr++; /* Length of next pattern item */
1006
fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1007
GETOFFSET(offset, pptr);
1008
fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1009
}
1010
break;
1011
1012
case META_RECURSE_BYNAME:
1013
fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1014
GETOFFSET(offset, pptr);
1015
fprintf(stderr, "%zd", offset);
1016
break;
1017
1018
case META_BACKREF_BYNAME:
1019
fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1020
GETOFFSET(offset, pptr);
1021
fprintf(stderr, "%zd", offset);
1022
break;
1023
1024
case META_COND_NUMBER:
1025
fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1026
GETOFFSET(offset, pptr);
1027
fprintf(stderr, "%zd", offset);
1028
pptr++;
1029
break;
1030
1031
case META_COND_DEFINE:
1032
fprintf(stderr, "META (?(DEFINE) offset=");
1033
GETOFFSET(offset, pptr);
1034
fprintf(stderr, "%zd", offset);
1035
break;
1036
1037
case META_COND_VERSION:
1038
fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1039
fprintf(stderr, "%d.", *pptr++);
1040
fprintf(stderr, "%d)", *pptr++);
1041
break;
1042
1043
case META_COND_NAME:
1044
fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1045
GETOFFSET(offset, pptr);
1046
fprintf(stderr, "%zd", offset);
1047
break;
1048
1049
case META_COND_RNAME:
1050
fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1051
GETOFFSET(offset, pptr);
1052
fprintf(stderr, "%zd", offset);
1053
break;
1054
1055
/* This is kept as a name, because it might be. */
1056
1057
case META_COND_RNUMBER:
1058
fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1059
GETOFFSET(offset, pptr);
1060
fprintf(stderr, "%zd", offset);
1061
break;
1062
1063
case META_OFFSET:
1064
fprintf(stderr, "META_OFFSET offset=");
1065
GETOFFSET(offset, pptr);
1066
fprintf(stderr, "%zd", offset);
1067
break;
1068
1069
case META_SCS:
1070
fprintf(stderr, "META (*scan_substring:");
1071
break;
1072
1073
case META_CAPTURE_NAME:
1074
fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1075
break;
1076
1077
case META_CAPTURE_NUMBER:
1078
fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1079
break;
1080
1081
case META_MARK:
1082
fprintf(stderr, "META (*MARK:");
1083
goto SHOWARG;
1084
1085
case META_COMMIT_ARG:
1086
fprintf(stderr, "META (*COMMIT:");
1087
goto SHOWARG;
1088
1089
case META_PRUNE_ARG:
1090
fprintf(stderr, "META (*PRUNE:");
1091
goto SHOWARG;
1092
1093
case META_SKIP_ARG:
1094
fprintf(stderr, "META (*SKIP:");
1095
goto SHOWARG;
1096
1097
case META_THEN_ARG:
1098
fprintf(stderr, "META (*THEN:");
1099
SHOWARG:
1100
length = *pptr++;
1101
for (i = 0; i < length; i++)
1102
{
1103
uint32_t cc = *pptr++;
1104
if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1105
else fprintf(stderr, "\\x{%x}", cc);
1106
}
1107
fprintf(stderr, ") length=%u", length);
1108
break;
1109
1110
case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1111
case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1112
case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1113
case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1114
case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1115
}
1116
fprintf(stderr, "\n");
1117
}
1118
return;
1119
}
1120
#endif /* DEBUG_SHOW_PARSED */
1121
1122
1123
1124
/*************************************************
1125
* Copy compiled code *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. */
1130
1131
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1132
pcre2_code_copy(const pcre2_code *code)
1133
{
1134
PCRE2_SIZE *ref_count;
1135
pcre2_code *newcode;
1136
1137
if (code == NULL) return NULL;
1138
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1139
if (newcode == NULL) return NULL;
1140
memcpy(newcode, code, code->blocksize);
1141
newcode->executable_jit = NULL;
1142
1143
/* If the code is one that has been deserialized, increment the reference count
1144
in the decoded tables. */
1145
1146
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1147
{
1148
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1149
(*ref_count)++;
1150
}
1151
1152
return newcode;
1153
}
1154
1155
1156
1157
/*************************************************
1158
* Copy compiled code and character tables *
1159
*************************************************/
1160
1161
/* Compiled JIT code cannot be copied, so the new compiled block has no
1162
associated JIT data. This version of code_copy also makes a separate copy of
1163
the character tables. */
1164
1165
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1166
pcre2_code_copy_with_tables(const pcre2_code *code)
1167
{
1168
PCRE2_SIZE* ref_count;
1169
pcre2_code *newcode;
1170
uint8_t *newtables;
1171
1172
if (code == NULL) return NULL;
1173
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1174
if (newcode == NULL) return NULL;
1175
memcpy(newcode, code, code->blocksize);
1176
newcode->executable_jit = NULL;
1177
1178
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1179
code->memctl.memory_data);
1180
if (newtables == NULL)
1181
{
1182
code->memctl.free((void *)newcode, code->memctl.memory_data);
1183
return NULL;
1184
}
1185
memcpy(newtables, code->tables, TABLES_LENGTH);
1186
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1187
*ref_count = 1;
1188
1189
newcode->tables = newtables;
1190
newcode->flags |= PCRE2_DEREF_TABLES;
1191
return newcode;
1192
}
1193
1194
1195
1196
/*************************************************
1197
* Free compiled code *
1198
*************************************************/
1199
1200
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1201
pcre2_code_free(pcre2_code *code)
1202
{
1203
PCRE2_SIZE* ref_count;
1204
1205
if (code != NULL)
1206
{
1207
#ifdef SUPPORT_JIT
1208
if (code->executable_jit != NULL)
1209
PRIV(jit_free)(code->executable_jit, &code->memctl);
1210
#endif
1211
1212
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1213
{
1214
/* Decoded tables belong to the codes after deserialization, and they must
1215
be freed when there are no more references to them. The *ref_count should
1216
always be > 0. */
1217
1218
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1219
if (*ref_count > 0)
1220
{
1221
(*ref_count)--;
1222
if (*ref_count == 0)
1223
code->memctl.free((void *)code->tables, code->memctl.memory_data);
1224
}
1225
}
1226
1227
code->memctl.free(code, code->memctl.memory_data);
1228
}
1229
}
1230
1231
1232
1233
/*************************************************
1234
* Read a number, possibly signed *
1235
*************************************************/
1236
1237
/* This function is used to read numbers in the pattern. The initial pointer
1238
must be at the sign or first digit of the number. When relative values
1239
(introduced by + or -) are allowed, they are relative group numbers, and the
1240
result must be greater than zero.
1241
1242
Arguments:
1243
ptrptr points to the character pointer variable
1244
ptrend points to the end of the input string
1245
allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1246
max_value the largest number allowed;
1247
you must not pass a value for max_value larger than
1248
INT_MAX/10 - 1 because this function relies on max_value to
1249
avoid integer overflow
1250
max_error the error to give for an over-large number
1251
intptr where to put the result
1252
errcodeptr where to put an error code
1253
1254
Returns: TRUE - a number was read
1255
FALSE - errorcode == 0 => no number was found
1256
errorcode != 0 => an error occurred
1257
*/
1258
1259
static BOOL
1260
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1261
uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1262
{
1263
int sign = 0;
1264
uint32_t n = 0;
1265
PCRE2_SPTR ptr = *ptrptr;
1266
BOOL yield = FALSE;
1267
1268
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1269
1270
*errorcodeptr = 0;
1271
1272
if (allow_sign >= 0 && ptr < ptrend)
1273
{
1274
if (*ptr == CHAR_PLUS)
1275
{
1276
sign = +1;
1277
max_value -= allow_sign;
1278
ptr++;
1279
}
1280
else if (*ptr == CHAR_MINUS)
1281
{
1282
sign = -1;
1283
ptr++;
1284
}
1285
}
1286
1287
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1288
while (ptr < ptrend && IS_DIGIT(*ptr))
1289
{
1290
n = n * 10 + (*ptr++ - CHAR_0);
1291
if (n > max_value)
1292
{
1293
*errorcodeptr = max_error;
1294
while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1295
goto EXIT;
1296
}
1297
}
1298
1299
if (allow_sign >= 0 && sign != 0)
1300
{
1301
if (n == 0)
1302
{
1303
*errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1304
goto EXIT;
1305
}
1306
1307
if (sign > 0) n += allow_sign;
1308
else if (n > (uint32_t)allow_sign)
1309
{
1310
*errorcodeptr = ERR15; /* Non-existent subpattern */
1311
goto EXIT;
1312
}
1313
else n = allow_sign + 1 - n;
1314
}
1315
1316
yield = TRUE;
1317
1318
EXIT:
1319
*intptr = n;
1320
*ptrptr = ptr;
1321
return yield;
1322
}
1323
1324
1325
1326
/*************************************************
1327
* Read repeat counts *
1328
*************************************************/
1329
1330
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1331
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1332
larger value is used for "unlimited". We have to use signed arguments for
1333
read_number() because it is capable of returning a signed value. As of Perl
1334
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1335
tabs after { and before } and between the numbers and the comma, so we do too.
1336
1337
Arguments:
1338
ptrptr points to pointer to character after '{'
1339
ptrend pointer to end of input
1340
minp if not NULL, pointer to int for min
1341
maxp if not NULL, pointer to int for max
1342
errorcodeptr points to error code variable
1343
1344
Returns: FALSE if not a repeat quantifier, errorcode set zero
1345
FALSE on error, with errorcode set non-zero
1346
TRUE on success, with pointer updated to point after '}'
1347
*/
1348
1349
static BOOL
1350
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1351
uint32_t *maxp, int *errorcodeptr)
1352
{
1353
PCRE2_SPTR p = *ptrptr;
1354
PCRE2_SPTR pp;
1355
BOOL yield = FALSE;
1356
BOOL had_minimum = FALSE;
1357
int32_t min = 0;
1358
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1359
1360
*errorcodeptr = 0;
1361
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1362
1363
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1364
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1365
error. */
1366
1367
pp = p;
1368
if (pp < ptrend && IS_DIGIT(*pp))
1369
{
1370
had_minimum = TRUE;
1371
while (++pp < ptrend && IS_DIGIT(*pp)) {}
1372
}
1373
1374
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1375
if (pp >= ptrend) return FALSE;
1376
1377
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1378
{
1379
if (!had_minimum) return FALSE;
1380
}
1381
else
1382
{
1383
if (*pp++ != CHAR_COMMA) return FALSE;
1384
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1385
if (pp >= ptrend) return FALSE;
1386
if (IS_DIGIT(*pp))
1387
{
1388
while (++pp < ptrend && IS_DIGIT(*pp)) {}
1389
}
1390
else if (!had_minimum) return FALSE;
1391
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1392
if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1393
}
1394
1395
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1396
or {n,m}. The only error that read_number() can return is for a number that is
1397
too big. If *errorcodeptr is returned as zero it means no number was found. */
1398
1399
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1400
check m >= n because n defaults to zero. */
1401
1402
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1403
{
1404
if (*errorcodeptr != 0) goto EXIT; /* n too big */
1405
p++; /* Skip comma and subsequent spaces */
1406
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1407
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1408
{
1409
if (*errorcodeptr != 0) goto EXIT; /* m too big */
1410
}
1411
}
1412
1413
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1414
1415
else
1416
{
1417
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1418
if (*p == CHAR_RIGHT_CURLY_BRACKET)
1419
{
1420
max = min;
1421
}
1422
else /* Handle {n,} or {n,m} */
1423
{
1424
p++; /* Skip comma and subsequent spaces */
1425
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1426
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1427
{
1428
if (*errorcodeptr != 0) goto EXIT; /* m too big */
1429
}
1430
1431
if (max < min)
1432
{
1433
*errorcodeptr = ERR4;
1434
goto EXIT;
1435
}
1436
}
1437
}
1438
1439
/* Valid quantifier exists */
1440
1441
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1442
p++;
1443
yield = TRUE;
1444
if (minp != NULL) *minp = (uint32_t)min;
1445
if (maxp != NULL) *maxp = (uint32_t)max;
1446
1447
/* Update the pattern pointer */
1448
1449
EXIT:
1450
*ptrptr = p;
1451
return yield;
1452
}
1453
1454
1455
1456
/*************************************************
1457
* Handle escapes *
1458
*************************************************/
1459
1460
/* This function is called when a \ has been encountered. It either returns a
1461
positive value for a simple escape such as \d, or 0 for a data character, which
1462
is placed in chptr. A backreference to group n is returned as -(n+1). On
1463
entry, ptr is pointing at the character after \. On exit, it points after the
1464
final code unit of the escape sequence.
1465
1466
This function is also called from pcre2_substitute() to handle escape sequences
1467
in replacement strings. In this case, the cb argument is NULL, and in the case
1468
of escapes that have further processing, only sequences that define a data
1469
character are recognised. The options argument is the final value of the
1470
compiled pattern's options.
1471
1472
Arguments:
1473
ptrptr points to the input position pointer
1474
ptrend points to the end of the input
1475
chptr points to a returned data character
1476
errorcodeptr points to the errorcode variable (containing zero)
1477
options the current options bits
1478
xoptions the current extra options bits
1479
bracount the number of capturing parentheses encountered so far
1480
isclass TRUE if in a character class
1481
cb compile data block or NULL when called from pcre2_substitute()
1482
1483
Returns: zero => a data character
1484
positive => a special escape sequence
1485
negative => a numerical back reference
1486
on error, errorcodeptr is set non-zero
1487
*/
1488
1489
int
1490
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1491
int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1492
BOOL isclass, compile_block *cb)
1493
{
1494
BOOL utf = (options & PCRE2_UTF) != 0;
1495
BOOL alt_bsux =
1496
((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1497
PCRE2_SPTR ptr = *ptrptr;
1498
uint32_t c, cc;
1499
int escape = 0;
1500
int i;
1501
1502
/* If backslash is at the end of the string, it's an error. */
1503
1504
if (ptr >= ptrend)
1505
{
1506
*errorcodeptr = ERR1;
1507
return 0;
1508
}
1509
1510
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1511
*errorcodeptr = 0; /* Be optimistic */
1512
1513
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1514
value test saves a memory lookup for code points outside the alphanumeric
1515
range. */
1516
1517
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1518
1519
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1520
positive value is a literal value for something like \n. A negative value is
1521
the negation of one of the ESC_ macros that is passed back for handling by the
1522
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1523
is supported. If the value is zero, further processing is handled below. */
1524
1525
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1526
{
1527
if (i > 0)
1528
{
1529
c = (uint32_t)i;
1530
if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1531
c = CHAR_LF;
1532
}
1533
else /* Negative table entry */
1534
{
1535
escape = -i; /* Else return a special escape */
1536
if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1537
cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1538
1539
/* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1540
Unicode code points, as well as plain \N for "not newline". PCRE does not
1541
support \N{name}. However, it does support quantification such as \N{2,3},
1542
so if \N{ is not followed by U+dddd we check for a quantifier. */
1543
1544
if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1545
{
1546
PCRE2_SPTR p = ptr + 1;
1547
1548
/* Perl ignores spaces and tabs after { */
1549
1550
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1551
1552
/* \N{U+ can be handled by the \x{ code. However, this construction is
1553
not valid in EBCDIC environments because it specifies a Unicode
1554
character, not a codepoint in the local code. For example \N{U+0041}
1555
must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1556
casing semantics for the entire pattern, so allow it only in UTF (i.e.
1557
Unicode) mode. */
1558
1559
if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1560
{
1561
#ifndef EBCDIC
1562
if (utf)
1563
{
1564
ptr = p + 2;
1565
escape = 0; /* Not a fancy escape after all */
1566
goto COME_FROM_NU;
1567
}
1568
#endif
1569
1570
/* Improve error offset. */
1571
ptr = p + 2;
1572
while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1573
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1574
if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET) ptr++;
1575
1576
*errorcodeptr = ERR93;
1577
}
1578
1579
/* Give an error in contexts where quantifiers are not allowed
1580
(character classes; substitution strings). */
1581
1582
else if (isclass || cb == NULL)
1583
{
1584
ptr++; /* Skip over the opening brace */
1585
*errorcodeptr = ERR37;
1586
}
1587
1588
/* Give an error if what follows is not a quantifier, but don't override
1589
an error set by the quantifier reader (e.g. number overflow). */
1590
1591
else
1592
{
1593
if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1594
*errorcodeptr == 0)
1595
{
1596
ptr++; /* Skip over the opening brace */
1597
*errorcodeptr = ERR37;
1598
}
1599
}
1600
}
1601
}
1602
}
1603
1604
/* Escapes that need further processing, including those that are unknown, have
1605
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1606
\o, and \x are recognized (\u and \U can never appear as they are used for case
1607
forcing). */
1608
1609
else
1610
{
1611
int s;
1612
PCRE2_SPTR oldptr;
1613
BOOL overflow;
1614
1615
/* Filter calls from pcre2_substitute(). */
1616
1617
if (cb == NULL)
1618
{
1619
if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&
1620
c != CHAR_x && c != CHAR_g)
1621
{
1622
*errorcodeptr = ERR3;
1623
goto EXIT;
1624
}
1625
alt_bsux = FALSE; /* Do not modify \x handling */
1626
}
1627
1628
switch (c)
1629
{
1630
/* A number of Perl escapes are not handled by PCRE. We give an explicit
1631
error. */
1632
1633
case CHAR_F:
1634
case CHAR_l:
1635
case CHAR_L:
1636
*errorcodeptr = ERR37;
1637
break;
1638
1639
/* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1640
is set. Otherwise, \u must be followed by exactly four hex digits or, if
1641
PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1642
Otherwise it is a lowercase u letter. This gives some compatibility with
1643
ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1644
allowed. When \u{ is not followed by hex digits, a special return is given
1645
because otherwise \u{ 12} (for example) would be treated as u{12}. */
1646
1647
case CHAR_u:
1648
if (!alt_bsux)
1649
*errorcodeptr = ERR37;
1650
else
1651
{
1652
uint32_t xc;
1653
1654
if (ptr >= ptrend) break;
1655
if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1656
(xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1657
{
1658
PCRE2_SPTR hptr = ptr + 1;
1659
1660
cc = 0;
1661
while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1662
{
1663
if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1664
{
1665
*errorcodeptr = ERR77;
1666
ptr = hptr; /* Show where */
1667
break; /* *hptr != } will cause another break below */
1668
}
1669
cc = (cc << 4) | xc;
1670
hptr++;
1671
}
1672
1673
if (hptr == ptr + 1 || /* No hex digits */
1674
hptr >= ptrend || /* Hit end of input */
1675
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1676
{
1677
if (isclass) break; /* In a class, just treat as '\u' literal */
1678
escape = ESC_ub; /* Special return */
1679
ptr++; /* Skip { */
1680
break; /* Hex escape not recognized */
1681
}
1682
1683
c = cc; /* Accept the code point */
1684
ptr = hptr + 1;
1685
}
1686
1687
else /* Must be exactly 4 hex digits */
1688
{
1689
if (ptrend - ptr < 4) break; /* Less than 4 chars */
1690
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1691
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1692
cc = (cc << 4) | xc;
1693
if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1694
cc = (cc << 4) | xc;
1695
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1696
c = (cc << 4) | xc;
1697
ptr += 4;
1698
}
1699
1700
if (utf)
1701
{
1702
if (c > 0x10ffffU) *errorcodeptr = ERR77;
1703
else
1704
if (c >= 0xd800 && c <= 0xdfff &&
1705
(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1706
*errorcodeptr = ERR73;
1707
}
1708
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1709
}
1710
break;
1711
1712
/* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1713
in which case it is an upper case letter. */
1714
1715
case CHAR_U:
1716
if (!alt_bsux) *errorcodeptr = ERR37;
1717
break;
1718
1719
/* In a character class, \g is just a literal "g". Outside a character
1720
class, \g must be followed by one of a number of specific things:
1721
1722
(1) A number, either plain or braced. If positive, it is an absolute
1723
backreference. If negative, it is a relative backreference. This is a Perl
1724
5.10 feature.
1725
1726
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1727
is part of Perl's movement towards a unified syntax for back references. As
1728
this is synonymous with \k{name}, we fudge it up by pretending it really
1729
was \k{name}.
1730
1731
(3) For Oniguruma compatibility we also support \g followed by a name or a
1732
number either in angle brackets or in single quotes. However, these are
1733
(possibly recursive) subroutine calls, _not_ backreferences. We return
1734
the ESC_g code.
1735
1736
Summary: Return a negative number for a numerical back reference (offset
1737
by 1), ESC_k for a named back reference, and ESC_g for a named or
1738
numbered subroutine call.
1739
1740
The above describes the \g behaviour inside patterns. Inside replacement
1741
strings (pcre2_substitute) we support only \g<nameornum> for Python
1742
compatibility. Return ESG_g for the named case, and -(num+1) for the
1743
numbered case.
1744
*/
1745
1746
case CHAR_g:
1747
if (isclass) break;
1748
1749
if (ptr >= ptrend)
1750
{
1751
*errorcodeptr = ERR57;
1752
break;
1753
}
1754
1755
if (cb == NULL)
1756
{
1757
PCRE2_SPTR p;
1758
/* Substitution strings */
1759
if (*ptr != CHAR_LESS_THAN_SIGN)
1760
{
1761
*errorcodeptr = ERR57;
1762
break;
1763
}
1764
1765
p = ptr + 1;
1766
1767
if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1768
errorcodeptr))
1769
{
1770
if (*errorcodeptr == 0) escape = ESC_g; /* No number found */
1771
break;
1772
}
1773
1774
if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1775
{
1776
ptr = p;
1777
*errorcodeptr = ERR119; /* Missing terminator for number */
1778
break;
1779
}
1780
1781
/* This is the reason that back references are returned as -(s+1) rather
1782
than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1783
valid in a substitution string, so this must be representable. */
1784
ptr = p + 1;
1785
escape = -(s+1);
1786
break;
1787
}
1788
1789
if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
{
1791
escape = ESC_g;
1792
break;
1793
}
1794
1795
/* If there is a brace delimiter, try to read a numerical reference. If
1796
there isn't one, assume we have a name and treat it as \k. */
1797
1798
if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
{
1800
PCRE2_SPTR p = ptr + 1;
1801
1802
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
errorcodeptr))
1805
{
1806
if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1807
break;
1808
}
1809
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
{
1813
ptr = p;
1814
*errorcodeptr = ERR119; /* Missing terminator for number */
1815
break;
1816
}
1817
ptr = p + 1;
1818
}
1819
1820
/* Read an undelimited number */
1821
1822
else
1823
{
1824
if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1825
errorcodeptr))
1826
{
1827
if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1828
break;
1829
}
1830
}
1831
1832
if (s <= 0)
1833
{
1834
*errorcodeptr = ERR15;
1835
break;
1836
}
1837
1838
escape = -(s+1);
1839
break;
1840
1841
/* The handling of escape sequences consisting of a string of digits
1842
starting with one that is not zero is not straightforward. Perl has changed
1843
over the years. Nowadays \g{} for backreferences and \o{} for octal are
1844
recommended to avoid the ambiguities in the old syntax.
1845
1846
Outside a character class, the digits are read as a decimal number. If the
1847
number is less than 10, or if there are that many previous extracting left
1848
brackets, it is a back reference. Otherwise, up to three octal digits are
1849
read to form an escaped character code. Thus \123 is likely to be octal 123
1850
(cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1851
style" of handling ambiguous octal/backrefences such as \12.
1852
1853
There is an alternative disambiguation strategy, selected by
1854
PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1855
have either a leading zero, or exactly three octal digits; otherwise it's
1856
a backreference. The disambiguation is stable, and does not depend on how
1857
many capture groups are defined (it's simply an invalid backreference if
1858
there is no corresponding capture group). Additionally, octal values above
1859
\377 (\xff) are rejected.
1860
1861
Inside a character class, \ followed by a digit is always either a literal
1862
8 or 9 or an octal number. */
1863
1864
case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1865
case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1866
1867
if (isclass)
1868
{
1869
/* Fall through to octal handling; never a backreference inside a class. */
1870
}
1871
else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1872
{
1873
/* Python-style disambiguation. */
1874
if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1875
ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1876
{
1877
/* We peeked a three-digit octal, so fall through */
1878
}
1879
else
1880
{
1881
/* We are at a digit, so the only possible error from read_number() is
1882
a number that is too large. */
1883
ptr--; /* Back to the digit */
1884
1885
if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1886
{
1887
*errorcodeptr = ERR61;
1888
break;
1889
}
1890
1891
escape = -(s+1);
1892
break;
1893
}
1894
}
1895
else
1896
{
1897
/* Perl-style disambiguation. */
1898
oldptr = ptr;
1899
ptr--; /* Back to the digit */
1900
1901
/* As we know we are at a digit, the only possible error from
1902
read_number() is a number that is too large to be a group number. Because
1903
that number might be still valid if read as an octal, errorcodeptr is not
1904
set on failure and therefore a sentinel value of INT_MAX is used instead
1905
of the original value, and will be used later to properly set the error,
1906
if not falling through. */
1907
1908
if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1909
s = INT_MAX;
1910
1911
/* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1912
are octal escapes if there are not that many previous captures. */
1913
1914
if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1915
{
1916
/* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1917
but we keep it just to be safe and because it will also catch the
1918
sentinel value that was set on failure by that function. */
1919
1920
if ((unsigned)s > MAX_GROUP_NUMBER)
1921
{
1922
PCRE2_ASSERT(s == INT_MAX);
1923
*errorcodeptr = ERR61;
1924
}
1925
else escape = -(s+1); /* Indicates a back reference */
1926
break;
1927
}
1928
1929
ptr = oldptr; /* Put the pointer back and fall through */
1930
}
1931
1932
/* Handle a digit following \ when the number is not a back reference, or
1933
we are within a character class. If the first digit is 8 or 9, Perl used to
1934
generate a binary zero and then treat the digit as a following literal. At
1935
least by Perl 5.18 this changed so as not to insert the binary zero. */
1936
1937
if (c >= CHAR_8) break;
1938
1939
PCRE2_FALLTHROUGH /* Fall through */
1940
1941
/* \0 always starts an octal number, but we may drop through to here with a
1942
larger first octal digit. The original code used just to take the least
1943
significant 8 bits of octal numbers (I think this is what early Perls used
1944
to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1945
but no more than 3 octal digits. */
1946
1947
case CHAR_0:
1948
c -= CHAR_0;
1949
while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1950
c = c * 8 + *ptr++ - CHAR_0;
1951
if (c > 0xff)
1952
{
1953
if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1954
#if PCRE2_CODE_UNIT_WIDTH == 8
1955
else if (!utf) *errorcodeptr = ERR51;
1956
#endif
1957
}
1958
1959
/* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1960
two- or three-character octal escapes \00 and \000, nor \x00. */
1961
1962
if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1963
*errorcodeptr = ERR98;
1964
break;
1965
1966
/* \o is a relatively new Perl feature, supporting a more general way of
1967
specifying character codes in octal. The only supported form is \o{ddd},
1968
with optional spaces or tabs after { and before }. */
1969
1970
case CHAR_o:
1971
if (ptr >= ptrend || *ptr != CHAR_LEFT_CURLY_BRACKET)
1972
{
1973
*errorcodeptr = ERR55;
1974
break;
1975
}
1976
ptr++;
1977
1978
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1979
if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1980
{
1981
*errorcodeptr = ERR78;
1982
break;
1983
}
1984
1985
c = 0;
1986
overflow = FALSE;
1987
while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1988
{
1989
cc = *ptr++;
1990
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1991
#if PCRE2_CODE_UNIT_WIDTH == 32
1992
if (c >= 0x20000000u) { overflow = TRUE; break; }
1993
#endif
1994
c = (c << 3) + (cc - CHAR_0);
1995
#if PCRE2_CODE_UNIT_WIDTH == 8
1996
if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1997
#elif PCRE2_CODE_UNIT_WIDTH == 16
1998
if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1999
#elif PCRE2_CODE_UNIT_WIDTH == 32
2000
if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2001
#endif
2002
}
2003
2004
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2005
2006
if (overflow)
2007
{
2008
while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2009
*errorcodeptr = ERR34;
2010
}
2011
else if (utf && c >= 0xd800 && c <= 0xdfff &&
2012
(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2013
{
2014
*errorcodeptr = ERR73;
2015
}
2016
else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2017
{
2018
ptr++;
2019
}
2020
else
2021
{
2022
*errorcodeptr = ERR64;
2023
goto ESCAPE_FAILED_FORWARD;
2024
}
2025
break;
2026
2027
/* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
2028
by two hexadecimal digits. Otherwise it is a lowercase x letter. */
2029
2030
case CHAR_x:
2031
if (alt_bsux)
2032
{
2033
uint32_t xc;
2034
if (ptrend - ptr < 2) break; /* Less than 2 characters */
2035
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
2036
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
2037
c = (cc << 4) | xc;
2038
ptr += 2;
2039
}
2040
2041
/* Handle \x in Perl's style. \x{ddd} is a character code which can be
2042
greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2043
digits. If not, { used to be treated as a data character. However, Perl
2044
seems to read hex digits up to the first non-such, and ignore the rest, so
2045
that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2046
now gives an error. */
2047
2048
else
2049
{
2050
if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2051
{
2052
ptr++;
2053
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2054
2055
#ifndef EBCDIC
2056
COME_FROM_NU:
2057
#endif
2058
if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2059
{
2060
*errorcodeptr = ERR78;
2061
break;
2062
}
2063
c = 0;
2064
overflow = FALSE;
2065
2066
while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2067
{
2068
ptr++;
2069
if (c == 0 && cc == 0) continue; /* Leading zeroes */
2070
#if PCRE2_CODE_UNIT_WIDTH == 32
2071
if (c >= 0x10000000l) { overflow = TRUE; break; }
2072
#endif
2073
c = (c << 4) | cc;
2074
if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2075
{
2076
overflow = TRUE;
2077
break;
2078
}
2079
}
2080
2081
/* Perl ignores spaces and tabs before } */
2082
2083
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2084
2085
/* On overflow, skip remaining hex digits */
2086
2087
if (overflow)
2088
{
2089
while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2090
*errorcodeptr = ERR34;
2091
}
2092
else if (utf && c >= 0xd800 && c <= 0xdfff &&
2093
(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2094
{
2095
*errorcodeptr = ERR73;
2096
}
2097
else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2098
{
2099
ptr++;
2100
}
2101
2102
/* If the sequence of hex digits (followed by optional space) does not
2103
end with '}', give an error. We used just to recognize this construct
2104
and fall through to the normal \x handling, but nowadays Perl gives an
2105
error, which seems much more sensible, so we do too. */
2106
2107
else
2108
{
2109
*errorcodeptr = ERR67;
2110
goto ESCAPE_FAILED_FORWARD;
2111
}
2112
} /* End of \x{} processing */
2113
2114
/* Read a up to two hex digits after \x */
2115
2116
else
2117
{
2118
/* Perl has the surprising/broken behaviour that \x without following
2119
hex digits is treated as an escape for NUL. Their source code laments
2120
this but keeps it for backwards compatibility. A warning is printed
2121
when "use warnings" is enabled. Because we don't have warnings, we
2122
simply forbid it. */
2123
if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2124
{
2125
/* Not a hex digit */
2126
*errorcodeptr = ERR78;
2127
break;
2128
}
2129
ptr++;
2130
c = cc;
2131
2132
/* With "use re 'strict'" Perl actually requires exactly two digits (error
2133
for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2134
strict, and there seems little incentive to align with that, given the
2135
backwards-compatibility cost.
2136
2137
For comparison, note that other engines disagree. For example:
2138
- Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2139
- .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2140
*/
2141
if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2142
ptr++;
2143
c = (c << 4) | cc;
2144
} /* End of \xdd handling */
2145
} /* End of Perl-style \x handling */
2146
break;
2147
2148
/* The handling of \c is different in ASCII and EBCDIC environments. In an
2149
ASCII (or Unicode) environment, an error is given if the character
2150
following \c is not a printable ASCII character. Otherwise, the following
2151
character is upper-cased if it is a letter, and after that the 0x40 bit is
2152
flipped. The result is the value of the escape.
2153
2154
In an EBCDIC environment the handling of \c is compatible with the
2155
specification in the perlebcdic document. The following character must be
2156
a letter or one of small number of special characters. These provide a
2157
means of defining the character values 0-31.
2158
2159
For testing the EBCDIC handling of \c in an ASCII environment, recognize
2160
the EBCDIC value of 'c' explicitly. */
2161
2162
case CHAR_c:
2163
if (ptr >= ptrend)
2164
{
2165
*errorcodeptr = ERR2;
2166
break;
2167
}
2168
c = *ptr;
2169
if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2170
2171
/* Handle \c in an ASCII/Unicode environment. */
2172
2173
#ifndef EBCDIC /* ASCII/UTF-8 coding */
2174
if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2175
{
2176
*errorcodeptr = ERR68;
2177
goto ESCAPE_FAILED_FORWARD;
2178
}
2179
c ^= 0x40;
2180
2181
/* Handle \c in an EBCDIC environment. The special case \c? is converted to
2182
255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2183
POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2184
The other valid sequences correspond to a list of specific characters. */
2185
2186
#else
2187
if (c == CHAR_QUESTION_MARK)
2188
c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;
2189
else
2190
{
2191
for (i = 0; i < 32; i++)
2192
{
2193
if (c == ebcdic_escape_c[i]) break;
2194
}
2195
if (i < 32)
2196
c = i;
2197
else
2198
{
2199
*errorcodeptr = ERR68;
2200
goto ESCAPE_FAILED_FORWARD;
2201
}
2202
}
2203
#endif /* EBCDIC */
2204
2205
ptr++;
2206
break;
2207
2208
/* Any other alphanumeric following \ is an error. Perl gives an error only
2209
if in warning mode, but PCRE doesn't have a warning mode. */
2210
2211
default:
2212
*errorcodeptr = ERR3;
2213
break;
2214
}
2215
}
2216
2217
/* Set the pointer to the next character before returning. */
2218
2219
EXIT:
2220
*ptrptr = ptr;
2221
*chptr = c;
2222
return escape;
2223
2224
/* Some errors need to indicate the next character. */
2225
2226
ESCAPE_FAILED_FORWARD:
2227
ptr++;
2228
#ifdef SUPPORT_UNICODE
2229
if (utf) FORWARDCHARTEST(ptr, ptrend);
2230
#endif
2231
goto EXIT;
2232
}
2233
2234
2235
2236
#ifdef SUPPORT_UNICODE
2237
/*************************************************
2238
* Handle \P and \p *
2239
*************************************************/
2240
2241
/* This function is called after \P or \p has been encountered, provided that
2242
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2243
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2244
after the final code unit of the escape sequence.
2245
2246
Arguments:
2247
ptrptr the pattern position pointer
2248
utf true if the input is UTF-encoded
2249
negptr a boolean that is set TRUE for negation else FALSE
2250
ptypeptr an unsigned int that is set to the type value
2251
pdataptr an unsigned int that is set to the detailed property value
2252
errorcodeptr the error code variable
2253
cb the compile data
2254
2255
Returns: TRUE if the type value was found, or FALSE for an invalid type
2256
*/
2257
2258
static BOOL
2259
get_ucp(PCRE2_SPTR *ptrptr, BOOL utf, BOOL *negptr, uint16_t *ptypeptr,
2260
uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2261
{
2262
uint32_t c;
2263
ptrdiff_t i;
2264
PCRE2_SIZE bot, top;
2265
PCRE2_SPTR ptr = *ptrptr;
2266
PCRE2_UCHAR name[50];
2267
PCRE2_UCHAR *vptr = NULL;
2268
uint16_t ptscript = PT_NOTSCRIPT;
2269
2270
#ifndef MAYBE_UTF_MULTI
2271
(void)utf; /* Avoid compiler warning */
2272
#endif
2273
2274
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2275
GETCHARINCTEST(c, ptr);
2276
*negptr = FALSE;
2277
2278
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2279
negation. We must be handling Unicode encoding here, though we may be compiling
2280
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2281
input and Unicode input in the same build.) In accordance with Unicode's "loose
2282
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2283
don't use isspace() or tolower() because (a) code points may be greater than
2284
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2285
environment. */
2286
2287
if (c == CHAR_LEFT_CURLY_BRACKET)
2288
{
2289
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2290
2291
for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2292
{
2293
REDO:
2294
2295
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2296
GETCHARINCTEST(c, ptr);
2297
2298
/* Skip ignorable Unicode characters. */
2299
2300
if (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2301
(c >= CHAR_HT && c <= CHAR_CR))
2302
{
2303
goto REDO;
2304
}
2305
2306
/* The first significant character being circumflex negates the meaning of
2307
the item. */
2308
2309
if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2310
{
2311
*negptr = TRUE;
2312
goto REDO;
2313
}
2314
2315
if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2316
2317
/* Names consist of ASCII letters and digits, but equals and colon may also
2318
occur as a name/value separator. We must also allow for \p{L&}. A simple
2319
check for a value between '&' and 'z' suffices because anything else in a
2320
name or value will cause an "unknown property" error anyway. */
2321
2322
if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2323
2324
/* Lower case a capital letter or remember where the name/value separator
2325
is. */
2326
2327
if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2328
else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2329
vptr = name + i;
2330
2331
name[i] = c;
2332
}
2333
2334
/* Error if the loop didn't end with '}' - either we hit the end of the
2335
pattern or the name was longer than any legal property name. */
2336
2337
if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2338
name[i] = 0;
2339
}
2340
2341
/* If { doesn't follow \p or \P there is just one following character, which
2342
must be an ASCII letter. */
2343
2344
else if (c >= CHAR_A && c <= CHAR_Z)
2345
{
2346
name[0] = c | 0x20; /* Lower case */
2347
name[1] = 0;
2348
}
2349
else if (c >= CHAR_a && c <= CHAR_z)
2350
{
2351
name[0] = c;
2352
name[1] = 0;
2353
}
2354
else goto ERROR_RETURN;
2355
2356
*ptrptr = ptr; /* Update pattern pointer */
2357
2358
/* If the property contains ':' or '=' we have class name and value separately
2359
specified. The following are supported:
2360
2361
. Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2362
. Script (synonym sc) for which the property name is the script name
2363
. Script_Extensions (synonym scx), ditto
2364
2365
As this is a small number, we currently just check the names directly. If this
2366
grows, a sorted table and a switch will be neater.
2367
2368
For both the script properties, set a PT_xxx value so that (1) they can be
2369
distinguished and (2) invalid script names that happen to be the name of
2370
another property can be diagnosed. */
2371
2372
if (vptr != NULL)
2373
{
2374
int offset = 0;
2375
PCRE2_UCHAR sname[8];
2376
2377
*vptr = 0; /* Terminate property name */
2378
if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2379
PRIV(strcmp_c8)(name, STRING_bc) == 0)
2380
{
2381
offset = 4;
2382
sname[0] = CHAR_b;
2383
sname[1] = CHAR_i; /* There is no strcpy_c8 function */
2384
sname[2] = CHAR_d;
2385
sname[3] = CHAR_i;
2386
}
2387
2388
else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2389
PRIV(strcmp_c8)(name, STRING_sc) == 0)
2390
ptscript = PT_SC;
2391
2392
else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2393
PRIV(strcmp_c8)(name, STRING_scx) == 0)
2394
ptscript = PT_SCX;
2395
2396
else
2397
{
2398
*errorcodeptr = ERR47;
2399
return FALSE;
2400
}
2401
2402
/* Adjust the string in name[] as needed */
2403
2404
memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2405
if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2406
}
2407
2408
/* Search for a recognized property using binary chop. */
2409
2410
bot = 0;
2411
top = PRIV(utt_size);
2412
2413
while (bot < top)
2414
{
2415
int r;
2416
i = (bot + top) >> 1;
2417
r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2418
2419
/* When a matching property is found, some extra checking is needed when the
2420
\p{xx:yy} syntax is used and xx is either sc or scx. */
2421
2422
if (r == 0)
2423
{
2424
*pdataptr = PRIV(utt)[i].value;
2425
if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2426
{
2427
*ptypeptr = PRIV(utt)[i].type;
2428
return TRUE;
2429
}
2430
2431
switch (PRIV(utt)[i].type)
2432
{
2433
case PT_SC:
2434
*ptypeptr = PT_SC;
2435
return TRUE;
2436
2437
case PT_SCX:
2438
*ptypeptr = ptscript;
2439
return TRUE;
2440
}
2441
2442
break; /* Non-script found */
2443
}
2444
2445
if (r > 0) bot = i + 1; else top = i;
2446
}
2447
2448
*errorcodeptr = ERR47; /* Unrecognized property */
2449
return FALSE;
2450
2451
ERROR_RETURN: /* Malformed \P or \p */
2452
*errorcodeptr = ERR46;
2453
*ptrptr = ptr;
2454
return FALSE;
2455
}
2456
#endif
2457
2458
2459
2460
/*************************************************
2461
* Check for POSIX class syntax *
2462
*************************************************/
2463
2464
/* This function is called when the sequence "[:" or "[." or "[=" is
2465
encountered in a character class. It checks whether this is followed by a
2466
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2467
reach an unescaped ']' without the special preceding character, return FALSE.
2468
2469
Originally, this function only recognized a sequence of letters between the
2470
terminators, but it seems that Perl recognizes any sequence of characters,
2471
though of course unknown POSIX names are subsequently rejected. Perl gives an
2472
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2473
didn't consider this to be a POSIX class. Likewise for [:1234:].
2474
2475
The problem in trying to be exactly like Perl is in the handling of escapes. We
2476
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2477
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2478
below handles the special cases \\ and \], but does not try to do any other
2479
escape processing. This makes it different from Perl for cases such as
2480
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2481
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2482
when Perl does, I think.
2483
2484
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2485
It seems that the appearance of a nested POSIX class supersedes an apparent
2486
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2487
a digit. This is handled by returning FALSE if the start of a new group with
2488
the same terminator is encountered, since the next closing sequence must close
2489
the nested group, not the outer one.
2490
2491
In Perl, unescaped square brackets may also appear as part of class names. For
2492
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2493
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2494
seem right at all. PCRE does not allow closing square brackets in POSIX class
2495
names.
2496
2497
Arguments:
2498
ptr pointer to the character after the initial [ (colon, dot, equals)
2499
ptrend pointer to the end of the pattern
2500
endptr where to return a pointer to the terminating ':', '.', or '='
2501
2502
Returns: TRUE or FALSE
2503
*/
2504
2505
static BOOL
2506
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2507
{
2508
PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2509
terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2510
2511
for (; ptrend - ptr >= 2; ptr++)
2512
{
2513
if (*ptr == CHAR_BACKSLASH &&
2514
(ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2515
ptr++;
2516
2517
else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2518
*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2519
2520
else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2521
{
2522
*endptr = ptr;
2523
return TRUE;
2524
}
2525
}
2526
2527
return FALSE;
2528
}
2529
2530
2531
2532
/*************************************************
2533
* Check POSIX class name *
2534
*************************************************/
2535
2536
/* This function is called to check the name given in a POSIX-style class entry
2537
such as [:alnum:].
2538
2539
Arguments:
2540
ptr points to the first letter
2541
len the length of the name
2542
2543
Returns: a value representing the name, or -1 if unknown
2544
*/
2545
2546
static int
2547
check_posix_name(PCRE2_SPTR ptr, int len)
2548
{
2549
const char *pn = posix_names;
2550
int yield = 0;
2551
while (posix_name_lengths[yield] != 0)
2552
{
2553
if (len == posix_name_lengths[yield] &&
2554
PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2555
pn += posix_name_lengths[yield] + 1;
2556
yield++;
2557
}
2558
return -1;
2559
}
2560
2561
2562
2563
/*************************************************
2564
* Read a subpattern or VERB name *
2565
*************************************************/
2566
2567
/* This function is called from parse_regex() below whenever it needs to read
2568
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2569
pointer must be to the preceding character. If that character is '*' we are
2570
reading a verb or alpha assertion name. The pointer is updated to point after
2571
the name, for a VERB or alpha assertion name, or after the name's terminator
2572
for a subpattern name. Returning both the offset and the name pointer is
2573
redundant information, but some callers use one and some the other, so it is
2574
simplest just to return both. When the name is in braces, spaces and tabs are
2575
allowed (and ignored) at either end.
2576
2577
Arguments:
2578
ptrptr points to the character pointer variable
2579
ptrend points to the end of the input string
2580
utf true if the input is UTF-encoded
2581
terminator the terminator of a subpattern name must be this
2582
offsetptr where to put the offset from the start of the pattern
2583
nameptr where to put a pointer to the name in the input
2584
namelenptr where to put the length of the name
2585
errcodeptr where to put an error code
2586
cb pointer to the compile data block
2587
2588
Returns: TRUE if a name was read
2589
FALSE otherwise, with error code set
2590
*/
2591
2592
static BOOL
2593
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2594
PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2595
int *errorcodeptr, compile_block *cb)
2596
{
2597
PCRE2_SPTR ptr = *ptrptr;
2598
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2599
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2600
2601
if (is_braced)
2602
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2603
2604
if (ptr >= ptrend) /* No characters in name */
2605
{
2606
*errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2607
ERR60; /* Verb not recognized or malformed */
2608
goto FAILED;
2609
}
2610
2611
*nameptr = ptr;
2612
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2613
2614
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2615
ought to be updated to match. */
2616
2617
/* In UTF mode, a group name may contain letters and decimal digits as defined
2618
by Unicode properties, and underscores, but must not start with a digit. */
2619
2620
#ifdef SUPPORT_UNICODE
2621
if (utf && is_group)
2622
{
2623
uint32_t c, type;
2624
PCRE2_SPTR p = ptr;
2625
2626
GETCHARINC(c, p); /* Peek at next character */
2627
type = UCD_CHARTYPE(c);
2628
2629
if (type == ucp_Nd)
2630
{
2631
ptr = p;
2632
*errorcodeptr = ERR44;
2633
goto FAILED;
2634
}
2635
2636
for(;;)
2637
{
2638
if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2639
c != CHAR_UNDERSCORE) break;
2640
ptr = p; /* Accept character and peek again */
2641
if (p >= ptrend) break;
2642
GETCHARINC(c, p);
2643
type = UCD_CHARTYPE(c);
2644
}
2645
}
2646
else
2647
#else
2648
(void)utf; /* Avoid compiler warning */
2649
#endif /* SUPPORT_UNICODE */
2650
2651
/* Handle non-group names and group names in non-UTF modes. A group name must
2652
not start with a digit. If either of the others start with a digit it just
2653
won't be recognized. */
2654
2655
{
2656
if (is_group && IS_DIGIT(*ptr))
2657
{
2658
++ptr;
2659
*errorcodeptr = ERR44;
2660
goto FAILED;
2661
}
2662
2663
while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2664
{
2665
ptr++;
2666
}
2667
}
2668
2669
/* Check name length */
2670
2671
if (ptr - *nameptr > MAX_NAME_SIZE)
2672
{
2673
*errorcodeptr = ERR48;
2674
goto FAILED;
2675
}
2676
*namelenptr = (uint32_t)(ptr - *nameptr);
2677
2678
/* Subpattern names must not be empty, and their terminator is checked here.
2679
(What follows a verb or alpha assertion name is checked separately.) */
2680
2681
if (is_group)
2682
{
2683
if (ptr == *nameptr)
2684
{
2685
*errorcodeptr = ERR62; /* Subpattern name expected */
2686
goto FAILED;
2687
}
2688
if (is_braced)
2689
while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2690
if (terminator != 0)
2691
{
2692
if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2693
{
2694
*errorcodeptr = ERR42;
2695
goto FAILED;
2696
}
2697
ptr++;
2698
}
2699
}
2700
2701
*ptrptr = ptr;
2702
return TRUE;
2703
2704
FAILED:
2705
*ptrptr = ptr;
2706
return FALSE;
2707
}
2708
2709
2710
2711
/**************************************************
2712
* Parse capturing bracket argument list *
2713
**************************************************/
2714
2715
/* Reads a list of capture references. The references
2716
can be numbers or names.
2717
2718
Arguments:
2719
ptrptr points to the character pointer variable
2720
ptrend points to the end of the input string
2721
utf true if the input is UTF-encoded
2722
parsed_pattern the parsed pattern pointer
2723
offset last known offset
2724
errcodeptr where to put an error code
2725
cb pointer to the compile data block
2726
2727
Returns: updated parsed_pattern pointer on success
2728
NULL otherwise
2729
*/
2730
2731
static uint32_t *
2732
parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
2733
BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,
2734
int *errorcodeptr, compile_block *cb)
2735
{
2736
PCRE2_SIZE next_offset;
2737
PCRE2_SPTR ptr = *ptrptr;
2738
PCRE2_SPTR name;
2739
PCRE2_UCHAR terminator;
2740
uint32_t meta, namelen;
2741
int i;
2742
2743
if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)
2744
{
2745
*errorcodeptr = ERR118;
2746
goto FAILED;
2747
}
2748
2749
for (;;)
2750
{
2751
ptr++;
2752
next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
2753
2754
if (ptr >= ptrend)
2755
{
2756
*errorcodeptr = ERR117;
2757
goto FAILED;
2758
}
2759
2760
/* Handle [+-]number cases */
2761
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
2762
&i, errorcodeptr))
2763
{
2764
PCRE2_ASSERT(i >= 0);
2765
if (i <= 0)
2766
{
2767
*errorcodeptr = ERR15;
2768
goto FAILED;
2769
}
2770
meta = META_CAPTURE_NUMBER;
2771
namelen = (uint32_t)i;
2772
}
2773
else if (*errorcodeptr != 0) goto FAILED; /* Number too big */
2774
else
2775
{
2776
/* Handle 'name' or <name> cases. */
2777
if (*ptr == CHAR_LESS_THAN_SIGN)
2778
terminator = CHAR_GREATER_THAN_SIGN;
2779
else if (*ptr == CHAR_APOSTROPHE)
2780
terminator = CHAR_APOSTROPHE;
2781
else
2782
{
2783
*errorcodeptr = ERR117;
2784
goto FAILED;
2785
}
2786
2787
if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
2788
&name, &namelen, errorcodeptr, cb)) goto FAILED;
2789
2790
meta = META_CAPTURE_NAME;
2791
}
2792
2793
PCRE2_ASSERT(next_offset > 0);
2794
if (offset == 0 || (next_offset - offset) >= 0x10000)
2795
{
2796
*parsed_pattern++ = META_OFFSET;
2797
PUTOFFSET(next_offset, parsed_pattern);
2798
offset = next_offset;
2799
}
2800
2801
/* The offset is encoded as a relative offset, because for some
2802
inputs such as ",2" in (1,2,3), we only have space for two uint32_t
2803
values, and an opcode and absolute offset may require three uint32_t
2804
values. */
2805
*parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
2806
*parsed_pattern++ = namelen;
2807
offset = next_offset;
2808
2809
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
2810
2811
if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
2812
2813
if (*ptr != CHAR_COMMA)
2814
{
2815
*errorcodeptr = ERR24;
2816
goto FAILED;
2817
}
2818
}
2819
2820
*ptrptr = ptr + 1;
2821
return parsed_pattern;
2822
2823
UNCLOSED_PARENTHESIS:
2824
*errorcodeptr = ERR14;
2825
2826
FAILED:
2827
*ptrptr = ptr;
2828
return NULL;
2829
}
2830
2831
2832
2833
/*************************************************
2834
* Manage callouts at start of cycle *
2835
*************************************************/
2836
2837
/* At the start of a new item in parse_regex() we are able to record the
2838
details of the previous item in a prior callout, and also to set up an
2839
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2840
which would otherwise happen for items such as \Q that contribute nothing to
2841
the parsed pattern.
2842
2843
Arguments:
2844
ptr current pattern pointer
2845
pcalloutptr points to a pointer to previous callout, or NULL
2846
auto_callout TRUE if auto_callouts are enabled
2847
parsed_pattern the parsed pattern pointer
2848
cb compile block
2849
2850
Returns: possibly updated parsed_pattern pointer.
2851
*/
2852
2853
static uint32_t *
2854
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2855
uint32_t *parsed_pattern, compile_block *cb)
2856
{
2857
uint32_t *previous_callout = *pcalloutptr;
2858
2859
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2860
cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2861
2862
if (!auto_callout) previous_callout = NULL; else
2863
{
2864
if (previous_callout == NULL ||
2865
previous_callout != parsed_pattern - 4 ||
2866
previous_callout[3] != 255)
2867
{
2868
previous_callout = parsed_pattern; /* Set up new automatic callout */
2869
parsed_pattern += 4;
2870
previous_callout[0] = META_CALLOUT_NUMBER;
2871
previous_callout[2] = 0;
2872
previous_callout[3] = 255;
2873
}
2874
previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2875
}
2876
2877
*pcalloutptr = previous_callout;
2878
return parsed_pattern;
2879
}
2880
2881
2882
2883
/*************************************************
2884
* Handle \d, \D, \s, \S, \w, \W *
2885
*************************************************/
2886
2887
/* This function is called from parse_regex() below, both for freestanding
2888
escapes, and those within classes, to handle those escapes that may change when
2889
Unicode property support is requested. Note that PCRE2_UCP will never be set
2890
without Unicode support because that is checked when pcre2_compile() is called.
2891
2892
Arguments:
2893
escape the ESC_... value
2894
parsed_pattern where to add the code
2895
options options bits
2896
xoptions extra options bits
2897
2898
Returns: updated value of parsed_pattern
2899
*/
2900
static uint32_t *
2901
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2902
uint32_t xoptions)
2903
{
2904
uint32_t ascii_option = 0;
2905
uint32_t prop = ESC_p;
2906
2907
switch(escape)
2908
{
2909
case ESC_D:
2910
prop = ESC_P;
2911
PCRE2_FALLTHROUGH /* Fall through */
2912
case ESC_d:
2913
ascii_option = PCRE2_EXTRA_ASCII_BSD;
2914
break;
2915
2916
case ESC_S:
2917
prop = ESC_P;
2918
PCRE2_FALLTHROUGH /* Fall through */
2919
case ESC_s:
2920
ascii_option = PCRE2_EXTRA_ASCII_BSS;
2921
break;
2922
2923
case ESC_W:
2924
prop = ESC_P;
2925
PCRE2_FALLTHROUGH /* Fall through */
2926
case ESC_w:
2927
ascii_option = PCRE2_EXTRA_ASCII_BSW;
2928
break;
2929
}
2930
2931
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2932
{
2933
*parsed_pattern++ = META_ESCAPE + escape;
2934
}
2935
else
2936
{
2937
*parsed_pattern++ = META_ESCAPE + prop;
2938
switch(escape)
2939
{
2940
case ESC_d:
2941
case ESC_D:
2942
*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2943
break;
2944
2945
case ESC_s:
2946
case ESC_S:
2947
*parsed_pattern++ = PT_SPACE << 16;
2948
break;
2949
2950
case ESC_w:
2951
case ESC_W:
2952
*parsed_pattern++ = PT_WORD << 16;
2953
break;
2954
}
2955
}
2956
2957
return parsed_pattern;
2958
}
2959
2960
2961
2962
/*************************************************
2963
* Maximum size of parsed_pattern for given input *
2964
*************************************************/
2965
2966
/* This function is called from parse_regex() below, to determine the amount
2967
of memory to allocate for parsed_pattern. It is also called to check whether
2968
the amount of data written respects the amount of memory allocated.
2969
2970
Arguments:
2971
ptr points to the start of the pattern
2972
ptrend points to the end of the pattern
2973
utf TRUE in UTF mode
2974
options the options bits
2975
2976
Returns: the number of uint32_t units for parsed_pattern
2977
*/
2978
static ptrdiff_t
2979
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2980
uint32_t options)
2981
{
2982
PCRE2_SIZE big32count = 0;
2983
ptrdiff_t parsed_size_needed;
2984
2985
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2986
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2987
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2988
when literal characters greater than META_END (0x80000000) have to be coded as
2989
two units. In this case, therefore, we scan the pattern to check for such
2990
values. */
2991
2992
#if PCRE2_CODE_UNIT_WIDTH == 32
2993
if (!utf)
2994
{
2995
PCRE2_SPTR p;
2996
for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2997
}
2998
#else
2999
(void)utf; /* Avoid compiler warning */
3000
#endif
3001
3002
parsed_size_needed = (ptrend - ptr) + big32count;
3003
3004
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
3005
elements) for each character. This is overkill, but memory is plentiful these
3006
days. */
3007
3008
if ((options & PCRE2_AUTO_CALLOUT) != 0)
3009
parsed_size_needed += (ptrend - ptr) * 4;
3010
3011
return parsed_size_needed;
3012
}
3013
3014
3015
3016
/*************************************************
3017
* Parse regex and identify named groups *
3018
*************************************************/
3019
3020
/* This function is called first of all. It scans the pattern and does two
3021
things: (1) It identifies capturing groups and makes a table of named capturing
3022
groups so that information about them is fully available to both the compiling
3023
scans. (2) It writes a parsed version of the pattern with comments omitted and
3024
escapes processed into the parsed_pattern vector.
3025
3026
Arguments:
3027
ptr points to the start of the pattern
3028
options compiling dynamic options (may change during the scan)
3029
has_lookbehind points to a boolean, set TRUE if a lookbehind is found
3030
cb pointer to the compile data block
3031
3032
Returns: zero on success or a non-zero error code, with the
3033
error offset placed in the cb field
3034
*/
3035
3036
/* A structure and some flags for dealing with nested groups. */
3037
3038
typedef struct nest_save {
3039
uint16_t nest_depth;
3040
uint16_t reset_group;
3041
uint16_t max_group;
3042
uint16_t flags;
3043
uint32_t options;
3044
uint32_t xoptions;
3045
} nest_save;
3046
3047
#define NSF_RESET 0x0001u
3048
#define NSF_CONDASSERT 0x0002u
3049
#define NSF_ATOMICSR 0x0004u
3050
3051
/* Options that are changeable within the pattern must be tracked during
3052
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
3053
but all must be tracked so that META_OPTIONS items set the correct values for
3054
the main compiling phase. */
3055
3056
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
3057
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
3058
PCRE2_UNGREEDY)
3059
3060
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
3061
PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
3062
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
3063
3064
/* States used for analyzing ranges in character classes. The two OK values
3065
must be last. */
3066
3067
enum {
3068
RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
3069
RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
3070
RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
3071
RANGE_FORBID_STARTED, /* State after '[\d-'*/
3072
RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
3073
RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
3074
};
3075
3076
/* States used for analyzing operators and operands in extended character
3077
classes. */
3078
3079
enum {
3080
CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
3081
CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
3082
CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
3083
};
3084
3085
/* States used for determining the parse mode in character classes. The two
3086
PERL_EXT values must be last. */
3087
3088
enum {
3089
CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
3090
CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
3091
CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
3092
CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
3093
};
3094
3095
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
3096
the storing of literal values in the main parsed pattern, where they can always
3097
be quantified. */
3098
3099
#if PCRE2_CODE_UNIT_WIDTH == 32
3100
#define PARSED_LITERAL(c, p) \
3101
{ \
3102
if (c >= META_END) *p++ = META_BIGVALUE; \
3103
*p++ = c; \
3104
okquantifier = TRUE; \
3105
}
3106
#else
3107
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
3108
#endif
3109
3110
/* Here's the actual function. */
3111
3112
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
3113
BOOL *has_lookbehind, compile_block *cb)
3114
{
3115
uint32_t c;
3116
uint32_t delimiter;
3117
uint32_t namelen;
3118
uint32_t class_range_state;
3119
uint32_t class_op_state;
3120
uint32_t class_mode_state;
3121
uint32_t *class_start;
3122
uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
3123
uint32_t *verbstartptr = NULL;
3124
uint32_t *previous_callout = NULL;
3125
uint32_t *parsed_pattern = cb->parsed_pattern;
3126
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
3127
uint32_t *this_parsed_item = NULL;
3128
uint32_t *prev_parsed_item = NULL;
3129
uint32_t meta_quantifier = 0;
3130
uint32_t add_after_mark = 0;
3131
uint16_t nest_depth = 0;
3132
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
3133
int16_t class_maxdepth_m1 = -1;
3134
uint16_t hash;
3135
int after_manual_callout = 0;
3136
int expect_cond_assert = 0;
3137
int errorcode = 0;
3138
int escape;
3139
int i;
3140
BOOL inescq = FALSE;
3141
BOOL inverbname = FALSE;
3142
BOOL utf = (options & PCRE2_UTF) != 0;
3143
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
3144
BOOL is_dupname;
3145
BOOL negate_class;
3146
BOOL okquantifier = FALSE;
3147
PCRE2_SPTR thisptr;
3148
PCRE2_SPTR name;
3149
PCRE2_SPTR ptrend = cb->end_pattern;
3150
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
3151
PCRE2_SPTR class_range_forbid_ptr = NULL;
3152
named_group *ng;
3153
nest_save *top_nest, *end_nests;
3154
#ifdef PCRE2_DEBUG
3155
uint32_t *parsed_pattern_check;
3156
ptrdiff_t parsed_pattern_extra = 0;
3157
ptrdiff_t parsed_pattern_extra_check = 0;
3158
PCRE2_SPTR ptr_check;
3159
#endif
3160
3161
PCRE2_ASSERT(parsed_pattern != NULL);
3162
3163
/* Insert leading items for word and line matching (features provided for the
3164
benefit of pcre2grep). */
3165
3166
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
3167
{
3168
*parsed_pattern++ = META_CIRCUMFLEX;
3169
*parsed_pattern++ = META_NOCAPTURE;
3170
}
3171
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
3172
{
3173
*parsed_pattern++ = META_ESCAPE + ESC_b;
3174
*parsed_pattern++ = META_NOCAPTURE;
3175
}
3176
3177
#ifdef PCRE2_DEBUG
3178
parsed_pattern_check = parsed_pattern;
3179
ptr_check = ptr;
3180
#endif
3181
3182
/* If the pattern is actually a literal string, process it separately to avoid
3183
cluttering up the main loop. */
3184
3185
if ((options & PCRE2_LITERAL) != 0)
3186
{
3187
while (ptr < ptrend)
3188
{
3189
/* LCOV_EXCL_START */
3190
if (parsed_pattern >= parsed_pattern_end)
3191
{
3192
PCRE2_DEBUG_UNREACHABLE();
3193
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
3194
goto FAILED;
3195
}
3196
/* LCOV_EXCL_STOP */
3197
3198
thisptr = ptr;
3199
GETCHARINCTEST(c, ptr);
3200
if (auto_callout)
3201
parsed_pattern = manage_callouts(thisptr, &previous_callout,
3202
auto_callout, parsed_pattern, cb);
3203
PARSED_LITERAL(c, parsed_pattern);
3204
}
3205
goto PARSED_END;
3206
}
3207
3208
/* Process a real regex which may contain meta-characters. */
3209
3210
top_nest = NULL;
3211
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3212
3213
/* The size of the nest_save structure might not be a factor of the size of the
3214
workspace. Therefore we must round down end_nests so as to correctly avoid
3215
creating a nest_save that spans the end of the workspace. */
3216
3217
end_nests = (nest_save *)((char *)end_nests -
3218
((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3219
3220
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3221
3222
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3223
3224
/* Now scan the pattern */
3225
3226
while (ptr < ptrend)
3227
{
3228
int prev_expect_cond_assert;
3229
uint32_t min_repeat = 0, max_repeat = 0;
3230
uint32_t set, unset, *optset;
3231
uint32_t xset, xunset, *xoptset;
3232
uint32_t terminator;
3233
uint32_t prev_meta_quantifier;
3234
BOOL prev_okquantifier;
3235
PCRE2_SPTR tempptr;
3236
PCRE2_SIZE offset;
3237
3238
if (nest_depth > cb->cx->parens_nest_limit)
3239
{
3240
errorcode = ERR19;
3241
goto FAILED; /* Parentheses too deeply nested */
3242
}
3243
3244
/* Check that we haven't emitted too much into parsed_pattern. We allocate
3245
a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3246
write a little bit too much, everything will appear to be OK, because the
3247
upfront size is an overestimate... but a malicious pattern could end up
3248
forcing a write past the buffer end. We must catch this during
3249
development. */
3250
3251
#ifdef PCRE2_DEBUG
3252
/* Strong post-write check. Won't help in release builds - at this point
3253
the write has already occurred so it's too late. However, should stop us
3254
committing unsafe code. */
3255
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3256
(parsed_pattern_extra - parsed_pattern_extra_check) <=
3257
max_parsed_pattern(ptr_check, ptr, utf, options));
3258
parsed_pattern_check = parsed_pattern;
3259
parsed_pattern_extra_check = parsed_pattern_extra;
3260
ptr_check = ptr;
3261
#endif
3262
3263
/* LCOV_EXCL_START */
3264
if (parsed_pattern >= parsed_pattern_end)
3265
{
3266
/* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3267
(but the code below can write many chars). Better than nothing. */
3268
PCRE2_DEBUG_UNREACHABLE();
3269
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
3270
goto FAILED;
3271
}
3272
/* LCOV_EXCL_STOP */
3273
3274
/* If the last time round this loop something was added, parsed_pattern will
3275
no longer be equal to this_parsed_item. Remember where the previous item
3276
started and reset for the next item. Note that sometimes round the loop,
3277
nothing gets added (e.g. for ignored white space). */
3278
3279
if (this_parsed_item != parsed_pattern)
3280
{
3281
prev_parsed_item = this_parsed_item;
3282
this_parsed_item = parsed_pattern;
3283
}
3284
3285
/* Get next input character, save its position for callout handling. */
3286
3287
thisptr = ptr;
3288
GETCHARINCTEST(c, ptr);
3289
3290
/* Copy quoted literals until \E, allowing for the possibility of automatic
3291
callouts, except when processing a (*VERB) "name". */
3292
3293
if (inescq)
3294
{
3295
if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3296
{
3297
inescq = FALSE;
3298
ptr++; /* Skip E */
3299
}
3300
else
3301
{
3302
if (inverbname)
3303
{ /* Don't use PARSED_LITERAL() because it */
3304
#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3305
if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3306
#endif
3307
*parsed_pattern++ = c;
3308
}
3309
else
3310
{
3311
if (after_manual_callout-- <= 0)
3312
parsed_pattern = manage_callouts(thisptr, &previous_callout,
3313
auto_callout, parsed_pattern, cb);
3314
PARSED_LITERAL(c, parsed_pattern);
3315
}
3316
meta_quantifier = 0;
3317
}
3318
continue; /* Next character */
3319
}
3320
3321
/* If we are processing the "name" part of a (*VERB:NAME) item, all
3322
characters up to the closing parenthesis are literals except when
3323
PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3324
and \E and escaped characters are allowed (no character types such as \d). If
3325
PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3326
this by not entering the special (*VERB:NAME) processing - they are then
3327
picked up below. Note that c is a character, not a code unit, so we must not
3328
use MAX_255 to test its size because MAX_255 tests code units and is assumed
3329
TRUE in 8-bit mode. */
3330
3331
if (inverbname &&
3332
(
3333
/* EITHER: not both options set */
3334
((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3335
(PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3336
#ifdef SUPPORT_UNICODE
3337
/* OR: character > 255 AND not Unicode Pattern White Space */
3338
(c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3339
#endif
3340
/* OR: not a # comment or isspace() white space */
3341
(c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3342
#ifdef SUPPORT_UNICODE
3343
/* and not CHAR_NEL when Unicode is supported */
3344
&& c != CHAR_NEL
3345
#endif
3346
)))
3347
{
3348
PCRE2_SIZE verbnamelength;
3349
3350
switch(c)
3351
{
3352
default: /* Don't use PARSED_LITERAL() because it */
3353
#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3354
if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3355
#endif
3356
*parsed_pattern++ = c;
3357
break;
3358
3359
case CHAR_RIGHT_PARENTHESIS:
3360
inverbname = FALSE;
3361
/* This is the length in characters */
3362
verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3363
/* But the limit on the length is in code units */
3364
if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3365
{
3366
ptr--;
3367
errorcode = ERR76;
3368
goto FAILED;
3369
}
3370
*verblengthptr = (uint32_t)verbnamelength;
3371
3372
/* If this name was on a verb such as (*ACCEPT) which does not continue,
3373
a (*MARK) was generated for the name. We now add the original verb as the
3374
next item. */
3375
3376
if (add_after_mark != 0)
3377
{
3378
*parsed_pattern++ = add_after_mark;
3379
add_after_mark = 0;
3380
}
3381
break;
3382
3383
case CHAR_BACKSLASH:
3384
if ((options & PCRE2_ALT_VERBNAMES) != 0)
3385
{
3386
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3387
xoptions, cb->bracount, FALSE, cb);
3388
if (errorcode != 0) goto FAILED;
3389
}
3390
else escape = 0; /* Treat all as literal */
3391
3392
switch(escape)
3393
{
3394
case 0: /* Don't use PARSED_LITERAL() because it */
3395
#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3396
if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3397
#endif
3398
*parsed_pattern++ = c;
3399
break;
3400
3401
case ESC_ub:
3402
*parsed_pattern++ = CHAR_u;
3403
PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3404
break;
3405
3406
case ESC_Q:
3407
inescq = TRUE;
3408
break;
3409
3410
case ESC_E: /* Ignore */
3411
break;
3412
3413
default:
3414
errorcode = ERR40; /* Invalid in verb name */
3415
goto FAILED;
3416
}
3417
}
3418
continue; /* Next character in pattern */
3419
}
3420
3421
/* Not a verb name character. At this point we must process everything that
3422
must not change the quantification state. This is mainly comments, but we
3423
handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3424
A+, as in Perl. An isolated \E is ignored. */
3425
3426
if (c == CHAR_BACKSLASH && ptr < ptrend)
3427
{
3428
if (*ptr == CHAR_Q || *ptr == CHAR_E)
3429
{
3430
/* A literal inside a \Q...\E is not allowed if we are expecting a
3431
conditional assertion, but an empty \Q\E sequence is OK. */
3432
if (expect_cond_assert > 0 && *ptr == CHAR_Q &&
3433
!(ptrend - ptr >= 3 && ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E))
3434
{
3435
ptr--;
3436
errorcode = ERR28;
3437
goto FAILED;
3438
}
3439
inescq = *ptr == CHAR_Q;
3440
ptr++;
3441
continue;
3442
}
3443
}
3444
3445
/* Skip over whitespace and # comments in extended mode. Note that c is a
3446
character, not a code unit, so we must not use MAX_255 to test its size
3447
because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3448
whitespace characters are those designated as "Pattern White Space" by
3449
Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3450
U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3451
subset of space characters that match \h and \v. */
3452
3453
if ((options & PCRE2_EXTENDED) != 0)
3454
{
3455
if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3456
#ifdef SUPPORT_UNICODE
3457
if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3458
#endif
3459
if (c == CHAR_NUMBER_SIGN)
3460
{
3461
while (ptr < ptrend)
3462
{
3463
if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3464
{ /* IS_NEWLINE sets cb->nllen. */
3465
ptr += cb->nllen;
3466
break;
3467
}
3468
ptr++;
3469
#ifdef SUPPORT_UNICODE
3470
if (utf) FORWARDCHARTEST(ptr, ptrend);
3471
#endif
3472
}
3473
continue; /* Next character in pattern */
3474
}
3475
}
3476
3477
/* Skip over bracketed comments */
3478
3479
if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3480
ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3481
{
3482
while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3483
if (ptr >= ptrend)
3484
{
3485
errorcode = ERR18; /* A special error for missing ) in a comment */
3486
goto FAILED; /* to make it easier to debug. */
3487
}
3488
ptr++;
3489
continue; /* Next character in pattern */
3490
}
3491
3492
/* If the next item is not a quantifier, fill in length of any previous
3493
callout and create an auto callout if required. */
3494
3495
if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3496
(c != CHAR_LEFT_CURLY_BRACKET ||
3497
(tempptr = ptr,
3498
!read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3499
{
3500
if (after_manual_callout-- <= 0)
3501
{
3502
parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3503
parsed_pattern, cb);
3504
this_parsed_item = parsed_pattern; /* New start for current item */
3505
}
3506
}
3507
3508
/* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3509
assertion, possibly preceded by a callout. If the value is 1, we have just
3510
had the callout and expect an assertion. There must be at least 3 more
3511
characters in all cases. When expect_cond_assert is 2, we know that the
3512
current character is an opening parenthesis, as otherwise we wouldn't be
3513
here. However, when it is 1, we need to check, and it's easiest just to check
3514
always. Note that expect_cond_assert may be negative, since all callouts just
3515
decrement it. */
3516
3517
if (expect_cond_assert > 0)
3518
{
3519
BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3520
(ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3521
if (ok)
3522
{
3523
if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
3524
{
3525
ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3526
}
3527
else switch(ptr[1]) /* Traditional symbolic format */
3528
{
3529
case CHAR_C:
3530
ok = expect_cond_assert == 2;
3531
break;
3532
3533
case CHAR_EQUALS_SIGN:
3534
case CHAR_EXCLAMATION_MARK:
3535
break;
3536
3537
case CHAR_LESS_THAN_SIGN:
3538
ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3539
break;
3540
3541
default:
3542
ok = FALSE;
3543
}
3544
}
3545
3546
if (!ok)
3547
{
3548
errorcode = ERR28;
3549
if (expect_cond_assert == 2) goto FAILED;
3550
goto FAILED_BACK;
3551
}
3552
}
3553
3554
/* Remember whether we are expecting a conditional assertion, and set the
3555
default for this item. */
3556
3557
prev_expect_cond_assert = expect_cond_assert;
3558
expect_cond_assert = 0;
3559
3560
/* Remember quantification status for the previous significant item, then set
3561
default for this item. */
3562
3563
prev_okquantifier = okquantifier;
3564
prev_meta_quantifier = meta_quantifier;
3565
okquantifier = FALSE;
3566
meta_quantifier = 0;
3567
3568
/* If the previous significant item was a quantifier, adjust the parsed code
3569
if there is a following modifier. The base meta value is always followed by
3570
the PLUS and QUERY values, in that order. We do this here rather than after
3571
reading a quantifier so that intervening comments and /x whitespace can be
3572
ignored without having to replicate code. */
3573
3574
if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3575
{
3576
parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3577
prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3578
0x00020000u : 0x00010000u);
3579
continue; /* Next character in pattern */
3580
}
3581
3582
/* Process the next item in the main part of a pattern. */
3583
3584
switch(c)
3585
{
3586
default: /* Non-special character */
3587
PARSED_LITERAL(c, parsed_pattern);
3588
break;
3589
3590
3591
/* ---- Escape sequence ---- */
3592
3593
case CHAR_BACKSLASH:
3594
tempptr = ptr;
3595
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3596
xoptions, cb->bracount, FALSE, cb);
3597
if (errorcode != 0)
3598
{
3599
ESCAPE_FAILED:
3600
if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3601
goto FAILED;
3602
ptr = tempptr;
3603
if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3604
{
3605
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3606
}
3607
escape = 0; /* Treat as literal character */
3608
}
3609
3610
/* The escape was a data escape or literal character. */
3611
3612
if (escape == 0)
3613
{
3614
PARSED_LITERAL(c, parsed_pattern);
3615
}
3616
3617
/* The escape was a back (or forward) reference. We keep the offset in
3618
order to give a more useful diagnostic for a bad forward reference. For
3619
references to groups numbered less than 10 we can't use more than two items
3620
in parsed_pattern because they may be just two characters in the input (and
3621
in a 64-bit world an offset may need two elements). So for them, the offset
3622
of the first occurrent is held in a special vector. */
3623
3624
else if (escape < 0)
3625
{
3626
offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3627
escape = -escape - 1;
3628
*parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3629
if (escape < 10)
3630
{
3631
if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3632
cb->small_ref_offset[escape] = offset;
3633
}
3634
else
3635
{
3636
PUTOFFSET(offset, parsed_pattern);
3637
}
3638
okquantifier = TRUE;
3639
}
3640
3641
/* The escape was a character class such as \d etc. or other special
3642
escape indicator such as \A or \X. Most of them generate just a single
3643
parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3644
value. They are supported only when Unicode is available. The type and
3645
value are packed into a single 32-bit value so that the whole sequences
3646
uses only two elements in the parsed_vector. This is because the same
3647
coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3648
set.
3649
3650
There are also some cases where the escape sequence is followed by a name:
3651
\k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3652
and \g'name' are subroutine calls by name; \g{name} is a synonym for
3653
\k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3654
and returned as a negative value (handled above). A name is coded as an
3655
offset into the pattern and a length. */
3656
3657
else switch (escape)
3658
{
3659
case ESC_C:
3660
#ifdef NEVER_BACKSLASH_C
3661
errorcode = ERR85;
3662
goto ESCAPE_FAILED;
3663
#else
3664
if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3665
{
3666
errorcode = ERR83;
3667
goto ESCAPE_FAILED;
3668
}
3669
#endif
3670
okquantifier = TRUE;
3671
*parsed_pattern++ = META_ESCAPE + escape;
3672
break;
3673
3674
/* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3675
when \u{ is not followed by hex digits and }. It requests two literal
3676
characters, u and { and we need this, as otherwise \u{ 12} (for example)
3677
would be treated as u{12} now that spaces are allowed in quantifiers. */
3678
3679
case ESC_ub:
3680
*parsed_pattern++ = CHAR_u;
3681
PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3682
break;
3683
3684
case ESC_X:
3685
#ifndef SUPPORT_UNICODE
3686
errorcode = ERR45; /* Supported only with Unicode support */
3687
goto ESCAPE_FAILED;
3688
#endif
3689
case ESC_H:
3690
case ESC_h:
3691
case ESC_N:
3692
case ESC_R:
3693
case ESC_V:
3694
case ESC_v:
3695
okquantifier = TRUE;
3696
*parsed_pattern++ = META_ESCAPE + escape;
3697
break;
3698
3699
default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3700
*parsed_pattern++ = META_ESCAPE + escape;
3701
break;
3702
3703
/* Escapes that may change in UCP mode. */
3704
3705
case ESC_d:
3706
case ESC_D:
3707
case ESC_s:
3708
case ESC_S:
3709
case ESC_w:
3710
case ESC_W:
3711
okquantifier = TRUE;
3712
parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3713
xoptions);
3714
break;
3715
3716
/* Unicode property matching */
3717
3718
case ESC_P:
3719
case ESC_p:
3720
#ifdef SUPPORT_UNICODE
3721
{
3722
BOOL negated;
3723
uint16_t ptype = 0, pdata = 0;
3724
if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
3725
goto ESCAPE_FAILED;
3726
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3727
*parsed_pattern++ = META_ESCAPE + escape;
3728
*parsed_pattern++ = (ptype << 16) | pdata;
3729
okquantifier = TRUE;
3730
}
3731
#else
3732
errorcode = ERR45;
3733
goto ESCAPE_FAILED;
3734
#endif
3735
break; /* End \P and \p */
3736
3737
/* When \g is used with quotes or angle brackets as delimiters, it is a
3738
numerical or named subroutine call, and control comes here. When used
3739
with brace delimiters it is a numerical back reference and does not come
3740
here because check_escape() returns it directly as a reference. \k is
3741
always a named back reference. */
3742
3743
case ESC_g:
3744
case ESC_k:
3745
if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3746
*ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3747
{
3748
errorcode = (escape == ESC_g)? ERR57 : ERR69;
3749
goto ESCAPE_FAILED;
3750
}
3751
terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3752
CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3753
CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3754
3755
/* For a non-braced \g, check for a numerical recursion. */
3756
3757
if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3758
{
3759
PCRE2_SPTR p = ptr + 1;
3760
3761
if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3762
&errorcode))
3763
{
3764
if (p >= ptrend || *p != terminator)
3765
{
3766
ptr = p;
3767
errorcode = ERR119; /* Missing terminator for number */
3768
goto ESCAPE_FAILED;
3769
}
3770
ptr = p + 1;
3771
goto SET_RECURSION;
3772
}
3773
if (errorcode != 0) goto ESCAPE_FAILED;
3774
}
3775
3776
/* Not a numerical recursion. Perl allows spaces and tabs after { and
3777
before } but not for other delimiters. */
3778
3779
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3780
&errorcode, cb)) goto ESCAPE_FAILED;
3781
3782
/* \k and \g when used with braces are back references, whereas \g used
3783
with quotes or angle brackets is a recursion */
3784
3785
*parsed_pattern++ =
3786
(escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3787
META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3788
*parsed_pattern++ = namelen;
3789
3790
PUTOFFSET(offset, parsed_pattern);
3791
okquantifier = TRUE;
3792
break; /* End special escape processing */
3793
}
3794
break; /* End escape sequence processing */
3795
3796
3797
/* ---- Single-character special items ---- */
3798
3799
case CHAR_CIRCUMFLEX_ACCENT:
3800
*parsed_pattern++ = META_CIRCUMFLEX;
3801
break;
3802
3803
case CHAR_DOLLAR_SIGN:
3804
*parsed_pattern++ = META_DOLLAR;
3805
break;
3806
3807
case CHAR_DOT:
3808
*parsed_pattern++ = META_DOT;
3809
okquantifier = TRUE;
3810
break;
3811
3812
3813
/* ---- Single-character quantifiers ---- */
3814
3815
case CHAR_ASTERISK:
3816
meta_quantifier = META_ASTERISK;
3817
goto CHECK_QUANTIFIER;
3818
3819
case CHAR_PLUS:
3820
meta_quantifier = META_PLUS;
3821
goto CHECK_QUANTIFIER;
3822
3823
case CHAR_QUESTION_MARK:
3824
meta_quantifier = META_QUERY;
3825
goto CHECK_QUANTIFIER;
3826
3827
3828
/* ---- Potential {n,m} quantifier ---- */
3829
3830
case CHAR_LEFT_CURLY_BRACKET:
3831
if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3832
&errorcode))
3833
{
3834
if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3835
PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3836
break; /* No more quantifier processing */
3837
}
3838
meta_quantifier = META_MINMAX;
3839
/* Fall through */
3840
3841
3842
/* ---- Quantifier post-processing ---- */
3843
3844
/* Check that a quantifier is allowed after the previous item. This
3845
guarantees that there is a previous item. */
3846
3847
CHECK_QUANTIFIER:
3848
if (!prev_okquantifier)
3849
{
3850
errorcode = ERR9;
3851
goto FAILED;
3852
}
3853
3854
/* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3855
quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3856
sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3857
wrapping it in non-capturing brackets, but we have to allow for a preceding
3858
(*MARK) for when (*ACCEPT) has an argument. */
3859
3860
if (*prev_parsed_item == META_ACCEPT)
3861
{
3862
uint32_t *p;
3863
for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3864
*verbstartptr = META_NOCAPTURE;
3865
parsed_pattern[1] = META_KET;
3866
parsed_pattern += 2;
3867
3868
#ifdef PCRE2_DEBUG
3869
PCRE2_ASSERT(parsed_pattern_extra >= 2);
3870
parsed_pattern_extra -= 2;
3871
#endif
3872
}
3873
3874
/* Now we can put the quantifier into the parsed pattern vector. At this
3875
stage, we have only the basic quantifier. The check for a following + or ?
3876
modifier happens at the top of the loop, after any intervening comments
3877
have been removed. */
3878
3879
*parsed_pattern++ = meta_quantifier;
3880
if (c == CHAR_LEFT_CURLY_BRACKET)
3881
{
3882
*parsed_pattern++ = min_repeat;
3883
*parsed_pattern++ = max_repeat;
3884
}
3885
break;
3886
3887
3888
/* ---- Character class ---- */
3889
3890
case CHAR_LEFT_SQUARE_BRACKET:
3891
3892
/* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3893
used for "start of word" and "end of word". As these are otherwise illegal
3894
sequences, we don't break anything by recognizing them. They are replaced
3895
by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3896
erroneous and are handled by the normal code below. */
3897
3898
if (ptrend - ptr >= 6 &&
3899
(PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3900
PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3901
{
3902
*parsed_pattern++ = META_ESCAPE + ESC_b;
3903
3904
if (ptr[2] == CHAR_LESS_THAN_SIGN)
3905
{
3906
*parsed_pattern++ = META_LOOKAHEAD;
3907
}
3908
else
3909
{
3910
*parsed_pattern++ = META_LOOKBEHIND;
3911
*has_lookbehind = TRUE;
3912
3913
/* The offset is used only for the "non-fixed length" error; this won't
3914
occur here, so just store zero. */
3915
3916
PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3917
}
3918
3919
if ((options & PCRE2_UCP) == 0)
3920
*parsed_pattern++ = META_ESCAPE + ESC_w;
3921
else
3922
{
3923
*parsed_pattern++ = META_ESCAPE + ESC_p;
3924
*parsed_pattern++ = PT_WORD << 16;
3925
}
3926
*parsed_pattern++ = META_KET;
3927
ptr += 6;
3928
okquantifier = TRUE;
3929
break;
3930
}
3931
3932
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3933
they are encountered at the top level, so we'll do that too. */
3934
3935
if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3936
*ptr == CHAR_EQUALS_SIGN) &&
3937
check_posix_syntax(ptr, ptrend, &tempptr))
3938
{
3939
errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3940
ptr = tempptr + 2;
3941
goto FAILED;
3942
}
3943
3944
class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3945
CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3946
3947
/* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3948
set c to the '[' character, and ptr to just after the '['. */
3949
3950
FROM_PERL_EXTENDED_CLASS:
3951
okquantifier = TRUE;
3952
3953
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
3954
because there are holes in the encoding, and simply using the range A-Z
3955
(for example) would include the characters in the holes. This applies only
3956
to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3957
in this respect. In order to accommodate this, we keep track of whether
3958
character values are literal or not, and a state variable for handling
3959
ranges. */
3960
3961
/* Loop for the contents of the class. Classes may be nested, if
3962
PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3963
3964
/* c is still set to '[' so the loop will handle the start of the class. */
3965
3966
class_depth_m1 = -1;
3967
class_maxdepth_m1 = -1;
3968
class_range_state = RANGE_NO;
3969
class_op_state = CLASS_OP_EMPTY;
3970
class_start = NULL;
3971
3972
for (;;)
3973
{
3974
BOOL char_is_literal = TRUE;
3975
3976
/* Inside \Q...\E everything is literal except \E */
3977
3978
if (inescq)
3979
{
3980
if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3981
{
3982
inescq = FALSE; /* Reset literal state */
3983
ptr++; /* Skip the 'E' */
3984
goto CLASS_CONTINUE;
3985
}
3986
3987
/* Surprisingly, you cannot use \Q..\E to escape a character inside a
3988
Perl extended class. However, empty \Q\E sequences are allowed, so here
3989
were're only giving an error if the \Q..\E is non-empty. */
3990
3991
if (class_mode_state == CLASS_MODE_PERL_EXT)
3992
{
3993
errorcode = ERR116;
3994
goto FAILED;
3995
}
3996
3997
goto CLASS_LITERAL;
3998
}
3999
4000
/* Skip over space and tab (only) in extended-more mode, or anywhere
4001
inside a Perl extended class (which implies /xx). */
4002
4003
if ((c == CHAR_SPACE || c == CHAR_HT) &&
4004
((options & PCRE2_EXTENDED_MORE) != 0 ||
4005
class_mode_state >= CLASS_MODE_PERL_EXT))
4006
goto CLASS_CONTINUE;
4007
4008
/* Handle POSIX class names. Perl allows a negation extension of the
4009
form [:^name:]. A square bracket that doesn't match the syntax is
4010
treated as a literal. We also recognize the POSIX constructions
4011
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4012
5.6 and 5.8 do. */
4013
4014
if (class_depth_m1 >= 0 &&
4015
c == CHAR_LEFT_SQUARE_BRACKET &&
4016
ptrend - ptr >= 3 &&
4017
(*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
4018
*ptr == CHAR_EQUALS_SIGN) &&
4019
check_posix_syntax(ptr, ptrend, &tempptr))
4020
{
4021
BOOL posix_negate = FALSE;
4022
int posix_class;
4023
4024
/* Perl treats a hyphen before a POSIX class as a literal, not the
4025
start of a range. However, it gives a warning in its warning mode. PCRE
4026
does not have a warning mode, so we give an error, because this is
4027
likely an error on the user's part. */
4028
4029
if (class_range_state == RANGE_STARTED)
4030
{
4031
ptr = tempptr + 2;
4032
errorcode = ERR50;
4033
goto FAILED;
4034
}
4035
4036
/* Perl treats a hyphen after a POSIX class as a literal, not the
4037
start of a range. However, it gives a warning in its warning mode
4038
unless the hyphen is the last character in the class. PCRE does not
4039
have a warning mode, so we give an error, because this is likely an
4040
error on the user's part.
4041
4042
Roll back to the hyphen for the error position. */
4043
4044
if (class_range_state == RANGE_FORBID_STARTED)
4045
{
4046
ptr = class_range_forbid_ptr;
4047
errorcode = ERR50;
4048
goto FAILED;
4049
}
4050
4051
/* Disallow implicit union in Perl extended classes. */
4052
4053
if (class_op_state == CLASS_OP_OPERAND &&
4054
class_mode_state == CLASS_MODE_PERL_EXT)
4055
{
4056
ptr = tempptr + 2;
4057
errorcode = ERR113;
4058
goto FAILED;
4059
}
4060
4061
if (*ptr != CHAR_COLON)
4062
{
4063
ptr = tempptr + 2;
4064
errorcode = ERR13;
4065
goto FAILED;
4066
}
4067
4068
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
4069
{
4070
posix_negate = TRUE;
4071
ptr++;
4072
}
4073
4074
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4075
ptr = tempptr + 2;
4076
if (posix_class < 0)
4077
{
4078
errorcode = ERR30;
4079
goto FAILED;
4080
}
4081
4082
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
4083
case, the hyphen is treated as a literal, but for '-1' it is disallowed
4084
(because it would be interpreted as range). */
4085
4086
class_range_state = RANGE_FORBID_NO;
4087
class_op_state = CLASS_OP_OPERAND;
4088
4089
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
4090
of the POSIX classes are converted to use Unicode properties \p or \P
4091
or, in one case, \h or \H. The substitutes table has two values per
4092
class, containing the type and value of a \p or \P item. The special
4093
cases are specified with a negative type: a non-zero value causes \h or
4094
\H to be used, and a zero value falls through to behave like a non-UCP
4095
POSIX class. There are now also some extra options that force ASCII for
4096
some classes. */
4097
4098
#ifdef SUPPORT_UNICODE
4099
if ((options & PCRE2_UCP) != 0 &&
4100
(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
4101
!((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
4102
(posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
4103
{
4104
int ptype = posix_substitutes[2*posix_class];
4105
int pvalue = posix_substitutes[2*posix_class + 1];
4106
4107
if (ptype >= 0)
4108
{
4109
*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
4110
*parsed_pattern++ = (ptype << 16) | pvalue;
4111
goto CLASS_CONTINUE;
4112
}
4113
4114
if (pvalue != 0)
4115
{
4116
*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
4117
goto CLASS_CONTINUE;
4118
}
4119
4120
/* Fall through */
4121
}
4122
#endif /* SUPPORT_UNICODE */
4123
4124
/* Non-UCP POSIX class */
4125
4126
*parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
4127
*parsed_pattern++ = posix_class;
4128
}
4129
4130
/* Check for the start of the outermost class, or the start of a nested class. */
4131
4132
else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
4133
(class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
4134
class_mode_state == CLASS_MODE_PERL_EXT)) ||
4135
(c == CHAR_LEFT_PARENTHESIS &&
4136
class_mode_state == CLASS_MODE_PERL_EXT))
4137
{
4138
uint32_t start_c = c;
4139
uint32_t new_class_mode_state;
4140
4141
/* Update the class mode, if moving into a 'leaf' inside a Perl extended
4142
class. */
4143
4144
if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
4145
class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
4146
new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
4147
else
4148
new_class_mode_state = class_mode_state;
4149
4150
/* Tidy up the other class before starting the nested class. */
4151
/* -[ beginning a nested class is a literal '-' */
4152
4153
if (class_range_state == RANGE_STARTED)
4154
parsed_pattern[-1] = CHAR_MINUS;
4155
4156
/* Disallow implicit union in Perl extended classes. */
4157
4158
if (class_op_state == CLASS_OP_OPERAND &&
4159
class_mode_state == CLASS_MODE_PERL_EXT)
4160
{
4161
errorcode = ERR113;
4162
goto FAILED;
4163
}
4164
4165
/* Validate nesting depth */
4166
if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
4167
{
4168
ptr--; /* Point rightwards at the paren, same as ERR19. */
4169
errorcode = ERR107; /* Classes too deeply nested */
4170
goto FAILED;
4171
}
4172
4173
/* Process the character class start. If the first character is '^', set
4174
the negation flag. If the first few characters (either before or after ^)
4175
are \Q\E or \E or space or tab in extended-more mode, we skip them too.
4176
This makes for compatibility with Perl. */
4177
4178
negate_class = FALSE;
4179
for (;;)
4180
{
4181
if (ptr >= ptrend)
4182
{
4183
if (start_c == CHAR_LEFT_PARENTHESIS)
4184
errorcode = ERR14; /* Missing terminating ')' */
4185
else
4186
errorcode = ERR6; /* Missing terminating ']' */
4187
goto FAILED;
4188
}
4189
4190
GETCHARINCTEST(c, ptr);
4191
if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
4192
else if (c == CHAR_BACKSLASH)
4193
{
4194
if (ptr < ptrend && *ptr == CHAR_E) ptr++;
4195
else if (ptrend - ptr >= 3 &&
4196
PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4197
ptr += 3;
4198
else
4199
break;
4200
}
4201
else if ((c == CHAR_SPACE || c == CHAR_HT) && /* Note: just these two */
4202
((options & PCRE2_EXTENDED_MORE) != 0 ||
4203
new_class_mode_state >= CLASS_MODE_PERL_EXT))
4204
continue;
4205
else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4206
negate_class = TRUE;
4207
else break;
4208
}
4209
4210
/* Now the real contents of the class; c has the first "real" character.
4211
Empty classes are permitted only if the option is set, and if it's not
4212
a Perl-extended class. */
4213
4214
if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4215
(cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4216
new_class_mode_state < CLASS_MODE_PERL_EXT)
4217
{
4218
PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4219
4220
if (class_start != NULL)
4221
{
4222
PCRE2_ASSERT(class_depth_m1 >= 0);
4223
/* Represents that the class is an extended class. */
4224
*class_start |= CLASS_IS_ECLASS;
4225
class_start = NULL;
4226
}
4227
4228
*parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4229
4230
/* Leave nesting depth unchanged; but check for zero depth to handle the
4231
very first (top-level) class being empty. */
4232
if (class_depth_m1 < 0) break;
4233
4234
class_range_state = RANGE_NO; /* for processing the containing class */
4235
class_op_state = CLASS_OP_OPERAND;
4236
goto CLASS_CONTINUE;
4237
}
4238
4239
/* Enter a non-empty class. */
4240
4241
if (class_start != NULL)
4242
{
4243
PCRE2_ASSERT(class_depth_m1 >= 0);
4244
/* Represents that the class is an extended class. */
4245
*class_start |= CLASS_IS_ECLASS;
4246
class_start = NULL;
4247
}
4248
4249
class_start = parsed_pattern;
4250
*parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4251
class_range_state = RANGE_NO;
4252
class_op_state = CLASS_OP_EMPTY;
4253
class_mode_state = new_class_mode_state;
4254
++class_depth_m1;
4255
if (class_maxdepth_m1 < class_depth_m1)
4256
class_maxdepth_m1 = class_depth_m1;
4257
/* Reset; no op seen yet at new depth. */
4258
cb->class_op_used[class_depth_m1] = 0;
4259
4260
/* Implement the special start-of-class literal meaning of ']'. */
4261
if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4262
new_class_mode_state != CLASS_MODE_PERL_EXT)
4263
{
4264
class_range_state = RANGE_OK_LITERAL;
4265
class_op_state = CLASS_OP_OPERAND;
4266
PARSED_LITERAL(c, parsed_pattern);
4267
goto CLASS_CONTINUE;
4268
}
4269
4270
continue; /* We have already loaded c with the next character */
4271
}
4272
4273
/* Check for the end of the class. */
4274
4275
else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4276
(c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4277
{
4278
/* In Perl extended mode, the ']' can only be used to match the
4279
opening '[', and ')' must match an opening parenthesis. */
4280
if (class_mode_state == CLASS_MODE_PERL_EXT)
4281
{
4282
if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4283
{
4284
errorcode = ERR14;
4285
ptr--; /* Correct the offset */
4286
goto FAILED;
4287
}
4288
if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4289
{
4290
errorcode = ERR22;
4291
goto FAILED;
4292
}
4293
}
4294
4295
/* Check no trailing operator. */
4296
if (class_op_state == CLASS_OP_OPERATOR)
4297
{
4298
errorcode = ERR110;
4299
goto FAILED;
4300
}
4301
4302
/* Check no empty expression for Perl extended expressions. */
4303
if (class_mode_state == CLASS_MODE_PERL_EXT &&
4304
class_op_state == CLASS_OP_EMPTY)
4305
{
4306
errorcode = ERR114;
4307
goto FAILED;
4308
}
4309
4310
/* -] at the end of a class is a literal '-' */
4311
if (class_range_state == RANGE_STARTED)
4312
parsed_pattern[-1] = CHAR_MINUS;
4313
4314
*parsed_pattern++ = META_CLASS_END;
4315
4316
if (--class_depth_m1 < 0)
4317
{
4318
/* Check for and consume ')' after '(?[...]'. */
4319
PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4320
if (class_mode_state == CLASS_MODE_PERL_EXT)
4321
{
4322
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4323
{
4324
errorcode = ERR115;
4325
goto FAILED;
4326
}
4327
4328
ptr++;
4329
}
4330
4331
break;
4332
}
4333
4334
class_range_state = RANGE_NO; /* for processing the containing class */
4335
class_op_state = CLASS_OP_OPERAND;
4336
if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4337
class_mode_state = CLASS_MODE_PERL_EXT;
4338
/* The extended class flag has already
4339
been set for the parent class. */
4340
class_start = NULL;
4341
}
4342
4343
/* Handle a Perl set binary operator */
4344
4345
else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4346
(c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4347
c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4348
{
4349
/* Check that there was a preceding operand. */
4350
if (class_op_state != CLASS_OP_OPERAND)
4351
{
4352
errorcode = ERR109;
4353
goto FAILED;
4354
}
4355
4356
if (class_start != NULL)
4357
{
4358
PCRE2_ASSERT(class_depth_m1 >= 0);
4359
/* Represents that the class is an extended class. */
4360
*class_start |= CLASS_IS_ECLASS;
4361
class_start = NULL;
4362
}
4363
4364
PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4365
class_range_state != RANGE_FORBID_STARTED);
4366
4367
*parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4368
c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4369
c == CHAR_MINUS? META_ECLASS_SUB :
4370
c == CHAR_AMPERSAND? META_ECLASS_AND :
4371
META_ECLASS_XOR;
4372
class_range_state = RANGE_NO;
4373
class_op_state = CLASS_OP_OPERATOR;
4374
}
4375
4376
/* Handle a Perl set unary operator */
4377
4378
else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4379
c == CHAR_EXCLAMATION_MARK)
4380
{
4381
/* Check that the "!" has not got a preceding operand (i.e. it's the
4382
start of the class, or follows an operator). */
4383
if (class_op_state == CLASS_OP_OPERAND)
4384
{
4385
errorcode = ERR113;
4386
goto FAILED;
4387
}
4388
4389
if (class_start != NULL)
4390
{
4391
PCRE2_ASSERT(class_depth_m1 >= 0);
4392
/* Represents that the class is an extended class. */
4393
*class_start |= CLASS_IS_ECLASS;
4394
class_start = NULL;
4395
}
4396
4397
PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4398
class_range_state != RANGE_FORBID_STARTED);
4399
4400
*parsed_pattern++ = META_ECLASS_NOT;
4401
class_range_state = RANGE_NO;
4402
class_op_state = CLASS_OP_OPERATOR;
4403
}
4404
4405
/* Handle a UTS#18 set operator */
4406
4407
else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4408
(c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4409
c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4410
ptr < ptrend && *ptr == c)
4411
{
4412
++ptr;
4413
4414
/* Check there isn't a triple-repetition. */
4415
if (ptr < ptrend && *ptr == c)
4416
{
4417
while (ptr < ptrend && *ptr == c) ++ptr; /* Improve error offset. */
4418
errorcode = ERR108;
4419
goto FAILED;
4420
}
4421
4422
/* Check for a preceding operand. */
4423
if (class_op_state != CLASS_OP_OPERAND)
4424
{
4425
errorcode = ERR109;
4426
goto FAILED;
4427
}
4428
4429
/* Check for mixed precedence. Forbid [A--B&&C]. */
4430
if (cb->class_op_used[class_depth_m1] != 0 &&
4431
cb->class_op_used[class_depth_m1] != (uint8_t)c)
4432
{
4433
errorcode = ERR111;
4434
goto FAILED;
4435
}
4436
4437
if (class_start != NULL)
4438
{
4439
PCRE2_ASSERT(class_depth_m1 >= 0);
4440
/* Represents that the class is an extended class. */
4441
*class_start |= CLASS_IS_ECLASS;
4442
class_start = NULL;
4443
}
4444
4445
/* Dangling '-' before an operator is a literal */
4446
if (class_range_state == RANGE_STARTED)
4447
parsed_pattern[-1] = CHAR_MINUS;
4448
4449
*parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4450
c == CHAR_MINUS? META_ECLASS_SUB :
4451
c == CHAR_AMPERSAND? META_ECLASS_AND :
4452
META_ECLASS_XOR;
4453
class_range_state = RANGE_NO;
4454
class_op_state = CLASS_OP_OPERATOR;
4455
cb->class_op_used[class_depth_m1] = (uint8_t)c;
4456
}
4457
4458
/* Handle escapes in a class */
4459
4460
else if (c == CHAR_BACKSLASH)
4461
{
4462
tempptr = ptr;
4463
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4464
xoptions, cb->bracount, TRUE, cb);
4465
4466
if (errorcode != 0)
4467
{
4468
if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4469
class_mode_state >= CLASS_MODE_PERL_EXT)
4470
goto FAILED;
4471
ptr = tempptr;
4472
if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4473
{
4474
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
4475
}
4476
escape = 0; /* Treat as literal character */
4477
}
4478
4479
switch(escape)
4480
{
4481
case 0: /* Escaped character code point is in c */
4482
char_is_literal = FALSE;
4483
goto CLASS_LITERAL; /* (a few lines above) */
4484
4485
case ESC_b:
4486
c = CHAR_BS; /* \b is backspace in a class */
4487
char_is_literal = FALSE;
4488
goto CLASS_LITERAL;
4489
4490
case ESC_k:
4491
c = CHAR_k; /* \k is not special in a class, just like \g */
4492
char_is_literal = FALSE;
4493
goto CLASS_LITERAL;
4494
4495
case ESC_Q:
4496
inescq = TRUE; /* Enter literal mode */
4497
goto CLASS_CONTINUE;
4498
4499
case ESC_E: /* Ignore orphan \E */
4500
goto CLASS_CONTINUE;
4501
4502
case ESC_B: /* Always an error in a class */
4503
case ESC_R:
4504
case ESC_X:
4505
errorcode = ERR7;
4506
goto FAILED;
4507
4508
case ESC_N: /* Not permitted by Perl either */
4509
errorcode = ERR71;
4510
goto FAILED;
4511
4512
case ESC_H:
4513
case ESC_h:
4514
case ESC_V:
4515
case ESC_v:
4516
*parsed_pattern++ = META_ESCAPE + escape;
4517
break;
4518
4519
/* These escapes may be converted to Unicode property tests when
4520
PCRE2_UCP is set. */
4521
4522
case ESC_d:
4523
case ESC_D:
4524
case ESC_s:
4525
case ESC_S:
4526
case ESC_w:
4527
case ESC_W:
4528
parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4529
xoptions);
4530
break;
4531
4532
/* Explicit Unicode property matching */
4533
4534
case ESC_P:
4535
case ESC_p:
4536
#ifdef SUPPORT_UNICODE
4537
{
4538
BOOL negated;
4539
uint16_t ptype = 0, pdata = 0;
4540
if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
4541
goto FAILED;
4542
4543
/* In caseless matching, particular characteristics Lu, Ll, and Lt
4544
get converted to the general characteristic L&. That is, upper,
4545
lower, and title case letters are all conflated. */
4546
4547
if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4548
(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4549
{
4550
ptype = PT_LAMP;
4551
pdata = 0;
4552
}
4553
4554
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4555
*parsed_pattern++ = META_ESCAPE + escape;
4556
*parsed_pattern++ = (ptype << 16) | pdata;
4557
}
4558
#else
4559
errorcode = ERR45;
4560
goto FAILED;
4561
#endif
4562
break; /* End \P and \p */
4563
4564
/* All others are not allowed in a class */
4565
4566
/* LCOV_EXCL_START */
4567
default:
4568
PCRE2_DEBUG_UNREACHABLE();
4569
PCRE2_FALLTHROUGH /* Fall through */
4570
/* LCOV_EXCL_STOP */
4571
4572
case ESC_A:
4573
case ESC_Z:
4574
case ESC_z:
4575
case ESC_G:
4576
case ESC_K:
4577
case ESC_C:
4578
errorcode = ERR7;
4579
goto FAILED;
4580
}
4581
4582
/* All the switch-cases above which end in "break" describe a set
4583
of characters. None may start a range. */
4584
4585
/* The second part of a range can be a single-character escape
4586
sequence (detected above), but not any of the other escapes. Perl
4587
treats a hyphen as a literal in such circumstances. However, in Perl's
4588
warning mode, a warning is given, so PCRE now faults it, as it is
4589
almost certainly a mistake on the user's part. */
4590
4591
if (class_range_state == RANGE_STARTED)
4592
{
4593
errorcode = ERR50;
4594
goto FAILED;
4595
}
4596
4597
/* Perl gives a warning unless the hyphen following a multi-character
4598
escape is the last character in the class. PCRE throws an error. */
4599
4600
if (class_range_state == RANGE_FORBID_STARTED)
4601
{
4602
ptr = class_range_forbid_ptr;
4603
errorcode = ERR50;
4604
goto FAILED;
4605
}
4606
4607
/* Disallow implicit union in Perl extended classes. */
4608
4609
if (class_op_state == CLASS_OP_OPERAND &&
4610
class_mode_state == CLASS_MODE_PERL_EXT)
4611
{
4612
errorcode = ERR113;
4613
goto FAILED;
4614
}
4615
4616
class_range_state = RANGE_FORBID_NO;
4617
class_op_state = CLASS_OP_OPERAND;
4618
}
4619
4620
/* Forbid unescaped literals, and the special meaning of '-', inside a
4621
Perl extended class. */
4622
4623
else if (class_mode_state == CLASS_MODE_PERL_EXT)
4624
{
4625
errorcode = ERR116;
4626
goto FAILED;
4627
}
4628
4629
/* Handle potential start of range */
4630
4631
else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4632
{
4633
*parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4634
META_RANGE_LITERAL : META_RANGE_ESCAPED;
4635
class_range_state = RANGE_STARTED;
4636
}
4637
4638
/* Handle forbidden start of range */
4639
4640
else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4641
{
4642
*parsed_pattern++ = CHAR_MINUS;
4643
class_range_state = RANGE_FORBID_STARTED;
4644
class_range_forbid_ptr = ptr;
4645
}
4646
4647
/* Handle a literal character */
4648
4649
else
4650
{
4651
CLASS_LITERAL:
4652
4653
/* Disallow implicit union in Perl extended classes. */
4654
4655
if (class_op_state == CLASS_OP_OPERAND &&
4656
class_mode_state == CLASS_MODE_PERL_EXT)
4657
{
4658
errorcode = ERR113;
4659
goto FAILED;
4660
}
4661
4662
if (class_range_state == RANGE_STARTED)
4663
{
4664
if (c == parsed_pattern[-2]) /* Optimize one-char range */
4665
parsed_pattern--;
4666
else if (parsed_pattern[-2] > c) /* Check range is in order */
4667
{
4668
errorcode = ERR8;
4669
goto FAILED;
4670
}
4671
else
4672
{
4673
if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4674
parsed_pattern[-1] = META_RANGE_ESCAPED;
4675
PARSED_LITERAL(c, parsed_pattern);
4676
}
4677
class_range_state = RANGE_NO;
4678
class_op_state = CLASS_OP_OPERAND;
4679
}
4680
else if (class_range_state == RANGE_FORBID_STARTED)
4681
{
4682
ptr = class_range_forbid_ptr;
4683
errorcode = ERR50;
4684
goto FAILED;
4685
}
4686
else /* Potential start of range */
4687
{
4688
class_range_state = char_is_literal?
4689
RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4690
class_op_state = CLASS_OP_OPERAND;
4691
PARSED_LITERAL(c, parsed_pattern);
4692
}
4693
}
4694
4695
/* Proceed to next thing in the class. */
4696
4697
CLASS_CONTINUE:
4698
if (ptr >= ptrend)
4699
{
4700
if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4701
errorcode = ERR14; /* Missing terminating ')' */
4702
if (class_mode_state == CLASS_MODE_ALT_EXT &&
4703
class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4704
errorcode = ERR112; /* Missing terminating ']', but we saw '[ [ ]...' */
4705
else
4706
errorcode = ERR6; /* Missing terminating ']' */
4707
goto FAILED;
4708
}
4709
GETCHARINCTEST(c, ptr);
4710
} /* End of class-processing loop */
4711
4712
break; /* End of character class */
4713
4714
4715
/* ---- Opening parenthesis ---- */
4716
4717
case CHAR_LEFT_PARENTHESIS:
4718
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4719
4720
/* If ( is not followed by ? it is either a capture or a special verb or an
4721
alpha assertion or a positive non-atomic lookahead. */
4722
4723
if (*ptr != CHAR_QUESTION_MARK)
4724
{
4725
const char *vn;
4726
4727
/* Handle capturing brackets (or non-capturing if auto-capture is turned
4728
off). */
4729
4730
if (*ptr != CHAR_ASTERISK)
4731
{
4732
nest_depth++;
4733
if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4734
{
4735
if (cb->bracount >= MAX_GROUP_NUMBER)
4736
{
4737
errorcode = ERR97;
4738
goto FAILED;
4739
}
4740
cb->bracount++;
4741
*parsed_pattern++ = META_CAPTURE | cb->bracount;
4742
}
4743
else *parsed_pattern++ = META_NOCAPTURE;
4744
}
4745
4746
/* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4747
quantifier" error rather than "(*MARK) must have an argument". */
4748
4749
else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4750
break;
4751
4752
/* Handle "alpha assertions" such as (*pla:...). Most of these are
4753
synonyms for the historical symbolic assertions, but the script run and
4754
non-atomic lookaround ones are new. They are distinguished by starting
4755
with a lower case letter. Checking both ends of the alphabet makes this
4756
work in all character codes. */
4757
4758
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4759
{
4760
uint32_t meta;
4761
4762
vn = alasnames;
4763
if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4764
&errorcode, cb)) goto FAILED;
4765
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4766
if (*ptr != CHAR_COLON)
4767
{
4768
errorcode = ERR95; /* Malformed */
4769
goto FAILED_FORWARD;
4770
}
4771
4772
/* Scan the table of alpha assertion names */
4773
4774
for (i = 0; i < alascount; i++)
4775
{
4776
if (namelen == alasmeta[i].len &&
4777
PRIV(strncmp_c8)(name, vn, namelen) == 0)
4778
break;
4779
vn += alasmeta[i].len + 1;
4780
}
4781
4782
if (i >= alascount)
4783
{
4784
errorcode = ERR95; /* Alpha assertion not recognized */
4785
goto FAILED;
4786
}
4787
4788
/* Check for expecting an assertion condition. If so, only atomic
4789
lookaround assertions are valid. */
4790
4791
meta = alasmeta[i].meta;
4792
if (prev_expect_cond_assert > 0 &&
4793
(meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4794
{
4795
errorcode = ERR28; /* Atomic assertion expected */
4796
goto FAILED;
4797
}
4798
4799
/* The lookaround alphabetic synonyms can mostly be handled by jumping
4800
to the code that handles the traditional symbolic forms. */
4801
4802
switch(meta)
4803
{
4804
/* LCOV_EXCL_START */
4805
default:
4806
PCRE2_DEBUG_UNREACHABLE();
4807
errorcode = ERR89; /* Unknown code; should never occur because */
4808
goto FAILED; /* the meta values come from a table above. */
4809
/* LCOV_EXCL_STOP */
4810
4811
case META_ATOMIC:
4812
goto ATOMIC_GROUP;
4813
4814
case META_LOOKAHEAD:
4815
goto POSITIVE_LOOK_AHEAD;
4816
4817
case META_LOOKAHEAD_NA:
4818
goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4819
4820
case META_LOOKAHEADNOT:
4821
goto NEGATIVE_LOOK_AHEAD;
4822
4823
case META_SCS:
4824
ptr++;
4825
*parsed_pattern++ = META_SCS;
4826
4827
parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
4828
0, &errorcode, cb);
4829
if (parsed_pattern == NULL) goto FAILED;
4830
goto POST_ASSERTION;
4831
4832
case META_LOOKBEHIND:
4833
case META_LOOKBEHINDNOT:
4834
case META_LOOKBEHIND_NA:
4835
*parsed_pattern++ = meta;
4836
ptr--;
4837
goto POST_LOOKBEHIND;
4838
4839
/* The script run facilities are handled here. Unicode support is
4840
required (give an error if not, as this is a security issue). Always
4841
record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4842
META_ATOMIC and remember that we need two META_KETs at the end. */
4843
4844
case META_SCRIPT_RUN:
4845
case META_ATOMIC_SCRIPT_RUN:
4846
#ifdef SUPPORT_UNICODE
4847
*parsed_pattern++ = META_SCRIPT_RUN;
4848
nest_depth++;
4849
ptr++;
4850
if (meta == META_ATOMIC_SCRIPT_RUN)
4851
{
4852
*parsed_pattern++ = META_ATOMIC;
4853
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4854
else if (++top_nest >= end_nests)
4855
{
4856
errorcode = ERR84;
4857
goto FAILED;
4858
}
4859
top_nest->nest_depth = nest_depth;
4860
top_nest->flags = NSF_ATOMICSR;
4861
top_nest->options = options & PARSE_TRACKED_OPTIONS;
4862
top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4863
4864
#ifdef PCRE2_DEBUG
4865
/* We'll write out two META_KETs for a single ")" in the input
4866
pattern, so we reserve space for that in our bounds check. */
4867
parsed_pattern_extra++;
4868
#endif
4869
}
4870
break;
4871
#else /* SUPPORT_UNICODE */
4872
errorcode = ERR96;
4873
goto FAILED;
4874
#endif
4875
}
4876
}
4877
4878
4879
/* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4880
4881
else
4882
{
4883
vn = verbnames;
4884
if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4885
&errorcode, cb)) goto FAILED;
4886
if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4887
*ptr != CHAR_RIGHT_PARENTHESIS))
4888
{
4889
errorcode = ERR60; /* Malformed */
4890
goto FAILED;
4891
}
4892
4893
/* Scan the table of verb names */
4894
4895
for (i = 0; i < verbcount; i++)
4896
{
4897
if (namelen == verbs[i].len &&
4898
PRIV(strncmp_c8)(name, vn, namelen) == 0)
4899
break;
4900
vn += verbs[i].len + 1;
4901
}
4902
4903
if (i >= verbcount)
4904
{
4905
errorcode = ERR60; /* Verb not recognized */
4906
goto FAILED;
4907
}
4908
4909
/* An empty argument is treated as no argument. */
4910
4911
if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4912
ptr[1] == CHAR_RIGHT_PARENTHESIS)
4913
ptr++; /* Advance to the closing parens */
4914
4915
/* Check for mandatory non-empty argument; this is (*MARK) */
4916
4917
if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4918
{
4919
errorcode = ERR66;
4920
goto FAILED;
4921
}
4922
4923
/* Remember where this verb, possibly with a preceding (*MARK), starts,
4924
for handling quantified (*ACCEPT). */
4925
4926
verbstartptr = parsed_pattern;
4927
okquantifier = (verbs[i].meta == META_ACCEPT);
4928
#ifdef PCRE2_DEBUG
4929
/* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4930
with a non-capturing bracket, if there is a following quantifier. */
4931
if (okquantifier) parsed_pattern_extra += 2;
4932
#endif
4933
4934
/* It appears that Perl allows any characters whatsoever, other than a
4935
closing parenthesis, to appear in arguments ("names"), so we no longer
4936
insist on letters, digits, and underscores. Perl does not, however, do
4937
any interpretation within arguments, and has no means of including a
4938
closing parenthesis. PCRE supports escape processing but only when it
4939
is requested by an option. We set inverbname TRUE here, and let the
4940
main loop take care of this so that escape and \x processing is done by
4941
the main code above. */
4942
4943
if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
4944
{
4945
/* Some optional arguments can be treated as a preceding (*MARK) */
4946
4947
if (verbs[i].has_arg < 0)
4948
{
4949
add_after_mark = verbs[i].meta;
4950
*parsed_pattern++ = META_MARK;
4951
}
4952
4953
/* The remaining verbs with arguments (except *MARK) need a different
4954
opcode. */
4955
4956
else
4957
{
4958
*parsed_pattern++ = verbs[i].meta +
4959
((verbs[i].meta != META_MARK)? 0x00010000u:0);
4960
}
4961
4962
/* Set up for reading the name in the main loop. */
4963
4964
verblengthptr = parsed_pattern++;
4965
verbnamestart = ptr;
4966
inverbname = TRUE;
4967
}
4968
else /* No verb "name" argument */
4969
{
4970
*parsed_pattern++ = verbs[i].meta;
4971
}
4972
} /* End of (*VERB) handling */
4973
break; /* Done with this parenthesis */
4974
} /* End of groups that don't start with (? */
4975
4976
4977
/* ---- Items starting (? ---- */
4978
4979
/* The type of item is determined by what follows (?. Handle (?| and option
4980
changes under "default" because both need a new block on the nest stack.
4981
Comments starting with (?# are handled above. Note that there is some
4982
ambiguity about the sequence (?- because if a digit follows it's a relative
4983
recursion or subroutine call whereas otherwise it's an option unsetting. */
4984
4985
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4986
4987
switch(*ptr)
4988
{
4989
default:
4990
if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4991
goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
4992
4993
/* We now have either (?| or a (possibly empty) option setting,
4994
optionally followed by a non-capturing group. */
4995
4996
nest_depth++;
4997
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4998
else if (++top_nest >= end_nests)
4999
{
5000
errorcode = ERR84;
5001
goto FAILED;
5002
}
5003
top_nest->nest_depth = nest_depth;
5004
top_nest->flags = 0;
5005
top_nest->options = options & PARSE_TRACKED_OPTIONS;
5006
top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5007
5008
/* Start of non-capturing group that resets the capture count for each
5009
branch. */
5010
5011
if (*ptr == CHAR_VERTICAL_LINE)
5012
{
5013
top_nest->reset_group = (uint16_t)cb->bracount;
5014
top_nest->max_group = (uint16_t)cb->bracount;
5015
top_nest->flags |= NSF_RESET;
5016
cb->external_flags |= PCRE2_DUPCAPUSED;
5017
*parsed_pattern++ = META_NOCAPTURE;
5018
ptr++;
5019
}
5020
5021
/* Scan for options imnrsxJU to be set or unset. */
5022
5023
else
5024
{
5025
BOOL hyphenok = TRUE;
5026
uint32_t oldoptions = options;
5027
uint32_t oldxoptions = xoptions;
5028
5029
top_nest->reset_group = 0;
5030
top_nest->max_group = 0;
5031
set = unset = 0;
5032
optset = &set;
5033
xset = xunset = 0;
5034
xoptset = &xset;
5035
5036
/* ^ at the start unsets irmnsx and disables the subsequent use of - */
5037
5038
if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
5039
{
5040
options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
5041
PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
5042
xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
5043
hyphenok = FALSE;
5044
ptr++;
5045
}
5046
5047
while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
5048
*ptr != CHAR_COLON)
5049
{
5050
switch (*ptr++)
5051
{
5052
case CHAR_MINUS:
5053
if (!hyphenok)
5054
{
5055
errorcode = ERR94;
5056
goto FAILED;
5057
}
5058
optset = &unset;
5059
xoptset = &xunset;
5060
hyphenok = FALSE;
5061
break;
5062
5063
/* There are some two-character sequences that start with 'a'. */
5064
5065
case CHAR_a:
5066
if (ptr < ptrend)
5067
{
5068
if (*ptr == CHAR_D)
5069
{
5070
*xoptset |= PCRE2_EXTRA_ASCII_BSD;
5071
ptr++;
5072
break;
5073
}
5074
if (*ptr == CHAR_P)
5075
{
5076
*xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
5077
ptr++;
5078
break;
5079
}
5080
if (*ptr == CHAR_S)
5081
{
5082
*xoptset |= PCRE2_EXTRA_ASCII_BSS;
5083
ptr++;
5084
break;
5085
}
5086
if (*ptr == CHAR_T)
5087
{
5088
*xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
5089
ptr++;
5090
break;
5091
}
5092
if (*ptr == CHAR_W)
5093
{
5094
*xoptset |= PCRE2_EXTRA_ASCII_BSW;
5095
ptr++;
5096
break;
5097
}
5098
}
5099
*xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
5100
PCRE2_EXTRA_ASCII_BSW|
5101
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
5102
break;
5103
5104
case CHAR_J: /* Record that it changed in the external options */
5105
*optset |= PCRE2_DUPNAMES;
5106
cb->external_flags |= PCRE2_JCHANGED;
5107
break;
5108
5109
case CHAR_i: *optset |= PCRE2_CASELESS; break;
5110
case CHAR_m: *optset |= PCRE2_MULTILINE; break;
5111
case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
5112
case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
5113
case CHAR_s: *optset |= PCRE2_DOTALL; break;
5114
case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
5115
5116
/* If x appears twice it sets the extended extended option. */
5117
5118
case CHAR_x:
5119
*optset |= PCRE2_EXTENDED;
5120
if (ptr < ptrend && *ptr == CHAR_x)
5121
{
5122
*optset |= PCRE2_EXTENDED_MORE;
5123
ptr++;
5124
}
5125
break;
5126
5127
default:
5128
errorcode = ERR11;
5129
goto FAILED;
5130
}
5131
}
5132
5133
/* If we are setting extended without extended-more, ensure that any
5134
existing extended-more gets unset. Also, unsetting extended must also
5135
unset extended-more. */
5136
5137
if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5138
(unset & PCRE2_EXTENDED) != 0)
5139
unset |= PCRE2_EXTENDED_MORE;
5140
5141
options = (options | set) & (~unset);
5142
xoptions = (xoptions | xset) & (~xunset);
5143
5144
/* If the options ended with ')' this is not the start of a nested
5145
group with option changes, so the options change at this level.
5146
In this case, if the previous level set up a nest block, discard the
5147
one we have just created. Otherwise adjust it for the previous level.
5148
If the options ended with ':' we are starting a non-capturing group,
5149
possibly with an options setting. */
5150
5151
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5152
if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5153
{
5154
nest_depth--; /* This is not a nested group after all. */
5155
if (top_nest > (nest_save *)(cb->start_workspace) &&
5156
(top_nest-1)->nest_depth == nest_depth) top_nest--;
5157
else top_nest->nest_depth = nest_depth;
5158
}
5159
else *parsed_pattern++ = META_NOCAPTURE;
5160
5161
/* If nothing changed, no need to record. */
5162
5163
if (options != oldoptions || xoptions != oldxoptions)
5164
{
5165
*parsed_pattern++ = META_OPTIONS;
5166
*parsed_pattern++ = options;
5167
*parsed_pattern++ = xoptions;
5168
}
5169
} /* End options processing */
5170
break; /* End default case after (? */
5171
5172
5173
/* ---- Python syntax support ---- */
5174
5175
case CHAR_P:
5176
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5177
5178
/* (?P<name> is the same as (?<name>, which defines a named group. */
5179
5180
if (*ptr == CHAR_LESS_THAN_SIGN)
5181
{
5182
terminator = CHAR_GREATER_THAN_SIGN;
5183
goto DEFINE_NAME;
5184
}
5185
5186
/* (?P>name) is the same as (?&name), which is a recursion or subroutine
5187
call. */
5188
5189
if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5190
5191
/* (?P=name) is the same as \k<name>, a back reference by name. Anything
5192
else after (?P is an error. */
5193
5194
if (*ptr != CHAR_EQUALS_SIGN)
5195
{
5196
errorcode = ERR41;
5197
goto FAILED_FORWARD;
5198
}
5199
if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5200
&namelen, &errorcode, cb)) goto FAILED;
5201
*parsed_pattern++ = META_BACKREF_BYNAME;
5202
*parsed_pattern++ = namelen;
5203
PUTOFFSET(offset, parsed_pattern);
5204
okquantifier = TRUE;
5205
break; /* End of (?P processing */
5206
5207
5208
/* ---- Recursion/subroutine calls by number ---- */
5209
5210
case CHAR_R:
5211
i = 0; /* (?R) == (?R0) */
5212
ptr++;
5213
if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))
5214
{
5215
errorcode = ERR58;
5216
goto FAILED;
5217
}
5218
terminator = CHAR_NUL;
5219
goto SET_RECURSION;
5220
5221
/* An item starting (?- followed by a digit comes here via the "default"
5222
case because (?- followed by a non-digit is an options setting. */
5223
5224
case CHAR_PLUS:
5225
if (ptr + 1 >= ptrend)
5226
{
5227
++ptr;
5228
goto UNCLOSED_PARENTHESIS;
5229
}
5230
if (!IS_DIGIT(ptr[1]))
5231
{
5232
errorcode = ERR29; /* Missing number */
5233
++ptr;
5234
goto FAILED_FORWARD;
5235
}
5236
PCRE2_FALLTHROUGH /* Fall through */
5237
5238
case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5239
case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5240
RECURSION_BYNUMBER:
5241
if (!read_number(&ptr, ptrend,
5242
(IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5243
MAX_GROUP_NUMBER, ERR61,
5244
&i, &errorcode)) goto FAILED;
5245
PCRE2_ASSERT(i >= 0); /* NB (?0) is permitted, represented by i=0 */
5246
terminator = CHAR_NUL;
5247
5248
SET_RECURSION:
5249
*parsed_pattern++ = META_RECURSE | (uint32_t)i;
5250
offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5251
/* End of recursive call by number handling */
5252
goto READ_RECURSION_ARGUMENTS;
5253
5254
5255
/* ---- Recursion/subroutine calls by name ---- */
5256
5257
case CHAR_AMPERSAND:
5258
RECURSE_BY_NAME:
5259
if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,
5260
&namelen, &errorcode, cb)) goto FAILED;
5261
*parsed_pattern++ = META_RECURSE_BYNAME;
5262
*parsed_pattern++ = namelen;
5263
terminator = CHAR_NUL;
5264
5265
READ_RECURSION_ARGUMENTS:
5266
PUTOFFSET(offset, parsed_pattern);
5267
okquantifier = TRUE;
5268
5269
/* Arguments are not supported for \g construct. */
5270
if (terminator != CHAR_NUL) break;
5271
5272
if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)
5273
{
5274
parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
5275
offset, &errorcode, cb);
5276
if (parsed_pattern == NULL) goto FAILED;
5277
}
5278
5279
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5280
goto UNCLOSED_PARENTHESIS;
5281
5282
ptr++;
5283
break;
5284
5285
/* ---- Callout with numerical or string argument ---- */
5286
5287
case CHAR_C:
5288
if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5289
{
5290
ptr++;
5291
errorcode = ERR103;
5292
goto FAILED;
5293
}
5294
5295
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5296
5297
/* If the previous item was a condition starting (?(? an assertion,
5298
optionally preceded by a callout, is expected. This is checked later on,
5299
during actual compilation. However we need to identify this kind of
5300
assertion in this pass because it must not be qualified. The value of
5301
expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5302
for a callout - still leaving a positive value that identifies the
5303
assertion. Multiple callouts or any other items will make it zero or
5304
less, which doesn't matter because they will cause an error later. */
5305
5306
expect_cond_assert = prev_expect_cond_assert - 1;
5307
5308
/* If previous_callout is not NULL, it means this follows a previous
5309
callout. If it was a manual callout, do nothing; this means its "length
5310
of next pattern item" field will remain zero. If it was an automatic
5311
callout, abolish it. */
5312
5313
if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5314
previous_callout == parsed_pattern - 4 &&
5315
parsed_pattern[-1] == 255)
5316
parsed_pattern = previous_callout;
5317
5318
/* Save for updating next pattern item length, and skip one item before
5319
completing. */
5320
5321
previous_callout = parsed_pattern;
5322
after_manual_callout = 1;
5323
5324
/* Handle a string argument; specific delimiter is required. */
5325
5326
if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5327
{
5328
PCRE2_SIZE calloutlength;
5329
PCRE2_SPTR startptr = ptr;
5330
5331
delimiter = 0;
5332
for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5333
{
5334
if (*ptr == PRIV(callout_start_delims)[i])
5335
{
5336
delimiter = PRIV(callout_end_delims)[i];
5337
break;
5338
}
5339
}
5340
if (delimiter == 0)
5341
{
5342
errorcode = ERR82;
5343
goto FAILED_FORWARD;
5344
}
5345
5346
*parsed_pattern = META_CALLOUT_STRING;
5347
parsed_pattern += 3; /* Skip pattern info */
5348
5349
for (;;)
5350
{
5351
if (++ptr >= ptrend)
5352
{
5353
errorcode = ERR81;
5354
ptr = startptr; /* To give a more useful message */
5355
goto FAILED;
5356
}
5357
if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5358
break;
5359
}
5360
5361
calloutlength = (PCRE2_SIZE)(ptr - startptr);
5362
if (calloutlength > UINT32_MAX)
5363
{
5364
errorcode = ERR72;
5365
goto FAILED;
5366
}
5367
*parsed_pattern++ = (uint32_t)calloutlength;
5368
offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5369
PUTOFFSET(offset, parsed_pattern);
5370
}
5371
5372
/* Handle a callout with an optional numerical argument, which must be
5373
less than or equal to 255. A missing argument gives 0. */
5374
5375
else
5376
{
5377
int n = 0;
5378
*parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
5379
parsed_pattern += 3; /* Skip pattern info */
5380
while (ptr < ptrend && IS_DIGIT(*ptr))
5381
{
5382
n = n * 10 + (*ptr++ - CHAR_0);
5383
if (n > 255)
5384
{
5385
errorcode = ERR38;
5386
goto FAILED;
5387
}
5388
}
5389
*parsed_pattern++ = n;
5390
}
5391
5392
/* Both formats must have a closing parenthesis */
5393
5394
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5395
{
5396
errorcode = ERR39;
5397
goto FAILED;
5398
}
5399
ptr++;
5400
5401
/* Remember the offset to the next item in the pattern, and set a default
5402
length. This should get updated after the next item is read. */
5403
5404
previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5405
previous_callout[2] = 0;
5406
break; /* End callout */
5407
5408
5409
/* ---- Conditional group ---- */
5410
5411
/* A condition can be an assertion, a number (referring to a numbered
5412
group's having been set), a name (referring to a named group), or 'R',
5413
referring to overall recursion. R<digits> and R&name are also permitted
5414
for recursion state tests. Numbers may be preceded by + or - to specify a
5415
relative group number.
5416
5417
There are several syntaxes for testing a named group: (?(name)) is used
5418
by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5419
5420
There are two unfortunate ambiguities. 'R' can be the recursive thing or
5421
the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5422
the Perl DEFINE feature or the Python named test. We look for a name
5423
first; if not found, we try the other case.
5424
5425
For compatibility with auto-callouts, we allow a callout to be specified
5426
before a condition that is an assertion. */
5427
5428
case CHAR_LEFT_PARENTHESIS:
5429
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5430
nest_depth++;
5431
5432
/* If the next character is ? or * there must be an assertion next
5433
(optionally preceded by a callout). We do not check this here, but
5434
instead we set expect_cond_assert to 2. If this is still greater than
5435
zero (callouts decrement it) when the next assertion is read, it will be
5436
marked as a condition that must not be repeated. A value greater than
5437
zero also causes checking that an assertion (possibly with callout)
5438
follows. */
5439
5440
if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5441
{
5442
*parsed_pattern++ = META_COND_ASSERT;
5443
ptr--; /* Pull pointer back to the opening parenthesis. */
5444
expect_cond_assert = 2;
5445
break; /* End of conditional */
5446
}
5447
5448
/* Handle (?([+-]number)... */
5449
5450
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5451
&errorcode))
5452
{
5453
PCRE2_ASSERT(i >= 0);
5454
if (i <= 0)
5455
{
5456
errorcode = ERR15;
5457
goto FAILED;
5458
}
5459
*parsed_pattern++ = META_COND_NUMBER;
5460
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5461
PUTOFFSET(offset, parsed_pattern);
5462
*parsed_pattern++ = i;
5463
}
5464
else if (errorcode != 0) goto FAILED; /* Number too big */
5465
5466
/* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5467
5468
else if (ptrend - ptr >= 10 &&
5469
PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5470
ptr[7] != CHAR_RIGHT_PARENTHESIS)
5471
{
5472
uint32_t ge = 0;
5473
int major = 0;
5474
int minor = 0;
5475
5476
ptr += 7;
5477
if (*ptr == CHAR_GREATER_THAN_SIGN)
5478
{
5479
ge = 1;
5480
ptr++;
5481
}
5482
5483
/* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5484
references its argument twice. */
5485
5486
if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5487
{
5488
errorcode = ERR79;
5489
if (!ge) goto FAILED_FORWARD;
5490
goto FAILED;
5491
}
5492
5493
if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5494
goto FAILED;
5495
5496
if (ptr < ptrend && *ptr == CHAR_DOT)
5497
{
5498
if (++ptr >= ptrend || !IS_DIGIT(*ptr))
5499
{
5500
errorcode = ERR79;
5501
if (ptr < ptrend) goto FAILED_FORWARD;
5502
goto FAILED;
5503
}
5504
if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &minor, &errorcode))
5505
goto FAILED;
5506
}
5507
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5508
{
5509
errorcode = ERR79;
5510
if (ptr < ptrend) goto FAILED_FORWARD;
5511
goto FAILED;
5512
}
5513
5514
*parsed_pattern++ = META_COND_VERSION;
5515
*parsed_pattern++ = ge;
5516
*parsed_pattern++ = major;
5517
*parsed_pattern++ = minor;
5518
}
5519
5520
/* All the remaining cases now require us to read a name. We cannot at
5521
this stage distinguish ambiguous cases such as (?(R12) which might be a
5522
recursion test by number or a name, because the named groups have not yet
5523
all been identified. Those cases are treated as names, but given a
5524
different META code. */
5525
5526
else
5527
{
5528
BOOL was_r_ampersand = FALSE;
5529
5530
if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5531
{
5532
terminator = CHAR_RIGHT_PARENTHESIS;
5533
was_r_ampersand = TRUE;
5534
ptr++;
5535
}
5536
else if (*ptr == CHAR_LESS_THAN_SIGN)
5537
terminator = CHAR_GREATER_THAN_SIGN;
5538
else if (*ptr == CHAR_APOSTROPHE)
5539
terminator = CHAR_APOSTROPHE;
5540
else
5541
{
5542
terminator = CHAR_RIGHT_PARENTHESIS;
5543
ptr--; /* Point to char before name */
5544
}
5545
5546
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5547
&errorcode, cb)) goto FAILED;
5548
5549
/* Handle (?(R&name) */
5550
5551
if (was_r_ampersand)
5552
{
5553
*parsed_pattern = META_COND_RNAME;
5554
ptr--; /* Back to closing parens */
5555
}
5556
5557
/* Handle (?(name). If the name is "DEFINE" we identify it with a
5558
special code. Likewise if the name consists of R followed only by
5559
digits. Otherwise, handle it like a quoted name. */
5560
5561
else if (terminator == CHAR_RIGHT_PARENTHESIS)
5562
{
5563
if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5564
*parsed_pattern = META_COND_DEFINE;
5565
else
5566
{
5567
for (i = 1; i < (int)namelen; i++)
5568
if (!IS_DIGIT(name[i])) break;
5569
*parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5570
META_COND_RNUMBER : META_COND_NAME;
5571
}
5572
ptr--; /* Back to closing parens */
5573
}
5574
5575
/* Handle (?('name') or (?(<name>) */
5576
5577
else *parsed_pattern = META_COND_NAME;
5578
5579
/* All these cases except DEFINE end with the name length and offset;
5580
DEFINE just has an offset (for the "too many branches" error). */
5581
5582
if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5583
PUTOFFSET(offset, parsed_pattern);
5584
} /* End cases that read a name */
5585
5586
/* Check the closing parenthesis of the condition */
5587
5588
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5589
{
5590
errorcode = ERR24;
5591
goto FAILED;
5592
}
5593
ptr++;
5594
break; /* End of condition processing */
5595
5596
5597
/* ---- Atomic group ---- */
5598
5599
case CHAR_GREATER_THAN_SIGN:
5600
ATOMIC_GROUP: /* Come from (*atomic: */
5601
*parsed_pattern++ = META_ATOMIC;
5602
nest_depth++;
5603
ptr++;
5604
break;
5605
5606
5607
/* ---- Lookahead assertions ---- */
5608
5609
case CHAR_EQUALS_SIGN:
5610
POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
5611
*parsed_pattern++ = META_LOOKAHEAD;
5612
ptr++;
5613
goto POST_ASSERTION;
5614
5615
case CHAR_ASTERISK:
5616
POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (*napla: */
5617
*parsed_pattern++ = META_LOOKAHEAD_NA;
5618
ptr++;
5619
goto POST_ASSERTION;
5620
5621
case CHAR_EXCLAMATION_MARK:
5622
NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
5623
*parsed_pattern++ = META_LOOKAHEADNOT;
5624
ptr++;
5625
goto POST_ASSERTION;
5626
5627
5628
/* ---- Lookbehind assertions ---- */
5629
5630
/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5631
is the start of the name of a capturing group. */
5632
5633
case CHAR_LESS_THAN_SIGN:
5634
if (ptrend - ptr <= 1 ||
5635
(ptr[1] != CHAR_EQUALS_SIGN &&
5636
ptr[1] != CHAR_EXCLAMATION_MARK &&
5637
ptr[1] != CHAR_ASTERISK))
5638
{
5639
terminator = CHAR_GREATER_THAN_SIGN;
5640
goto DEFINE_NAME;
5641
}
5642
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5643
META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5644
META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5645
5646
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
5647
*has_lookbehind = TRUE;
5648
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5649
PUTOFFSET(offset, parsed_pattern);
5650
ptr += 2;
5651
/* Fall through */
5652
5653
/* If the previous item was a condition starting (?(? an assertion,
5654
optionally preceded by a callout, is expected. This is checked later on,
5655
during actual compilation. However we need to identify this kind of
5656
assertion in this pass because it must not be qualified. The value of
5657
expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5658
for a callout - still leaving a positive value that identifies the
5659
assertion. Multiple callouts or any other items will make it zero or
5660
less, which doesn't matter because they will cause an error later. */
5661
5662
POST_ASSERTION:
5663
nest_depth++;
5664
if (prev_expect_cond_assert > 0)
5665
{
5666
if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5667
else if (++top_nest >= end_nests)
5668
{
5669
errorcode = ERR84;
5670
goto FAILED;
5671
}
5672
top_nest->nest_depth = nest_depth;
5673
top_nest->flags = NSF_CONDASSERT;
5674
top_nest->options = options & PARSE_TRACKED_OPTIONS;
5675
top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5676
}
5677
break;
5678
5679
5680
/* ---- Define a named group ---- */
5681
5682
/* A named group may be defined as (?'name') or (?<name>). In the latter
5683
case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5684
terminator set to '>'. */
5685
5686
case CHAR_APOSTROPHE:
5687
terminator = CHAR_APOSTROPHE; /* Terminator */
5688
5689
DEFINE_NAME:
5690
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5691
&errorcode, cb)) goto FAILED;
5692
5693
/* We have a name for this capturing group. It is also assigned a number,
5694
which is its primary means of identification. */
5695
5696
if (cb->bracount >= MAX_GROUP_NUMBER)
5697
{
5698
errorcode = ERR97;
5699
goto FAILED;
5700
}
5701
cb->bracount++;
5702
*parsed_pattern++ = META_CAPTURE | cb->bracount;
5703
nest_depth++;
5704
5705
/* Check not too many names */
5706
5707
if (cb->names_found >= MAX_NAME_COUNT)
5708
{
5709
errorcode = ERR49;
5710
goto FAILED;
5711
}
5712
5713
/* Adjust the entry size to accommodate the longest name found. */
5714
5715
if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5716
cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5717
5718
/* Scan the list to check for duplicates. For duplicate names, if the
5719
number is the same, break the loop, which causes the name to be
5720
discarded; otherwise, if DUPNAMES is not set, give an error.
5721
If it is set, allow the name with a different number, but continue
5722
scanning in case this is a duplicate with the same number. For
5723
non-duplicate names, give an error if the number is duplicated. */
5724
5725
is_dupname = FALSE;
5726
hash = PRIV(compile_get_hash_from_name)(name, namelen);
5727
ng = cb->named_groups;
5728
for (i = 0; i < cb->names_found; i++, ng++)
5729
{
5730
if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&
5731
PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5732
{
5733
/* When a bracket is referenced by the same name multiple
5734
times, is not considered as a duplicate and ignored. */
5735
if (ng->number == cb->bracount) break;
5736
if ((options & PCRE2_DUPNAMES) == 0)
5737
{
5738
errorcode = ERR43;
5739
goto FAILED;
5740
}
5741
5742
ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;
5743
is_dupname = TRUE; /* Mark as a duplicate */
5744
cb->dupnames = TRUE; /* Duplicate names exist */
5745
5746
/* The entry represents a duplicate. */
5747
name = ng->name;
5748
namelen = 0;
5749
5750
/* Even duplicated names may refer to the same
5751
capture index. These references are also ignored. */
5752
for (; i < cb->names_found; i++, ng++)
5753
if (ng->name == name && ng->number == cb->bracount)
5754
break;
5755
break;
5756
}
5757
else if (ng->number == cb->bracount)
5758
{
5759
errorcode = ERR65;
5760
goto FAILED;
5761
}
5762
}
5763
5764
/* Ignore duplicate with same number. */
5765
if (i < cb->names_found) break;
5766
5767
/* Increase the list size if necessary */
5768
5769
if (cb->names_found >= cb->named_group_list_size)
5770
{
5771
uint32_t newsize = cb->named_group_list_size * 2;
5772
named_group *newspace =
5773
cb->cx->memctl.malloc(newsize * sizeof(named_group),
5774
cb->cx->memctl.memory_data);
5775
if (newspace == NULL)
5776
{
5777
errorcode = ERR21;
5778
goto FAILED;
5779
}
5780
5781
memcpy(newspace, cb->named_groups,
5782
cb->named_group_list_size * sizeof(named_group));
5783
if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5784
cb->cx->memctl.free((void *)cb->named_groups,
5785
cb->cx->memctl.memory_data);
5786
cb->named_groups = newspace;
5787
cb->named_group_list_size = newsize;
5788
}
5789
5790
/* Add this name to the list */
5791
if (is_dupname)
5792
hash |= NAMED_GROUP_IS_DUPNAME;
5793
5794
cb->named_groups[cb->names_found].name = name;
5795
cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5796
cb->named_groups[cb->names_found].number = cb->bracount;
5797
cb->named_groups[cb->names_found].hash_dup = hash;
5798
cb->names_found++;
5799
break;
5800
5801
5802
/* ---- Perl extended character class ---- */
5803
5804
/* These are of the form '(?[...])'. We handle these via the same parser
5805
that consumes ordinary '[...]' classes, but with a flag set to activate
5806
the extended behaviour. */
5807
5808
case CHAR_LEFT_SQUARE_BRACKET:
5809
class_mode_state = CLASS_MODE_PERL_EXT;
5810
c = *ptr++;
5811
goto FROM_PERL_EXTENDED_CLASS;
5812
} /* End of (? switch */
5813
break; /* End of ( handling */
5814
5815
5816
/* ---- Branch terminators ---- */
5817
5818
/* Alternation: reset the capture count if we are in a (?| group. */
5819
5820
case CHAR_VERTICAL_LINE:
5821
if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5822
(top_nest->flags & NSF_RESET) != 0)
5823
{
5824
if (cb->bracount > top_nest->max_group)
5825
top_nest->max_group = (uint16_t)cb->bracount;
5826
cb->bracount = top_nest->reset_group;
5827
}
5828
*parsed_pattern++ = META_ALT;
5829
break;
5830
5831
/* End of group; reset the capture count to the maximum if we are in a (?|
5832
group and/or reset the options that are tracked during parsing. Disallow
5833
quantifier for a condition that is an assertion. */
5834
5835
case CHAR_RIGHT_PARENTHESIS:
5836
okquantifier = TRUE;
5837
if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5838
{
5839
options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5840
xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5841
if ((top_nest->flags & NSF_RESET) != 0 &&
5842
top_nest->max_group > cb->bracount)
5843
cb->bracount = top_nest->max_group;
5844
if ((top_nest->flags & NSF_CONDASSERT) != 0)
5845
okquantifier = FALSE;
5846
5847
if ((top_nest->flags & NSF_ATOMICSR) != 0)
5848
{
5849
*parsed_pattern++ = META_KET;
5850
5851
#ifdef PCRE2_DEBUG
5852
PCRE2_ASSERT(parsed_pattern_extra > 0);
5853
parsed_pattern_extra--;
5854
#endif
5855
}
5856
5857
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5858
else top_nest--;
5859
}
5860
if (nest_depth == 0) /* Unmatched closing parenthesis */
5861
{
5862
errorcode = ERR22;
5863
goto FAILED;
5864
}
5865
nest_depth--;
5866
*parsed_pattern++ = META_KET;
5867
break;
5868
} /* End of switch on pattern character */
5869
} /* End of main character scan loop */
5870
5871
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5872
5873
if (inverbname && ptr >= ptrend)
5874
{
5875
errorcode = ERR60;
5876
goto FAILED;
5877
}
5878
5879
5880
PARSED_END:
5881
5882
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5883
(parsed_pattern_extra - parsed_pattern_extra_check) <=
5884
max_parsed_pattern(ptr_check, ptr, utf, options));
5885
5886
/* Manage callout for the final item */
5887
5888
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5889
parsed_pattern, cb);
5890
5891
/* Insert trailing items for word and line matching (features provided for the
5892
benefit of pcre2grep). */
5893
5894
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5895
{
5896
*parsed_pattern++ = META_KET;
5897
*parsed_pattern++ = META_DOLLAR;
5898
}
5899
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5900
{
5901
*parsed_pattern++ = META_KET;
5902
*parsed_pattern++ = META_ESCAPE + ESC_b;
5903
}
5904
5905
/* Terminate the parsed pattern, then return success if all groups are closed.
5906
Otherwise we have unclosed parentheses. */
5907
5908
/* LCOV_EXCL_START */
5909
if (parsed_pattern >= parsed_pattern_end)
5910
{
5911
PCRE2_DEBUG_UNREACHABLE();
5912
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
5913
goto FAILED;
5914
}
5915
/* LCOV_EXCL_STOP */
5916
5917
*parsed_pattern = META_END;
5918
if (nest_depth == 0) return 0;
5919
5920
UNCLOSED_PARENTHESIS:
5921
errorcode = ERR14;
5922
5923
/* Come here for all failures. */
5924
5925
FAILED:
5926
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5927
return errorcode;
5928
5929
/* Some errors need to indicate the previous character. */
5930
5931
FAILED_BACK:
5932
ptr--;
5933
#ifdef SUPPORT_UNICODE
5934
if (utf) BACKCHAR(ptr);
5935
#endif
5936
goto FAILED;
5937
5938
/* Some errors need to indicate the next character. */
5939
5940
FAILED_FORWARD:
5941
ptr++;
5942
#ifdef SUPPORT_UNICODE
5943
if (utf) FORWARDCHARTEST(ptr, ptrend);
5944
#endif
5945
goto FAILED;
5946
}
5947
5948
5949
5950
/*************************************************
5951
* Find first significant opcode *
5952
*************************************************/
5953
5954
/* This is called by several functions that scan a compiled expression looking
5955
for a fixed first character, or an anchoring opcode etc. It skips over things
5956
that do not influence this. For some calls, it makes sense to skip negative
5957
forward and all backward assertions, and also the \b assertion; for others it
5958
does not.
5959
5960
Arguments:
5961
code pointer to the start of the group
5962
skipassert TRUE if certain assertions are to be skipped
5963
5964
Returns: pointer to the first significant opcode
5965
*/
5966
5967
static const PCRE2_UCHAR*
5968
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5969
{
5970
for (;;)
5971
{
5972
switch ((int)*code)
5973
{
5974
case OP_ASSERT_NOT:
5975
case OP_ASSERTBACK:
5976
case OP_ASSERTBACK_NOT:
5977
case OP_ASSERTBACK_NA:
5978
if (!skipassert) return code;
5979
do code += GET(code, 1); while (*code == OP_ALT);
5980
code += PRIV(OP_lengths)[*code];
5981
break;
5982
5983
case OP_WORD_BOUNDARY:
5984
case OP_NOT_WORD_BOUNDARY:
5985
case OP_UCP_WORD_BOUNDARY:
5986
case OP_NOT_UCP_WORD_BOUNDARY:
5987
if (!skipassert) return code;
5988
PCRE2_FALLTHROUGH /* Fall through */
5989
5990
case OP_CALLOUT:
5991
case OP_CREF:
5992
case OP_DNCREF:
5993
case OP_RREF:
5994
case OP_DNRREF:
5995
case OP_FALSE:
5996
case OP_TRUE:
5997
code += PRIV(OP_lengths)[*code];
5998
break;
5999
6000
case OP_CALLOUT_STR:
6001
code += GET(code, 1 + 2*LINK_SIZE);
6002
break;
6003
6004
case OP_SKIPZERO:
6005
code += 2 + GET(code, 2) + LINK_SIZE;
6006
break;
6007
6008
case OP_COND:
6009
case OP_SCOND:
6010
if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
6011
code[GET(code, 1)] != OP_KET) /* More than one branch */
6012
return code;
6013
code += GET(code, 1) + 1 + LINK_SIZE;
6014
break;
6015
6016
case OP_MARK:
6017
case OP_COMMIT_ARG:
6018
case OP_PRUNE_ARG:
6019
case OP_SKIP_ARG:
6020
case OP_THEN_ARG:
6021
code += code[1] + PRIV(OP_lengths)[*code];
6022
break;
6023
6024
default:
6025
return code;
6026
}
6027
}
6028
6029
/* LCOV_EXCL_START */
6030
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6031
/* LCOV_EXCL_STOP */
6032
}
6033
6034
6035
6036
/*************************************************
6037
* Compile one branch *
6038
*************************************************/
6039
6040
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
6041
the options are changed during the branch, the pointer is used to change the
6042
external options bits. This function is used during the pre-compile phase when
6043
we are trying to find out the amount of memory needed, as well as during the
6044
real compile phase. The value of lengthptr distinguishes the two phases.
6045
6046
Arguments:
6047
optionsptr pointer to the option bits
6048
xoptionsptr pointer to the extra option bits
6049
codeptr points to the pointer to the current code point
6050
pptrptr points to the current parsed pattern pointer
6051
errorcodeptr points to error code variable
6052
firstcuptr place to put the first required code unit
6053
firstcuflagsptr place to put the first code unit flags
6054
reqcuptr place to put the last required code unit
6055
reqcuflagsptr place to put the last required code unit flags
6056
bcptr points to current branch chain
6057
open_caps points to current capitem
6058
cb contains pointers to tables etc.
6059
lengthptr NULL during the real compile phase
6060
points to length accumulator during pre-compile phase
6061
6062
Returns: 0 There's been an error, *errorcodeptr is non-zero
6063
+1 Success, this branch must match at least one character
6064
-1 Success, this branch may match an empty string
6065
*/
6066
6067
static int
6068
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
6069
PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
6070
uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
6071
uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
6072
compile_block *cb, PCRE2_SIZE *lengthptr)
6073
{
6074
int bravalue = 0;
6075
int okreturn = -1;
6076
int group_return = 0;
6077
uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
6078
uint32_t greedy_default, greedy_non_default;
6079
uint32_t repeat_type, op_type;
6080
uint32_t options = *optionsptr; /* May change dynamically */
6081
uint32_t xoptions = *xoptionsptr; /* May change dynamically */
6082
uint32_t firstcu, reqcu;
6083
uint32_t zeroreqcu, zerofirstcu;
6084
uint32_t *pptr = *pptrptr;
6085
uint32_t meta, meta_arg;
6086
uint32_t firstcuflags, reqcuflags;
6087
uint32_t zeroreqcuflags, zerofirstcuflags;
6088
uint32_t req_caseopt, reqvary, tempreqvary;
6089
/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,
6090
depends on the previous value of offset. */
6091
PCRE2_SIZE offset = 0;
6092
PCRE2_SIZE length_prevgroup = 0;
6093
PCRE2_UCHAR *code = *codeptr;
6094
PCRE2_UCHAR *last_code = code;
6095
PCRE2_UCHAR *orig_code = code;
6096
PCRE2_UCHAR *tempcode;
6097
PCRE2_UCHAR *previous = NULL;
6098
PCRE2_UCHAR op_previous;
6099
BOOL groupsetfirstcu = FALSE;
6100
BOOL had_accept = FALSE;
6101
BOOL matched_char = FALSE;
6102
BOOL previous_matched_char = FALSE;
6103
BOOL reset_caseful = FALSE;
6104
6105
/* We can fish out the UTF setting once and for all into a BOOL, but we must
6106
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
6107
as we process the pattern. */
6108
6109
#ifdef SUPPORT_UNICODE
6110
BOOL utf = (options & PCRE2_UTF) != 0;
6111
BOOL ucp = (options & PCRE2_UCP) != 0;
6112
#else /* No Unicode support */
6113
BOOL utf = FALSE;
6114
#endif
6115
6116
/* Set up the default and non-default settings for greediness */
6117
6118
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6119
greedy_non_default = greedy_default ^ 1;
6120
6121
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6122
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6123
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6124
6125
When we hit a repeat whose minimum is zero, we may have to adjust these values
6126
to take the zero repeat into account. This is implemented by setting them to
6127
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6128
item types that can be repeated set these backoff variables appropriately. */
6129
6130
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6131
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6132
6133
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6134
according to the current setting of the caseless flag. The REQ_CASELESS value
6135
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6136
to record the case status of the value. This is used only for ASCII characters.
6137
*/
6138
6139
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6140
6141
/* Switch on next META item until the end of the branch */
6142
6143
for (;; pptr++)
6144
{
6145
BOOL possessive_quantifier;
6146
BOOL note_group_empty;
6147
uint32_t mclength;
6148
uint32_t skipunits;
6149
uint32_t subreqcu, subfirstcu;
6150
uint32_t groupnumber;
6151
uint32_t verbarglen, verbculen;
6152
uint32_t subreqcuflags, subfirstcuflags;
6153
open_capitem *oc;
6154
PCRE2_UCHAR mcbuffer[8];
6155
6156
/* Get next META item in the pattern and its potential argument. */
6157
6158
meta = META_CODE(*pptr);
6159
meta_arg = META_DATA(*pptr);
6160
6161
/* If we are in the pre-compile phase, accumulate the length used for the
6162
previous cycle of this loop, unless the next item is a quantifier. */
6163
6164
if (lengthptr != NULL)
6165
{
6166
/* LCOV_EXCL_START */
6167
if (code >= cb->start_workspace + cb->workspace_size)
6168
{
6169
PCRE2_DEBUG_UNREACHABLE();
6170
*errorcodeptr = ERR52; /* Over-ran workspace - internal error */
6171
cb->erroroffset = 0;
6172
return 0;
6173
}
6174
/* LCOV_EXCL_STOP */
6175
6176
if (code > cb->start_workspace + cb->workspace_size -
6177
WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
6178
{
6179
*errorcodeptr = ERR86; /* Pattern too complicated */
6180
cb->erroroffset = 0;
6181
return 0;
6182
}
6183
6184
/* There is at least one situation where code goes backwards: this is the
6185
case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6186
is processed, the whole class is eliminated. However, it is created first,
6187
so we have to allow memory for it. Therefore, don't ever reduce the length
6188
at this point. */
6189
6190
if (code < last_code) code = last_code;
6191
6192
/* If the next thing is not a quantifier, we add the length of the previous
6193
item into the total, and reset the code pointer to the start of the
6194
workspace. Otherwise leave the previous item available to be quantified. */
6195
6196
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6197
{
6198
if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6199
{
6200
*errorcodeptr = ERR20; /* Integer overflow */
6201
cb->erroroffset = 0;
6202
return 0;
6203
}
6204
*lengthptr += (PCRE2_SIZE)(code - orig_code);
6205
if (*lengthptr > MAX_PATTERN_SIZE)
6206
{
6207
*errorcodeptr = ERR20; /* Pattern is too large */
6208
cb->erroroffset = 0;
6209
return 0;
6210
}
6211
code = orig_code;
6212
}
6213
6214
/* Remember where this code item starts so we can catch the "backwards"
6215
case above next time round. */
6216
6217
last_code = code;
6218
}
6219
6220
/* Process the next parsed pattern item. If it is not a quantifier, remember
6221
where it starts so that it can be quantified when a quantifier follows.
6222
Checking for the legality of quantifiers happens in parse_regex(), except for
6223
a quantifier after an assertion that is a condition. */
6224
6225
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6226
{
6227
previous = code;
6228
if (matched_char && !had_accept) okreturn = 1;
6229
}
6230
6231
previous_matched_char = matched_char;
6232
matched_char = FALSE;
6233
note_group_empty = FALSE;
6234
skipunits = 0; /* Default value for most subgroups */
6235
6236
switch(meta)
6237
{
6238
/* ===================================================================*/
6239
/* The branch terminates at pattern end or | or ) */
6240
6241
case META_END:
6242
case META_ALT:
6243
case META_KET:
6244
*firstcuptr = firstcu;
6245
*firstcuflagsptr = firstcuflags;
6246
*reqcuptr = reqcu;
6247
*reqcuflagsptr = reqcuflags;
6248
*codeptr = code;
6249
*pptrptr = pptr;
6250
return okreturn;
6251
6252
6253
/* ===================================================================*/
6254
/* Handle single-character metacharacters. In multiline mode, ^ disables
6255
the setting of any following char as a first character. */
6256
6257
case META_CIRCUMFLEX:
6258
if ((options & PCRE2_MULTILINE) != 0)
6259
{
6260
if (firstcuflags == REQ_UNSET)
6261
zerofirstcuflags = firstcuflags = REQ_NONE;
6262
*code++ = OP_CIRCM;
6263
}
6264
else *code++ = OP_CIRC;
6265
break;
6266
6267
case META_DOLLAR:
6268
*code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6269
break;
6270
6271
/* There can never be a first char if '.' is first, whatever happens about
6272
repeats. The value of reqcu doesn't change either. */
6273
6274
case META_DOT:
6275
matched_char = TRUE;
6276
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6277
zerofirstcu = firstcu;
6278
zerofirstcuflags = firstcuflags;
6279
zeroreqcu = reqcu;
6280
zeroreqcuflags = reqcuflags;
6281
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6282
break;
6283
6284
6285
/* ===================================================================*/
6286
/* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6287
Otherwise, an initial ']' is taken as a data character. When empty classes
6288
are allowed, [] must generate an empty class - we have no dedicated opcode
6289
to optimise the representation, but it's a rare case (the '(*FAIL)'
6290
construct would be a clearer way for a pattern author to represent a
6291
non-matching branch, but it does have different semantics to '[]' if both
6292
are followed by a quantifier). The empty-negated [^] matches any character,
6293
so is useful: generate OP_ALLANY for this. */
6294
6295
case META_CLASS_EMPTY:
6296
case META_CLASS_EMPTY_NOT:
6297
matched_char = TRUE;
6298
if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6299
else
6300
{
6301
*code++ = OP_CLASS;
6302
memset(code, 0, 32);
6303
code += 32 / sizeof(PCRE2_UCHAR);
6304
}
6305
6306
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6307
zerofirstcu = firstcu;
6308
zerofirstcuflags = firstcuflags;
6309
break;
6310
6311
6312
/* ===================================================================*/
6313
/* Non-empty character class. If the included characters are all < 256, we
6314
build a 32-byte bitmap of the permitted characters, except in the special
6315
case where there is only one such character. For negated classes, we build
6316
the map as usual, then invert it at the end. However, we use a different
6317
opcode so that data characters > 255 can be handled correctly.
6318
6319
If the class contains characters outside the 0-255 range, a different
6320
opcode is compiled. It may optionally have a bit map for characters < 256,
6321
but those above are explicitly listed afterwards. A flag code unit tells
6322
whether the bitmap is present, and whether this is a negated class or
6323
not. */
6324
6325
case META_CLASS_NOT:
6326
case META_CLASS:
6327
matched_char = TRUE;
6328
6329
/* Check for complex extended classes and handle them separately. */
6330
6331
if ((*pptr & CLASS_IS_ECLASS) != 0)
6332
{
6333
if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6334
errorcodeptr, cb, lengthptr))
6335
return 0;
6336
goto CLASS_END_PROCESSING;
6337
}
6338
6339
/* We can optimize the case of a single character in a class by generating
6340
OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6341
negative. In the negative case there can be no first char if this item is
6342
first, whatever repeat count may follow. In the case of reqcu, save the
6343
previous value for reinstating. */
6344
6345
/* NOTE: at present this optimization is not effective if the only
6346
character in a class in 32-bit, non-UCP mode has its top bit set. */
6347
6348
if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6349
{
6350
uint32_t c = pptr[1];
6351
6352
pptr += 2; /* Move on to class end */
6353
if (meta == META_CLASS) /* A positive one-char class can be */
6354
{ /* handled as a normal literal character. */
6355
meta = c; /* Set up the character */
6356
goto NORMAL_CHAR_SET;
6357
}
6358
6359
/* Handle a negative one-character class */
6360
6361
zeroreqcu = reqcu;
6362
zeroreqcuflags = reqcuflags;
6363
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6364
zerofirstcu = firstcu;
6365
zerofirstcuflags = firstcuflags;
6366
6367
/* For caseless UTF or UCP mode, check whether this character has more
6368
than one other case. If so, generate a special OP_NOTPROP item instead of
6369
OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6370
caseless set that starts with an ASCII character. If the character is
6371
affected by the special Turkish rules, hardcode the not-matching
6372
characters using a caseset. */
6373
6374
#ifdef SUPPORT_UNICODE
6375
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6376
{
6377
uint32_t caseset;
6378
6379
if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6380
PCRE2_EXTRA_TURKISH_CASING &&
6381
UCD_ANY_I(c))
6382
{
6383
caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6384
}
6385
else if ((caseset = UCD_CASESET(c)) != 0 &&
6386
(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6387
PRIV(ucd_caseless_sets)[caseset] < 128)
6388
{
6389
caseset = 0; /* Ignore the caseless set if it's restricted. */
6390
}
6391
6392
if (caseset != 0)
6393
{
6394
*code++ = OP_NOTPROP;
6395
*code++ = PT_CLIST;
6396
*code++ = caseset;
6397
break; /* We are finished with this class */
6398
}
6399
}
6400
#endif
6401
/* Char has only one other (usable) case, or UCP not available */
6402
6403
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6404
code += PUTCHAR(c, code);
6405
break; /* We are finished with this class */
6406
} /* End of 1-char optimization */
6407
6408
/* Handle character classes that contain more than just one literal
6409
character. If there are exactly two characters in a positive class, see if
6410
they are case partners. This can be optimized to generate a caseless single
6411
character match (which also sets first/required code units if relevant).
6412
When casing restrictions apply, ignore a caseless set if both characters
6413
are ASCII. When Turkish casing applies, an 'i' does not match its normal
6414
Unicode "othercase". */
6415
6416
if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6417
pptr[3] == META_CLASS_END)
6418
{
6419
uint32_t c = pptr[1];
6420
6421
#ifdef SUPPORT_UNICODE
6422
if ((UCD_CASESET(c) == 0 ||
6423
((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6424
c < 128 && pptr[2] < 128)) &&
6425
!((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6426
PCRE2_EXTRA_TURKISH_CASING &&
6427
UCD_ANY_I(c)))
6428
#endif
6429
{
6430
uint32_t d;
6431
6432
#ifdef SUPPORT_UNICODE
6433
if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6434
#endif
6435
{
6436
#if PCRE2_CODE_UNIT_WIDTH != 8
6437
if (c > 255) d = c; else
6438
#endif
6439
d = TABLE_GET(c, cb->fcc, c);
6440
}
6441
6442
if (c != d && pptr[2] == d)
6443
{
6444
pptr += 3; /* Move on to class end */
6445
meta = c;
6446
if ((options & PCRE2_CASELESS) == 0)
6447
{
6448
reset_caseful = TRUE;
6449
options |= PCRE2_CASELESS;
6450
req_caseopt = REQ_CASELESS;
6451
}
6452
goto CLASS_CASELESS_CHAR;
6453
}
6454
}
6455
}
6456
6457
/* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6458
6459
pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6460
&code, meta == META_CLASS_NOT, NULL,
6461
errorcodeptr, cb, lengthptr);
6462
if (pptr == NULL) return 0;
6463
PCRE2_ASSERT(*pptr == META_CLASS_END);
6464
6465
CLASS_END_PROCESSING:
6466
6467
/* If this class is the first thing in the branch, there can be no first
6468
char setting, whatever the repeat count. Any reqcu setting must remain
6469
unchanged after any kind of repeat. */
6470
6471
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6472
zerofirstcu = firstcu;
6473
zerofirstcuflags = firstcuflags;
6474
zeroreqcu = reqcu;
6475
zeroreqcuflags = reqcuflags;
6476
break; /* End of class processing */
6477
6478
6479
/* ===================================================================*/
6480
/* Deal with (*VERB)s. */
6481
6482
/* Check for open captures before ACCEPT and close those that are within
6483
the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6484
assertion. In the first pass, just accumulate the length required;
6485
otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6486
workspace overflow. Do not set firstcu after *ACCEPT. */
6487
6488
case META_ACCEPT:
6489
cb->had_accept = had_accept = TRUE;
6490
for (oc = open_caps;
6491
oc != NULL && oc->assert_depth >= cb->assert_depth;
6492
oc = oc->next)
6493
{
6494
if (lengthptr != NULL)
6495
{
6496
*lengthptr += CU2BYTES(1) + IMM2_SIZE;
6497
}
6498
else
6499
{
6500
*code++ = OP_CLOSE;
6501
PUT2INC(code, 0, oc->number);
6502
}
6503
}
6504
*code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6505
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6506
break;
6507
6508
case META_PRUNE:
6509
case META_SKIP:
6510
cb->had_pruneorskip = TRUE;
6511
PCRE2_FALLTHROUGH /* Fall through */
6512
case META_COMMIT:
6513
case META_FAIL:
6514
*code++ = verbops[(meta - META_MARK) >> 16];
6515
break;
6516
6517
case META_THEN:
6518
cb->external_flags |= PCRE2_HASTHEN;
6519
*code++ = OP_THEN;
6520
break;
6521
6522
/* Handle verbs with arguments. Arguments can be very long, especially in
6523
16- and 32-bit modes, and can overflow the workspace in the first pass.
6524
However, the argument length is constrained to be small enough to fit in
6525
one code unit. This check happens in parse_regex(). In the first pass,
6526
instead of putting the argument into memory, we just update the length
6527
counter and set up an empty argument. */
6528
6529
case META_THEN_ARG:
6530
cb->external_flags |= PCRE2_HASTHEN;
6531
goto VERB_ARG;
6532
6533
case META_PRUNE_ARG:
6534
case META_SKIP_ARG:
6535
cb->had_pruneorskip = TRUE;
6536
PCRE2_FALLTHROUGH /* Fall through */
6537
case META_MARK:
6538
case META_COMMIT_ARG:
6539
VERB_ARG:
6540
*code++ = verbops[(meta - META_MARK) >> 16];
6541
/* The length is in characters. */
6542
verbarglen = *(++pptr);
6543
verbculen = 0;
6544
tempcode = code++;
6545
for (int i = 0; i < (int)verbarglen; i++)
6546
{
6547
meta = *(++pptr);
6548
#ifdef SUPPORT_UNICODE
6549
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6550
#endif
6551
{
6552
mclength = 1;
6553
mcbuffer[0] = meta;
6554
}
6555
if (lengthptr != NULL) *lengthptr += mclength; else
6556
{
6557
memcpy(code, mcbuffer, CU2BYTES(mclength));
6558
code += mclength;
6559
verbculen += mclength;
6560
}
6561
}
6562
6563
*tempcode = verbculen; /* Fill in the code unit length */
6564
*code++ = 0; /* Terminating zero */
6565
break;
6566
6567
6568
/* ===================================================================*/
6569
/* Handle options change. The new setting must be passed back for use in
6570
subsequent branches. Reset the greedy defaults and the case value for
6571
firstcu and reqcu. */
6572
6573
case META_OPTIONS:
6574
*optionsptr = options = *(++pptr);
6575
*xoptionsptr = xoptions = *(++pptr);
6576
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6577
greedy_non_default = greedy_default ^ 1;
6578
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6579
break;
6580
6581
/* ===================================================================*/
6582
/* Handle scan substring. Scan substring assertion starts with META_SCS,
6583
which recursively calls compile_branch. The first opcode processed by
6584
this recursive call is always META_OFFSET. */
6585
6586
case META_OFFSET:
6587
if (lengthptr != NULL)
6588
{
6589
pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);
6590
if (pptr == NULL)
6591
return 0;
6592
break;
6593
}
6594
6595
while (TRUE)
6596
{
6597
int count, index;
6598
named_group *ng;
6599
6600
switch (META_CODE(*pptr))
6601
{
6602
case META_OFFSET:
6603
pptr++;
6604
SKIPOFFSET(pptr);
6605
continue;
6606
6607
case META_CAPTURE_NAME:
6608
ng = cb->named_groups + pptr[1];
6609
pptr += 2;
6610
count = 0;
6611
index = 0;
6612
6613
if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6614
&count, errorcodeptr, cb)) return 0;
6615
6616
code[0] = OP_DNCREF;
6617
PUT2(code, 1, index);
6618
PUT2(code, 1 + IMM2_SIZE, count);
6619
code += 1 + 2 * IMM2_SIZE;
6620
continue;
6621
6622
case META_CAPTURE_NUMBER:
6623
pptr += 2;
6624
if (pptr[-1] == 0) continue;
6625
6626
code[0] = OP_CREF;
6627
PUT2(code, 1, pptr[-1]);
6628
code += 1 + IMM2_SIZE;
6629
continue;
6630
6631
default:
6632
break;
6633
}
6634
6635
break;
6636
}
6637
--pptr;
6638
break;
6639
6640
case META_SCS:
6641
bravalue = OP_ASSERT_SCS;
6642
cb->assert_depth += 1;
6643
goto GROUP_PROCESS;
6644
6645
6646
/* ===================================================================*/
6647
/* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6648
because it could be a numerical check on recursion, or a name check on a
6649
group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6650
we can handle it either way. We first try for a name; if not found, process
6651
the number. */
6652
6653
case META_COND_RNUMBER: /* (?(Rdigits) */
6654
case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6655
case META_COND_RNAME: /* (?(R&name) - test for recursion */
6656
bravalue = OP_COND;
6657
6658
if (lengthptr != NULL)
6659
{
6660
uint32_t i;
6661
PCRE2_SPTR name;
6662
named_group *ng;
6663
uint32_t *start_pptr = pptr;
6664
uint32_t length = *(++pptr);
6665
6666
GETPLUSOFFSET(offset, pptr);
6667
name = cb->start_pattern + offset;
6668
6669
/* In the first pass, the names generated in the pre-pass are available,
6670
but the main name table has not yet been created. Scan the list of names
6671
generated in the pre-pass in order to get a number and whether or not
6672
this name is duplicated. If it is not duplicated, we can handle it as a
6673
numerical group. */
6674
6675
ng = PRIV(compile_find_named_group)(name, length, cb);
6676
6677
if (ng == NULL)
6678
{
6679
/* If the name was not found we have a bad reference, unless we are
6680
dealing with R<digits>, which is treated as a recursion test by
6681
number. */
6682
6683
groupnumber = 0;
6684
if (meta == META_COND_RNUMBER)
6685
{
6686
for (i = 1; i < length; i++)
6687
{
6688
groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6689
if (groupnumber > MAX_GROUP_NUMBER)
6690
{
6691
*errorcodeptr = ERR61;
6692
cb->erroroffset = offset + i;
6693
return 0;
6694
}
6695
}
6696
}
6697
6698
if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6699
{
6700
*errorcodeptr = ERR15;
6701
cb->erroroffset = offset;
6702
return 0;
6703
}
6704
6705
/* (?Rdigits) treated as a recursion reference by number. A value of
6706
zero (which is the result of both (?R) and (?R0)) means "any", and is
6707
translated into RREF_ANY (which is 0xffff). */
6708
6709
if (groupnumber == 0) groupnumber = RREF_ANY;
6710
PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);
6711
start_pptr[1] = groupnumber;
6712
skipunits = 1+IMM2_SIZE;
6713
goto GROUP_PROCESS_NOTE_EMPTY;
6714
}
6715
6716
/* From here on, we know we have a name (not a number),
6717
so treat META_COND_RNUMBER the same as META_COND_NAME. */
6718
if (meta == META_COND_RNUMBER) meta = META_COND_NAME;
6719
6720
if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
6721
{
6722
/* Found a non-duplicated name. Since it is a global,
6723
it is enough to update it in the pre-processing phase. */
6724
if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6725
6726
start_pptr[0] = meta;
6727
start_pptr[1] = ng->number;
6728
6729
skipunits = 1 + IMM2_SIZE;
6730
goto GROUP_PROCESS_NOTE_EMPTY;
6731
}
6732
6733
/* We have a duplicated name. In the compile pass we have to search the
6734
main table in order to get the index and count values. */
6735
6736
start_pptr[0] = meta | 1;
6737
start_pptr[1] = (uint32_t)(ng - cb->named_groups);
6738
6739
/* A duplicated name was found. Note that if an R<digits> name is found
6740
(META_COND_RNUMBER), it is a reference test, not a recursion test. */
6741
skipunits = 1 + 2 * IMM2_SIZE;
6742
}
6743
else
6744
{
6745
/* Otherwise lengthptr equals to NULL,
6746
which is the second phase of compilation. */
6747
int count, index;
6748
named_group *ng;
6749
6750
/* Generate code using the data
6751
collected in the pre-processing phase. */
6752
6753
if (meta == META_COND_RNUMBER)
6754
{
6755
code[1+LINK_SIZE] = OP_RREF;
6756
PUT2(code, 2 + LINK_SIZE, pptr[1]);
6757
skipunits = 1 + IMM2_SIZE;
6758
pptr += 1 + SIZEOFFSET;
6759
goto GROUP_PROCESS_NOTE_EMPTY;
6760
}
6761
6762
if (meta_arg == 0)
6763
{
6764
code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6765
PUT2(code, 2 + LINK_SIZE, pptr[1]);
6766
skipunits = 1 + IMM2_SIZE;
6767
pptr += 1 + SIZEOFFSET;
6768
goto GROUP_PROCESS_NOTE_EMPTY;
6769
}
6770
6771
ng = cb->named_groups + pptr[1];
6772
count = 0; /* Values for first pass (avoids compiler warning) */
6773
index = 0;
6774
6775
/* The failed case is an internal error. */
6776
if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6777
&count, errorcodeptr, cb)) return 0;
6778
6779
/* A duplicated name was found. Note that if an R<digits> name is found
6780
(META_COND_RNUMBER), it is a reference test, not a recursion test. */
6781
6782
code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6783
6784
/* Insert appropriate data values. */
6785
PUT2(code, 2 + LINK_SIZE, index);
6786
PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);
6787
skipunits = 1 + 2 * IMM2_SIZE;
6788
pptr += 1 + SIZEOFFSET;
6789
}
6790
6791
PCRE2_ASSERT(meta != META_CAPTURE_NAME);
6792
goto GROUP_PROCESS_NOTE_EMPTY;
6793
6794
/* The DEFINE condition is always false. Its internal groups may never
6795
be called, so matched_char must remain false, hence the jump to
6796
GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6797
6798
case META_COND_DEFINE:
6799
bravalue = OP_COND;
6800
GETPLUSOFFSET(offset, pptr);
6801
code[1+LINK_SIZE] = OP_DEFINE;
6802
skipunits = 1;
6803
goto GROUP_PROCESS;
6804
6805
/* Conditional test of a group's being set. */
6806
6807
case META_COND_NUMBER:
6808
bravalue = OP_COND;
6809
GETPLUSOFFSET(offset, pptr);
6810
6811
groupnumber = *(++pptr);
6812
if (groupnumber > cb->bracount)
6813
{
6814
*errorcodeptr = ERR15;
6815
cb->erroroffset = offset;
6816
return 0;
6817
}
6818
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6819
6820
/* Point at initial ( for too many branches error */
6821
offset -= 2;
6822
code[1+LINK_SIZE] = OP_CREF;
6823
skipunits = 1+IMM2_SIZE;
6824
PUT2(code, 2+LINK_SIZE, groupnumber);
6825
goto GROUP_PROCESS_NOTE_EMPTY;
6826
6827
/* Test for the PCRE2 version. */
6828
6829
case META_COND_VERSION:
6830
bravalue = OP_COND;
6831
if (pptr[1] > 0)
6832
code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6833
(PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6834
OP_TRUE : OP_FALSE;
6835
else
6836
code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6837
OP_TRUE : OP_FALSE;
6838
skipunits = 1;
6839
pptr += 3;
6840
goto GROUP_PROCESS_NOTE_EMPTY;
6841
6842
/* The condition is an assertion, possibly preceded by a callout. */
6843
6844
case META_COND_ASSERT:
6845
bravalue = OP_COND;
6846
goto GROUP_PROCESS_NOTE_EMPTY;
6847
6848
6849
/* ===================================================================*/
6850
/* Handle all kinds of nested bracketed groups. The non-capturing,
6851
non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6852
6853
case META_LOOKAHEAD:
6854
bravalue = OP_ASSERT;
6855
cb->assert_depth += 1;
6856
goto GROUP_PROCESS;
6857
6858
case META_LOOKAHEAD_NA:
6859
bravalue = OP_ASSERT_NA;
6860
cb->assert_depth += 1;
6861
goto GROUP_PROCESS;
6862
6863
/* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6864
thing to do, but Perl allows all assertions to be quantified, and when
6865
they contain capturing parentheses there may be a potential use for
6866
this feature. Not that that applies to a quantified (?!) but we allow
6867
it for uniformity. */
6868
6869
case META_LOOKAHEADNOT:
6870
if (pptr[1] == META_KET &&
6871
(pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6872
{
6873
*code++ = OP_FAIL;
6874
pptr++;
6875
}
6876
else
6877
{
6878
bravalue = OP_ASSERT_NOT;
6879
cb->assert_depth += 1;
6880
goto GROUP_PROCESS;
6881
}
6882
break;
6883
6884
case META_LOOKBEHIND:
6885
bravalue = OP_ASSERTBACK;
6886
cb->assert_depth += 1;
6887
goto GROUP_PROCESS;
6888
6889
case META_LOOKBEHINDNOT:
6890
bravalue = OP_ASSERTBACK_NOT;
6891
cb->assert_depth += 1;
6892
goto GROUP_PROCESS;
6893
6894
case META_LOOKBEHIND_NA:
6895
bravalue = OP_ASSERTBACK_NA;
6896
cb->assert_depth += 1;
6897
goto GROUP_PROCESS;
6898
6899
case META_ATOMIC:
6900
bravalue = OP_ONCE;
6901
goto GROUP_PROCESS_NOTE_EMPTY;
6902
6903
case META_SCRIPT_RUN:
6904
bravalue = OP_SCRIPT_RUN;
6905
goto GROUP_PROCESS_NOTE_EMPTY;
6906
6907
case META_NOCAPTURE:
6908
bravalue = OP_BRA;
6909
/* Fall through */
6910
6911
/* Process nested bracketed regex. The nesting depth is maintained for the
6912
benefit of the stackguard function. The test for too deep nesting is now
6913
done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6914
others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6915
note of whether or not they may match an empty string. */
6916
6917
GROUP_PROCESS_NOTE_EMPTY:
6918
note_group_empty = TRUE;
6919
6920
GROUP_PROCESS:
6921
cb->parens_depth += 1;
6922
*code = bravalue;
6923
pptr++;
6924
tempcode = code;
6925
tempreqvary = cb->req_varyopt; /* Save value before group */
6926
length_prevgroup = 0; /* Initialize for pre-compile phase */
6927
6928
if ((group_return =
6929
compile_regex(
6930
options, /* The options state */
6931
xoptions, /* The extra options state */
6932
&tempcode, /* Where to put code (updated) */
6933
&pptr, /* Input pointer (updated) */
6934
errorcodeptr, /* Where to put an error message */
6935
skipunits, /* Skip over bracket number */
6936
&subfirstcu, /* For possible first char */
6937
&subfirstcuflags,
6938
&subreqcu, /* For possible last char */
6939
&subreqcuflags,
6940
bcptr, /* Current branch chain */
6941
open_caps, /* Pointer to capture stack */
6942
cb, /* Compile data block */
6943
(lengthptr == NULL)? NULL : /* Actual compile phase */
6944
&length_prevgroup /* Pre-compile phase */
6945
)) == 0)
6946
return 0; /* Error */
6947
6948
cb->parens_depth -= 1;
6949
6950
/* If that was a non-conditional significant group (not an assertion, not a
6951
DEFINE) that matches at least one character, then the current item matches
6952
a character. Conditionals are handled below. */
6953
6954
if (note_group_empty && bravalue != OP_COND && group_return > 0)
6955
matched_char = TRUE;
6956
6957
/* If we've just compiled an assertion, pop the assert depth. */
6958
6959
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6960
cb->assert_depth -= 1;
6961
6962
/* At the end of compiling, code is still pointing to the start of the
6963
group, while tempcode has been updated to point past the end of the group.
6964
The parsed pattern pointer (pptr) is on the closing META_KET.
6965
6966
If this is a conditional bracket, check that there are no more than
6967
two branches in the group, or just one if it's a DEFINE group. We do this
6968
in the real compile phase, not in the pre-pass, where the whole group may
6969
not be available. */
6970
6971
if (bravalue == OP_COND && lengthptr == NULL)
6972
{
6973
PCRE2_UCHAR *tc = code;
6974
int condcount = 0;
6975
6976
do {
6977
condcount++;
6978
tc += GET(tc,1);
6979
}
6980
while (*tc != OP_KET);
6981
6982
/* A DEFINE group is never obeyed inline (the "condition" is always
6983
false). It must have only one branch. Having checked this, change the
6984
opcode to OP_FALSE. */
6985
6986
if (code[LINK_SIZE+1] == OP_DEFINE)
6987
{
6988
if (condcount > 1)
6989
{
6990
cb->erroroffset = offset;
6991
*errorcodeptr = ERR54;
6992
return 0;
6993
}
6994
code[LINK_SIZE+1] = OP_FALSE;
6995
bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6996
}
6997
6998
/* A "normal" conditional group. If there is just one branch, we must not
6999
make use of its firstcu or reqcu, because this is equivalent to an
7000
empty second branch. Also, it may match an empty string. If there are two
7001
branches, this item must match a character if the group must. */
7002
7003
else
7004
{
7005
if (condcount > 2)
7006
{
7007
cb->erroroffset = offset;
7008
*errorcodeptr = ERR27;
7009
return 0;
7010
}
7011
if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7012
else if (group_return > 0) matched_char = TRUE;
7013
}
7014
}
7015
7016
/* In the pre-compile phase, update the length by the length of the group,
7017
less the brackets at either end. Then reduce the compiled code to just a
7018
set of non-capturing brackets so that it doesn't use much memory if it is
7019
duplicated by a quantifier.*/
7020
7021
if (lengthptr != NULL)
7022
{
7023
if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7024
{
7025
*errorcodeptr = ERR20;
7026
return 0;
7027
}
7028
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7029
code++; /* This already contains bravalue */
7030
PUTINC(code, 0, 1 + LINK_SIZE);
7031
*code++ = OP_KET;
7032
PUTINC(code, 0, 1 + LINK_SIZE);
7033
break; /* No need to waste time with special character handling */
7034
}
7035
7036
/* Otherwise update the main code pointer to the end of the group. */
7037
7038
code = tempcode;
7039
7040
/* For a DEFINE group, required and first character settings are not
7041
relevant. */
7042
7043
if (bravalue == OP_DEFINE) break;
7044
7045
/* Handle updating of the required and first code units for other types of
7046
group. Update for normal brackets of all kinds, and conditions with two
7047
branches (see code above). If the bracket is followed by a quantifier with
7048
zero repeat, we have to back off. Hence the definition of zeroreqcu and
7049
zerofirstcu outside the main loop so that they can be accessed for the back
7050
off. */
7051
7052
zeroreqcu = reqcu;
7053
zeroreqcuflags = reqcuflags;
7054
zerofirstcu = firstcu;
7055
zerofirstcuflags = firstcuflags;
7056
groupsetfirstcu = FALSE;
7057
7058
if (bravalue >= OP_ONCE) /* Not an assertion */
7059
{
7060
/* If we have not yet set a firstcu in this branch, take it from the
7061
subpattern, remembering that it was set here so that a repeat of more
7062
than one can replicate it as reqcu if necessary. If the subpattern has
7063
no firstcu, set "none" for the whole branch. In both cases, a zero
7064
repeat forces firstcu to "none". */
7065
7066
if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7067
{
7068
if (subfirstcuflags < REQ_NONE)
7069
{
7070
firstcu = subfirstcu;
7071
firstcuflags = subfirstcuflags;
7072
groupsetfirstcu = TRUE;
7073
}
7074
else firstcuflags = REQ_NONE;
7075
zerofirstcuflags = REQ_NONE;
7076
}
7077
7078
/* If firstcu was previously set, convert the subpattern's firstcu
7079
into reqcu if there wasn't one, using the vary flag that was in
7080
existence beforehand. */
7081
7082
else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
7083
{
7084
subreqcu = subfirstcu;
7085
subreqcuflags = subfirstcuflags | tempreqvary;
7086
}
7087
7088
/* If the subpattern set a required code unit (or set a first code unit
7089
that isn't really the first code unit - see above), set it. */
7090
7091
if (subreqcuflags < REQ_NONE)
7092
{
7093
reqcu = subreqcu;
7094
reqcuflags = subreqcuflags;
7095
}
7096
}
7097
7098
/* For a forward assertion, we take the reqcu, if set, provided that the
7099
group has also set a firstcu. This can be helpful if the pattern that
7100
follows the assertion doesn't set a different char. For example, it's
7101
useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7102
because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7103
the "real" "a" would then become a reqcu instead of a firstcu. This is
7104
overcome by a scan at the end if there's no firstcu, looking for an
7105
asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7106
we must only take the reqcu when the group also set a firstcu. Otherwise,
7107
in that example, 'X' ends up set for both. */
7108
7109
else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7110
subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7111
{
7112
reqcu = subreqcu;
7113
reqcuflags = subreqcuflags;
7114
}
7115
7116
break; /* End of nested group handling */
7117
7118
7119
/* ===================================================================*/
7120
/* Handle named backreferences and recursions. */
7121
7122
case META_BACKREF_BYNAME:
7123
case META_RECURSE_BYNAME:
7124
{
7125
int count, index;
7126
PCRE2_SPTR name;
7127
named_group *ng;
7128
uint32_t length = *(++pptr);
7129
7130
GETPLUSOFFSET(offset, pptr);
7131
name = cb->start_pattern + offset;
7132
7133
/* In the first pass, the names generated in the pre-pass are available,
7134
but the main name table has not yet been created. Scan the list of names
7135
generated in the pre-pass in order to get a number and whether or not
7136
this name is duplicated. */
7137
7138
ng = PRIV(compile_find_named_group)(name, length, cb);
7139
7140
if (ng == NULL)
7141
{
7142
/* If the name was not found we have a bad reference. */
7143
*errorcodeptr = ERR15;
7144
cb->erroroffset = offset;
7145
return 0;
7146
}
7147
7148
groupnumber = ng->number;
7149
7150
/* For a recursion, that's all that is needed. We can now go to
7151
the code that handles numerical recursion, applying it to the first
7152
group with the given name. */
7153
7154
if (meta == META_RECURSE_BYNAME)
7155
{
7156
meta_arg = groupnumber;
7157
goto HANDLE_NUMERICAL_RECURSION;
7158
}
7159
7160
/* For a back reference, update the back reference map and the
7161
maximum back reference. */
7162
7163
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7164
if (groupnumber > cb->top_backref)
7165
cb->top_backref = groupnumber;
7166
7167
/* If a back reference name is not duplicated, we can handle it as
7168
a numerical reference. */
7169
7170
if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
7171
{
7172
meta_arg = groupnumber;
7173
goto HANDLE_SINGLE_REFERENCE;
7174
}
7175
7176
/* If a back reference name is duplicated, we generate a different
7177
opcode to a numerical back reference. In the second pass we must
7178
search for the index and count in the final name table. */
7179
7180
count = 0; /* Values for first pass (avoids compiler warning) */
7181
index = 0;
7182
if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,
7183
&index, &count, errorcodeptr, cb)) return 0;
7184
7185
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7186
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7187
PUT2INC(code, 0, index);
7188
PUT2INC(code, 0, count);
7189
if ((options & PCRE2_CASELESS) != 0)
7190
*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7191
REFI_FLAG_CASELESS_RESTRICT : 0) |
7192
(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7193
REFI_FLAG_TURKISH_CASING : 0);
7194
}
7195
break;
7196
7197
7198
/* ===================================================================*/
7199
/* Handle a numerical callout. */
7200
7201
case META_CALLOUT_NUMBER:
7202
code[0] = OP_CALLOUT;
7203
PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7204
PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7205
code[1 + 2*LINK_SIZE] = pptr[3];
7206
pptr += 3;
7207
code += PRIV(OP_lengths)[OP_CALLOUT];
7208
break;
7209
7210
7211
/* ===================================================================*/
7212
/* Handle a callout with a string argument. In the pre-pass we just compute
7213
the length without generating anything. The length in pptr[3] includes both
7214
delimiters; in the actual compile only the first one is copied, but a
7215
terminating zero is added. Any doubled delimiters within the string make
7216
this an overestimate, but it is not worth bothering about. */
7217
7218
case META_CALLOUT_STRING:
7219
if (lengthptr != NULL)
7220
{
7221
*lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7222
pptr += 3;
7223
SKIPOFFSET(pptr);
7224
}
7225
7226
/* In the real compile we can copy the string. The starting delimiter is
7227
included so that the client can discover it if they want. We also pass the
7228
start offset to help a script language give better error messages. */
7229
7230
else
7231
{
7232
PCRE2_SPTR pp;
7233
uint32_t delimiter;
7234
uint32_t length = pptr[3];
7235
PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7236
7237
code[0] = OP_CALLOUT_STR;
7238
PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7239
PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7240
7241
pptr += 3;
7242
GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
7243
pp = cb->start_pattern + offset;
7244
delimiter = *callout_string++ = *pp++;
7245
if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7246
delimiter = CHAR_RIGHT_CURLY_BRACKET;
7247
PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
7248
7249
/* The syntax of the pattern was checked in the parsing scan. The length
7250
includes both delimiters, but we have passed the opening one just above,
7251
so we reduce length before testing it. The test is for > 1 because we do
7252
not want to copy the final delimiter. This also ensures that pp[1] is
7253
accessible. */
7254
7255
while (--length > 1)
7256
{
7257
if (*pp == delimiter && pp[1] == delimiter)
7258
{
7259
*callout_string++ = delimiter;
7260
pp += 2;
7261
length--;
7262
}
7263
else *callout_string++ = *pp++;
7264
}
7265
*callout_string++ = CHAR_NUL;
7266
7267
/* Set the length of the entire item, the advance to its end. */
7268
7269
PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7270
code = callout_string;
7271
}
7272
break;
7273
7274
7275
/* ===================================================================*/
7276
/* Handle repetition. The different types are all sorted out in the parsing
7277
pass. */
7278
7279
case META_MINMAX_PLUS:
7280
case META_MINMAX_QUERY:
7281
case META_MINMAX:
7282
repeat_min = *(++pptr);
7283
repeat_max = *(++pptr);
7284
goto REPEAT;
7285
7286
case META_ASTERISK:
7287
case META_ASTERISK_PLUS:
7288
case META_ASTERISK_QUERY:
7289
repeat_min = 0;
7290
repeat_max = REPEAT_UNLIMITED;
7291
goto REPEAT;
7292
7293
case META_PLUS:
7294
case META_PLUS_PLUS:
7295
case META_PLUS_QUERY:
7296
repeat_min = 1;
7297
repeat_max = REPEAT_UNLIMITED;
7298
goto REPEAT;
7299
7300
case META_QUERY:
7301
case META_QUERY_PLUS:
7302
case META_QUERY_QUERY:
7303
repeat_min = 0;
7304
repeat_max = 1;
7305
7306
REPEAT:
7307
if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7308
7309
/* Remember whether this is a variable length repeat, and default to
7310
single-char opcodes. */
7311
7312
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7313
7314
/* Adjust first and required code units for a zero repeat. */
7315
7316
if (repeat_min == 0)
7317
{
7318
firstcu = zerofirstcu;
7319
firstcuflags = zerofirstcuflags;
7320
reqcu = zeroreqcu;
7321
reqcuflags = zeroreqcuflags;
7322
}
7323
7324
/* Note the greediness and possessiveness. */
7325
7326
switch (meta)
7327
{
7328
case META_MINMAX_PLUS:
7329
case META_ASTERISK_PLUS:
7330
case META_PLUS_PLUS:
7331
case META_QUERY_PLUS:
7332
repeat_type = 0; /* Force greedy */
7333
possessive_quantifier = TRUE;
7334
break;
7335
7336
case META_MINMAX_QUERY:
7337
case META_ASTERISK_QUERY:
7338
case META_PLUS_QUERY:
7339
case META_QUERY_QUERY:
7340
repeat_type = greedy_non_default;
7341
possessive_quantifier = FALSE;
7342
break;
7343
7344
default:
7345
repeat_type = greedy_default;
7346
possessive_quantifier = FALSE;
7347
break;
7348
}
7349
7350
/* Save start of previous item, in case we have to move it up in order to
7351
insert something before it, and remember what it was. */
7352
7353
PCRE2_ASSERT(previous != NULL);
7354
tempcode = previous;
7355
op_previous = *previous;
7356
7357
/* Now handle repetition for the different types of item. If the repeat
7358
minimum and the repeat maximum are both 1, we can ignore the quantifier for
7359
non-parenthesized items, as they have only one alternative. For anything in
7360
parentheses, we must not ignore if {1} is possessive. */
7361
7362
switch (op_previous)
7363
{
7364
/* If previous was a character or negated character match, abolish the
7365
item and generate a repeat item instead. If a char item has a minimum of
7366
more than one, ensure that it is set in reqcu - it might not be if a
7367
sequence such as x{3} is the first thing in a branch because the x will
7368
have gone into firstcu instead. */
7369
7370
case OP_CHAR:
7371
case OP_CHARI:
7372
case OP_NOT:
7373
case OP_NOTI:
7374
if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7375
op_type = chartypeoffset[op_previous - OP_CHAR];
7376
7377
/* Deal with UTF characters that take up more than one code unit. */
7378
7379
#ifdef MAYBE_UTF_MULTI
7380
if (utf && NOT_FIRSTCU(code[-1]))
7381
{
7382
PCRE2_UCHAR *lastchar = code - 1;
7383
BACKCHAR(lastchar);
7384
mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
7385
memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
7386
}
7387
else
7388
#endif /* MAYBE_UTF_MULTI */
7389
7390
/* Handle the case of a single code unit - either with no UTF support, or
7391
with UTF disabled, or for a single-code-unit UTF character. In the latter
7392
case, for a repeated positive match, get the caseless flag for the
7393
required code unit from the previous character, because a class like [Aa]
7394
sets a caseless A but by now the req_caseopt flag has been reset. */
7395
7396
{
7397
mcbuffer[0] = code[-1];
7398
mclength = 1;
7399
if (op_previous <= OP_CHARI && repeat_min > 1)
7400
{
7401
reqcu = mcbuffer[0];
7402
reqcuflags = cb->req_varyopt;
7403
if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7404
}
7405
}
7406
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
7407
7408
/* If previous was a character class or a back reference, we put the
7409
repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7410
7411
#ifdef SUPPORT_WIDE_CHARS
7412
case OP_XCLASS:
7413
case OP_ECLASS:
7414
#endif
7415
case OP_CLASS:
7416
case OP_NCLASS:
7417
case OP_REF:
7418
case OP_REFI:
7419
case OP_DNREF:
7420
case OP_DNREFI:
7421
7422
if (repeat_max == 0)
7423
{
7424
code = previous;
7425
goto END_REPEAT;
7426
}
7427
if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7428
7429
if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7430
*code++ = OP_CRSTAR + repeat_type;
7431
else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7432
*code++ = OP_CRPLUS + repeat_type;
7433
else if (repeat_min == 0 && repeat_max == 1)
7434
*code++ = OP_CRQUERY + repeat_type;
7435
else
7436
{
7437
*code++ = OP_CRRANGE + repeat_type;
7438
PUT2INC(code, 0, repeat_min);
7439
if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
7440
PUT2INC(code, 0, repeat_max);
7441
}
7442
break;
7443
7444
/* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7445
because pcre2_match() could not handle backtracking into recursively
7446
called groups. Now that this backtracking is available, we no longer need
7447
to do this. However, we still need to replicate recursions as we do for
7448
groups so as to have independent backtracking points. We can replicate
7449
for the minimum number of repeats directly. For optional repeats we now
7450
wrap the recursion in OP_BRA brackets and make use of the bracket
7451
repetition. */
7452
7453
case OP_RECURSE:
7454
if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7455
goto END_REPEAT;
7456
7457
/* Generate unwrapped repeats for a non-zero minimum, except when the
7458
minimum is 1 and the maximum unlimited, because that can be handled with
7459
OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7460
minimum, we just need to generate the appropriate additional copies.
7461
Otherwise we need to generate one more, to simulate the situation when
7462
the minimum is zero. */
7463
7464
if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7465
{
7466
int replicate = repeat_min;
7467
7468
if (repeat_min == repeat_max) replicate--;
7469
7470
/* In the pre-compile phase, we don't actually do the replication. We
7471
just adjust the length as if we had. Do some paranoid checks for
7472
potential integer overflow. */
7473
7474
if (lengthptr != NULL)
7475
{
7476
PCRE2_SIZE delta;
7477
if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||
7478
OFLOW_MAX - *lengthptr < delta)
7479
{
7480
*errorcodeptr = ERR20;
7481
return 0;
7482
}
7483
*lengthptr += delta;
7484
}
7485
else for (int i = 0; i < replicate; i++)
7486
{
7487
memcpy(code, previous, CU2BYTES(length_prevgroup));
7488
previous = code;
7489
code += length_prevgroup;
7490
}
7491
7492
/* If the number of repeats is fixed, we are done. Otherwise, adjust
7493
the counts and fall through. */
7494
7495
if (repeat_min == repeat_max) break;
7496
if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7497
repeat_min = 0;
7498
}
7499
7500
/* Wrap the recursion call in OP_BRA brackets. */
7501
{
7502
PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;
7503
7504
(void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));
7505
op_previous = *previous = OP_BRA;
7506
PUT(previous, 1, 1 + LINK_SIZE + length);
7507
previous[1 + LINK_SIZE + length] = OP_KET;
7508
PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);
7509
}
7510
code += 2 + 2 * LINK_SIZE;
7511
length_prevgroup += 2 + 2 * LINK_SIZE;
7512
group_return = -1; /* Set "may match empty string" */
7513
7514
/* Now treat as a repeated OP_BRA. */
7515
PCRE2_FALLTHROUGH /* Fall through */
7516
7517
/* If previous was a bracket group, we may have to replicate it in
7518
certain cases. Note that at this point we can encounter only the "basic"
7519
bracket opcodes such as BRA and CBRA, as this is the place where they get
7520
converted into the more special varieties such as BRAPOS and SBRA.
7521
Originally, PCRE did not allow repetition of assertions, but now it does,
7522
for Perl compatibility. */
7523
7524
case OP_ASSERT:
7525
case OP_ASSERT_NOT:
7526
case OP_ASSERT_NA:
7527
case OP_ASSERTBACK:
7528
case OP_ASSERTBACK_NOT:
7529
case OP_ASSERTBACK_NA:
7530
case OP_ASSERT_SCS:
7531
case OP_ONCE:
7532
case OP_SCRIPT_RUN:
7533
case OP_BRA:
7534
case OP_CBRA:
7535
case OP_COND:
7536
{
7537
int len = (int)(code - previous);
7538
PCRE2_UCHAR *bralink = NULL;
7539
PCRE2_UCHAR *brazeroptr = NULL;
7540
7541
if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7542
goto END_REPEAT;
7543
7544
/* Repeating a DEFINE group (or any group where the condition is always
7545
FALSE and there is only one branch) is pointless, but Perl allows the
7546
syntax, so we just ignore the repeat. */
7547
7548
if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7549
previous[GET(previous, 1)] != OP_ALT)
7550
goto END_REPEAT;
7551
7552
/* Perl allows all assertions to be quantified, and when they contain
7553
capturing parentheses and/or are optional there are potential uses for
7554
this feature. PCRE2 used to force the maximum quantifier to 1 on the
7555
invalid grounds that further repetition was never useful. This was
7556
always a bit pointless, since an assertion could be wrapped with a
7557
repeated group to achieve the effect. General repetition is now
7558
permitted, but if the maximum is unlimited it is set to one more than
7559
the minimum. */
7560
7561
if (op_previous < OP_ONCE) /* Assertion */
7562
{
7563
if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7564
}
7565
7566
/* The case of a zero minimum is special because of the need to stick
7567
OP_BRAZERO in front of it, and because the group appears once in the
7568
data, whereas in other cases it appears the minimum number of times. For
7569
this reason, it is simplest to treat this case separately, as otherwise
7570
the code gets far too messy. There are several special subcases when the
7571
minimum is zero. */
7572
7573
if (repeat_min == 0)
7574
{
7575
/* If the maximum is also zero, we used to just omit the group from
7576
the output altogether, like this:
7577
7578
** if (repeat_max == 0)
7579
** {
7580
** code = previous;
7581
** goto END_REPEAT;
7582
** }
7583
7584
However, that fails when a group or a subgroup within it is
7585
referenced as a subroutine from elsewhere in the pattern, so now we
7586
stick in OP_SKIPZERO in front of it so that it is skipped on
7587
execution. As we don't have a list of which groups are referenced, we
7588
cannot do this selectively.
7589
7590
If the maximum is 1 or unlimited, we just have to stick in the
7591
BRAZERO and do no more at this point. */
7592
7593
if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7594
{
7595
(void)memmove(previous + 1, previous, CU2BYTES(len));
7596
code++;
7597
if (repeat_max == 0)
7598
{
7599
*previous++ = OP_SKIPZERO;
7600
goto END_REPEAT;
7601
}
7602
brazeroptr = previous; /* Save for possessive optimizing */
7603
*previous++ = OP_BRAZERO + repeat_type;
7604
}
7605
7606
/* If the maximum is greater than 1 and limited, we have to replicate
7607
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7608
The first one has to be handled carefully because it's the original
7609
copy, which has to be moved up. The remainder can be handled by code
7610
that is common with the non-zero minimum case below. We have to
7611
adjust the value or repeat_max, since one less copy is required. */
7612
7613
else
7614
{
7615
int linkoffset;
7616
(void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7617
code += 2 + LINK_SIZE;
7618
*previous++ = OP_BRAZERO + repeat_type;
7619
*previous++ = OP_BRA;
7620
7621
/* We chain together the bracket link offset fields that have to be
7622
filled in later when the ends of the brackets are reached. */
7623
7624
linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7625
bralink = previous;
7626
PUTINC(previous, 0, linkoffset);
7627
}
7628
7629
if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7630
}
7631
7632
/* If the minimum is greater than zero, replicate the group as many
7633
times as necessary, and adjust the maximum to the number of subsequent
7634
copies that we need. */
7635
7636
else
7637
{
7638
if (repeat_min > 1)
7639
{
7640
/* In the pre-compile phase, we don't actually do the replication.
7641
We just adjust the length as if we had. Do some paranoid checks for
7642
potential integer overflow. */
7643
7644
if (lengthptr != NULL)
7645
{
7646
PCRE2_SIZE delta;
7647
if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7648
(int)length_prevgroup) ||
7649
OFLOW_MAX - *lengthptr < delta)
7650
{
7651
*errorcodeptr = ERR20;
7652
return 0;
7653
}
7654
*lengthptr += delta;
7655
}
7656
7657
/* This is compiling for real. If there is a set first code unit
7658
for the group, and we have not yet set a "required code unit", set
7659
it. */
7660
7661
else
7662
{
7663
if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7664
{
7665
reqcu = firstcu;
7666
reqcuflags = firstcuflags;
7667
}
7668
for (uint32_t i = 1; i < repeat_min; i++)
7669
{
7670
memcpy(code, previous, CU2BYTES(len));
7671
code += len;
7672
}
7673
}
7674
}
7675
7676
if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7677
}
7678
7679
/* This code is common to both the zero and non-zero minimum cases. If
7680
the maximum is limited, it replicates the group in a nested fashion,
7681
remembering the bracket starts on a stack. In the case of a zero
7682
minimum, the first one was set up above. In all cases the repeat_max
7683
now specifies the number of additional copies needed. Again, we must
7684
remember to replicate entries on the forward reference list. */
7685
7686
if (repeat_max != REPEAT_UNLIMITED)
7687
{
7688
/* In the pre-compile phase, we don't actually do the replication. We
7689
just adjust the length as if we had. For each repetition we must add
7690
1 to the length for BRAZERO and for all but the last repetition we
7691
must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7692
paranoid checks to avoid integer overflow. */
7693
7694
if (lengthptr != NULL && repeat_max > 0)
7695
{
7696
PCRE2_SIZE delta;
7697
if (PRIV(ckd_smul)(&delta, repeat_max,
7698
(int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7699
OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7700
{
7701
*errorcodeptr = ERR20;
7702
return 0;
7703
}
7704
delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */
7705
*lengthptr += delta;
7706
}
7707
7708
/* This is compiling for real */
7709
7710
else for (uint32_t i = repeat_max; i >= 1; i--)
7711
{
7712
*code++ = OP_BRAZERO + repeat_type;
7713
7714
/* All but the final copy start a new nesting, maintaining the
7715
chain of brackets outstanding. */
7716
7717
if (i != 1)
7718
{
7719
int linkoffset;
7720
*code++ = OP_BRA;
7721
linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7722
bralink = code;
7723
PUTINC(code, 0, linkoffset);
7724
}
7725
7726
memcpy(code, previous, CU2BYTES(len));
7727
code += len;
7728
}
7729
7730
/* Now chain through the pending brackets, and fill in their length
7731
fields (which are holding the chain links pro tem). */
7732
7733
while (bralink != NULL)
7734
{
7735
int oldlinkoffset;
7736
int linkoffset = (int)(code - bralink + 1);
7737
PCRE2_UCHAR *bra = code - linkoffset;
7738
oldlinkoffset = GET(bra, 1);
7739
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7740
*code++ = OP_KET;
7741
PUTINC(code, 0, linkoffset);
7742
PUT(bra, 1, linkoffset);
7743
}
7744
}
7745
7746
/* If the maximum is unlimited, set a repeater in the final copy. For
7747
SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7748
possessively repeated ONCE brackets can be converted into non-capturing
7749
brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7750
saves having to deal with possessive ONCEs specially.
7751
7752
Otherwise, when we are doing the actual compile phase, check to see
7753
whether this group is one that could match an empty string. If so,
7754
convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7755
that runtime checking can be done. [This check is also applied to ONCE
7756
and SCRIPT_RUN groups at runtime, but in a different way.]
7757
7758
Then, if the quantifier was possessive and the bracket is not a
7759
conditional, we convert the BRA code to the POS form, and the KET code
7760
to KETRPOS. (It turns out to be convenient at runtime to detect this
7761
kind of subpattern at both the start and at the end.) The use of
7762
special opcodes makes it possible to reduce greatly the stack usage in
7763
pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7764
OP_BRAPOSZERO.
7765
7766
Then, if the minimum number of matches is 1 or 0, cancel the possessive
7767
flag so that the default action below, of wrapping everything inside
7768
atomic brackets, does not happen. When the minimum is greater than 1,
7769
there will be earlier copies of the group, and so we still have to wrap
7770
the whole thing. */
7771
7772
else
7773
{
7774
PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7775
PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7776
7777
/* Convert possessive ONCE brackets to non-capturing */
7778
7779
if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7780
7781
/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7782
to do is to set the KET. */
7783
7784
if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7785
*ketcode = OP_KETRMAX + repeat_type;
7786
7787
/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7788
(which have been converted to non-capturing above). */
7789
7790
else
7791
{
7792
/* In the compile phase, adjust the opcode if the group can match
7793
an empty string. For a conditional group with only one branch, the
7794
value of group_return will not show "could be empty", so we must
7795
check that separately. */
7796
7797
if (lengthptr == NULL)
7798
{
7799
if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7800
if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7801
*bracode = OP_SCOND;
7802
}
7803
7804
/* Handle possessive quantifiers. */
7805
7806
if (possessive_quantifier)
7807
{
7808
/* For COND brackets, we wrap the whole thing in a possessively
7809
repeated non-capturing bracket, because we have not invented POS
7810
versions of the COND opcodes. */
7811
7812
if (*bracode == OP_COND || *bracode == OP_SCOND)
7813
{
7814
int nlen = (int)(code - bracode);
7815
(void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7816
code += 1 + LINK_SIZE;
7817
nlen += 1 + LINK_SIZE;
7818
*bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7819
*code++ = OP_KETRPOS;
7820
PUTINC(code, 0, nlen);
7821
PUT(bracode, 1, nlen);
7822
}
7823
7824
/* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7825
7826
else
7827
{
7828
*bracode += 1; /* Switch to xxxPOS opcodes */
7829
*ketcode = OP_KETRPOS;
7830
}
7831
7832
/* If the minimum is zero, mark it as possessive, then unset the
7833
possessive flag when the minimum is 0 or 1. */
7834
7835
if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7836
if (repeat_min < 2) possessive_quantifier = FALSE;
7837
}
7838
7839
/* Non-possessive quantifier */
7840
7841
else *ketcode = OP_KETRMAX + repeat_type;
7842
}
7843
}
7844
}
7845
break;
7846
7847
/* If previous was a character type match (\d or similar), abolish it and
7848
create a suitable repeat item. The code is shared with single-character
7849
repeats by setting op_type to add a suitable offset into repeat_type.
7850
Note the the Unicode property types will be present only when
7851
SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7852
here because it just makes it horribly messy. */
7853
7854
default:
7855
7856
/* LCOV_EXCL_START */
7857
if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7858
{
7859
PCRE2_DEBUG_UNREACHABLE();
7860
*errorcodeptr = ERR10; /* Not a character type - internal error */
7861
return 0;
7862
}
7863
/* LCOV_EXCL_STOP */
7864
7865
{
7866
int prop_type, prop_value;
7867
PCRE2_UCHAR *oldcode;
7868
7869
if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7870
7871
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7872
mclength = 0; /* Not a character */
7873
7874
if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7875
{
7876
prop_type = previous[1];
7877
prop_value = previous[2];
7878
}
7879
else
7880
{
7881
/* Come here from just above with a character in mcbuffer/mclength.
7882
You must also set op_type before the jump. */
7883
OUTPUT_SINGLE_REPEAT:
7884
prop_type = prop_value = -1;
7885
}
7886
7887
/* At this point, if prop_type == prop_value == -1 we either have a
7888
character in mcbuffer when mclength is greater than zero, or we have
7889
mclength zero, in which case there is a non-property character type in
7890
op_previous. If prop_type/value are not negative, we have a property
7891
character type in op_previous. */
7892
7893
oldcode = code; /* Save where we were */
7894
code = previous; /* Usually overwrite previous item */
7895
7896
/* If the maximum is zero then the minimum must also be zero; Perl allows
7897
this case, so we do too - by simply omitting the item altogether. */
7898
7899
if (repeat_max == 0) goto END_REPEAT;
7900
7901
/* Combine the op_type with the repeat_type */
7902
7903
repeat_type += op_type;
7904
7905
/* A minimum of zero is handled either as the special case * or ?, or as
7906
an UPTO, with the maximum given. */
7907
7908
if (repeat_min == 0)
7909
{
7910
if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7911
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7912
else
7913
{
7914
*code++ = OP_UPTO + repeat_type;
7915
PUT2INC(code, 0, repeat_max);
7916
}
7917
}
7918
7919
/* A repeat minimum of 1 is optimized into some special cases. If the
7920
maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7921
left in place and, if the maximum is greater than 1, we use OP_UPTO with
7922
one less than the maximum. */
7923
7924
else if (repeat_min == 1)
7925
{
7926
if (repeat_max == REPEAT_UNLIMITED)
7927
*code++ = OP_PLUS + repeat_type;
7928
else
7929
{
7930
code = oldcode; /* Leave previous item in place */
7931
if (repeat_max == 1) goto END_REPEAT;
7932
*code++ = OP_UPTO + repeat_type;
7933
PUT2INC(code, 0, repeat_max - 1);
7934
}
7935
}
7936
7937
/* The case {n,n} is just an EXACT, while the general case {n,m} is
7938
handled as an EXACT followed by an UPTO or STAR or QUERY. */
7939
7940
else
7941
{
7942
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7943
PUT2INC(code, 0, repeat_min);
7944
7945
/* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7946
and then generate the second opcode. For a repeated Unicode property
7947
match, there are two extra values that define the required property,
7948
and mclength is set zero to indicate this. */
7949
7950
if (repeat_max != repeat_min)
7951
{
7952
if (mclength > 0)
7953
{
7954
memcpy(code, mcbuffer, CU2BYTES(mclength));
7955
code += mclength;
7956
}
7957
else
7958
{
7959
*code++ = op_previous;
7960
if (prop_type >= 0)
7961
{
7962
*code++ = prop_type;
7963
*code++ = prop_value;
7964
}
7965
}
7966
7967
/* Now set up the following opcode */
7968
7969
if (repeat_max == REPEAT_UNLIMITED)
7970
*code++ = OP_STAR + repeat_type;
7971
else
7972
{
7973
repeat_max -= repeat_min;
7974
if (repeat_max == 1)
7975
{
7976
*code++ = OP_QUERY + repeat_type;
7977
}
7978
else
7979
{
7980
*code++ = OP_UPTO + repeat_type;
7981
PUT2INC(code, 0, repeat_max);
7982
}
7983
}
7984
}
7985
}
7986
7987
/* Fill in the character or character type for the final opcode. */
7988
7989
if (mclength > 0)
7990
{
7991
memcpy(code, mcbuffer, CU2BYTES(mclength));
7992
code += mclength;
7993
}
7994
else
7995
{
7996
*code++ = op_previous;
7997
if (prop_type >= 0)
7998
{
7999
*code++ = prop_type;
8000
*code++ = prop_value;
8001
}
8002
}
8003
}
8004
break;
8005
} /* End of switch on different op_previous values */
8006
8007
8008
/* If the character following a repeat is '+', possessive_quantifier is
8009
TRUE. For some opcodes, there are special alternative opcodes for this
8010
case. For anything else, we wrap the entire repeated item inside OP_ONCE
8011
brackets. Logically, the '+' notation is just syntactic sugar, taken from
8012
Sun's Java package, but the special opcodes can optimize it.
8013
8014
Some (but not all) possessively repeated subpatterns have already been
8015
completely handled in the code just above. For them, possessive_quantifier
8016
is always FALSE at this stage. Note that the repeated item starts at
8017
tempcode, not at previous, which might be the first part of a string whose
8018
(former) last char we repeated. */
8019
8020
if (possessive_quantifier)
8021
{
8022
int len;
8023
8024
/* Possessifying an EXACT quantifier has no effect, so we can ignore it.
8025
However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
8026
{5,}, or {5,10}). We skip over an EXACT item; if the length of what
8027
remains is greater than zero, there's a further opcode that can be
8028
handled. If not, do nothing, leaving the EXACT alone. */
8029
8030
switch(*tempcode)
8031
{
8032
case OP_TYPEEXACT:
8033
tempcode += PRIV(OP_lengths)[*tempcode] +
8034
((tempcode[1 + IMM2_SIZE] == OP_PROP
8035
|| tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
8036
break;
8037
8038
/* CHAR opcodes are used for exacts whose count is 1. */
8039
8040
case OP_CHAR:
8041
case OP_CHARI:
8042
case OP_NOT:
8043
case OP_NOTI:
8044
case OP_EXACT:
8045
case OP_EXACTI:
8046
case OP_NOTEXACT:
8047
case OP_NOTEXACTI:
8048
tempcode += PRIV(OP_lengths)[*tempcode];
8049
#ifdef SUPPORT_UNICODE
8050
if (utf && HAS_EXTRALEN(tempcode[-1]))
8051
tempcode += GET_EXTRALEN(tempcode[-1]);
8052
#endif
8053
break;
8054
8055
/* For the class opcodes, the repeat operator appears at the end;
8056
adjust tempcode to point to it. */
8057
8058
case OP_CLASS:
8059
case OP_NCLASS:
8060
tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
8061
break;
8062
8063
#ifdef SUPPORT_WIDE_CHARS
8064
case OP_XCLASS:
8065
case OP_ECLASS:
8066
tempcode += GET(tempcode, 1);
8067
break;
8068
#endif
8069
}
8070
8071
/* If tempcode is equal to code (which points to the end of the repeated
8072
item), it means we have skipped an EXACT item but there is no following
8073
QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
8074
all other cases, tempcode will be pointing to the repeat opcode, and will
8075
be less than code, so the value of len will be greater than 0. */
8076
8077
len = (int)(code - tempcode);
8078
if (len > 0)
8079
{
8080
unsigned int repcode = *tempcode;
8081
8082
/* There is a table for possessifying opcodes, all of which are less
8083
than OP_CALLOUT. A zero entry means there is no possessified version.
8084
*/
8085
8086
if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
8087
*tempcode = opcode_possessify[repcode];
8088
8089
/* For opcode without a special possessified version, wrap the item in
8090
ONCE brackets. */
8091
8092
else
8093
{
8094
(void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
8095
code += 1 + LINK_SIZE;
8096
len += 1 + LINK_SIZE;
8097
tempcode[0] = OP_ONCE;
8098
*code++ = OP_KET;
8099
PUTINC(code, 0, len);
8100
PUT(tempcode, 1, len);
8101
}
8102
}
8103
}
8104
8105
/* We set the "follows varying string" flag for subsequently encountered
8106
reqcus if it isn't already set and we have just passed a varying length
8107
item. */
8108
8109
END_REPEAT:
8110
cb->req_varyopt |= reqvary;
8111
break;
8112
8113
8114
/* ===================================================================*/
8115
/* Handle a 32-bit data character with a value greater than META_END. */
8116
8117
case META_BIGVALUE:
8118
pptr++;
8119
goto NORMAL_CHAR;
8120
8121
8122
/* ===============================================================*/
8123
/* Handle a back reference by number, which is the meta argument. The
8124
pattern offsets for back references to group numbers less than 10 are held
8125
in a special vector, to avoid using more than two parsed pattern elements
8126
in 64-bit environments. We only need the offset to the first occurrence,
8127
because if that doesn't fail, subsequent ones will also be OK. */
8128
8129
case META_BACKREF:
8130
if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8131
else GETPLUSOFFSET(offset, pptr);
8132
8133
if (meta_arg > cb->bracount)
8134
{
8135
cb->erroroffset = offset;
8136
*errorcodeptr = ERR15; /* Non-existent subpattern */
8137
return 0;
8138
}
8139
8140
/* Come here from named backref handling when the reference is to a
8141
single group (that is, not to a duplicated name). The back reference
8142
data will have already been updated. We must disable firstcu if not
8143
set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8144
later. */
8145
8146
HANDLE_SINGLE_REFERENCE:
8147
if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8148
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8149
PUT2INC(code, 0, meta_arg);
8150
if ((options & PCRE2_CASELESS) != 0)
8151
*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8152
REFI_FLAG_CASELESS_RESTRICT : 0) |
8153
(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
8154
REFI_FLAG_TURKISH_CASING : 0);
8155
8156
/* Update the map of back references, and keep the highest one. We
8157
could do this in parse_regex() for numerical back references, but not
8158
for named back references, because we don't know the numbers to which
8159
named back references refer. So we do it all in this function. */
8160
8161
cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8162
if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8163
break;
8164
8165
8166
/* ===============================================================*/
8167
/* Handle recursion by inserting the number of the called group (which is
8168
the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8169
scanned and these numbers are replaced by offsets within the pattern. It is
8170
done like this to avoid problems with forward references and adjusting
8171
offsets when groups are duplicated and moved (as discovered in previous
8172
implementations). Note that a recursion does not have a set first
8173
character. */
8174
8175
case META_RECURSE:
8176
GETPLUSOFFSET(offset, pptr);
8177
if (meta_arg > cb->bracount)
8178
{
8179
cb->erroroffset = offset;
8180
*errorcodeptr = ERR15; /* Non-existent subpattern */
8181
return 0;
8182
}
8183
HANDLE_NUMERICAL_RECURSION:
8184
*code = OP_RECURSE;
8185
PUT(code, 1, meta_arg);
8186
code += 1 + LINK_SIZE;
8187
/* Repeat processing requires this information to
8188
determine the real length in pre-compile phase. */
8189
length_prevgroup = 1 + LINK_SIZE;
8190
8191
if (META_CODE(pptr[1]) == META_OFFSET ||
8192
META_CODE(pptr[1]) == META_CAPTURE_NAME ||
8193
META_CODE(pptr[1]) == META_CAPTURE_NUMBER)
8194
{
8195
recurse_arguments *args;
8196
8197
if (lengthptr != NULL)
8198
{
8199
if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))
8200
return 0;
8201
8202
args = (recurse_arguments*)cb->last_data;
8203
length_prevgroup += (args->size * (1 + IMM2_SIZE));
8204
*lengthptr += (args->size * (1 + IMM2_SIZE));
8205
pptr += args->skip_size;
8206
}
8207
else
8208
{
8209
uint16_t *current, *end;
8210
8211
args = (recurse_arguments*)cb->first_data;
8212
PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);
8213
8214
current = (uint16_t*)(args + 1);
8215
end = current + args->size;
8216
PCRE2_ASSERT(end > current);
8217
8218
do
8219
{
8220
code[0] = OP_CREF;
8221
PUT2(code, 1, *current);
8222
code += 1 + IMM2_SIZE;
8223
}
8224
while (++current < end);
8225
8226
length_prevgroup += (args->size * (1 + IMM2_SIZE));
8227
pptr += args->skip_size;
8228
cb->first_data = args->header.next;
8229
cb->cx->memctl.free(args, cb->cx->memctl.memory_data);
8230
}
8231
}
8232
8233
groupsetfirstcu = FALSE;
8234
cb->had_recurse = TRUE;
8235
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8236
zerofirstcu = firstcu;
8237
zerofirstcuflags = firstcuflags;
8238
break;
8239
8240
8241
/* ===============================================================*/
8242
/* Handle capturing parentheses; the number is the meta argument. */
8243
8244
case META_CAPTURE:
8245
bravalue = OP_CBRA;
8246
skipunits = IMM2_SIZE;
8247
PUT2(code, 1+LINK_SIZE, meta_arg);
8248
cb->lastcapture = meta_arg;
8249
goto GROUP_PROCESS_NOTE_EMPTY;
8250
8251
8252
/* ===============================================================*/
8253
/* Handle escape sequence items. For ones like \d, the ESC_values are
8254
arranged to be the same as the corresponding OP_values in the default case
8255
when PCRE2_UCP is not set (which is the only case in which they will appear
8256
here).
8257
8258
Note: \Q and \E are never seen here, as they were dealt with in
8259
parse_pattern(). Neither are numerical back references or recursions, which
8260
were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8261
\g, when followed by names, are turned into META_BACKREF_BYNAME or
8262
META_RECURSE_BYNAME. */
8263
8264
case META_ESCAPE:
8265
8266
/* We can test for escape sequences that consume a character because their
8267
values lie between ESC_b and ESC_Z; this may have to change if any new ones
8268
are ever created. For these sequences, we disable the setting of a first
8269
character if it hasn't already been set. */
8270
8271
if (meta_arg > ESC_b && meta_arg < ESC_Z)
8272
{
8273
matched_char = TRUE;
8274
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8275
}
8276
8277
/* Set values to reset to if this is followed by a zero repeat. */
8278
8279
zerofirstcu = firstcu;
8280
zerofirstcuflags = firstcuflags;
8281
zeroreqcu = reqcu;
8282
zeroreqcuflags = reqcuflags;
8283
8284
/* If Unicode is not supported, \P and \p are not allowed and are
8285
faulted at parse time, so will never appear here. */
8286
8287
#ifdef SUPPORT_UNICODE
8288
if (meta_arg == ESC_P || meta_arg == ESC_p)
8289
{
8290
uint32_t ptype = *(++pptr) >> 16;
8291
uint32_t pdata = *pptr & 0xffff;
8292
8293
/* In caseless matching, particular characteristics Lu, Ll, and Lt get
8294
converted to the general characteristic L&. That is, upper, lower, and
8295
title case letters are all conflated. */
8296
8297
if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8298
(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8299
{
8300
ptype = PT_LAMP;
8301
pdata = 0;
8302
}
8303
8304
/* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8305
is compiled to [] so as to benefit from the auto-anchoring code. */
8306
8307
if (ptype == PT_ANY)
8308
{
8309
if (meta_arg == ESC_P)
8310
{
8311
*code++ = OP_CLASS;
8312
memset(code, 0, 32);
8313
code += 32 / sizeof(PCRE2_UCHAR);
8314
}
8315
else
8316
*code++ = OP_ALLANY;
8317
}
8318
else
8319
{
8320
*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8321
*code++ = ptype;
8322
*code++ = pdata;
8323
}
8324
break; /* End META_ESCAPE */
8325
}
8326
#endif
8327
8328
/* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8329
done. However, there's an option, in case anyone was relying on it. */
8330
8331
if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8332
(xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8333
{
8334
*errorcodeptr = ERR99;
8335
return 0;
8336
}
8337
8338
/* For the rest (including \X when Unicode is supported - if not it's
8339
faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8340
not set; if it is set, most of them do not show up here because they are
8341
converted into Unicode property tests in parse_regex().
8342
8343
In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8344
instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8345
There are special UCP codes for \B and \b which are used in UCP mode unless
8346
"word" matching is being forced to ASCII.
8347
8348
Note that \b and \B do a one-character lookbehind, and \A also behaves as
8349
if it does. */
8350
8351
switch(meta_arg)
8352
{
8353
case ESC_C:
8354
cb->external_flags |= PCRE2_HASBKC; /* Record */
8355
#if PCRE2_CODE_UNIT_WIDTH == 32
8356
meta_arg = OP_ALLANY;
8357
(void)utf; /* Avoid compiler warning. */
8358
#else
8359
if (!utf) meta_arg = OP_ALLANY;
8360
#endif
8361
break;
8362
8363
case ESC_B:
8364
case ESC_b:
8365
if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8366
meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8367
OP_UCP_WORD_BOUNDARY;
8368
PCRE2_FALLTHROUGH /* Fall through */
8369
8370
case ESC_A:
8371
if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8372
break;
8373
8374
case ESC_K:
8375
cb->external_flags |= PCRE2_HASBSK; /* Record */
8376
break;
8377
}
8378
8379
*code++ = meta_arg;
8380
break; /* End META_ESCAPE */
8381
8382
8383
/* ===================================================================*/
8384
/* Handle an unrecognized meta value. A parsed pattern value less than
8385
META_END is a literal. Otherwise we have a problem. */
8386
8387
default:
8388
/* LCOV_EXCL_START */
8389
if (meta >= META_END)
8390
{
8391
PCRE2_DEBUG_UNREACHABLE();
8392
*errorcodeptr = ERR89; /* Internal error - unrecognized. */
8393
return 0;
8394
}
8395
/* LCOV_EXCL_STOP */
8396
8397
/* Handle a literal character. We come here by goto in the case of a
8398
32-bit, non-UTF character whose value is greater than META_END. */
8399
8400
NORMAL_CHAR:
8401
meta = *pptr; /* Get the full 32 bits */
8402
NORMAL_CHAR_SET: /* Character is already in meta */
8403
matched_char = TRUE;
8404
8405
/* For caseless UTF or UCP mode, check whether this character has more than
8406
one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8407
When casing restrictions apply, ignore caseless sets that start with an
8408
ASCII character. If the character is affected by the special Turkish rules,
8409
hardcode the matching characters using a caseset. */
8410
8411
#ifdef SUPPORT_UNICODE
8412
if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8413
{
8414
uint32_t caseset;
8415
8416
if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8417
PCRE2_EXTRA_TURKISH_CASING &&
8418
UCD_ANY_I(meta))
8419
{
8420
caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8421
}
8422
else if ((caseset = UCD_CASESET(meta)) != 0 &&
8423
(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8424
PRIV(ucd_caseless_sets)[caseset] < 128)
8425
{
8426
caseset = 0; /* Ignore the caseless set if it's restricted. */
8427
}
8428
8429
if (caseset != 0)
8430
{
8431
*code++ = OP_PROP;
8432
*code++ = PT_CLIST;
8433
*code++ = caseset;
8434
if (firstcuflags == REQ_UNSET)
8435
firstcuflags = zerofirstcuflags = REQ_NONE;
8436
break; /* End handling this meta item */
8437
}
8438
}
8439
#endif
8440
8441
/* Caseful matches, or caseless and not one of the multicase characters. We
8442
come here by goto in the case of a positive class that contains only
8443
case-partners of a character with just two cases; matched_char has already
8444
been set TRUE and options fudged if necessary. */
8445
8446
CLASS_CASELESS_CHAR:
8447
8448
/* Get the character's code units into mcbuffer, with the length in
8449
mclength. When not in UTF mode, the length is always 1. */
8450
8451
#ifdef SUPPORT_UNICODE
8452
if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8453
#endif
8454
{
8455
mclength = 1;
8456
mcbuffer[0] = meta;
8457
}
8458
8459
/* Generate the appropriate code */
8460
8461
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8462
memcpy(code, mcbuffer, CU2BYTES(mclength));
8463
code += mclength;
8464
8465
/* Remember if \r or \n were seen */
8466
8467
if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8468
cb->external_flags |= PCRE2_HASCRORLF;
8469
8470
/* Set the first and required code units appropriately. If no previous
8471
first code unit, set it from this character, but revert to none on a zero
8472
repeat. Otherwise, leave the firstcu value alone, and don't change it on
8473
a zero repeat. */
8474
8475
if (firstcuflags == REQ_UNSET)
8476
{
8477
zerofirstcuflags = REQ_NONE;
8478
zeroreqcu = reqcu;
8479
zeroreqcuflags = reqcuflags;
8480
8481
/* If the character is more than one code unit long, we can set a single
8482
firstcu only if it is not to be matched caselessly. Multiple possible
8483
starting code units may be picked up later in the studying code. */
8484
8485
if (mclength == 1 || req_caseopt == 0)
8486
{
8487
firstcu = mcbuffer[0];
8488
firstcuflags = req_caseopt;
8489
if (mclength != 1)
8490
{
8491
reqcu = code[-1];
8492
reqcuflags = cb->req_varyopt;
8493
}
8494
}
8495
else firstcuflags = reqcuflags = REQ_NONE;
8496
}
8497
8498
/* firstcu was previously set; we can set reqcu only if the length is
8499
1 or the matching is caseful. */
8500
8501
else
8502
{
8503
zerofirstcu = firstcu;
8504
zerofirstcuflags = firstcuflags;
8505
zeroreqcu = reqcu;
8506
zeroreqcuflags = reqcuflags;
8507
if (mclength == 1 || req_caseopt == 0)
8508
{
8509
reqcu = code[-1];
8510
reqcuflags = req_caseopt | cb->req_varyopt;
8511
}
8512
}
8513
8514
/* If caselessness was temporarily instated, reset it. */
8515
8516
if (reset_caseful)
8517
{
8518
options &= ~PCRE2_CASELESS;
8519
req_caseopt = 0;
8520
reset_caseful = FALSE;
8521
}
8522
8523
break; /* End literal character handling */
8524
} /* End of big switch */
8525
} /* End of big loop */
8526
8527
/* LCOV_EXCL_START */
8528
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8529
return 0; /* Avoid compiler warnings */
8530
/* LCOV_EXCL_STOP */
8531
}
8532
8533
8534
8535
/*************************************************
8536
* Compile regex: a sequence of alternatives *
8537
*************************************************/
8538
8539
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8540
the closing bracket or META_END. The code variable is pointing at the code unit
8541
into which the BRA operator has been stored. This function is used during the
8542
pre-compile phase when we are trying to find out the amount of memory needed,
8543
as well as during the real compile phase. The value of lengthptr distinguishes
8544
the two phases.
8545
8546
Arguments:
8547
options option bits, including any changes for this subpattern
8548
xoptions extra option bits, ditto
8549
codeptr -> the address of the current code pointer
8550
pptrptr -> the address of the current parsed pattern pointer
8551
errorcodeptr -> pointer to error code variable
8552
skipunits skip this many code units at start (for brackets and OP_COND)
8553
firstcuptr place to put the first required code unit
8554
firstcuflagsptr place to put the first code unit flags
8555
reqcuptr place to put the last required code unit
8556
reqcuflagsptr place to put the last required code unit flags
8557
bcptr pointer to the chain of currently open branches
8558
cb points to the data block with tables pointers etc.
8559
lengthptr NULL during the real compile phase
8560
points to length accumulator during pre-compile phase
8561
8562
Returns: 0 There has been an error
8563
+1 Success, this group must match at least one character
8564
-1 Success, this group may match an empty string
8565
*/
8566
8567
static int
8568
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8569
uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8570
uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8571
uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8572
compile_block *cb, PCRE2_SIZE *lengthptr)
8573
{
8574
PCRE2_UCHAR *code = *codeptr;
8575
PCRE2_UCHAR *last_branch = code;
8576
PCRE2_UCHAR *start_bracket = code;
8577
BOOL lookbehind;
8578
open_capitem capitem;
8579
int capnumber = 0;
8580
int okreturn = 1;
8581
uint32_t *pptr = *pptrptr;
8582
uint32_t firstcu, reqcu;
8583
uint32_t lookbehindlength;
8584
uint32_t lookbehindminlength;
8585
uint32_t firstcuflags, reqcuflags;
8586
PCRE2_SIZE length;
8587
branch_chain bc;
8588
8589
/* If set, call the external function that checks for stack availability. */
8590
8591
if (cb->cx->stack_guard != NULL &&
8592
cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8593
{
8594
*errorcodeptr= ERR33;
8595
cb->erroroffset = 0;
8596
return 0;
8597
}
8598
8599
/* Miscellaneous initialization */
8600
8601
bc.outer = bcptr;
8602
bc.current_branch = code;
8603
8604
firstcu = reqcu = 0;
8605
firstcuflags = reqcuflags = REQ_UNSET;
8606
8607
/* Accumulate the length for use in the pre-compile phase. Start with the
8608
length of the BRA and KET and any extra code units that are required at the
8609
beginning. We accumulate in a local variable to save frequent testing of
8610
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8611
start and end of each alternative, because compiled items are discarded during
8612
the pre-compile phase so that the workspace is not exceeded. */
8613
8614
length = 2 + 2*LINK_SIZE + skipunits;
8615
8616
/* Remember if this is a lookbehind assertion, and if it is, save its length
8617
and skip over the pattern offset. */
8618
8619
lookbehind = *code == OP_ASSERTBACK ||
8620
*code == OP_ASSERTBACK_NOT ||
8621
*code == OP_ASSERTBACK_NA;
8622
8623
if (lookbehind)
8624
{
8625
lookbehindlength = META_DATA(pptr[-1]);
8626
lookbehindminlength = *pptr;
8627
pptr += SIZEOFFSET;
8628
}
8629
else lookbehindlength = lookbehindminlength = 0;
8630
8631
/* If this is a capturing subpattern, add to the chain of open capturing items
8632
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8633
need be tested here; changing this opcode to one of its variants, e.g.
8634
OP_SCBRAPOS, happens later, after the group has been compiled. */
8635
8636
if (*code == OP_CBRA)
8637
{
8638
capnumber = GET2(code, 1 + LINK_SIZE);
8639
capitem.number = capnumber;
8640
capitem.next = open_caps;
8641
capitem.assert_depth = cb->assert_depth;
8642
open_caps = &capitem;
8643
}
8644
8645
/* Offset is set zero to mark that this bracket is still open */
8646
8647
PUT(code, 1, 0);
8648
code += 1 + LINK_SIZE + skipunits;
8649
8650
/* Loop for each alternative branch */
8651
8652
for (;;)
8653
{
8654
int branch_return;
8655
uint32_t branchfirstcu = 0, branchreqcu = 0;
8656
uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8657
8658
/* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8659
is only a single minimum length for the whole assertion. When the minimum
8660
length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8661
though not necessarily the same length. In this case, the original OP_REVERSE
8662
can be used. It can also be used if a branch in a variable length lookbehind
8663
has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8664
maximum and minimum values. */
8665
8666
if (lookbehind && lookbehindlength > 0)
8667
{
8668
if (lookbehindminlength == LOOKBEHIND_MAX ||
8669
lookbehindminlength == lookbehindlength)
8670
{
8671
*code++ = OP_REVERSE;
8672
PUT2INC(code, 0, lookbehindlength);
8673
length += 1 + IMM2_SIZE;
8674
}
8675
else
8676
{
8677
*code++ = OP_VREVERSE;
8678
PUT2INC(code, 0, lookbehindminlength);
8679
PUT2INC(code, 0, lookbehindlength);
8680
length += 1 + 2*IMM2_SIZE;
8681
}
8682
}
8683
8684
/* Now compile the branch; in the pre-compile phase its length gets added
8685
into the length. */
8686
8687
if ((branch_return =
8688
compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8689
&branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8690
&bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8691
return 0;
8692
8693
/* If a branch can match an empty string, so can the whole group. */
8694
8695
if (branch_return < 0) okreturn = -1;
8696
8697
/* In the real compile phase, there is some post-processing to be done. */
8698
8699
if (lengthptr == NULL)
8700
{
8701
/* If this is the first branch, the firstcu and reqcu values for the
8702
branch become the values for the regex. */
8703
8704
if (*last_branch != OP_ALT)
8705
{
8706
firstcu = branchfirstcu;
8707
firstcuflags = branchfirstcuflags;
8708
reqcu = branchreqcu;
8709
reqcuflags = branchreqcuflags;
8710
}
8711
8712
/* If this is not the first branch, the first char and reqcu have to
8713
match the values from all the previous branches, except that if the
8714
previous value for reqcu didn't have REQ_VARY set, it can still match,
8715
and we set REQ_VARY for the group from this branch's value. */
8716
8717
else
8718
{
8719
/* If we previously had a firstcu, but it doesn't match the new branch,
8720
we have to abandon the firstcu for the regex, but if there was
8721
previously no reqcu, it takes on the value of the old firstcu. */
8722
8723
if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8724
{
8725
if (firstcuflags < REQ_NONE)
8726
{
8727
if (reqcuflags >= REQ_NONE)
8728
{
8729
reqcu = firstcu;
8730
reqcuflags = firstcuflags;
8731
}
8732
}
8733
firstcuflags = REQ_NONE;
8734
}
8735
8736
/* If we (now or from before) have no firstcu, a firstcu from the
8737
branch becomes a reqcu if there isn't a branch reqcu. */
8738
8739
if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8740
branchreqcuflags >= REQ_NONE)
8741
{
8742
branchreqcu = branchfirstcu;
8743
branchreqcuflags = branchfirstcuflags;
8744
}
8745
8746
/* Now ensure that the reqcus match */
8747
8748
if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8749
reqcu != branchreqcu)
8750
reqcuflags = REQ_NONE;
8751
else
8752
{
8753
reqcu = branchreqcu;
8754
reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8755
}
8756
}
8757
}
8758
8759
/* Handle reaching the end of the expression, either ')' or end of pattern.
8760
In the real compile phase, go back through the alternative branches and
8761
reverse the chain of offsets, with the field in the BRA item now becoming an
8762
offset to the first alternative. If there are no alternatives, it points to
8763
the end of the group. The length in the terminating ket is always the length
8764
of the whole bracketed item. Return leaving the pointer at the terminating
8765
char. */
8766
8767
if (META_CODE(*pptr) != META_ALT)
8768
{
8769
if (lengthptr == NULL)
8770
{
8771
uint32_t branch_length = (uint32_t)(code - last_branch);
8772
do
8773
{
8774
uint32_t prev_length = GET(last_branch, 1);
8775
PUT(last_branch, 1, branch_length);
8776
branch_length = prev_length;
8777
last_branch -= branch_length;
8778
}
8779
while (branch_length > 0);
8780
}
8781
8782
/* Fill in the ket */
8783
8784
*code = OP_KET;
8785
PUT(code, 1, (uint32_t)(code - start_bracket));
8786
code += 1 + LINK_SIZE;
8787
8788
/* Set values to pass back */
8789
8790
*codeptr = code;
8791
*pptrptr = pptr;
8792
*firstcuptr = firstcu;
8793
*firstcuflagsptr = firstcuflags;
8794
*reqcuptr = reqcu;
8795
*reqcuflagsptr = reqcuflags;
8796
if (lengthptr != NULL)
8797
{
8798
if (OFLOW_MAX - *lengthptr < length)
8799
{
8800
*errorcodeptr = ERR20;
8801
return 0;
8802
}
8803
*lengthptr += length;
8804
}
8805
return okreturn;
8806
}
8807
8808
/* Another branch follows. In the pre-compile phase, we can move the code
8809
pointer back to where it was for the start of the first branch. (That is,
8810
pretend that each branch is the only one.)
8811
8812
In the real compile phase, insert an ALT node. Its length field points back
8813
to the previous branch while the bracket remains open. At the end the chain
8814
is reversed. It's done like this so that the start of the bracket has a
8815
zero offset until it is closed, making it possible to detect recursion. */
8816
8817
if (lengthptr != NULL)
8818
{
8819
code = *codeptr + 1 + LINK_SIZE + skipunits;
8820
length += 1 + LINK_SIZE;
8821
}
8822
else
8823
{
8824
*code = OP_ALT;
8825
PUT(code, 1, (int)(code - last_branch));
8826
bc.current_branch = last_branch = code;
8827
code += 1 + LINK_SIZE;
8828
}
8829
8830
/* Set the maximum lookbehind length for the next branch (if not in a
8831
lookbehind the value will be zero) and then advance past the vertical bar. */
8832
8833
lookbehindlength = META_DATA(*pptr);
8834
pptr++;
8835
}
8836
8837
/* LCOV_EXCL_START */
8838
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8839
return 0; /* Avoid compiler warnings */
8840
/* LCOV_EXCL_STOP */
8841
}
8842
8843
8844
8845
/*************************************************
8846
* Check for anchored pattern *
8847
*************************************************/
8848
8849
/* Try to find out if this is an anchored regular expression. Consider each
8850
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8851
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8852
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8853
be found, because ^ generates OP_CIRCM in that mode.
8854
8855
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8856
This is the code for \G, which means "match at start of match position, taking
8857
into account the match offset".
8858
8859
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8860
because that will try the rest of the pattern at all possible matching points,
8861
so there is no point trying again.... er ....
8862
8863
.... except when the .* appears inside capturing parentheses, and there is a
8864
subsequent back reference to those parentheses. We haven't enough information
8865
to catch that case precisely.
8866
8867
At first, the best we could do was to detect when .* was in capturing brackets
8868
and the highest back reference was greater than or equal to that level.
8869
However, by keeping a bitmap of the first 31 back references, we can catch some
8870
of the more common cases more precisely.
8871
8872
... A second exception is when the .* appears inside an atomic group, because
8873
this prevents the number of characters it matches from being adjusted.
8874
8875
Arguments:
8876
code points to start of the compiled pattern
8877
bracket_map a bitmap of which brackets we are inside while testing; this
8878
handles up to substring 31; after that we just have to take
8879
the less precise approach
8880
cb points to the compile data block
8881
atomcount atomic group level
8882
inassert TRUE if in an assertion
8883
dotstar_anchor TRUE if automatic anchoring optimization is enabled
8884
8885
Returns: TRUE or FALSE
8886
*/
8887
8888
static BOOL
8889
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8890
int atomcount, BOOL inassert, BOOL dotstar_anchor)
8891
{
8892
do {
8893
PCRE2_SPTR scode = first_significant_code(
8894
code + PRIV(OP_lengths)[*code], FALSE);
8895
int op = *scode;
8896
8897
/* Non-capturing brackets */
8898
8899
if (op == OP_BRA || op == OP_BRAPOS ||
8900
op == OP_SBRA || op == OP_SBRAPOS)
8901
{
8902
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8903
return FALSE;
8904
}
8905
8906
/* Capturing brackets */
8907
8908
else if (op == OP_CBRA || op == OP_CBRAPOS ||
8909
op == OP_SCBRA || op == OP_SCBRAPOS)
8910
{
8911
int n = GET2(scode, 1+LINK_SIZE);
8912
uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8913
if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8914
}
8915
8916
/* Positive forward assertion */
8917
8918
else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8919
{
8920
if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8921
}
8922
8923
/* Condition. If there is no second branch, it can't be anchored. */
8924
8925
else if (op == OP_COND || op == OP_SCOND)
8926
{
8927
if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8928
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8929
return FALSE;
8930
}
8931
8932
/* Atomic groups */
8933
8934
else if (op == OP_ONCE)
8935
{
8936
if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8937
return FALSE;
8938
}
8939
8940
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8941
it isn't in brackets that are or may be referenced or inside an atomic
8942
group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8943
because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8944
with the subject "aab", which matches "b", i.e. not at the start of a line.
8945
There is also an option that disables auto-anchoring. */
8946
8947
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8948
op == OP_TYPEPOSSTAR))
8949
{
8950
if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8951
atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8952
return FALSE;
8953
}
8954
8955
/* Check for explicit anchoring */
8956
8957
else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8958
8959
code += GET(code, 1);
8960
}
8961
while (*code == OP_ALT); /* Loop for each alternative */
8962
return TRUE;
8963
}
8964
8965
8966
8967
/*************************************************
8968
* Check for starting with ^ or .* *
8969
*************************************************/
8970
8971
/* This is called to find out if every branch starts with ^ or .* so that
8972
"first char" processing can be done to speed things up in multiline
8973
matching and for non-DOTALL patterns that start with .* (which must start at
8974
the beginning or after \n). As in the case of is_anchored() (see above), we
8975
have to take account of back references to capturing brackets that contain .*
8976
because in that case we can't make the assumption. Also, the appearance of .*
8977
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8978
or *SKIP does not count, because once again the assumption no longer holds.
8979
8980
Arguments:
8981
code points to start of the compiled pattern or a group
8982
bracket_map a bitmap of which brackets we are inside while testing; this
8983
handles up to substring 31; after that we just have to take
8984
the less precise approach
8985
cb points to the compile data
8986
atomcount atomic group level
8987
inassert TRUE if in an assertion
8988
dotstar_anchor TRUE if automatic anchoring optimization is enabled
8989
8990
Returns: TRUE or FALSE
8991
*/
8992
8993
static BOOL
8994
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8995
int atomcount, BOOL inassert, BOOL dotstar_anchor)
8996
{
8997
do {
8998
PCRE2_SPTR scode = first_significant_code(
8999
code + PRIV(OP_lengths)[*code], FALSE);
9000
int op = *scode;
9001
9002
/* If we are at the start of a conditional assertion group, *both* the
9003
conditional assertion *and* what follows the condition must satisfy the test
9004
for start of line. Other kinds of condition fail. Note that there may be an
9005
auto-callout at the start of a condition. */
9006
9007
if (op == OP_COND)
9008
{
9009
scode += 1 + LINK_SIZE;
9010
9011
if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
9012
else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
9013
9014
switch (*scode)
9015
{
9016
case OP_CREF:
9017
case OP_DNCREF:
9018
case OP_RREF:
9019
case OP_DNRREF:
9020
case OP_FAIL:
9021
case OP_FALSE:
9022
case OP_TRUE:
9023
return FALSE;
9024
9025
default: /* Assertion */
9026
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9027
return FALSE;
9028
do scode += GET(scode, 1); while (*scode == OP_ALT);
9029
scode += 1 + LINK_SIZE;
9030
break;
9031
}
9032
scode = first_significant_code(scode, FALSE);
9033
op = *scode;
9034
}
9035
9036
/* Non-capturing brackets */
9037
9038
if (op == OP_BRA || op == OP_BRAPOS ||
9039
op == OP_SBRA || op == OP_SBRAPOS)
9040
{
9041
if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
9042
return FALSE;
9043
}
9044
9045
/* Capturing brackets */
9046
9047
else if (op == OP_CBRA || op == OP_CBRAPOS ||
9048
op == OP_SCBRA || op == OP_SCBRAPOS)
9049
{
9050
int n = GET2(scode, 1+LINK_SIZE);
9051
unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
9052
if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
9053
return FALSE;
9054
}
9055
9056
/* Positive forward assertions */
9057
9058
else if (op == OP_ASSERT || op == OP_ASSERT_NA)
9059
{
9060
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9061
return FALSE;
9062
}
9063
9064
/* Atomic brackets */
9065
9066
else if (op == OP_ONCE)
9067
{
9068
if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
9069
return FALSE;
9070
}
9071
9072
/* .* means "start at start or after \n" if it isn't in atomic brackets or
9073
brackets that may be referenced or an assertion, and as long as the pattern
9074
does not contain *PRUNE or *SKIP, because these break the feature. Consider,
9075
for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
9076
i.e. not at the start of a line. There is also an option that disables this
9077
optimization. */
9078
9079
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
9080
{
9081
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
9082
atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
9083
return FALSE;
9084
}
9085
9086
/* Check for explicit circumflex; anything else gives a FALSE result. Note
9087
in particular that this includes atomic brackets OP_ONCE because the number
9088
of characters matched by .* cannot be adjusted inside them. */
9089
9090
else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
9091
9092
/* Move on to the next alternative */
9093
9094
code += GET(code, 1);
9095
}
9096
while (*code == OP_ALT); /* Loop for each alternative */
9097
return TRUE;
9098
}
9099
9100
9101
9102
/*************************************************
9103
* Scan compiled regex for recursion reference *
9104
*************************************************/
9105
9106
/* This function scans through a compiled pattern until it finds an instance of
9107
OP_RECURSE.
9108
9109
Arguments:
9110
code points to start of expression
9111
utf TRUE in UTF mode
9112
9113
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
9114
*/
9115
9116
static PCRE2_UCHAR *
9117
find_recurse(PCRE2_UCHAR *code, BOOL utf)
9118
{
9119
for (;;)
9120
{
9121
PCRE2_UCHAR c = *code;
9122
if (c == OP_END) return NULL;
9123
if (c == OP_RECURSE) return code;
9124
9125
/* XCLASS is used for classes that cannot be represented just by a bit map.
9126
This includes negated single high-valued characters. ECLASS is used for
9127
classes that use set operations internally. CALLOUT_STR is used for
9128
callouts with string arguments. In each case the length in the table is
9129
zero; the actual length is stored in the compiled code. */
9130
9131
if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
9132
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
9133
9134
/* Otherwise, we can get the item's length from the table, except that for
9135
repeated character types, we have to test for \p and \P, which have an extra
9136
two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
9137
we must add in its length. */
9138
9139
else
9140
{
9141
switch(c)
9142
{
9143
case OP_TYPESTAR:
9144
case OP_TYPEMINSTAR:
9145
case OP_TYPEPLUS:
9146
case OP_TYPEMINPLUS:
9147
case OP_TYPEQUERY:
9148
case OP_TYPEMINQUERY:
9149
case OP_TYPEPOSSTAR:
9150
case OP_TYPEPOSPLUS:
9151
case OP_TYPEPOSQUERY:
9152
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
9153
break;
9154
9155
case OP_TYPEPOSUPTO:
9156
case OP_TYPEUPTO:
9157
case OP_TYPEMINUPTO:
9158
case OP_TYPEEXACT:
9159
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
9160
code += 2;
9161
break;
9162
9163
case OP_MARK:
9164
case OP_COMMIT_ARG:
9165
case OP_PRUNE_ARG:
9166
case OP_SKIP_ARG:
9167
case OP_THEN_ARG:
9168
code += code[1];
9169
break;
9170
}
9171
9172
/* Add in the fixed length from the table */
9173
9174
code += PRIV(OP_lengths)[c];
9175
9176
/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
9177
be followed by a multi-unit character. The length in the table is a
9178
minimum, so we have to arrange to skip the extra units. */
9179
9180
#ifdef MAYBE_UTF_MULTI
9181
if (utf) switch(c)
9182
{
9183
case OP_CHAR:
9184
case OP_CHARI:
9185
case OP_NOT:
9186
case OP_NOTI:
9187
case OP_EXACT:
9188
case OP_EXACTI:
9189
case OP_NOTEXACT:
9190
case OP_NOTEXACTI:
9191
case OP_UPTO:
9192
case OP_UPTOI:
9193
case OP_NOTUPTO:
9194
case OP_NOTUPTOI:
9195
case OP_MINUPTO:
9196
case OP_MINUPTOI:
9197
case OP_NOTMINUPTO:
9198
case OP_NOTMINUPTOI:
9199
case OP_POSUPTO:
9200
case OP_POSUPTOI:
9201
case OP_NOTPOSUPTO:
9202
case OP_NOTPOSUPTOI:
9203
case OP_STAR:
9204
case OP_STARI:
9205
case OP_NOTSTAR:
9206
case OP_NOTSTARI:
9207
case OP_MINSTAR:
9208
case OP_MINSTARI:
9209
case OP_NOTMINSTAR:
9210
case OP_NOTMINSTARI:
9211
case OP_POSSTAR:
9212
case OP_POSSTARI:
9213
case OP_NOTPOSSTAR:
9214
case OP_NOTPOSSTARI:
9215
case OP_PLUS:
9216
case OP_PLUSI:
9217
case OP_NOTPLUS:
9218
case OP_NOTPLUSI:
9219
case OP_MINPLUS:
9220
case OP_MINPLUSI:
9221
case OP_NOTMINPLUS:
9222
case OP_NOTMINPLUSI:
9223
case OP_POSPLUS:
9224
case OP_POSPLUSI:
9225
case OP_NOTPOSPLUS:
9226
case OP_NOTPOSPLUSI:
9227
case OP_QUERY:
9228
case OP_QUERYI:
9229
case OP_NOTQUERY:
9230
case OP_NOTQUERYI:
9231
case OP_MINQUERY:
9232
case OP_MINQUERYI:
9233
case OP_NOTMINQUERY:
9234
case OP_NOTMINQUERYI:
9235
case OP_POSQUERY:
9236
case OP_POSQUERYI:
9237
case OP_NOTPOSQUERY:
9238
case OP_NOTPOSQUERYI:
9239
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9240
break;
9241
}
9242
#else
9243
(void)(utf); /* Keep compiler happy by referencing function argument */
9244
#endif /* MAYBE_UTF_MULTI */
9245
}
9246
}
9247
}
9248
9249
9250
9251
/*************************************************
9252
* Check for asserted fixed first code unit *
9253
*************************************************/
9254
9255
/* During compilation, the "first code unit" settings from forward assertions
9256
are discarded, because they can cause conflicts with actual literals that
9257
follow. However, if we end up without a first code unit setting for an
9258
unanchored pattern, it is worth scanning the regex to see if there is an
9259
initial asserted first code unit. If all branches start with the same asserted
9260
code unit, or with a non-conditional bracket all of whose alternatives start
9261
with the same asserted code unit (recurse ad lib), then we return that code
9262
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9263
REQ_NONE in the flags.
9264
9265
Arguments:
9266
code points to start of compiled pattern
9267
flags points to the first code unit flags
9268
inassert non-zero if in an assertion
9269
9270
Returns: the fixed first code unit, or 0 with REQ_NONE in flags
9271
*/
9272
9273
static uint32_t
9274
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9275
{
9276
uint32_t c = 0;
9277
uint32_t cflags = REQ_NONE;
9278
9279
*flags = REQ_NONE;
9280
do {
9281
uint32_t d;
9282
uint32_t dflags;
9283
int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9284
*code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9285
PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9286
PCRE2_UCHAR op = *scode;
9287
9288
switch(op)
9289
{
9290
default:
9291
return 0;
9292
9293
case OP_BRA:
9294
case OP_BRAPOS:
9295
case OP_CBRA:
9296
case OP_SCBRA:
9297
case OP_CBRAPOS:
9298
case OP_SCBRAPOS:
9299
case OP_ASSERT:
9300
case OP_ASSERT_NA:
9301
case OP_ONCE:
9302
case OP_SCRIPT_RUN:
9303
d = find_firstassertedcu(scode, &dflags, inassert +
9304
((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9305
if (dflags >= REQ_NONE) return 0;
9306
if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9307
else if (c != d || cflags != dflags) return 0;
9308
break;
9309
9310
case OP_EXACT:
9311
scode += IMM2_SIZE;
9312
PCRE2_FALLTHROUGH /* Fall through */
9313
9314
case OP_CHAR:
9315
case OP_PLUS:
9316
case OP_MINPLUS:
9317
case OP_POSPLUS:
9318
if (inassert == 0) return 0;
9319
if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9320
else if (c != scode[1]) return 0;
9321
break;
9322
9323
case OP_EXACTI:
9324
scode += IMM2_SIZE;
9325
PCRE2_FALLTHROUGH /* Fall through */
9326
9327
case OP_CHARI:
9328
case OP_PLUSI:
9329
case OP_MINPLUSI:
9330
case OP_POSPLUSI:
9331
if (inassert == 0) return 0;
9332
9333
/* If the character is more than one code unit long, we cannot set its
9334
first code unit when matching caselessly. Later scanning may pick up
9335
multiple code units. */
9336
9337
#ifdef SUPPORT_UNICODE
9338
#if PCRE2_CODE_UNIT_WIDTH == 8
9339
if (scode[1] >= 0x80) return 0;
9340
#elif PCRE2_CODE_UNIT_WIDTH == 16
9341
if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9342
#endif
9343
#endif
9344
9345
if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9346
else if (c != scode[1]) return 0;
9347
break;
9348
}
9349
9350
code += GET(code, 1);
9351
}
9352
while (*code == OP_ALT);
9353
9354
*flags = cflags;
9355
return c;
9356
}
9357
9358
9359
9360
/*************************************************
9361
* Skip in parsed pattern *
9362
*************************************************/
9363
9364
/* This function is called to skip parts of the parsed pattern when finding the
9365
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9366
the end of the branch, it is called to skip over an internal lookaround or
9367
(DEFINE) group, and it is also called to skip to the end of a class, during
9368
which it will never encounter nested groups (but there's no need to have
9369
special code for that).
9370
9371
When called to find the end of a branch or group, pptr must point to the first
9372
meta code inside the branch, not the branch-starting code. In other cases it
9373
can point to the item that causes the function to be called.
9374
9375
Arguments:
9376
pptr current pointer to skip from
9377
skiptype PSKIP_CLASS when skipping to end of class
9378
PSKIP_ALT when META_ALT ends the skip
9379
PSKIP_KET when only META_KET ends the skip
9380
9381
Returns: new value of pptr
9382
NULL if META_END is reached - should never occur
9383
or for an unknown meta value - likewise
9384
*/
9385
9386
static uint32_t *
9387
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9388
{
9389
uint32_t nestlevel = 0;
9390
9391
for (;; pptr++)
9392
{
9393
uint32_t meta = META_CODE(*pptr);
9394
9395
switch(meta)
9396
{
9397
default: /* Just skip over most items */
9398
if (meta < META_END) continue; /* Literal */
9399
break;
9400
9401
/* The parsed regex is malformed; we have reached the end and did
9402
not find the end of the construct which we are skipping over. */
9403
9404
/* LCOV_EXCL_START */
9405
case META_END:
9406
PCRE2_DEBUG_UNREACHABLE();
9407
return NULL;
9408
/* LCOV_EXCL_STOP */
9409
9410
/* The data for these items is variable in length. */
9411
9412
case META_BACKREF: /* Offset is present only if group >= 10 */
9413
if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9414
break;
9415
9416
case META_ESCAPE:
9417
if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9418
pptr += 1; /* Skip prop data */
9419
break;
9420
9421
case META_MARK: /* Add the length of the name. */
9422
case META_COMMIT_ARG:
9423
case META_PRUNE_ARG:
9424
case META_SKIP_ARG:
9425
case META_THEN_ARG:
9426
pptr += pptr[1];
9427
break;
9428
9429
/* These are the "active" items in this loop. */
9430
9431
case META_CLASS_END:
9432
if (skiptype == PSKIP_CLASS) return pptr;
9433
break;
9434
9435
case META_ATOMIC:
9436
case META_CAPTURE:
9437
case META_COND_ASSERT:
9438
case META_COND_DEFINE:
9439
case META_COND_NAME:
9440
case META_COND_NUMBER:
9441
case META_COND_RNAME:
9442
case META_COND_RNUMBER:
9443
case META_COND_VERSION:
9444
case META_SCS:
9445
case META_LOOKAHEAD:
9446
case META_LOOKAHEADNOT:
9447
case META_LOOKAHEAD_NA:
9448
case META_LOOKBEHIND:
9449
case META_LOOKBEHINDNOT:
9450
case META_LOOKBEHIND_NA:
9451
case META_NOCAPTURE:
9452
case META_SCRIPT_RUN:
9453
nestlevel++;
9454
break;
9455
9456
case META_ALT:
9457
if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9458
break;
9459
9460
case META_KET:
9461
if (nestlevel == 0) return pptr;
9462
nestlevel--;
9463
break;
9464
}
9465
9466
/* The extra data item length for each meta is in a table. */
9467
9468
meta = (meta >> 16) & 0x7fff;
9469
if (meta >= sizeof(meta_extra_lengths)) return NULL;
9470
pptr += meta_extra_lengths[meta];
9471
}
9472
9473
/* LCOV_EXCL_START */
9474
PCRE2_UNREACHABLE(); /* Control never reaches here */
9475
/* LCOV_EXCL_STOP */
9476
}
9477
9478
9479
9480
/*************************************************
9481
* Find length of a parsed group *
9482
*************************************************/
9483
9484
/* This is called for nested groups within a branch of a lookbehind whose
9485
length is being computed. On entry, the pointer must be at the first element
9486
after the group initializing code. On exit it points to OP_KET. Caching is used
9487
to improve processing speed when the same capturing group occurs many times.
9488
9489
Arguments:
9490
pptrptr pointer to pointer in the parsed pattern
9491
minptr where to return the minimum length
9492
isinline FALSE if a reference or recursion; TRUE for inline group
9493
errcodeptr pointer to the errorcode
9494
lcptr pointer to the loop counter
9495
group number of captured group or -1 for a non-capturing group
9496
recurses chain of recurse_check to catch mutual recursion
9497
cb pointer to the compile data
9498
9499
Returns: the maximum group length or a negative number
9500
*/
9501
9502
static int
9503
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9504
int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9505
{
9506
uint32_t *gi = cb->groupinfo + 2 * group;
9507
int branchlength, branchminlength;
9508
int grouplength = -1;
9509
int groupminlength = INT_MAX;
9510
9511
/* The cache can be used only if there is no possibility of there being two
9512
groups with the same number. We do not need to set the end pointer for a group
9513
that is being processed as a back reference or recursion, but we must do so for
9514
an inline group. */
9515
9516
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9517
{
9518
uint32_t groupinfo = gi[0];
9519
if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9520
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9521
{
9522
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9523
*minptr = gi[1];
9524
return groupinfo & GI_FIXED_LENGTH_MASK;
9525
}
9526
}
9527
9528
/* Scan the group. In this case we find the end pointer of necessity. */
9529
9530
for(;;)
9531
{
9532
branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9533
recurses, cb);
9534
if (branchlength < 0) goto ISNOTFIXED;
9535
if (branchlength > grouplength) grouplength = branchlength;
9536
if (branchminlength < groupminlength) groupminlength = branchminlength;
9537
if (**pptrptr == META_KET) break;
9538
*pptrptr += 1; /* Skip META_ALT */
9539
}
9540
9541
if (group > 0)
9542
{
9543
gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9544
gi[1] = groupminlength;
9545
}
9546
9547
*minptr = groupminlength;
9548
return grouplength;
9549
9550
ISNOTFIXED:
9551
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9552
return -1;
9553
}
9554
9555
9556
9557
/*************************************************
9558
* Find length of a parsed branch *
9559
*************************************************/
9560
9561
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9562
giving an error if the length is not limited. On entry, *pptrptr points to the
9563
first element inside the branch. On exit it is set to point to the ALT or KET.
9564
9565
Arguments:
9566
pptrptr pointer to pointer in the parsed pattern
9567
minptr where to return the minimum length
9568
errcodeptr pointer to error code
9569
lcptr pointer to loop counter
9570
recurses chain of recurse_check to catch mutual recursion
9571
cb pointer to compile block
9572
9573
Returns: the maximum length, or a negative value on error
9574
*/
9575
9576
static int
9577
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9578
parsed_recurse_check *recurses, compile_block *cb)
9579
{
9580
int branchlength = 0;
9581
int branchminlength = 0;
9582
int grouplength, groupminlength;
9583
uint32_t lastitemlength = 0;
9584
uint32_t lastitemminlength = 0;
9585
uint32_t *pptr = *pptrptr;
9586
PCRE2_SIZE offset;
9587
parsed_recurse_check this_recurse;
9588
9589
/* A large and/or complex regex can take too long to process. This can happen
9590
more often when (?| groups are present in the pattern because their length
9591
cannot be cached. */
9592
9593
if ((*lcptr)++ > 2000)
9594
{
9595
*errcodeptr = ERR35; /* Lookbehind is too complicated */
9596
return -1;
9597
}
9598
9599
/* Scan the branch, accumulating the length. */
9600
9601
for (;; pptr++)
9602
{
9603
parsed_recurse_check *r;
9604
uint32_t *gptr, *gptrend;
9605
uint32_t escape;
9606
uint32_t min, max;
9607
uint32_t group = 0;
9608
uint32_t itemlength = 0;
9609
uint32_t itemminlength = 0;
9610
9611
if (*pptr < META_END)
9612
{
9613
itemlength = itemminlength = 1;
9614
}
9615
9616
else switch (META_CODE(*pptr))
9617
{
9618
case META_KET:
9619
case META_ALT:
9620
goto EXIT;
9621
9622
/* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9623
actual termination. */
9624
9625
case META_ACCEPT:
9626
case META_FAIL:
9627
pptr = parsed_skip(pptr, PSKIP_ALT);
9628
if (pptr == NULL) goto PARSED_SKIP_FAILED;
9629
goto EXIT;
9630
9631
case META_MARK:
9632
case META_COMMIT_ARG:
9633
case META_PRUNE_ARG:
9634
case META_SKIP_ARG:
9635
case META_THEN_ARG:
9636
pptr += pptr[1] + 1;
9637
break;
9638
9639
case META_CIRCUMFLEX:
9640
case META_COMMIT:
9641
case META_DOLLAR:
9642
case META_PRUNE:
9643
case META_SKIP:
9644
case META_THEN:
9645
break;
9646
9647
case META_OPTIONS:
9648
pptr += 2;
9649
break;
9650
9651
case META_BIGVALUE:
9652
itemlength = itemminlength = 1;
9653
pptr += 1;
9654
break;
9655
9656
case META_CLASS:
9657
case META_CLASS_NOT:
9658
itemlength = itemminlength = 1;
9659
pptr = parsed_skip(pptr, PSKIP_CLASS);
9660
if (pptr == NULL) goto PARSED_SKIP_FAILED;
9661
break;
9662
9663
case META_CLASS_EMPTY_NOT:
9664
case META_DOT:
9665
itemlength = itemminlength = 1;
9666
break;
9667
9668
case META_CALLOUT_NUMBER:
9669
pptr += 3;
9670
break;
9671
9672
case META_CALLOUT_STRING:
9673
pptr += 3 + SIZEOFFSET;
9674
break;
9675
9676
/* Only some escapes consume a character. Of those, \R can match one or two
9677
characters, but \X is never allowed because it matches an unknown number of
9678
characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9679
9680
case META_ESCAPE:
9681
escape = META_DATA(*pptr);
9682
if (escape == ESC_X) return -1;
9683
if (escape == ESC_R)
9684
{
9685
itemminlength = 1;
9686
itemlength = 2;
9687
}
9688
else if (escape > ESC_b && escape < ESC_Z)
9689
{
9690
#if PCRE2_CODE_UNIT_WIDTH != 32
9691
if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9692
{
9693
*errcodeptr = ERR36;
9694
return -1;
9695
}
9696
#endif
9697
itemlength = itemminlength = 1;
9698
if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9699
}
9700
break;
9701
9702
/* Lookaheads do not contribute to the length of this branch, but they may
9703
contain lookbehinds within them whose lengths need to be set. */
9704
9705
case META_LOOKAHEAD:
9706
case META_LOOKAHEADNOT:
9707
case META_LOOKAHEAD_NA:
9708
case META_SCS:
9709
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9710
if (*errcodeptr != 0) return -1;
9711
9712
/* Ignore any qualifiers that follow a lookahead assertion. */
9713
9714
switch (pptr[1])
9715
{
9716
case META_ASTERISK:
9717
case META_ASTERISK_PLUS:
9718
case META_ASTERISK_QUERY:
9719
case META_PLUS:
9720
case META_PLUS_PLUS:
9721
case META_PLUS_QUERY:
9722
case META_QUERY:
9723
case META_QUERY_PLUS:
9724
case META_QUERY_QUERY:
9725
pptr++;
9726
break;
9727
9728
case META_MINMAX:
9729
case META_MINMAX_PLUS:
9730
case META_MINMAX_QUERY:
9731
pptr += 3;
9732
break;
9733
9734
default:
9735
break;
9736
}
9737
break;
9738
9739
/* A nested lookbehind does not contribute any length to this lookbehind,
9740
but must itself be checked and have its lengths set. Note that
9741
set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9742
of the group, so no need to update it here. */
9743
9744
case META_LOOKBEHIND:
9745
case META_LOOKBEHINDNOT:
9746
case META_LOOKBEHIND_NA:
9747
if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9748
return -1;
9749
break;
9750
9751
/* Back references and recursions are handled by very similar code. At this
9752
stage, the names generated in the parsing pass are available, but the main
9753
name table has not yet been created. So for the named varieties, scan the
9754
list of names in order to get the number of the first one in the pattern,
9755
and whether or not this name is duplicated. */
9756
9757
case META_BACKREF_BYNAME:
9758
if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9759
goto ISNOTFIXED;
9760
PCRE2_FALLTHROUGH /* Fall through */
9761
9762
case META_RECURSE_BYNAME:
9763
{
9764
PCRE2_SPTR name;
9765
BOOL is_dupname = FALSE;
9766
named_group *ng;
9767
uint32_t meta_code = META_CODE(*pptr);
9768
uint32_t length = *(++pptr);
9769
9770
GETPLUSOFFSET(offset, pptr);
9771
name = cb->start_pattern + offset;
9772
ng = PRIV(compile_find_named_group)(name, length, cb);
9773
9774
if (ng == NULL)
9775
{
9776
*errcodeptr = ERR15; /* Non-existent subpattern */
9777
cb->erroroffset = offset;
9778
return -1;
9779
}
9780
9781
group = ng->number;
9782
is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;
9783
9784
/* A numerical back reference can be fixed length if duplicate capturing
9785
groups are not being used. A non-duplicate named back reference can also
9786
be handled. */
9787
9788
if (meta_code == META_RECURSE_BYNAME ||
9789
(!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9790
goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9791
}
9792
goto ISNOTFIXED; /* Duplicate name or number */
9793
9794
/* The offset values for back references < 10 are in a separate vector
9795
because otherwise they would use more than two parsed pattern elements on
9796
64-bit systems. */
9797
9798
case META_BACKREF:
9799
if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9800
(cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9801
goto ISNOTFIXED;
9802
group = META_DATA(*pptr);
9803
if (group < 10)
9804
{
9805
offset = cb->small_ref_offset[group];
9806
goto RECURSE_OR_BACKREF_LENGTH;
9807
}
9808
9809
PCRE2_FALLTHROUGH /* Fall through */
9810
/* For groups >= 10 - picking up group twice does no harm. */
9811
9812
/* A true recursion implies not fixed length, but a subroutine call may
9813
be OK. Back reference "recursions" are also failed. */
9814
9815
case META_RECURSE:
9816
group = META_DATA(*pptr);
9817
GETPLUSOFFSET(offset, pptr);
9818
9819
RECURSE_OR_BACKREF_LENGTH:
9820
if (group > cb->bracount)
9821
{
9822
cb->erroroffset = offset;
9823
*errcodeptr = ERR15; /* Non-existent subpattern */
9824
return -1;
9825
}
9826
if (group == 0) goto ISNOTFIXED; /* Local recursion */
9827
for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9828
{
9829
if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9830
else if (*gptr == (META_CAPTURE | group)) break;
9831
}
9832
9833
/* We must start the search for the end of the group at the first meta code
9834
inside the group. Otherwise it will be treated as an enclosed group. */
9835
9836
gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9837
if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9838
if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9839
for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9840
if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9841
this_recurse.prev = recurses;
9842
this_recurse.groupptr = gptr;
9843
9844
/* We do not need to know the position of the end of the group, that is,
9845
gptr is not used after the call to get_grouplength(). Setting the second
9846
argument FALSE stops it scanning for the end when the length can be found
9847
in the cache. */
9848
9849
gptr++;
9850
grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9851
lcptr, group, &this_recurse, cb);
9852
if (grouplength < 0)
9853
{
9854
if (*errcodeptr == 0) goto ISNOTFIXED;
9855
return -1; /* Error already set */
9856
}
9857
itemlength = grouplength;
9858
itemminlength = groupminlength;
9859
break;
9860
9861
/* A (DEFINE) group is never obeyed inline and so it does not contribute to
9862
the length of this branch. Skip from the following item to the next
9863
unpaired ket. */
9864
9865
case META_COND_DEFINE:
9866
pptr = parsed_skip(pptr + 1, PSKIP_KET);
9867
break;
9868
9869
/* Check other nested groups - advance past the initial data for each type
9870
and then seek a fixed length with get_grouplength(). */
9871
9872
case META_COND_NAME:
9873
case META_COND_NUMBER:
9874
case META_COND_RNAME:
9875
case META_COND_RNUMBER:
9876
pptr += 2 + SIZEOFFSET;
9877
goto CHECK_GROUP;
9878
9879
case META_COND_ASSERT:
9880
pptr += 1;
9881
goto CHECK_GROUP;
9882
9883
case META_COND_VERSION:
9884
pptr += 4;
9885
goto CHECK_GROUP;
9886
9887
case META_CAPTURE:
9888
group = META_DATA(*pptr);
9889
PCRE2_FALLTHROUGH /* Fall through */
9890
9891
case META_ATOMIC:
9892
case META_NOCAPTURE:
9893
case META_SCRIPT_RUN:
9894
pptr++;
9895
CHECK_GROUP:
9896
grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9897
lcptr, group, recurses, cb);
9898
if (grouplength < 0) return -1;
9899
itemlength = grouplength;
9900
itemminlength = groupminlength;
9901
break;
9902
9903
case META_QUERY:
9904
case META_QUERY_PLUS:
9905
case META_QUERY_QUERY:
9906
min = 0;
9907
max = 1;
9908
goto REPETITION;
9909
9910
/* Exact repetition is OK; variable repetition is not. A repetition of zero
9911
must subtract the length that has already been added. */
9912
9913
case META_MINMAX:
9914
case META_MINMAX_PLUS:
9915
case META_MINMAX_QUERY:
9916
min = pptr[1];
9917
max = pptr[2];
9918
pptr += 2;
9919
9920
REPETITION:
9921
if (max != REPEAT_UNLIMITED)
9922
{
9923
if (lastitemlength != 0 && /* Should not occur, but just in case */
9924
max != 0 &&
9925
(INT_MAX - branchlength)/lastitemlength < max - 1)
9926
{
9927
*errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9928
return -1;
9929
}
9930
if (min == 0) branchminlength -= lastitemminlength;
9931
else itemminlength = (min - 1) * lastitemminlength;
9932
if (max == 0) branchlength -= lastitemlength;
9933
else itemlength = (max - 1) * lastitemlength;
9934
break;
9935
}
9936
PCRE2_FALLTHROUGH /* Fall through */
9937
9938
/* Any other item means this branch does not have a fixed length. */
9939
9940
default:
9941
ISNOTFIXED:
9942
*errcodeptr = ERR25; /* Not fixed length */
9943
return -1;
9944
}
9945
9946
/* Add the item length to the branchlength, checking for integer overflow and
9947
for the branch length exceeding the overall limit. Later, if there is at
9948
least one variable-length branch in the group, there is a test for the
9949
(smaller) variable-length branch length limit. */
9950
9951
if (INT_MAX - branchlength < (int)itemlength ||
9952
(branchlength += itemlength) > LOOKBEHIND_MAX)
9953
{
9954
*errcodeptr = ERR87;
9955
return -1;
9956
}
9957
9958
branchminlength += itemminlength;
9959
9960
/* Save this item length for use if the next item is a quantifier. */
9961
9962
lastitemlength = itemlength;
9963
lastitemminlength = itemminlength;
9964
}
9965
9966
EXIT:
9967
*pptrptr = pptr;
9968
*minptr = branchminlength;
9969
return branchlength;
9970
9971
/* LCOV_EXCL_START */
9972
PARSED_SKIP_FAILED:
9973
PCRE2_DEBUG_UNREACHABLE();
9974
*errcodeptr = ERR90; /* Unhandled META code - internal error */
9975
return -1;
9976
/* LCOV_EXCL_STOP */
9977
}
9978
9979
9980
9981
/*************************************************
9982
* Set lengths in a lookbehind *
9983
*************************************************/
9984
9985
/* This function is called for each lookbehind, to set the lengths in its
9986
branches. An error occurs if any branch does not have a limited maximum length
9987
that is less than the limit (65535). On exit, the pointer must be left on the
9988
final ket.
9989
9990
The function also maintains the max_lookbehind value. Any lookbehind branch
9991
that contains a nested lookbehind may actually look further back than the
9992
length of the branch. The additional amount is passed back from
9993
get_branchlength() as an "extra" value.
9994
9995
Arguments:
9996
pptrptr pointer to pointer in the parsed pattern
9997
errcodeptr pointer to error code
9998
lcptr pointer to loop counter
9999
recurses chain of recurse_check to catch mutual recursion
10000
cb pointer to compile block
10001
10002
Returns: TRUE if all is well
10003
FALSE otherwise, with error code and offset set
10004
*/
10005
10006
static BOOL
10007
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
10008
parsed_recurse_check *recurses, compile_block *cb)
10009
{
10010
PCRE2_SIZE offset;
10011
uint32_t *bptr = *pptrptr;
10012
uint32_t *gbptr = bptr;
10013
int maxlength = 0;
10014
int minlength = INT_MAX;
10015
BOOL variable = FALSE;
10016
10017
READPLUSOFFSET(offset, bptr); /* Offset for error messages */
10018
*pptrptr += SIZEOFFSET;
10019
10020
/* Each branch can have a different maximum length, but we can keep only a
10021
single minimum for the whole group, because there's nowhere to save individual
10022
values in the META_ALT item. */
10023
10024
do
10025
{
10026
int branchlength, branchminlength;
10027
10028
*pptrptr += 1;
10029
branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
10030
recurses, cb);
10031
10032
if (branchlength < 0)
10033
{
10034
/* The errorcode and offset may already be set from a nested lookbehind. */
10035
if (*errcodeptr == 0) *errcodeptr = ERR25;
10036
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
10037
return FALSE;
10038
}
10039
10040
if (branchlength != branchminlength) variable = TRUE;
10041
if (branchminlength < minlength) minlength = branchminlength;
10042
if (branchlength > maxlength) maxlength = branchlength;
10043
if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
10044
*bptr |= branchlength; /* branchlength never more than 65535 */
10045
bptr = *pptrptr;
10046
}
10047
while (META_CODE(*bptr) == META_ALT);
10048
10049
/* If any branch is of variable length, the whole lookbehind is of variable
10050
length. If the maximum length of any branch exceeds the maximum for variable
10051
lookbehinds, give an error. Otherwise, the minimum length is set in the word
10052
that follows the original group META value. For a fixed-length lookbehind, this
10053
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
10054
possibly different) length. */
10055
10056
if (variable)
10057
{
10058
gbptr[1] = minlength;
10059
if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
10060
{
10061
*errcodeptr = ERR100;
10062
cb->erroroffset = offset;
10063
return FALSE;
10064
}
10065
}
10066
else gbptr[1] = LOOKBEHIND_MAX;
10067
10068
return TRUE;
10069
}
10070
10071
10072
10073
/*************************************************
10074
* Check parsed pattern lookbehinds *
10075
*************************************************/
10076
10077
/* This function is called at the end of parsing a pattern if any lookbehinds
10078
were encountered. It scans the parsed pattern for them, calling
10079
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
10080
the error offset is marked unset. The enables the functions above not to
10081
override settings from deeper nestings.
10082
10083
This function is called recursively from get_branchlength() for lookaheads in
10084
order to process any lookbehinds that they may contain. It stops when it hits a
10085
non-nested closing parenthesis in this case, returning a pointer to it.
10086
10087
Arguments
10088
pptr points to where to start (start of pattern or start of lookahead)
10089
retptr if not NULL, return the ket pointer here
10090
recurses chain of recurse_check to catch mutual recursion
10091
cb points to the compile block
10092
lcptr points to loop counter
10093
10094
Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
10095
*/
10096
10097
static int
10098
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
10099
parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
10100
{
10101
int errorcode = 0;
10102
int nestlevel = 0;
10103
10104
cb->erroroffset = PCRE2_UNSET;
10105
10106
for (; *pptr != META_END; pptr++)
10107
{
10108
if (*pptr < META_END) continue; /* Literal */
10109
10110
switch (META_CODE(*pptr))
10111
{
10112
/* The following erroroffset is a bogus but safe value. This branch should
10113
be avoided by providing a proper implementation for all supported cases
10114
below. */
10115
10116
/* LCOV_EXCL_START */
10117
default:
10118
PCRE2_DEBUG_UNREACHABLE();
10119
cb->erroroffset = 0;
10120
return ERR70; /* Unrecognized meta code */
10121
/* LCOV_EXCL_STOP */
10122
10123
case META_ESCAPE:
10124
if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
10125
pptr += 1; /* Skip prop data */
10126
break;
10127
10128
case META_KET:
10129
if (--nestlevel < 0)
10130
{
10131
if (retptr != NULL) *retptr = pptr;
10132
return 0;
10133
}
10134
break;
10135
10136
case META_ATOMIC:
10137
case META_CAPTURE:
10138
case META_COND_ASSERT:
10139
case META_SCS:
10140
case META_LOOKAHEAD:
10141
case META_LOOKAHEADNOT:
10142
case META_LOOKAHEAD_NA:
10143
case META_NOCAPTURE:
10144
case META_SCRIPT_RUN:
10145
nestlevel++;
10146
break;
10147
10148
case META_ACCEPT:
10149
case META_ALT:
10150
case META_ASTERISK:
10151
case META_ASTERISK_PLUS:
10152
case META_ASTERISK_QUERY:
10153
case META_BACKREF:
10154
case META_CIRCUMFLEX:
10155
case META_CLASS:
10156
case META_CLASS_EMPTY:
10157
case META_CLASS_EMPTY_NOT:
10158
case META_CLASS_END:
10159
case META_CLASS_NOT:
10160
case META_COMMIT:
10161
case META_DOLLAR:
10162
case META_DOT:
10163
case META_FAIL:
10164
case META_PLUS:
10165
case META_PLUS_PLUS:
10166
case META_PLUS_QUERY:
10167
case META_PRUNE:
10168
case META_QUERY:
10169
case META_QUERY_PLUS:
10170
case META_QUERY_QUERY:
10171
case META_RANGE_ESCAPED:
10172
case META_RANGE_LITERAL:
10173
case META_SKIP:
10174
case META_THEN:
10175
break;
10176
10177
case META_OFFSET:
10178
case META_RECURSE:
10179
pptr += SIZEOFFSET;
10180
break;
10181
10182
case META_BACKREF_BYNAME:
10183
case META_RECURSE_BYNAME:
10184
pptr += 1 + SIZEOFFSET;
10185
break;
10186
10187
case META_COND_DEFINE:
10188
pptr += SIZEOFFSET;
10189
nestlevel++;
10190
break;
10191
10192
case META_COND_NAME:
10193
case META_COND_NUMBER:
10194
case META_COND_RNAME:
10195
case META_COND_RNUMBER:
10196
pptr += 1 + SIZEOFFSET;
10197
nestlevel++;
10198
break;
10199
10200
case META_COND_VERSION:
10201
pptr += 3;
10202
nestlevel++;
10203
break;
10204
10205
case META_CALLOUT_STRING:
10206
pptr += 3 + SIZEOFFSET;
10207
break;
10208
10209
case META_BIGVALUE:
10210
case META_POSIX:
10211
case META_POSIX_NEG:
10212
case META_CAPTURE_NAME:
10213
case META_CAPTURE_NUMBER:
10214
pptr += 1;
10215
break;
10216
10217
case META_MINMAX:
10218
case META_MINMAX_QUERY:
10219
case META_MINMAX_PLUS:
10220
case META_OPTIONS:
10221
pptr += 2;
10222
break;
10223
10224
case META_CALLOUT_NUMBER:
10225
pptr += 3;
10226
break;
10227
10228
case META_MARK:
10229
case META_COMMIT_ARG:
10230
case META_PRUNE_ARG:
10231
case META_SKIP_ARG:
10232
case META_THEN_ARG:
10233
pptr += 1 + pptr[1];
10234
break;
10235
10236
/* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10237
the final ket of the group, so no need to update it here. */
10238
10239
case META_LOOKBEHIND:
10240
case META_LOOKBEHINDNOT:
10241
case META_LOOKBEHIND_NA:
10242
if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10243
return errorcode;
10244
break;
10245
}
10246
}
10247
10248
return 0;
10249
}
10250
10251
10252
10253
/*************************************************
10254
* External function to compile a pattern *
10255
*************************************************/
10256
10257
/* This function reads a regular expression in the form of a string and returns
10258
a pointer to a block of store holding a compiled version of the expression.
10259
10260
Arguments:
10261
pattern the regular expression
10262
patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
10263
options option bits
10264
errorptr pointer to errorcode
10265
erroroffset pointer to error offset
10266
ccontext points to a compile context or is NULL
10267
10268
Returns: pointer to compiled data block, or NULL on error,
10269
with errorcode and erroroffset set
10270
*/
10271
10272
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10273
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10274
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10275
{
10276
BOOL utf; /* Set TRUE for UTF mode */
10277
BOOL ucp; /* Set TRUE for UCP mode */
10278
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
10279
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
10280
pcre2_real_code *re = NULL; /* What we will return */
10281
compile_block cb; /* "Static" compile-time data */
10282
const uint8_t *tables; /* Char tables base pointer */
10283
10284
PCRE2_UCHAR null_str[1] = { 0xcd }; /* Dummy for handling null inputs */
10285
PCRE2_UCHAR *code; /* Current pointer in compiled code */
10286
PCRE2_UCHAR *codestart; /* Start of compiled code */
10287
PCRE2_SPTR ptr; /* Current pointer in pattern */
10288
uint32_t *pptr; /* Current pointer in parsed pattern */
10289
10290
PCRE2_SIZE length = 1; /* Allow for final END opcode */
10291
PCRE2_SIZE usedlength; /* Actual length used */
10292
PCRE2_SIZE re_blocksize; /* Size of memory block */
10293
PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
10294
10295
uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
10296
uint32_t firstcu, reqcu; /* Value of first/req code unit */
10297
uint32_t setflags = 0; /* NL and BSR set flags */
10298
uint32_t xoptions; /* Flags from context, modified */
10299
10300
uint32_t skipatstart; /* When checking (*UTF) etc */
10301
uint32_t limit_heap = UINT32_MAX;
10302
uint32_t limit_match = UINT32_MAX; /* Unset match limits */
10303
uint32_t limit_depth = UINT32_MAX;
10304
10305
int newline = 0; /* Unset; can be set by the pattern */
10306
int bsr = 0; /* Unset; can be set by the pattern */
10307
int errorcode = 0; /* Initialize to avoid compiler warn */
10308
int regexrc; /* Return from compile */
10309
10310
uint32_t i; /* Local loop counter */
10311
10312
/* Enable all optimizations by default. */
10313
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10314
PCRE2_OPTIMIZATION_ALL;
10315
10316
/* Comments at the head of this file explain about these variables. */
10317
10318
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10319
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10320
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10321
10322
/* The workspace is used in different ways in the different compiling phases.
10323
It needs to be 16-bit aligned for the preliminary parsing scan. */
10324
10325
uint32_t c16workspace[C16_WORK_SIZE];
10326
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10327
10328
10329
/* -------------- Check arguments and set up the pattern ----------------- */
10330
10331
/* There must be error code and offset pointers. */
10332
10333
if (errorptr == NULL)
10334
{
10335
if (erroroffset != NULL) *erroroffset = 0;
10336
return NULL;
10337
}
10338
if (erroroffset == NULL)
10339
{
10340
if (errorptr != NULL) *errorptr = ERR120;
10341
return NULL;
10342
}
10343
*errorptr = ERR0;
10344
*erroroffset = 0;
10345
10346
/* There must be a pattern, but NULL is allowed with zero length. */
10347
10348
if (pattern == NULL)
10349
{
10350
if (patlen == 0)
10351
pattern = null_str;
10352
else
10353
{
10354
*errorptr = ERR16;
10355
return NULL;
10356
}
10357
}
10358
10359
/* A NULL compile context means "use a default context" */
10360
10361
if (ccontext == NULL)
10362
ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10363
10364
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10365
10366
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10367
10368
/* Check that all undefined public option bits are zero. */
10369
10370
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10371
(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10372
{
10373
*errorptr = ERR17;
10374
return NULL;
10375
}
10376
10377
if ((options & PCRE2_LITERAL) != 0 &&
10378
((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10379
(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10380
{
10381
*errorptr = ERR92;
10382
return NULL;
10383
}
10384
10385
/* A zero-terminated pattern is indicated by the special length value
10386
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10387
10388
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10389
patlen = PRIV(strlen)(pattern);
10390
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10391
10392
if (patlen > ccontext->max_pattern_length)
10393
{
10394
*errorptr = ERR88;
10395
return NULL;
10396
}
10397
10398
/* Optimization flags in 'options' can override those in the compile context.
10399
This is because some options to disable optimizations were added before the
10400
optimization flags word existed, and we need to continue supporting them
10401
for backwards compatibility. */
10402
10403
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10404
optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10405
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10406
optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10407
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10408
optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10409
10410
/* From here on, all returns from this function should end up going via the
10411
EXIT label. */
10412
10413
10414
/* ------------ Initialize the "static" compile data -------------- */
10415
10416
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10417
10418
cb.lcc = tables + lcc_offset; /* Individual */
10419
cb.fcc = tables + fcc_offset; /* character */
10420
cb.cbits = tables + cbits_offset; /* tables */
10421
cb.ctypes = tables + ctypes_offset;
10422
10423
cb.assert_depth = 0;
10424
cb.bracount = 0;
10425
cb.cx = ccontext;
10426
cb.dupnames = FALSE;
10427
cb.end_pattern = pattern + patlen;
10428
cb.erroroffset = 0;
10429
cb.external_flags = 0;
10430
cb.external_options = options;
10431
cb.groupinfo = stack_groupinfo;
10432
cb.had_recurse = FALSE;
10433
cb.lastcapture = 0;
10434
cb.max_lookbehind = 0; /* Max encountered */
10435
cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */
10436
cb.name_entry_size = 0;
10437
cb.name_table = NULL;
10438
cb.named_groups = named_groups;
10439
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10440
cb.names_found = 0;
10441
cb.parens_depth = 0;
10442
cb.parsed_pattern = stack_parsed_pattern;
10443
cb.req_varyopt = 0;
10444
cb.start_code = cworkspace;
10445
cb.start_pattern = pattern;
10446
cb.start_workspace = cworkspace;
10447
cb.workspace_size = COMPILE_WORK_SIZE;
10448
cb.first_data = NULL;
10449
cb.last_data = NULL;
10450
#ifdef SUPPORT_WIDE_CHARS
10451
cb.char_lists_size = 0;
10452
#endif
10453
10454
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10455
references to help in deciding whether (.*) can be treated as anchored or not.
10456
*/
10457
10458
cb.top_backref = 0;
10459
cb.backref_map = 0;
10460
10461
/* Escape sequences \1 to \9 are always back references, but as they are only
10462
two characters long, only two elements can be used in the parsed_pattern
10463
vector. The first contains the reference, and we'd like to use the second to
10464
record the offset in the pattern, so that forward references to non-existent
10465
groups can be diagnosed later with an offset. However, on 64-bit systems,
10466
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10467
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10468
references have enough space for the offset to be put into the parsed pattern.
10469
*/
10470
10471
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10472
10473
10474
/* --------------- Start looking at the pattern --------------- */
10475
10476
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10477
the start of the pattern, and remember the offset to the actual regex. With
10478
valgrind support, make the terminator of a zero-terminated pattern
10479
inaccessible. This catches bugs that would otherwise only show up for
10480
non-zero-terminated patterns. */
10481
10482
#ifdef SUPPORT_VALGRIND
10483
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10484
#endif
10485
10486
xoptions = ccontext->extra_options;
10487
ptr = pattern;
10488
skipatstart = 0;
10489
10490
if ((options & PCRE2_LITERAL) == 0)
10491
{
10492
while (patlen - skipatstart >= 2 &&
10493
ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10494
ptr[skipatstart+1] == CHAR_ASTERISK)
10495
{
10496
for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10497
{
10498
const pso *p = pso_list + i;
10499
10500
if (patlen - skipatstart - 2 >= p->length &&
10501
PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10502
{
10503
uint32_t c, pp;
10504
10505
skipatstart += p->length + 2;
10506
switch(p->type)
10507
{
10508
case PSO_OPT:
10509
cb.external_options |= p->value;
10510
break;
10511
10512
case PSO_XOPT:
10513
xoptions |= p->value;
10514
break;
10515
10516
case PSO_FLG:
10517
setflags |= p->value;
10518
break;
10519
10520
case PSO_NL:
10521
newline = p->value;
10522
setflags |= PCRE2_NL_SET;
10523
break;
10524
10525
case PSO_BSR:
10526
bsr = p->value;
10527
setflags |= PCRE2_BSR_SET;
10528
break;
10529
10530
case PSO_LIMM:
10531
case PSO_LIMD:
10532
case PSO_LIMH:
10533
c = 0;
10534
pp = skipatstart;
10535
while (pp < patlen && IS_DIGIT(ptr[pp]))
10536
{
10537
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
10538
c = c*10 + (ptr[pp++] - CHAR_0);
10539
}
10540
if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10541
{
10542
errorcode = ERR60;
10543
ptr += pp;
10544
utf = FALSE; /* Used by HAD_EARLY_ERROR */
10545
goto HAD_EARLY_ERROR;
10546
}
10547
if (p->type == PSO_LIMH) limit_heap = c;
10548
else if (p->type == PSO_LIMM) limit_match = c;
10549
else limit_depth = c;
10550
skipatstart = ++pp;
10551
break;
10552
10553
case PSO_OPTMZ:
10554
optim_flags &= ~(p->value);
10555
10556
/* For backward compatibility the three original VERBs to disable
10557
optimizations need to also update the corresponding bit in the
10558
external options. */
10559
10560
switch(p->value)
10561
{
10562
case PCRE2_OPTIM_AUTO_POSSESS:
10563
cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10564
break;
10565
10566
case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10567
cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10568
break;
10569
10570
case PCRE2_OPTIM_START_OPTIMIZE:
10571
cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10572
break;
10573
}
10574
10575
break;
10576
10577
/* LCOV_EXCL_START */
10578
default:
10579
/* All values in the enum need an explicit entry for this switch
10580
but until a better way to prevent coding mistakes is invented keep
10581
a catch all that triggers a debug build assert as a failsafe */
10582
PCRE2_DEBUG_UNREACHABLE();
10583
/* LCOV_EXCL_STOP */
10584
}
10585
break; /* Out of the table scan loop */
10586
}
10587
}
10588
if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
10589
}
10590
PCRE2_ASSERT(skipatstart <= patlen);
10591
}
10592
10593
/* End of pattern-start options; advance to start of real regex. */
10594
10595
ptr += skipatstart;
10596
10597
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10598
10599
#ifndef SUPPORT_UNICODE
10600
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10601
{
10602
errorcode = ERR32;
10603
goto HAD_EARLY_ERROR;
10604
}
10605
#endif
10606
10607
/* Check UTF. We have the original options in 'options', with that value as
10608
modified by (*UTF) etc in cb->external_options. The extra option
10609
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10610
surrogate code points cannot be represented in UTF-16. */
10611
10612
utf = (cb.external_options & PCRE2_UTF) != 0;
10613
if (utf)
10614
{
10615
if ((options & PCRE2_NEVER_UTF) != 0)
10616
{
10617
errorcode = ERR74;
10618
goto HAD_EARLY_ERROR;
10619
}
10620
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10621
(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10622
goto HAD_ERROR; /* Offset was set by valid_utf() */
10623
10624
#if PCRE2_CODE_UNIT_WIDTH == 16
10625
if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10626
{
10627
errorcode = ERR91;
10628
goto HAD_EARLY_ERROR;
10629
}
10630
#endif
10631
}
10632
10633
/* Check UCP lockout. */
10634
10635
ucp = (cb.external_options & PCRE2_UCP) != 0;
10636
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10637
{
10638
errorcode = ERR75;
10639
goto HAD_EARLY_ERROR;
10640
}
10641
10642
/* PCRE2_EXTRA_TURKISH_CASING checks */
10643
10644
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10645
{
10646
if (!utf && !ucp)
10647
{
10648
errorcode = ERR104;
10649
goto HAD_EARLY_ERROR;
10650
}
10651
10652
#if PCRE2_CODE_UNIT_WIDTH == 8
10653
if (!utf)
10654
{
10655
errorcode = ERR105;
10656
goto HAD_EARLY_ERROR;
10657
}
10658
#endif
10659
10660
if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10661
{
10662
errorcode = ERR106;
10663
goto HAD_EARLY_ERROR;
10664
}
10665
}
10666
10667
/* Process the BSR setting. */
10668
10669
if (bsr == 0) bsr = ccontext->bsr_convention;
10670
10671
/* Process the newline setting. */
10672
10673
if (newline == 0) newline = ccontext->newline_convention;
10674
cb.nltype = NLTYPE_FIXED;
10675
switch(newline)
10676
{
10677
case PCRE2_NEWLINE_CR:
10678
cb.nllen = 1;
10679
cb.nl[0] = CHAR_CR;
10680
break;
10681
10682
case PCRE2_NEWLINE_LF:
10683
cb.nllen = 1;
10684
cb.nl[0] = CHAR_NL;
10685
break;
10686
10687
case PCRE2_NEWLINE_NUL:
10688
cb.nllen = 1;
10689
cb.nl[0] = CHAR_NUL;
10690
break;
10691
10692
case PCRE2_NEWLINE_CRLF:
10693
cb.nllen = 2;
10694
cb.nl[0] = CHAR_CR;
10695
cb.nl[1] = CHAR_NL;
10696
break;
10697
10698
case PCRE2_NEWLINE_ANY:
10699
cb.nltype = NLTYPE_ANY;
10700
break;
10701
10702
case PCRE2_NEWLINE_ANYCRLF:
10703
cb.nltype = NLTYPE_ANYCRLF;
10704
break;
10705
10706
/* LCOV_EXCL_START */
10707
default:
10708
PCRE2_DEBUG_UNREACHABLE();
10709
errorcode = ERR56;
10710
goto HAD_EARLY_ERROR;
10711
/* LCOV_EXCL_STOP */
10712
}
10713
10714
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10715
their numerical equivalents, so that this information is always available for
10716
the remaining processing. (2) At the same time, parse the pattern and put a
10717
processed version into the parsed_pattern vector. This has escapes interpreted
10718
and comments removed (amongst other things). */
10719
10720
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10721
patterns the vector on the stack (which was set up above) can be used. */
10722
10723
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10724
10725
/* Allow for 2x uint32_t at the start and 2 at the end, for
10726
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10727
10728
if ((ccontext->extra_options &
10729
(PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10730
parsed_size_needed += 4;
10731
10732
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10733
10734
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10735
parsed_size_needed += 4;
10736
10737
parsed_size_needed += 1; /* For the final META_END */
10738
10739
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10740
{
10741
uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10742
parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10743
if (heap_parsed_pattern == NULL)
10744
{
10745
*errorptr = ERR21;
10746
goto EXIT;
10747
}
10748
cb.parsed_pattern = heap_parsed_pattern;
10749
}
10750
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10751
10752
/* Do the parsing scan. */
10753
10754
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10755
if (errorcode != 0) goto HAD_CB_ERROR;
10756
10757
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10758
lengths. Workspace is needed to remember whether numbered groups are or are not
10759
of limited length, and if limited, what the minimum and maximum lengths are.
10760
This caching saves re-computing the length of any group that is referenced more
10761
than once, which is particularly relevant when recursion is involved.
10762
Unnumbered groups do not have this exposure because they cannot be referenced.
10763
If there are sufficiently few groups, the default index vector on the stack, as
10764
set up above, can be used. Otherwise we have to get/free some heap memory. The
10765
vector must be initialized to zero. */
10766
10767
if (has_lookbehind)
10768
{
10769
int loopcount = 0;
10770
if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10771
{
10772
cb.groupinfo = ccontext->memctl.malloc(
10773
(2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10774
if (cb.groupinfo == NULL)
10775
{
10776
errorcode = ERR21;
10777
cb.erroroffset = 0;
10778
goto HAD_CB_ERROR;
10779
}
10780
}
10781
memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10782
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10783
if (errorcode != 0) goto HAD_CB_ERROR;
10784
}
10785
10786
/* For debugging, there is a function that shows the parsed pattern vector. */
10787
10788
#ifdef DEBUG_SHOW_PARSED
10789
fprintf(stderr, "+++ Pre-scan complete:\n");
10790
show_parsed(&cb);
10791
#endif
10792
10793
/* For debugging capturing information this code can be enabled. */
10794
10795
#ifdef DEBUG_SHOW_CAPTURES
10796
{
10797
named_group *ng = cb.named_groups;
10798
fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10799
for (i = 0; i < cb.names_found; i++, ng++)
10800
{
10801
fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10802
}
10803
}
10804
#endif
10805
10806
/* Pretend to compile the pattern while actually just accumulating the amount
10807
of memory required in the 'length' variable. This behaviour is triggered by
10808
passing a non-NULL final argument to compile_regex(). We pass a block of
10809
workspace (cworkspace) for it to compile parts of the pattern into; the
10810
compiled code is discarded when it is no longer needed, so hopefully this
10811
workspace will never overflow, though there is a test for its doing so.
10812
10813
On error, errorcode will be set non-zero, so we don't need to look at the
10814
result of the function. The initial options have been put into the cb block,
10815
but we still have to pass a separate options variable (the first argument)
10816
because the options may change as the pattern is processed. */
10817
10818
cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10819
pptr = cb.parsed_pattern;
10820
code = cworkspace;
10821
*code = OP_BRA;
10822
10823
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10824
&errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10825
&cb, &length);
10826
10827
if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10828
10829
/* This should be caught in compile_regex(), but just in case... */
10830
10831
#if defined SUPPORT_WIDE_CHARS
10832
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10833
if (length > MAX_PATTERN_SIZE ||
10834
MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10835
#else
10836
if (length > MAX_PATTERN_SIZE)
10837
#endif
10838
{
10839
errorcode = ERR20;
10840
cb.erroroffset = 0;
10841
goto HAD_CB_ERROR;
10842
}
10843
10844
/* Compute the size of, then, if not too large, get and initialize the data
10845
block for storing the compiled pattern and names table. Integer overflow should
10846
no longer be possible because nowadays we limit the maximum value of
10847
cb.names_found and cb.name_entry_size. */
10848
10849
re_blocksize =
10850
CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10851
10852
#if defined SUPPORT_WIDE_CHARS
10853
if (cb.char_lists_size != 0)
10854
{
10855
#if PCRE2_CODE_UNIT_WIDTH != 32
10856
/* Align to 32 bit first. This ensures the
10857
allocated area will also be 32 bit aligned. */
10858
re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10859
#endif
10860
re_blocksize += cb.char_lists_size;
10861
}
10862
#endif
10863
10864
re_blocksize += CU2BYTES(length);
10865
10866
if (re_blocksize > ccontext->max_pattern_compiled_length)
10867
{
10868
errorcode = ERR101;
10869
cb.erroroffset = 0;
10870
goto HAD_CB_ERROR;
10871
}
10872
10873
re_blocksize += sizeof(pcre2_real_code);
10874
re = (pcre2_real_code *)
10875
ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10876
if (re == NULL)
10877
{
10878
errorcode = ERR21;
10879
cb.erroroffset = 0;
10880
goto HAD_CB_ERROR;
10881
}
10882
10883
/* The compiler may put padding at the end of the pcre2_real_code structure in
10884
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10885
compiled pattern is copied (for example, when serialized) undefined bytes are
10886
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10887
write to the last 8 bytes of the structure before setting the fields. */
10888
10889
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10890
re->memctl = ccontext->memctl;
10891
re->tables = tables;
10892
re->executable_jit = NULL;
10893
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10894
re->blocksize = re_blocksize;
10895
re->code_start = re_blocksize - CU2BYTES(length);
10896
re->magic_number = MAGIC_NUMBER;
10897
re->compile_options = options;
10898
re->overall_options = cb.external_options;
10899
re->extra_options = xoptions;
10900
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10901
re->limit_heap = limit_heap;
10902
re->limit_match = limit_match;
10903
re->limit_depth = limit_depth;
10904
re->first_codeunit = 0;
10905
re->last_codeunit = 0;
10906
re->bsr_convention = bsr;
10907
re->newline_convention = newline;
10908
re->max_lookbehind = 0;
10909
re->minlength = 0;
10910
re->top_bracket = 0;
10911
re->top_backref = 0;
10912
re->name_entry_size = cb.name_entry_size;
10913
re->name_count = cb.names_found;
10914
re->optimization_flags = optim_flags;
10915
10916
/* The basic block is immediately followed by the name table, and the compiled
10917
code follows after that. */
10918
10919
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10920
10921
/* Update the compile data block for the actual compile. The starting points of
10922
the name/number translation table and of the code are passed around in the
10923
compile data block. The start/end pattern and initial options are already set
10924
from the pre-compile phase, as is the name_entry_size field. */
10925
10926
cb.parens_depth = 0;
10927
cb.assert_depth = 0;
10928
cb.lastcapture = 0;
10929
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10930
cb.start_code = codestart;
10931
cb.req_varyopt = 0;
10932
cb.had_accept = FALSE;
10933
cb.had_pruneorskip = FALSE;
10934
#ifdef SUPPORT_WIDE_CHARS
10935
cb.char_lists_size = 0;
10936
#endif
10937
10938
10939
/* If any named groups were found, create the name/number table from the list
10940
created in the pre-pass. */
10941
10942
if (cb.names_found > 0)
10943
{
10944
named_group *ng = cb.named_groups;
10945
uint32_t tablecount = 0;
10946
10947
/* Length 0 represents duplicates, and they have already been handled. */
10948
for (i = 0; i < cb.names_found; i++, ng++)
10949
if (ng->length > 0)
10950
tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);
10951
10952
PCRE2_ASSERT(tablecount == cb.names_found);
10953
}
10954
10955
/* Set up a starting, non-extracting bracket, then compile the expression. On
10956
error, errorcode will be set non-zero, so we don't need to look at the result
10957
of the function here. */
10958
10959
pptr = cb.parsed_pattern;
10960
code = (PCRE2_UCHAR *)codestart;
10961
*code = OP_BRA;
10962
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10963
&pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10964
NULL, &cb, NULL);
10965
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10966
re->top_bracket = cb.bracount;
10967
re->top_backref = cb.top_backref;
10968
re->max_lookbehind = cb.max_lookbehind;
10969
10970
if (cb.had_accept)
10971
{
10972
reqcu = 0; /* Must disable after (*ACCEPT) */
10973
reqcuflags = REQ_NONE;
10974
re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10975
}
10976
10977
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10978
but the estimated length exceeds the really used length, adjust the value of
10979
re->blocksize, and if valgrind support is configured, mark the extra allocated
10980
memory as unaddressable, so that any out-of-bound reads can be detected. */
10981
10982
*code++ = OP_END;
10983
usedlength = code - codestart;
10984
/* LCOV_EXCL_START */
10985
if (usedlength > length)
10986
{
10987
PCRE2_DEBUG_UNREACHABLE();
10988
errorcode = ERR23; /* Overflow of code block - internal error */
10989
cb.erroroffset = 0;
10990
goto HAD_CB_ERROR;
10991
}
10992
/* LCOV_EXCL_STOP */
10993
10994
re->blocksize -= CU2BYTES(length - usedlength);
10995
#ifdef SUPPORT_VALGRIND
10996
VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10997
#endif
10998
10999
/* Scan the pattern for recursion/subroutine calls and convert the group
11000
numbers into offsets. Maintain a small cache so that repeated groups containing
11001
recursions are efficiently handled. */
11002
11003
#define RSCAN_CACHE_SIZE 8
11004
11005
if (errorcode == 0 && cb.had_recurse)
11006
{
11007
PCRE2_UCHAR *rcode;
11008
PCRE2_SPTR rgroup;
11009
unsigned int ccount = 0;
11010
int start = RSCAN_CACHE_SIZE;
11011
recurse_cache rc[RSCAN_CACHE_SIZE];
11012
11013
for (rcode = find_recurse(codestart, utf);
11014
rcode != NULL;
11015
rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
11016
{
11017
int p, groupnumber;
11018
11019
groupnumber = (int)GET(rcode, 1);
11020
if (groupnumber == 0) rgroup = codestart; else
11021
{
11022
PCRE2_SPTR search_from = codestart;
11023
rgroup = NULL;
11024
for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
11025
{
11026
if (groupnumber == rc[p].groupnumber)
11027
{
11028
rgroup = rc[p].group;
11029
break;
11030
}
11031
11032
/* Group n+1 must always start to the right of group n, so we can save
11033
search time below when the new group number is greater than any of the
11034
previously found groups. */
11035
11036
if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
11037
}
11038
11039
if (rgroup == NULL)
11040
{
11041
rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
11042
/* LCOV_EXCL_START */
11043
if (rgroup == NULL)
11044
{
11045
PCRE2_DEBUG_UNREACHABLE();
11046
errorcode = ERR53;
11047
break;
11048
}
11049
/* LCOV_EXCL_STOP */
11050
11051
if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
11052
rc[start].groupnumber = groupnumber;
11053
rc[start].group = rgroup;
11054
if (ccount < RSCAN_CACHE_SIZE) ccount++;
11055
}
11056
}
11057
11058
PUT(rcode, 1, (uint32_t)(rgroup - codestart));
11059
}
11060
}
11061
11062
/* In rare debugging situations we sometimes need to look at the compiled code
11063
at this stage. */
11064
11065
#ifdef DEBUG_CALL_PRINTINT
11066
pcre2_printint(re, stderr, TRUE);
11067
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
11068
#endif
11069
11070
/* Unless disabled, check whether any single character iterators can be
11071
auto-possessified. The function overwrites the appropriate opcode values, so
11072
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
11073
used in this code because at least one compiler gives a warning about loss of
11074
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
11075
function call. */
11076
11077
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
11078
{
11079
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
11080
int possessify_rc = PRIV(auto_possessify)(temp, &cb);
11081
/* LCOV_EXCL_START */
11082
if (possessify_rc != 0)
11083
{
11084
PCRE2_DEBUG_UNREACHABLE();
11085
errorcode = ERR80;
11086
cb.erroroffset = 0;
11087
}
11088
/* LCOV_EXCL_STOP */
11089
}
11090
11091
/* Failed to compile, or error while post-processing. */
11092
11093
if (errorcode != 0) goto HAD_CB_ERROR;
11094
11095
/* Successful compile. If the anchored option was not passed, set it if
11096
we can determine that the pattern is anchored by virtue of ^ characters or \A
11097
or anything else, such as starting with non-atomic .* when DOTALL is set and
11098
there are no occurrences of *PRUNE or *SKIP (though there is an option to
11099
disable this case). */
11100
11101
if ((re->overall_options & PCRE2_ANCHORED) == 0)
11102
{
11103
BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11104
if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11105
re->overall_options |= PCRE2_ANCHORED;
11106
}
11107
11108
/* Set up the first code unit or startline flag, the required code unit, and
11109
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
11110
is disabled, as the data it would create will not be used. Note that a first code
11111
unit (but not the startline flag) is useful for anchored patterns because it
11112
can still give a quick "no match" and also avoid searching for a last code
11113
unit. */
11114
11115
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
11116
{
11117
int minminlength = 0; /* For minimal minlength from first/required CU */
11118
int study_rc;
11119
11120
/* If we do not have a first code unit, see if there is one that is asserted
11121
(these are not saved during the compile because they can cause conflicts with
11122
actual literals that follow). */
11123
11124
if (firstcuflags >= REQ_NONE) {
11125
uint32_t assertedcuflags = 0;
11126
uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
11127
/* It would be wrong to use the asserted first code unit as `firstcu` for
11128
* regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
11129
* For that example, if we set both firstcu and reqcu to 'a', it would mean
11130
* the subject string needs to be at least 2 characters long, which is wrong.
11131
* With more analysis, we would be able to set firstcu in more cases. */
11132
if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
11133
firstcu = assertedcu;
11134
firstcuflags = assertedcuflags;
11135
}
11136
}
11137
11138
/* Save the data for a first code unit. The existence of one means the
11139
minimum length must be at least 1. */
11140
11141
if (firstcuflags < REQ_NONE)
11142
{
11143
re->first_codeunit = firstcu;
11144
re->flags |= PCRE2_FIRSTSET;
11145
minminlength++;
11146
11147
/* Handle caseless first code units. */
11148
11149
if ((firstcuflags & REQ_CASELESS) != 0)
11150
{
11151
if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
11152
{
11153
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
11154
}
11155
11156
/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
11157
In 8-bit UTF mode, code units in the range 128-255 are introductory code
11158
units and cannot have another case, but if UCP is set they may do. */
11159
11160
#ifdef SUPPORT_UNICODE
11161
#if PCRE2_CODE_UNIT_WIDTH == 8
11162
else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
11163
re->flags |= PCRE2_FIRSTCASELESS;
11164
#else
11165
else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
11166
UCD_OTHERCASE(firstcu) != firstcu)
11167
re->flags |= PCRE2_FIRSTCASELESS;
11168
#endif
11169
#endif /* SUPPORT_UNICODE */
11170
}
11171
}
11172
11173
/* When there is no first code unit, for non-anchored patterns, see if we can
11174
set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
11175
branches start with ^ and also when all branches start with non-atomic .* for
11176
non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
11177
that disables this case.) */
11178
11179
else if ((re->overall_options & PCRE2_ANCHORED) == 0)
11180
{
11181
BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11182
if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11183
re->flags |= PCRE2_STARTLINE;
11184
}
11185
11186
/* Handle the "required code unit", if one is set. In the UTF case we can
11187
increment the minimum minimum length only if we are sure this really is a
11188
different character and not a non-starting code unit of the first character,
11189
because the minimum length count is in characters, not code units. */
11190
11191
if (reqcuflags < REQ_NONE)
11192
{
11193
#if PCRE2_CODE_UNIT_WIDTH == 16
11194
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
11195
firstcuflags >= REQ_NONE || /* First not set */
11196
(firstcu & 0xf800) != 0xd800 || /* First not surrogate */
11197
(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
11198
#elif PCRE2_CODE_UNIT_WIDTH == 8
11199
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
11200
firstcuflags >= REQ_NONE || /* First not set */
11201
(firstcu & 0x80) == 0 || /* First is ASCII */
11202
(reqcu & 0x80) == 0) /* Req is ASCII */
11203
#endif
11204
{
11205
minminlength++;
11206
}
11207
11208
/* In the case of an anchored pattern, set up the value only if it follows
11209
a variable length item in the pattern. */
11210
11211
if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
11212
(reqcuflags & REQ_VARY) != 0)
11213
{
11214
re->last_codeunit = reqcu;
11215
re->flags |= PCRE2_LASTSET;
11216
11217
/* Handle caseless required code units as for first code units (above). */
11218
11219
if ((reqcuflags & REQ_CASELESS) != 0)
11220
{
11221
if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
11222
{
11223
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11224
}
11225
#ifdef SUPPORT_UNICODE
11226
#if PCRE2_CODE_UNIT_WIDTH == 8
11227
else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11228
re->flags |= PCRE2_LASTCASELESS;
11229
#else
11230
else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11231
UCD_OTHERCASE(reqcu) != reqcu)
11232
re->flags |= PCRE2_LASTCASELESS;
11233
#endif
11234
#endif /* SUPPORT_UNICODE */
11235
}
11236
}
11237
}
11238
11239
/* Study the compiled pattern to set up information such as a bitmap of
11240
starting code units and a minimum matching length. */
11241
11242
study_rc = PRIV(study)(re);
11243
/* LCOV_EXCL_START */
11244
if (study_rc != 0)
11245
{
11246
PCRE2_DEBUG_UNREACHABLE();
11247
errorcode = ERR31;
11248
cb.erroroffset = 0;
11249
goto HAD_CB_ERROR;
11250
}
11251
/* LCOV_EXCL_STOP */
11252
11253
/* If study() set a bitmap of starting code units, it implies a minimum
11254
length of at least one. */
11255
11256
if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11257
minminlength = 1;
11258
11259
/* If the minimum length set (or not set) by study() is less than the minimum
11260
implied by required code units, override it. */
11261
11262
if (re->minlength < minminlength) re->minlength = minminlength;
11263
} /* End of start-of-match optimizations. */
11264
11265
/* Control ends up here in all cases. When running under valgrind, make a
11266
pattern's terminating zero defined again. If memory was obtained for the parsed
11267
version of the pattern, free it before returning. Also free the list of named
11268
groups if a larger one had to be obtained, and likewise the group information
11269
vector. */
11270
11271
#ifdef SUPPORT_UNICODE
11272
/* All items must be freed. */
11273
PCRE2_ASSERT(cb.first_data == NULL);
11274
#endif
11275
11276
EXIT:
11277
#ifdef SUPPORT_VALGRIND
11278
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11279
#endif
11280
if (cb.parsed_pattern != stack_parsed_pattern)
11281
ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11282
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11283
ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11284
if (cb.groupinfo != stack_groupinfo)
11285
ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11286
11287
return re; /* Will be NULL after an error */
11288
11289
/* Errors discovered in parse_regex() set the offset value in the compile
11290
block. Errors discovered before it is called must compute it from the ptr
11291
value. After parse_regex() is called, the offset in the compile block is set to
11292
the end of the pattern, but certain errors in compile_regex() may reset it if
11293
an offset is available in the parsed pattern. */
11294
11295
HAD_CB_ERROR:
11296
ptr = pattern + cb.erroroffset;
11297
11298
HAD_EARLY_ERROR:
11299
/* Ensure we don't return out-of-range erroroffset. */
11300
PCRE2_ASSERT(ptr >= pattern);
11301
PCRE2_ASSERT(ptr <= (pattern + patlen));
11302
/* Ensure that the erroroffset never slices a UTF-encoded character in half.
11303
If the input is invalid, then we return an offset just before the first invalid
11304
character, so the text to the left of the offset must always be valid. */
11305
#if defined PCRE2_DEBUG && defined SUPPORT_UNICODE
11306
if (ptr > pattern && utf)
11307
{
11308
PCRE2_SPTR prev = ptr - 1;
11309
PCRE2_SIZE dummyoffset;
11310
BACKCHAR(prev);
11311
PCRE2_ASSERT(prev >= pattern);
11312
PCRE2_ASSERT(PRIV(valid_utf)(prev, ptr - prev, &dummyoffset) == 0);
11313
}
11314
#endif
11315
*erroroffset = ptr - pattern;
11316
11317
HAD_ERROR:
11318
*errorptr = errorcode;
11319
pcre2_code_free(re);
11320
re = NULL;
11321
11322
if (cb.first_data != NULL)
11323
{
11324
compile_data* current_data = cb.first_data;
11325
do
11326
{
11327
compile_data* next_data = current_data->next;
11328
cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);
11329
current_data = next_data;
11330
}
11331
while (current_data != NULL);
11332
}
11333
11334
goto EXIT;
11335
}
11336
11337
/* These #undefs are here to enable unity builds with CMake. */
11338
11339
#undef NLBLOCK /* Block containing newline information */
11340
#undef PSSTART /* Field containing processed string start */
11341
#undef PSEND /* Field containing processed string end */
11342
11343
/* End of pcre2_compile.c */
11344
11345