CoCalc -- pcre2

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_compile.c
⁹⁸⁹⁸ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41

42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45

46
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
#define PSEND   end_pattern    /* Field containing processed string end */
49

50
#include "pcre2_compile.h"
51

52
/* In rare error cases debugging might require calling pcre2_printint(). */
53

54
#if 0
55
#ifdef EBCDIC
56
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57
#else
58
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59
#endif
60
#include "pcre2_printint.c"
61
#define DEBUG_CALL_PRINTINT
62
#endif
63

64
/* Other debugging code can be enabled by these defines. */
65

66
/* #define DEBUG_SHOW_CAPTURES */
67
/* #define DEBUG_SHOW_PARSED */
68

69
/* There are a few things that vary with different code unit sizes. Handle them
70
by defining macros in order to minimize #if usage. */
71

72
#if PCRE2_CODE_UNIT_WIDTH == 8
73
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74
#define XDIGIT(c)                xdigitab[c]
75

76
#else  /* Either 16-bit or 32-bit */
77
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78

79
#if PCRE2_CODE_UNIT_WIDTH == 16
80
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81

82
#else  /* 32-bit */
83
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84
#endif
85
#endif
86

87
/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88
consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89
them will be able to (i.e. assume a 64-bit world). */
90

91
#if PCRE2_SIZE_MAX <= UINT32_MAX
92
#define PUTOFFSET(s,p) *p++ = s
93
#define GETOFFSET(s,p) s = *p++
94
#define GETPLUSOFFSET(s,p) s = *(++p)
95
#define READPLUSOFFSET(s,p) s = p[1]
96
#define SKIPOFFSET(p) p++
97
#define SIZEOFFSET 1
98
#else
99
#define PUTOFFSET(s,p) \
100
  { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101
#define GETOFFSET(s,p) \
102
  { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103
#define GETPLUSOFFSET(s,p) \
104
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105
#define READPLUSOFFSET(s,p) \
106
  { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107
#define SKIPOFFSET(p) p += 2
108
#define SIZEOFFSET 2
109
#endif
110

111
/* Function definitions to allow mutual recursion */
112

113
static int
114
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
115
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
116
    open_capitem *, compile_block *, PCRE2_SIZE *);
117

118
static int
119
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
120
    compile_block *);
121

122
static BOOL
123
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
124
    compile_block *);
125

126
static int
127
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
128
    compile_block *, int *);
129

130

131
/*************************************************
132
*      Code parameters and static tables         *
133
*************************************************/
134

135
#define MAX_GROUP_NUMBER   65535u
136
#define MAX_REPEAT_COUNT   65535u
137
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
138

139
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
140
different ways in the different pattern scans. The parsing and group-
141
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
142
aligned for this. Having defined the size in code units, we set up
143
C16_WORK_SIZE as the number of elements in the 16-bit vector.
144

145
During the first compiling phase, when determining how much memory is required,
146
the regex is partly compiled into this space, but the compiled parts are
147
discarded as soon as they can be, so that hopefully there will never be an
148
overrun. The code does, however, check for an overrun, which can occur for
149
pathological patterns. The size of the workspace depends on LINK_SIZE because
150
the length of compiled items varies with this.
151

152
In the real compile phase, this workspace is not currently used. */
153

154
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
155

156
#define C16_WORK_SIZE \
157
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
158

159
/* A uint32_t vector is used for caching information about the size of
160
capturing groups, to improve performance. A default is created on the stack of
161
this size. */
162

163
#define GROUPINFO_DEFAULT_SIZE 256
164

165
/* The overrun tests check for a slightly smaller size so that they detect the
166
overrun before it actually does run off the end of the data block. */
167

168
#define WORK_SIZE_SAFETY_MARGIN (100)
169

170
/* This value determines the size of the initial vector that is used for
171
remembering named groups during the pre-compile. It is allocated on the stack,
172
but if it is too small, it is expanded, in a similar way to the workspace. The
173
value is the number of slots in the list. */
174

175
#define NAMED_GROUP_LIST_SIZE  20
176

177
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
178
of uint32_t. For short patterns this lives on the stack, with this size. Heap
179
memory is used for longer patterns. */
180

181
#define PARSED_PATTERN_DEFAULT_SIZE 1024
182

183
/* Maximum length value to check against when making sure that the variable
184
that holds the compiled pattern length does not overflow. We make it a bit less
185
than INT_MAX to allow for adding in group terminating code units, so that we
186
don't have to check them every time. */
187

188
#define OFLOW_MAX (INT_MAX - 20)
189

190
/* Table of extra lengths for each of the meta codes. Must be kept in step with
191
the definitions above. For some items these values are a basic length to which
192
a variable amount has to be added. */
193

194
static unsigned char meta_extra_lengths[] = {
195
  0,             /* META_END */
196
  0,             /* META_ALT */
197
  0,             /* META_ATOMIC */
198
  0,             /* META_BACKREF - more if group is >= 10 */
199
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
200
  1,             /* META_BIGVALUE */
201
  3,             /* META_CALLOUT_NUMBER */
202
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
203
  0,             /* META_CAPTURE */
204
  0,             /* META_CIRCUMFLEX */
205
  0,             /* META_CLASS */
206
  0,             /* META_CLASS_EMPTY */
207
  0,             /* META_CLASS_EMPTY_NOT */
208
  0,             /* META_CLASS_END */
209
  0,             /* META_CLASS_NOT */
210
  0,             /* META_COND_ASSERT */
211
  SIZEOFFSET,    /* META_COND_DEFINE */
212
  1+SIZEOFFSET,  /* META_COND_NAME */
213
  1+SIZEOFFSET,  /* META_COND_NUMBER */
214
  1+SIZEOFFSET,  /* META_COND_RNAME */
215
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
216
  3,             /* META_COND_VERSION */
217
  SIZEOFFSET,    /* META_OFFSET */
218
  0,             /* META_SCS */
219
  1,             /* META_SCS_NAME */
220
  1,             /* META_SCS_NUMBER */
221
  0,             /* META_DOLLAR */
222
  0,             /* META_DOT */
223
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
224
  0,             /* META_KET */
225
  0,             /* META_NOCAPTURE */
226
  2,             /* META_OPTIONS */
227
  1,             /* META_POSIX */
228
  1,             /* META_POSIX_NEG */
229
  0,             /* META_RANGE_ESCAPED */
230
  0,             /* META_RANGE_LITERAL */
231
  SIZEOFFSET,    /* META_RECURSE */
232
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
233
  0,             /* META_SCRIPT_RUN */
234
  0,             /* META_LOOKAHEAD */
235
  0,             /* META_LOOKAHEADNOT */
236
  SIZEOFFSET,    /* META_LOOKBEHIND */
237
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
238
  0,             /* META_LOOKAHEAD_NA */
239
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
240
  1,             /* META_MARK - plus the string length */
241
  0,             /* META_ACCEPT */
242
  0,             /* META_FAIL */
243
  0,             /* META_COMMIT */
244
  1,             /* META_COMMIT_ARG - plus the string length */
245
  0,             /* META_PRUNE */
246
  1,             /* META_PRUNE_ARG - plus the string length */
247
  0,             /* META_SKIP */
248
  1,             /* META_SKIP_ARG - plus the string length */
249
  0,             /* META_THEN */
250
  1,             /* META_THEN_ARG - plus the string length */
251
  0,             /* META_ASTERISK */
252
  0,             /* META_ASTERISK_PLUS */
253
  0,             /* META_ASTERISK_QUERY */
254
  0,             /* META_PLUS */
255
  0,             /* META_PLUS_PLUS */
256
  0,             /* META_PLUS_QUERY */
257
  0,             /* META_QUERY */
258
  0,             /* META_QUERY_PLUS */
259
  0,             /* META_QUERY_QUERY */
260
  2,             /* META_MINMAX */
261
  2,             /* META_MINMAX_PLUS */
262
  2,             /* META_MINMAX_QUERY */
263
  0,             /* META_ECLASS_AND */
264
  0,             /* META_ECLASS_OR */
265
  0,             /* META_ECLASS_SUB */
266
  0,             /* META_ECLASS_XOR */
267
  0              /* META_ECLASS_NOT */
268
};
269

270
/* Types for skipping parts of a parsed pattern. */
271

272
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
273

274
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
275
variables, which are concerned with first and required code units. A value
276
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
277
matching xxcu variable is set, and the low valued bits are relevant. */
278

279
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
280
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
281
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
282
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
283

284
/* These flags are used in the groupinfo vector. */
285

286
#define GI_SET_FIXED_LENGTH    0x80000000u
287
#define GI_NOT_FIXED_LENGTH    0x40000000u
288
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
289

290
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
291
and is fast (a good compiler can turn it into a subtraction and unsigned
292
comparison). */
293

294
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
295

296
/* Table to identify hex digits. The tables in chartables are dependent on the
297
locale, and may mark arbitrary characters as digits. We want to recognize only
298
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
299
costs 256 bytes, but it is a lot faster than doing character value tests (at
300
least in some simple cases I timed), and in some applications one wants PCRE2
301
to compile efficiently as well as match efficiently. The value in the table is
302
the binary hex digit value, or 0xff for non-hex digits. */
303

304
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
305
UTF-8 mode. */
306

307
#ifndef EBCDIC
308
static const uint8_t xdigitab[] =
309
  {
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
316
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
317
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
318
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
322
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
324
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
325
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
326
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
327
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
328
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
329
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
330
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
342

343
#else
344

345
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
346

347
static const uint8_t xdigitab[] =
348
  {
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
355
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
361
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
362
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
363
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
364
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
365
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
366
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
367
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
368
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
369
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
370
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
371
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
372
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
373
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
374
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
375
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
376
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
377
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
378
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
379
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
380
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
381
#endif  /* EBCDIC */
382

383

384
/* Table for handling alphanumeric escaped characters. Positive returns are
385
simple data values; negative values are for special things like \d and so on.
386
Zero means further processing is needed (for things like \x), or the escape is
387
invalid. */
388

389
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
390
in UTF-8 mode. It runs from '0' to 'z'. */
391

392
#ifndef EBCDIC
393
#define ESCAPES_FIRST       CHAR_0
394
#define ESCAPES_LAST        CHAR_z
395
#define UPPER_CASE(c)       (c-32)
396

397
static const short int escapes[] = {
398
    /* 0 */ 0,                       /* 1 */ 0,
399
    /* 2 */ 0,                       /* 3 */ 0,
400
    /* 4 */ 0,                       /* 5 */ 0,
401
    /* 6 */ 0,                       /* 7 */ 0,
402
    /* 8 */ 0,                       /* 9 */ 0,
403
    /* : */ CHAR_COLON,              /* ; */ CHAR_SEMICOLON,
404
    /* < */ CHAR_LESS_THAN_SIGN,     /* = */ CHAR_EQUALS_SIGN,
405
    /* > */ CHAR_GREATER_THAN_SIGN,  /* ? */ CHAR_QUESTION_MARK,
406
    /* @ */ CHAR_COMMERCIAL_AT,      /* A */ -ESC_A,
407
    /* B */ -ESC_B,                  /* C */ -ESC_C,
408
    /* D */ -ESC_D,                  /* E */ -ESC_E,
409
    /* F */ 0,                       /* G */ -ESC_G,
410
    /* H */ -ESC_H,                  /* I */ 0,
411
    /* J */ 0,                       /* K */ -ESC_K,
412
    /* L */ 0,                       /* M */ 0,
413
    /* N */ -ESC_N,                  /* O */ 0,
414
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
415
    /* R */ -ESC_R,                  /* S */ -ESC_S,
416
    /* T */ 0,                       /* U */ 0,
417
    /* V */ -ESC_V,                  /* W */ -ESC_W,
418
    /* X */ -ESC_X,                  /* Y */ 0,
419
    /* Z */ -ESC_Z,                  /* [ */ CHAR_LEFT_SQUARE_BRACKET,
420
    /* \ */ CHAR_BACKSLASH,          /* ] */ CHAR_RIGHT_SQUARE_BRACKET,
421
    /* ^ */ CHAR_CIRCUMFLEX_ACCENT,  /* _ */ CHAR_UNDERSCORE,
422
    /* ` */ CHAR_GRAVE_ACCENT,       /* a */ CHAR_BEL,
423
    /* b */ -ESC_b,                  /* c */ 0,
424
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
425
    /* f */ CHAR_FF,                 /* g */ 0,
426
    /* h */ -ESC_h,                  /* i */ 0,
427
    /* j */ 0,                       /* k */ -ESC_k,
428
    /* l */ 0,                       /* m */ 0,
429
    /* n */ CHAR_LF,                 /* o */ 0,
430
    /* p */ -ESC_p,                  /* q */ 0,
431
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
432
    /* t */ CHAR_HT,                 /* u */ 0,
433
    /* v */ -ESC_v,                  /* w */ -ESC_w,
434
    /* x */ 0,                       /* y */ 0,
435
    /* z */ -ESC_z
436
};
437

438
#else
439

440
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
441
It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
442
is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
443
because it is defined as 'a', which of course picks up the ASCII value. */
444

445
#if 'a' == 0x81                    /* Check for a real EBCDIC environment */
446
#define ESCAPES_FIRST       CHAR_a
447
#define ESCAPES_LAST        CHAR_9
448
#define UPPER_CASE(c)       (c+64)
449
#else                              /* Testing in an ASCII environment */
450
#define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
451
#define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
452
#define UPPER_CASE(c)  (c-32)
453
#endif
454

455
static const short int escapes[] = {
456
/*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
457
/*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
458
/*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
459
/*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
460
/*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
461
/*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
462
/*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
463
/*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
464
/*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
465
/*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
466
/*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
467
/*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
468
/*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
469
/*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
470
/*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
471
/*  F8 */      0,        0
472
};
473

474
/* We also need a table of characters that may follow \c in an EBCDIC
475
environment for characters 0-31. */
476

477
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
478

479
#endif   /* EBCDIC */
480

481

482
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
483
searched linearly. Put all the names into a single string, in order to reduce
484
the number of relocations when a shared library is dynamically linked. The
485
string is built from string macros so that it works in UTF-8 mode on EBCDIC
486
platforms. */
487

488
typedef struct verbitem {
489
  unsigned int len;          /* Length of verb name */
490
  uint32_t meta;             /* Base META_ code */
491
  int has_arg;               /* Argument requirement */
492
} verbitem;
493

494
static const char verbnames[] =
495
  "\0"                       /* Empty name is a shorthand for MARK */
496
  STRING_MARK0
497
  STRING_ACCEPT0
498
  STRING_F0
499
  STRING_FAIL0
500
  STRING_COMMIT0
501
  STRING_PRUNE0
502
  STRING_SKIP0
503
  STRING_THEN;
504

505
static const verbitem verbs[] = {
506
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
507
  { 4, META_MARK,   +1 },
508
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
509
  { 1, META_FAIL,   -1 },
510
  { 4, META_FAIL,   -1 },
511
  { 6, META_COMMIT,  0 },
512
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
513
  { 4, META_SKIP,    0 },
514
  { 4, META_THEN,    0 }
515
};
516

517
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
518

519
/* Verb opcodes, indexed by their META code offset from META_MARK. */
520

521
static const uint32_t verbops[] = {
522
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
523
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
524

525
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
526

527
typedef struct alasitem {
528
  unsigned int len;          /* Length of name */
529
  uint32_t meta;             /* Base META_ code */
530
} alasitem;
531

532
static const char alasnames[] =
533
  STRING_pla0
534
  STRING_plb0
535
  STRING_napla0
536
  STRING_naplb0
537
  STRING_nla0
538
  STRING_nlb0
539
  STRING_positive_lookahead0
540
  STRING_positive_lookbehind0
541
  STRING_non_atomic_positive_lookahead0
542
  STRING_non_atomic_positive_lookbehind0
543
  STRING_negative_lookahead0
544
  STRING_negative_lookbehind0
545
  STRING_scs0
546
  STRING_scan_substring0
547
  STRING_atomic0
548
  STRING_sr0
549
  STRING_asr0
550
  STRING_script_run0
551
  STRING_atomic_script_run;
552

553
static const alasitem alasmeta[] = {
554
  {  3, META_LOOKAHEAD         },
555
  {  3, META_LOOKBEHIND        },
556
  {  5, META_LOOKAHEAD_NA      },
557
  {  5, META_LOOKBEHIND_NA     },
558
  {  3, META_LOOKAHEADNOT      },
559
  {  3, META_LOOKBEHINDNOT     },
560
  { 18, META_LOOKAHEAD         },
561
  { 19, META_LOOKBEHIND        },
562
  { 29, META_LOOKAHEAD_NA      },
563
  { 30, META_LOOKBEHIND_NA     },
564
  { 18, META_LOOKAHEADNOT      },
565
  { 19, META_LOOKBEHINDNOT     },
566
  {  3, META_SCS               },
567
  { 14, META_SCS               },
568
  {  6, META_ATOMIC            },
569
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
570
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
571
  { 10, META_SCRIPT_RUN        }, /* script run */
572
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
573
};
574

575
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
576

577
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
578

579
static uint32_t chartypeoffset[] = {
580
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
581
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
582

583
/* Tables of names of POSIX character classes and their lengths. The names are
584
now all in a single string, to reduce the number of relocations when a shared
585
library is dynamically loaded. The list of lengths is terminated by a zero
586
length entry. The first three must be alpha, lower, upper, as this is assumed
587
for handling case independence.
588

589
The indices for several classes are stored in pcre2_compile.h - these must
590
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
591
and posix_substitutes. */
592

593
static const char posix_names[] =
594
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
595
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
596
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
597
  STRING_word0  STRING_xdigit;
598

599
static const uint8_t posix_name_lengths[] = {
600
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
601

602
/* Table of class bit maps for each POSIX class. Each class is formed from a
603
base map, with an optional addition or removal of another map. Then, for some
604
classes, there is some additional tweaking: for [:blank:] the vertical space
605
characters are removed, and for [:alpha:] and [:alnum:] the underscore
606
character is removed. The triples in the table consist of the base map offset,
607
second map offset or -1 if no second map, and a non-negative value for map
608
addition or a negative value for map subtraction (if there are two maps). The
609
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
610
remove vertical space characters, 2 => remove underscore. */
611

612
const int PRIV(posix_class_maps)[] = {
613
  cbit_word,   cbit_digit, -2,            /* alpha */
614
  cbit_lower,  -1,          0,            /* lower */
615
  cbit_upper,  -1,          0,            /* upper */
616
  cbit_word,   -1,          2,            /* alnum - word without underscore */
617
  cbit_print,  cbit_cntrl,  0,            /* ascii */
618
  cbit_space,  -1,          1,            /* blank - a GNU extension */
619
  cbit_cntrl,  -1,          0,            /* cntrl */
620
  cbit_digit,  -1,          0,            /* digit */
621
  cbit_graph,  -1,          0,            /* graph */
622
  cbit_print,  -1,          0,            /* print */
623
  cbit_punct,  -1,          0,            /* punct */
624
  cbit_space,  -1,          0,            /* space */
625
  cbit_word,   -1,          0,            /* word - a Perl extension */
626
  cbit_xdigit, -1,          0             /* xdigit */
627
};
628

629
#ifdef SUPPORT_UNICODE
630

631
/* The POSIX class Unicode property substitutes that are used in UCP mode must
632
be in the order of the POSIX class names, defined above. */
633

634
static int posix_substitutes[] = {
635
  PT_GC, ucp_L,     /* alpha */
636
  PT_PC, ucp_Ll,    /* lower */
637
  PT_PC, ucp_Lu,    /* upper */
638
  PT_ALNUM, 0,      /* alnum */
639
  -1, 0,            /* ascii, treat as non-UCP */
640
  -1, 1,            /* blank, treat as \h */
641
  PT_PC, ucp_Cc,    /* cntrl */
642
  PT_PC, ucp_Nd,    /* digit */
643
  PT_PXGRAPH, 0,    /* graph */
644
  PT_PXPRINT, 0,    /* print */
645
  PT_PXPUNCT, 0,    /* punct */
646
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
647
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
648
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
649
};
650
#endif  /* SUPPORT_UNICODE */
651

652
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
653
are allowed. */
654

655
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
656
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
657
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
658
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
659

660
#define PUBLIC_COMPILE_OPTIONS \
661
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
662
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
663
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
664
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
665
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
666
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
667
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
668

669
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
670
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
671
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
672

673
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
674
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
675
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
676
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
677
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
678
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
679
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
680
    PCRE2_EXTRA_NEVER_CALLOUT)
681

682
/* This is a table of start-of-pattern options such as (*UTF) and settings such
683
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
684
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
685
generic and always supported. */
686

687
enum { PSO_OPT,     /* Value is an option bit */
688
       PSO_XOPT,    /* Value is an xoption bit */
689
       PSO_FLG,     /* Value is a flag bit */
690
       PSO_NL,      /* Value is a newline type */
691
       PSO_BSR,     /* Value is a \R type */
692
       PSO_LIMH,    /* Read integer value for heap limit */
693
       PSO_LIMM,    /* Read integer value for match limit */
694
       PSO_LIMD,    /* Read integer value for depth limit */
695
       PSO_OPTMZ    /* Value is an optimization bit */
696
     };
697

698
typedef struct pso {
699
  const char *name;
700
  uint16_t length;
701
  uint16_t type;
702
  uint32_t value;
703
} pso;
704

705
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
706

707
static const pso pso_list[] = {
708
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
709
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
710
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
711
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
712
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
713
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
714
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
715
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
716
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
717
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
718
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
719
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
720
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
721
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
722
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
723
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
724
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
725
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
726
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
727
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
728
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
729
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
730
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
731
};
732

733
/* This table is used when converting repeating opcodes into possessified
734
versions as a result of an explicit possessive quantifier such as ++. A zero
735
value means there is no possessified version - in those cases the item in
736
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
737
because all relevant opcodes are less than that. */
738

739
static const uint8_t opcode_possessify[] = {
740
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
741
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
742

743
  0,                       /* NOTI */
744
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
745
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
746
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
747
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
748
  0,                       /* EXACT */
749
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
750

751
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
752
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
753
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
754
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
755
  0,                       /* EXACTI */
756
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
757

758
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
759
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
760
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
761
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
762
  0,                       /* NOTEXACT */
763
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
764

765
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
766
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
767
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
768
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
769
  0,                       /* NOTEXACTI */
770
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
771

772
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
773
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
774
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
775
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
776
  0,                       /* TYPEEXACT */
777
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
778

779
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
780
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
781
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
782
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
783
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
784

785
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
786
  0, 0,                    /* REF, REFI */
787
  0, 0,                    /* DNREF, DNREFI */
788
  0, 0,                    /* RECURSE, CALLOUT */
789
};
790

791
/* Compile-time check that the table has the correct size. */
792
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
793

794

795
#ifdef DEBUG_SHOW_PARSED
796
/*************************************************
797
*     Show the parsed pattern for debugging      *
798
*************************************************/
799

800
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
801
can be enabled. */
802

803
static void show_parsed(compile_block *cb)
804
{
805
uint32_t *pptr = cb->parsed_pattern;
806

807
for (;;)
808
  {
809
  int max, min;
810
  PCRE2_SIZE offset;
811
  uint32_t i;
812
  uint32_t length;
813
  uint32_t meta_arg = META_DATA(*pptr);
814

815
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
816

817
  if (*pptr < META_END)
818
    {
819
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
820
    pptr++;
821
    }
822

823
  else switch (META_CODE(*pptr++))
824
    {
825
    default:
826
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
827
    return;
828

829
    case META_END:
830
    fprintf(stderr, "META_END\n");
831
    return;
832

833
    case META_CAPTURE:
834
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
835
    break;
836

837
    case META_RECURSE:
838
    GETOFFSET(offset, pptr);
839
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
840
    break;
841

842
    case META_BACKREF:
843
    if (meta_arg < 10)
844
      offset = cb->small_ref_offset[meta_arg];
845
    else
846
      GETOFFSET(offset, pptr);
847
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
848
    break;
849

850
    case META_ESCAPE:
851
    if (meta_arg == ESC_P || meta_arg == ESC_p)
852
      {
853
      uint32_t ptype = *pptr >> 16;
854
      uint32_t pvalue = *pptr++ & 0xffff;
855
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
856
        ptype, pvalue);
857
      }
858
    else
859
      {
860
      uint32_t cc;
861
      /* There's just one escape we might have here that isn't negated in the
862
      escapes table. */
863
      if (meta_arg == ESC_g) cc = CHAR_g;
864
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
865
        {
866
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
867
        }
868
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
869
      fprintf(stderr, "META \\%c", cc);
870
      }
871
    break;
872

873
    case META_MINMAX:
874
    min = *pptr++;
875
    max = *pptr++;
876
    if (max != REPEAT_UNLIMITED)
877
      fprintf(stderr, "META {%d,%d}", min, max);
878
    else
879
      fprintf(stderr, "META {%d,}", min);
880
    break;
881

882
    case META_MINMAX_QUERY:
883
    min = *pptr++;
884
    max = *pptr++;
885
    if (max != REPEAT_UNLIMITED)
886
      fprintf(stderr, "META {%d,%d}?", min, max);
887
    else
888
      fprintf(stderr, "META {%d,}?", min);
889
    break;
890

891
    case META_MINMAX_PLUS:
892
    min = *pptr++;
893
    max = *pptr++;
894
    if (max != REPEAT_UNLIMITED)
895
      fprintf(stderr, "META {%d,%d}+", min, max);
896
    else
897
      fprintf(stderr, "META {%d,}+", min);
898
    break;
899

900
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
901
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
902
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
903
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
904
    case META_DOT: fprintf(stderr, "META_DOT"); break;
905
    case META_ASTERISK: fprintf(stderr, "META *"); break;
906
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
907
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
908
    case META_PLUS: fprintf(stderr, "META +"); break;
909
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
910
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
911
    case META_QUERY: fprintf(stderr, "META ?"); break;
912
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
913
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
914

915
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
916
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
917
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
918
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
919
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
920
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
921
    case META_KET: fprintf(stderr, "META )"); break;
922
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
923

924
    case META_CLASS: fprintf(stderr, "META ["); break;
925
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
926
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
927
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
928
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
929

930
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
931
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
932

933
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
934
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
935

936
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
937
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
938
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
939
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
940
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
941
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
942

943
    case META_OPTIONS:
944
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
945
    pptr += 2;
946
    break;
947

948
    case META_LOOKBEHIND:
949
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
950
    pptr += 2;
951
    break;
952

953
    case META_LOOKBEHIND_NA:
954
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
955
    pptr += 2;
956
    break;
957

958
    case META_LOOKBEHINDNOT:
959
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
960
    pptr += 2;
961
    break;
962

963
    case META_CALLOUT_NUMBER:
964
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
965
       pptr[1]);
966
    pptr += 3;
967
    break;
968

969
    case META_CALLOUT_STRING:
970
      {
971
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
972
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
973
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
974
      GETOFFSET(offset, pptr);
975
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
976
      }
977
    break;
978

979
    case META_RECURSE_BYNAME:
980
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
981
    GETOFFSET(offset, pptr);
982
    fprintf(stderr, "%zd", offset);
983
    break;
984

985
    case META_BACKREF_BYNAME:
986
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
987
    GETOFFSET(offset, pptr);
988
    fprintf(stderr, "%zd", offset);
989
    break;
990

991
    case META_COND_NUMBER:
992
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
993
    GETOFFSET(offset, pptr);
994
    fprintf(stderr, "%zd", offset);
995
    pptr++;
996
    break;
997

998
    case META_COND_DEFINE:
999
    fprintf(stderr, "META (?(DEFINE) offset=");
1000
    GETOFFSET(offset, pptr);
1001
    fprintf(stderr, "%zd", offset);
1002
    break;
1003

1004
    case META_COND_VERSION:
1005
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1006
    fprintf(stderr, "%d.", *pptr++);
1007
    fprintf(stderr, "%d)", *pptr++);
1008
    break;
1009

1010
    case META_COND_NAME:
1011
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1012
    GETOFFSET(offset, pptr);
1013
    fprintf(stderr, "%zd", offset);
1014
    break;
1015

1016
    case META_COND_RNAME:
1017
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1018
    GETOFFSET(offset, pptr);
1019
    fprintf(stderr, "%zd", offset);
1020
    break;
1021

1022
    /* This is kept as a name, because it might be. */
1023

1024
    case META_COND_RNUMBER:
1025
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    break;
1029

1030
    case META_OFFSET:
1031
    fprintf(stderr, "META_OFFSET offset=");
1032
    GETOFFSET(offset, pptr);
1033
    fprintf(stderr, "%zd", offset);
1034
    break;
1035

1036
    case META_SCS:
1037
    fprintf(stderr, "META (*scan_substring:");
1038
    break;
1039

1040
    case META_SCS_NAME:
1041
    fprintf(stderr, "META_SCS_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1042
    break;
1043

1044
    case META_SCS_NUMBER:
1045
    fprintf(stderr, "META_SCS_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1046
    break;
1047

1048
    case META_MARK:
1049
    fprintf(stderr, "META (*MARK:");
1050
    goto SHOWARG;
1051

1052
    case META_COMMIT_ARG:
1053
    fprintf(stderr, "META (*COMMIT:");
1054
    goto SHOWARG;
1055

1056
    case META_PRUNE_ARG:
1057
    fprintf(stderr, "META (*PRUNE:");
1058
    goto SHOWARG;
1059

1060
    case META_SKIP_ARG:
1061
    fprintf(stderr, "META (*SKIP:");
1062
    goto SHOWARG;
1063

1064
    case META_THEN_ARG:
1065
    fprintf(stderr, "META (*THEN:");
1066
    SHOWARG:
1067
    length = *pptr++;
1068
    for (i = 0; i < length; i++)
1069
      {
1070
      uint32_t cc = *pptr++;
1071
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1072
        else fprintf(stderr, "\\x{%x}", cc);
1073
      }
1074
    fprintf(stderr, ") length=%u", length);
1075
    break;
1076

1077
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1078
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1079
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1080
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1081
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1082
    }
1083
  fprintf(stderr, "\n");
1084
  }
1085
return;
1086
}
1087
#endif  /* DEBUG_SHOW_PARSED */
1088

1089

1090

1091
/*************************************************
1092
*               Copy compiled code               *
1093
*************************************************/
1094

1095
/* Compiled JIT code cannot be copied, so the new compiled block has no
1096
associated JIT data. */
1097

1098
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1099
pcre2_code_copy(const pcre2_code *code)
1100
{
1101
PCRE2_SIZE *ref_count;
1102
pcre2_code *newcode;
1103

1104
if (code == NULL) return NULL;
1105
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1106
if (newcode == NULL) return NULL;
1107
memcpy(newcode, code, code->blocksize);
1108
newcode->executable_jit = NULL;
1109

1110
/* If the code is one that has been deserialized, increment the reference count
1111
in the decoded tables. */
1112

1113
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1114
  {
1115
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1116
  (*ref_count)++;
1117
  }
1118

1119
return newcode;
1120
}
1121

1122

1123

1124
/*************************************************
1125
*     Copy compiled code and character tables    *
1126
*************************************************/
1127

1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. This version of code_copy also makes a separate copy of
1130
the character tables. */
1131

1132
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1133
pcre2_code_copy_with_tables(const pcre2_code *code)
1134
{
1135
PCRE2_SIZE* ref_count;
1136
pcre2_code *newcode;
1137
uint8_t *newtables;
1138

1139
if (code == NULL) return NULL;
1140
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1141
if (newcode == NULL) return NULL;
1142
memcpy(newcode, code, code->blocksize);
1143
newcode->executable_jit = NULL;
1144

1145
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1146
  code->memctl.memory_data);
1147
if (newtables == NULL)
1148
  {
1149
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1150
  return NULL;
1151
  }
1152
memcpy(newtables, code->tables, TABLES_LENGTH);
1153
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1154
*ref_count = 1;
1155

1156
newcode->tables = newtables;
1157
newcode->flags |= PCRE2_DEREF_TABLES;
1158
return newcode;
1159
}
1160

1161

1162

1163
/*************************************************
1164
*               Free compiled code               *
1165
*************************************************/
1166

1167
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1168
pcre2_code_free(pcre2_code *code)
1169
{
1170
PCRE2_SIZE* ref_count;
1171

1172
if (code != NULL)
1173
  {
1174
#ifdef SUPPORT_JIT
1175
  if (code->executable_jit != NULL)
1176
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1177
#endif
1178

1179
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1180
    {
1181
    /* Decoded tables belong to the codes after deserialization, and they must
1182
    be freed when there are no more references to them. The *ref_count should
1183
    always be > 0. */
1184

1185
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1186
    if (*ref_count > 0)
1187
      {
1188
      (*ref_count)--;
1189
      if (*ref_count == 0)
1190
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1191
      }
1192
    }
1193

1194
  code->memctl.free(code, code->memctl.memory_data);
1195
  }
1196
}
1197

1198

1199

1200
/*************************************************
1201
*         Read a number, possibly signed         *
1202
*************************************************/
1203

1204
/* This function is used to read numbers in the pattern. The initial pointer
1205
must be at the sign or first digit of the number. When relative values
1206
(introduced by + or -) are allowed, they are relative group numbers, and the
1207
result must be greater than zero.
1208

1209
Arguments:
1210
  ptrptr      points to the character pointer variable
1211
  ptrend      points to the end of the input string
1212
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1213
  max_value   the largest number allowed;
1214
              you must not pass a value for max_value larger than
1215
              INT_MAX/10 - 1 because this function relies on max_value to
1216
              avoid integer overflow
1217
  max_error   the error to give for an over-large number
1218
  intptr      where to put the result
1219
  errcodeptr  where to put an error code
1220

1221
Returns:      TRUE  - a number was read
1222
              FALSE - errorcode == 0 => no number was found
1223
                      errorcode != 0 => an error occurred
1224
*/
1225

1226
static BOOL
1227
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1228
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1229
{
1230
int sign = 0;
1231
uint32_t n = 0;
1232
PCRE2_SPTR ptr = *ptrptr;
1233
BOOL yield = FALSE;
1234

1235
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1236

1237
*errorcodeptr = 0;
1238

1239
if (allow_sign >= 0 && ptr < ptrend)
1240
  {
1241
  if (*ptr == CHAR_PLUS)
1242
    {
1243
    sign = +1;
1244
    max_value -= allow_sign;
1245
    ptr++;
1246
    }
1247
  else if (*ptr == CHAR_MINUS)
1248
    {
1249
    sign = -1;
1250
    ptr++;
1251
    }
1252
  }
1253

1254
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1255
while (ptr < ptrend && IS_DIGIT(*ptr))
1256
  {
1257
  n = n * 10 + (*ptr++ - CHAR_0);
1258
  if (n > max_value)
1259
    {
1260
    *errorcodeptr = max_error;
1261
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1262
    goto EXIT;
1263
    }
1264
  }
1265

1266
if (allow_sign >= 0 && sign != 0)
1267
  {
1268
  if (n == 0)
1269
    {
1270
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1271
    goto EXIT;
1272
    }
1273

1274
  if (sign > 0) n += allow_sign;
1275
  else if (n > (uint32_t)allow_sign)
1276
    {
1277
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1278
    goto EXIT;
1279
    }
1280
  else n = allow_sign + 1 - n;
1281
  }
1282

1283
yield = TRUE;
1284

1285
EXIT:
1286
*intptr = n;
1287
*ptrptr = ptr;
1288
return yield;
1289
}
1290

1291

1292

1293
/*************************************************
1294
*         Read repeat counts                     *
1295
*************************************************/
1296

1297
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1298
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1299
larger value is used for "unlimited". We have to use signed arguments for
1300
read_number() because it is capable of returning a signed value. As of Perl
1301
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1302
tabs after { and before } and between the numbers and the comma, so we do too.
1303

1304
Arguments:
1305
  ptrptr         points to pointer to character after '{'
1306
  ptrend         pointer to end of input
1307
  minp           if not NULL, pointer to int for min
1308
  maxp           if not NULL, pointer to int for max
1309
  errorcodeptr   points to error code variable
1310

1311
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1312
                 FALSE on error, with errorcode set non-zero
1313
                 TRUE on success, with pointer updated to point after '}'
1314
*/
1315

1316
static BOOL
1317
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1318
  uint32_t *maxp, int *errorcodeptr)
1319
{
1320
PCRE2_SPTR p = *ptrptr;
1321
PCRE2_SPTR pp;
1322
BOOL yield = FALSE;
1323
BOOL had_minimum = FALSE;
1324
int32_t min = 0;
1325
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1326

1327
*errorcodeptr = 0;
1328
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1329

1330
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1331
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1332
error. */
1333

1334
pp = p;
1335
if (pp < ptrend && IS_DIGIT(*pp))
1336
  {
1337
  had_minimum = TRUE;
1338
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1339
  }
1340

1341
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1342
if (pp >= ptrend) return FALSE;
1343

1344
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1345
  {
1346
  if (!had_minimum) return FALSE;
1347
  }
1348
else
1349
  {
1350
  if (*pp++ != CHAR_COMMA) return FALSE;
1351
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1352
  if (pp >= ptrend) return FALSE;
1353
  if (IS_DIGIT(*pp))
1354
    {
1355
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1356
    }
1357
  else if (!had_minimum) return FALSE;
1358
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1359
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1360
  }
1361

1362
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1363
or {n,m}. The only error that read_number() can return is for a number that is
1364
too big. If *errorcodeptr is returned as zero it means no number was found. */
1365

1366
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1367
check m >= n because n defaults to zero. */
1368

1369
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1370
  {
1371
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1372
  p++;  /* Skip comma and subsequent spaces */
1373
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1374
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1375
    {
1376
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1377
    }
1378
  }
1379

1380
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1381

1382
else
1383
  {
1384
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1385
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1386
    {
1387
    max = min;
1388
    }
1389
  else   /* Handle {n,} or {n,m} */
1390
    {
1391
    p++;    /* Skip comma and subsequent spaces */
1392
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1393
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1394
      {
1395
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1396
      }
1397

1398
    if (max < min)
1399
      {
1400
      *errorcodeptr = ERR4;
1401
      goto EXIT;
1402
      }
1403
    }
1404
  }
1405

1406
/* Valid quantifier exists */
1407

1408
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1409
p++;
1410
yield = TRUE;
1411
if (minp != NULL) *minp = (uint32_t)min;
1412
if (maxp != NULL) *maxp = (uint32_t)max;
1413

1414
/* Update the pattern pointer */
1415

1416
EXIT:
1417
*ptrptr = p;
1418
return yield;
1419
}
1420

1421

1422

1423
/*************************************************
1424
*            Handle escapes                      *
1425
*************************************************/
1426

1427
/* This function is called when a \ has been encountered. It either returns a
1428
positive value for a simple escape such as \d, or 0 for a data character, which
1429
is placed in chptr. A backreference to group n is returned as -(n+1). On
1430
entry, ptr is pointing at the character after \. On exit, it points after the
1431
final code unit of the escape sequence.
1432

1433
This function is also called from pcre2_substitute() to handle escape sequences
1434
in replacement strings. In this case, the cb argument is NULL, and in the case
1435
of escapes that have further processing, only sequences that define a data
1436
character are recognised. The options argument is the final value of the
1437
compiled pattern's options.
1438

1439
Arguments:
1440
  ptrptr         points to the input position pointer
1441
  ptrend         points to the end of the input
1442
  chptr          points to a returned data character
1443
  errorcodeptr   points to the errorcode variable (containing zero)
1444
  options        the current options bits
1445
  xoptions       the current extra options bits
1446
  bracount       the number of capturing parentheses encountered so far
1447
  isclass        TRUE if in a character class
1448
  cb             compile data block or NULL when called from pcre2_substitute()
1449

1450
Returns:         zero => a data character
1451
                 positive => a special escape sequence
1452
                 negative => a numerical back reference
1453
                 on error, errorcodeptr is set non-zero
1454
*/
1455

1456
int
1457
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1458
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1459
  BOOL isclass, compile_block *cb)
1460
{
1461
BOOL utf = (options & PCRE2_UTF) != 0;
1462
BOOL alt_bsux =
1463
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1464
PCRE2_SPTR ptr = *ptrptr;
1465
uint32_t c, cc;
1466
int escape = 0;
1467
int i;
1468

1469
/* If backslash is at the end of the string, it's an error. */
1470

1471
if (ptr >= ptrend)
1472
  {
1473
  *errorcodeptr = ERR1;
1474
  return 0;
1475
  }
1476

1477
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1478
*errorcodeptr = 0;              /* Be optimistic */
1479

1480
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1481
value test saves a memory lookup for code points outside the alphanumeric
1482
range. */
1483

1484
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1485

1486
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1487
positive value is a literal value for something like \n. A negative value is
1488
the negation of one of the ESC_ macros that is passed back for handling by the
1489
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1490
is supported. If the value is zero, further processing is handled below. */
1491

1492
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1493
  {
1494
  if (i > 0)
1495
    {
1496
    c = (uint32_t)i;
1497
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1498
      c = CHAR_LF;
1499
    }
1500
  else  /* Negative table entry */
1501
    {
1502
    escape = -i;                    /* Else return a special escape */
1503
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1504
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1505

1506
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1507
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1508
    support \N{name}. However, it does support quantification such as \N{2,3},
1509
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1510

1511
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1512
      {
1513
      PCRE2_SPTR p = ptr + 1;
1514

1515
      /* Perl ignores spaces and tabs after { */
1516

1517
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1518

1519
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1520
      not valid in EBCDIC environments because it specifies a Unicode
1521
      character, not a codepoint in the local code. For example \N{U+0041}
1522
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1523
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1524
      Unicode) mode. */
1525

1526
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1527
        {
1528
#ifndef EBCDIC
1529
        if (utf)
1530
          {
1531
          ptr = p + 2;
1532
          escape = 0;   /* Not a fancy escape after all */
1533
          goto COME_FROM_NU;
1534
          }
1535
#endif
1536
        *errorcodeptr = ERR93;
1537
        }
1538

1539
      /* Give an error in contexts where quantifiers are not allowed
1540
      (character classes; substitution strings). */
1541

1542
      else if (isclass || cb == NULL)
1543
        {
1544
        *errorcodeptr = ERR37;
1545
        }
1546

1547
      /* Give an error if what follows is not a quantifier, but don't override
1548
      an error set by the quantifier reader (e.g. number overflow). */
1549

1550
      else
1551
        {
1552
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1553
             *errorcodeptr == 0)
1554
          *errorcodeptr = ERR37;
1555
        }
1556
      }
1557
    }
1558
  }
1559

1560
/* Escapes that need further processing, including those that are unknown, have
1561
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1562
\o, and \x are recognized (\u and \U can never appear as they are used for case
1563
forcing). */
1564

1565
else
1566
  {
1567
  int s;
1568
  PCRE2_SPTR oldptr;
1569
  BOOL overflow;
1570

1571
  /* Filter calls from pcre2_substitute(). */
1572

1573
  if (cb == NULL)
1574
    {
1575
    if (c < CHAR_0 ||
1576
       (c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x && c != CHAR_g)))
1577
      {
1578
      *errorcodeptr = ERR3;
1579
      return 0;
1580
      }
1581
    alt_bsux = FALSE;   /* Do not modify \x handling */
1582
    }
1583

1584
  switch (c)
1585
    {
1586
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1587
    error. */
1588

1589
    case CHAR_F:
1590
    case CHAR_l:
1591
    case CHAR_L:
1592
    *errorcodeptr = ERR37;
1593
    break;
1594

1595
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1596
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1597
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1598
    Otherwise it is a lowercase u letter. This gives some compatibility with
1599
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1600
    allowed. When \u{ is not followed by hex digits, a special return is given
1601
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1602

1603
    case CHAR_u:
1604
    if (!alt_bsux) *errorcodeptr = ERR37; else
1605
      {
1606
      uint32_t xc;
1607

1608
      if (ptr >= ptrend) break;
1609
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1610
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1611
        {
1612
        PCRE2_SPTR hptr = ptr + 1;
1613

1614
        cc = 0;
1615
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1616
          {
1617
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1618
            {
1619
            *errorcodeptr = ERR77;
1620
            ptr = hptr;   /* Show where */
1621
            break;        /* *hptr != } will cause another break below */
1622
            }
1623
          cc = (cc << 4) | xc;
1624
          hptr++;
1625
          }
1626

1627
        if (hptr == ptr + 1 ||   /* No hex digits */
1628
            hptr >= ptrend ||    /* Hit end of input */
1629
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1630
          {
1631
          if (isclass) break; /* In a class, just treat as '\u' literal */
1632
          escape = ESC_ub;    /* Special return */
1633
          ptr++;              /* Skip { */
1634
          break;              /* Hex escape not recognized */
1635
          }
1636

1637
        c = cc;          /* Accept the code point */
1638
        ptr = hptr + 1;
1639
        }
1640

1641
      else  /* Must be exactly 4 hex digits */
1642
        {
1643
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1644
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1645
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1646
        cc = (cc << 4) | xc;
1647
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1648
        cc = (cc << 4) | xc;
1649
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1650
        c = (cc << 4) | xc;
1651
        ptr += 4;
1652
        }
1653

1654
      if (utf)
1655
        {
1656
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1657
        else
1658
          if (c >= 0xd800 && c <= 0xdfff &&
1659
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1660
                *errorcodeptr = ERR73;
1661
        }
1662
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1663
      }
1664
    break;
1665

1666
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1667
    in which case it is an upper case letter. */
1668

1669
    case CHAR_U:
1670
    if (!alt_bsux) *errorcodeptr = ERR37;
1671
    break;
1672

1673
    /* In a character class, \g is just a literal "g". Outside a character
1674
    class, \g must be followed by one of a number of specific things:
1675

1676
    (1) A number, either plain or braced. If positive, it is an absolute
1677
    backreference. If negative, it is a relative backreference. This is a Perl
1678
    5.10 feature.
1679

1680
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1681
    is part of Perl's movement towards a unified syntax for back references. As
1682
    this is synonymous with \k{name}, we fudge it up by pretending it really
1683
    was \k{name}.
1684

1685
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1686
    number either in angle brackets or in single quotes. However, these are
1687
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1688
    the ESC_g code.
1689

1690
    Summary: Return a negative number for a numerical back reference (offset
1691
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1692
    numbered subroutine call.
1693

1694
    The above describes the \g behaviour inside patterns. Inside replacement
1695
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1696
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1697
    numbered case.
1698
    */
1699

1700
    case CHAR_g:
1701
    if (isclass) break;
1702

1703
    if (ptr >= ptrend)
1704
      {
1705
      *errorcodeptr = ERR57;
1706
      break;
1707
      }
1708

1709
    if (cb == NULL)
1710
      {
1711
      PCRE2_SPTR p;
1712
      /* Substitution strings */
1713
      if (*ptr != CHAR_LESS_THAN_SIGN)
1714
        {
1715
        *errorcodeptr = ERR57;
1716
        break;
1717
        }
1718

1719
      p = ptr + 1;
1720

1721
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1722
          errorcodeptr))
1723
        {
1724
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1725
        break;
1726
        }
1727

1728
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1729
        {
1730
        /* not advancing ptr; report error at the \g character */
1731
        *errorcodeptr = ERR57;
1732
        break;
1733
        }
1734

1735
      /* This is the reason that back references are returned as -(s+1) rather
1736
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1737
      valid in a substitution string, so this must be representable. */
1738
      ptr = p + 1;
1739
      escape = -(s+1);
1740
      break;
1741
      }
1742

1743
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1744
      {
1745
      escape = ESC_g;
1746
      break;
1747
      }
1748

1749
    /* If there is a brace delimiter, try to read a numerical reference. If
1750
    there isn't one, assume we have a name and treat it as \k. */
1751

1752
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1753
      {
1754
      PCRE2_SPTR p = ptr + 1;
1755

1756
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1757
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1758
          errorcodeptr))
1759
        {
1760
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1761
        break;
1762
        }
1763
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1764

1765
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1766
        {
1767
        /* not advancing ptr; report error at the \g character */
1768
        *errorcodeptr = ERR57;
1769
        break;
1770
        }
1771
      ptr = p + 1;
1772
      }
1773

1774
    /* Read an undelimited number */
1775

1776
    else
1777
      {
1778
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1779
          errorcodeptr))
1780
        {
1781
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1782
        break;
1783
        }
1784
      }
1785

1786
    if (s <= 0)
1787
      {
1788
      *errorcodeptr = ERR15;
1789
      break;
1790
      }
1791

1792
    escape = -(s+1);
1793
    break;
1794

1795
    /* The handling of escape sequences consisting of a string of digits
1796
    starting with one that is not zero is not straightforward. Perl has changed
1797
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1798
    recommended to avoid the ambiguities in the old syntax.
1799

1800
    Outside a character class, the digits are read as a decimal number. If the
1801
    number is less than 10, or if there are that many previous extracting left
1802
    brackets, it is a back reference. Otherwise, up to three octal digits are
1803
    read to form an escaped character code. Thus \123 is likely to be octal 123
1804
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1805
    style" of handling ambiguous octal/backrefences such as \12.
1806

1807
    There is an alternative disambiguation strategy, selected by
1808
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1809
    have either a leading zero, or exactly three octal digits; otherwise it's
1810
    a backreference. The disambiguation is stable, and does not depend on how
1811
    many capture groups are defined (it's simply an invalid backreference if
1812
    there is no corresponding capture group). Additionally, octal values above
1813
    \377 (\xff) are rejected.
1814

1815
    Inside a character class, \ followed by a digit is always either a literal
1816
    8 or 9 or an octal number. */
1817

1818
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1819
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1820

1821
    if (isclass)
1822
      {
1823
      /* Fall through to octal handling; never a backreference inside a class. */
1824
      }
1825
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1826
      {
1827
      /* Python-style disambiguation. */
1828
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1829
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1830
        {
1831
        /* We peeked a three-digit octal, so fall through */
1832
        }
1833
      else
1834
        {
1835
        /* We are at a digit, so the only possible error from read_number() is
1836
        a number that is too large. */
1837
        ptr--;   /* Back to the digit */
1838

1839
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1840
          {
1841
          *errorcodeptr = ERR61;
1842
          break;
1843
          }
1844

1845
        escape = -(s+1);
1846
        break;
1847
        }
1848
      }
1849
    else
1850
      {
1851
      /* Perl-style disambiguation. */
1852
      oldptr = ptr;
1853
      ptr--;   /* Back to the digit */
1854

1855
      /* As we know we are at a digit, the only possible error from
1856
      read_number() is a number that is too large to be a group number. Because
1857
      that number might be still valid if read as an octal, errorcodeptr is not
1858
      set on failure and therefore a sentinel value of INT_MAX is used instead
1859
      of the original value, and will be used later to properly set the error,
1860
      if not falling through. */
1861

1862
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1863
        s = INT_MAX;
1864

1865
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1866
      are octal escapes if there are not that many previous captures. */
1867

1868
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1869
        {
1870
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1871
        but we keep it just to be safe and because it will also catch the
1872
        sentinel value that was set on failure by that function. */
1873

1874
        if ((unsigned)s > MAX_GROUP_NUMBER)
1875
          {
1876
          PCRE2_ASSERT(s == INT_MAX);
1877
          *errorcodeptr = ERR61;
1878
          }
1879
        else escape = -(s+1);     /* Indicates a back reference */
1880
        break;
1881
        }
1882

1883
      ptr = oldptr;      /* Put the pointer back and fall through */
1884
      }
1885

1886
    /* Handle a digit following \ when the number is not a back reference, or
1887
    we are within a character class. If the first digit is 8 or 9, Perl used to
1888
    generate a binary zero and then treat the digit as a following literal. At
1889
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1890

1891
    if (c >= CHAR_8) break;
1892

1893
    /* Fall through */
1894

1895
    /* \0 always starts an octal number, but we may drop through to here with a
1896
    larger first octal digit. The original code used just to take the least
1897
    significant 8 bits of octal numbers (I think this is what early Perls used
1898
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1899
    but no more than 3 octal digits. */
1900

1901
    case CHAR_0:
1902
    c -= CHAR_0;
1903
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1904
        c = c * 8 + *ptr++ - CHAR_0;
1905
    if (c > 0xff)
1906
      {
1907
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1908
#if PCRE2_CODE_UNIT_WIDTH == 8
1909
      else if (!utf) *errorcodeptr = ERR51;
1910
#endif
1911
      }
1912

1913
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1914
    two- or three-character octal escapes \00 and \000, nor \x00. */
1915

1916
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1917
        *errorcodeptr = ERR98;
1918
    break;
1919

1920
    /* \o is a relatively new Perl feature, supporting a more general way of
1921
    specifying character codes in octal. The only supported form is \o{ddd},
1922
    with optional spaces or tabs after { and before }. */
1923

1924
    case CHAR_o:
1925
    if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1926
      {
1927
      ptr--;
1928
      *errorcodeptr = ERR55;
1929
      break;
1930
      }
1931

1932
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1933
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1934
      {
1935
      *errorcodeptr = ERR78;
1936
      break;
1937
      }
1938

1939
    c = 0;
1940
    overflow = FALSE;
1941
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1942
      {
1943
      cc = *ptr++;
1944
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1945
#if PCRE2_CODE_UNIT_WIDTH == 32
1946
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1947
#endif
1948
      c = (c << 3) + (cc - CHAR_0);
1949
#if PCRE2_CODE_UNIT_WIDTH == 8
1950
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1951
#elif PCRE2_CODE_UNIT_WIDTH == 16
1952
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1953
#elif PCRE2_CODE_UNIT_WIDTH == 32
1954
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1955
#endif
1956
      }
1957

1958
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1959

1960
    if (overflow)
1961
      {
1962
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1963
      *errorcodeptr = ERR34;
1964
      }
1965
    else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1966
      {
1967
      if (utf && c >= 0xd800 && c <= 0xdfff &&
1968
          (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1969
        {
1970
        ptr--;
1971
        *errorcodeptr = ERR73;
1972
        }
1973
      }
1974
    else
1975
      {
1976
      ptr--;
1977
      *errorcodeptr = ERR64;
1978
      }
1979
    break;
1980

1981
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1982
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1983

1984
    case CHAR_x:
1985
    if (alt_bsux)
1986
      {
1987
      uint32_t xc;
1988
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1989
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1990
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1991
      c = (cc << 4) | xc;
1992
      ptr += 2;
1993
      }
1994

1995
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1996
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1997
    digits. If not, { used to be treated as a data character. However, Perl
1998
    seems to read hex digits up to the first non-such, and ignore the rest, so
1999
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2000
    now gives an error. */
2001

2002
    else
2003
      {
2004
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2005
        {
2006
        ptr++;
2007
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2008

2009
#ifndef EBCDIC
2010
        COME_FROM_NU:
2011
#endif
2012
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2013
          {
2014
          *errorcodeptr = ERR78;
2015
          break;
2016
          }
2017
        c = 0;
2018
        overflow = FALSE;
2019

2020
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2021
          {
2022
          ptr++;
2023
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2024
#if PCRE2_CODE_UNIT_WIDTH == 32
2025
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2026
#endif
2027
          c = (c << 4) | cc;
2028
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2029
            {
2030
            overflow = TRUE;
2031
            break;
2032
            }
2033
          }
2034

2035
        /* Perl ignores spaces and tabs before } */
2036

2037
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2038

2039
        /* On overflow, skip remaining hex digits */
2040

2041
        if (overflow)
2042
          {
2043
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2044
          *errorcodeptr = ERR34;
2045
          }
2046
        else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2047
          {
2048
          if (utf && c >= 0xd800 && c <= 0xdfff &&
2049
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2050
            {
2051
            ptr--;
2052
            *errorcodeptr = ERR73;
2053
            }
2054
          }
2055

2056
        /* If the sequence of hex digits (followed by optional space) does not
2057
        end with '}', give an error. We used just to recognize this construct
2058
        and fall through to the normal \x handling, but nowadays Perl gives an
2059
        error, which seems much more sensible, so we do too. */
2060

2061
        else
2062
          {
2063
          ptr--;
2064
          *errorcodeptr = ERR67;
2065
          }
2066
        }   /* End of \x{} processing */
2067

2068
      /* Read a up to two hex digits after \x */
2069

2070
      else
2071
        {
2072
        /* Perl has the surprising/broken behaviour that \x without following
2073
        hex digits is treated as an escape for NUL. Their source code laments
2074
        this but keeps it for backwards compatibility. A warning is printed
2075
        when "use warnings" is enabled. Because we don't have warnings, we
2076
        simply forbid it. */
2077
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2078
          {
2079
          /* Not a hex digit */
2080
          *errorcodeptr = ERR78;
2081
          break;
2082
          }
2083
        ptr++;
2084
        c = cc;
2085

2086
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2087
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2088
        strict, and there seems little incentive to align with that, given the
2089
        backwards-compatibility cost.
2090

2091
        For comparison, note that other engines disagree. For example:
2092
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2093
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2094
        */
2095
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2096
        ptr++;
2097
        c = (c << 4) | cc;
2098
        }     /* End of \xdd handling */
2099
      }       /* End of Perl-style \x handling */
2100
    break;
2101

2102
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2103
    ASCII (or Unicode) environment, an error is given if the character
2104
    following \c is not a printable ASCII character. Otherwise, the following
2105
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2106
    flipped. The result is the value of the escape.
2107

2108
    In an EBCDIC environment the handling of \c is compatible with the
2109
    specification in the perlebcdic document. The following character must be
2110
    a letter or one of small number of special characters. These provide a
2111
    means of defining the character values 0-31.
2112

2113
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2114
    the EBCDIC value of 'c' explicitly. */
2115

2116
#if defined EBCDIC && 'a' != 0x81
2117
    case 0x83:
2118
#else
2119
    case CHAR_c:
2120
#endif
2121
    if (ptr >= ptrend)
2122
      {
2123
      *errorcodeptr = ERR2;
2124
      break;
2125
      }
2126
    c = *ptr;
2127
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2128

2129
    /* Handle \c in an ASCII/Unicode environment. */
2130

2131
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2132
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2133
      {
2134
      *errorcodeptr = ERR68;
2135
      break;
2136
      }
2137
    c ^= 0x40;
2138

2139
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2140
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2141
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2142
    The other valid sequences correspond to a list of specific characters. */
2143

2144
#else
2145
    if (c == CHAR_QUESTION_MARK)
2146
      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2147
    else
2148
      {
2149
      for (i = 0; i < 32; i++)
2150
        {
2151
        if (c == ebcdic_escape_c[i]) break;
2152
        }
2153
      if (i < 32) c = i; else *errorcodeptr = ERR68;
2154
      }
2155
#endif  /* EBCDIC */
2156

2157
    ptr++;
2158
    break;
2159

2160
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2161
    if in warning mode, but PCRE doesn't have a warning mode. */
2162

2163
    default:
2164
    *errorcodeptr = ERR3;
2165
    *ptrptr = ptr - 1;     /* Point to the character at fault */
2166
    return 0;
2167
    }
2168
  }
2169

2170
/* Set the pointer to the next character before returning. */
2171

2172
*ptrptr = ptr;
2173
*chptr = c;
2174
return escape;
2175
}
2176

2177

2178

2179
#ifdef SUPPORT_UNICODE
2180
/*************************************************
2181
*               Handle \P and \p                 *
2182
*************************************************/
2183

2184
/* This function is called after \P or \p has been encountered, provided that
2185
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2186
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2187
after the final code unit of the escape sequence.
2188

2189
Arguments:
2190
  ptrptr         the pattern position pointer
2191
  negptr         a boolean that is set TRUE for negation else FALSE
2192
  ptypeptr       an unsigned int that is set to the type value
2193
  pdataptr       an unsigned int that is set to the detailed property value
2194
  errorcodeptr   the error code variable
2195
  cb             the compile data
2196

2197
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2198
*/
2199

2200
static BOOL
2201
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2202
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2203
{
2204
PCRE2_UCHAR c;
2205
PCRE2_SIZE i, bot, top;
2206
PCRE2_SPTR ptr = *ptrptr;
2207
PCRE2_UCHAR name[50];
2208
PCRE2_UCHAR *vptr = NULL;
2209
uint16_t ptscript = PT_NOTSCRIPT;
2210

2211
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2212
c = *ptr++;
2213
*negptr = FALSE;
2214

2215
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2216
negation. We must be handling Unicode encoding here, though we may be compiling
2217
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2218
input and Unicode input in the same build.) In accordance with Unicode's "loose
2219
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2220
don't use isspace() or tolower() because (a) code points may be greater than
2221
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2222
environment. */
2223

2224
if (c == CHAR_LEFT_CURLY_BRACKET)
2225
  {
2226
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2227

2228
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2229
    {
2230
    REDO:
2231

2232
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2233
    c = *ptr++;
2234

2235
    /* Skip ignorable Unicode characters. */
2236

2237
    while (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2238
          (c >= CHAR_HT && c <= CHAR_CR))
2239
      {
2240
      if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2241
      c = *ptr++;
2242
      }
2243

2244
    /* The first significant character being circumflex negates the meaning of
2245
    the item. */
2246

2247
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2248
      {
2249
      *negptr = TRUE;
2250
      goto REDO;
2251
      }
2252

2253
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2254

2255
    /* Names consist of ASCII letters and digits, but equals and colon may also
2256
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2257
    check for a value between '&' and 'z' suffices because anything else in a
2258
    name or value will cause an "unknown property" error anyway. */
2259

2260
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2261

2262
    /* Lower case a capital letter or remember where the name/value separator
2263
    is. */
2264

2265
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2266
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2267
      vptr = name + i;
2268

2269
    name[i] = c;
2270
    }
2271

2272
  /* Error if the loop didn't end with '}' - either we hit the end of the
2273
  pattern or the name was longer than any legal property name. */
2274

2275
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2276
  name[i] = 0;
2277
  }
2278

2279
/* If { doesn't follow \p or \P there is just one following character, which
2280
must be an ASCII letter. */
2281

2282
else if (c >= CHAR_A && c <= CHAR_Z)
2283
  {
2284
  name[0] = c | 0x20;  /* Lower case */
2285
  name[1] = 0;
2286
  }
2287
else if (c >= CHAR_a && c <= CHAR_z)
2288
  {
2289
  name[0] = c;
2290
  name[1] = 0;
2291
  }
2292
else goto ERROR_RETURN;
2293

2294
*ptrptr = ptr;   /* Update pattern pointer */
2295

2296
/* If the property contains ':' or '=' we have class name and value separately
2297
specified. The following are supported:
2298

2299
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2300
  . Script (synonym sc) for which the property name is the script name
2301
  . Script_Extensions (synonym scx), ditto
2302

2303
As this is a small number, we currently just check the names directly. If this
2304
grows, a sorted table and a switch will be neater.
2305

2306
For both the script properties, set a PT_xxx value so that (1) they can be
2307
distinguished and (2) invalid script names that happen to be the name of
2308
another property can be diagnosed. */
2309

2310
if (vptr != NULL)
2311
  {
2312
  int offset = 0;
2313
  PCRE2_UCHAR sname[8];
2314

2315
  *vptr = 0;   /* Terminate property name */
2316
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2317
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2318
    {
2319
    offset = 4;
2320
    sname[0] = CHAR_b;
2321
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2322
    sname[2] = CHAR_d;
2323
    sname[3] = CHAR_i;
2324
    }
2325

2326
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2327
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2328
    ptscript = PT_SC;
2329

2330
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2331
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2332
    ptscript = PT_SCX;
2333

2334
  else
2335
    {
2336
    *errorcodeptr = ERR47;
2337
    return FALSE;
2338
    }
2339

2340
  /* Adjust the string in name[] as needed */
2341

2342
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2343
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2344
  }
2345

2346
/* Search for a recognized property using binary chop. */
2347

2348
bot = 0;
2349
top = PRIV(utt_size);
2350

2351
while (bot < top)
2352
  {
2353
  int r;
2354
  i = (bot + top) >> 1;
2355
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2356

2357
  /* When a matching property is found, some extra checking is needed when the
2358
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2359

2360
  if (r == 0)
2361
    {
2362
    *pdataptr = PRIV(utt)[i].value;
2363
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2364
      {
2365
      *ptypeptr = PRIV(utt)[i].type;
2366
      return TRUE;
2367
      }
2368

2369
    switch (PRIV(utt)[i].type)
2370
      {
2371
      case PT_SC:
2372
      *ptypeptr = PT_SC;
2373
      return TRUE;
2374

2375
      case PT_SCX:
2376
      *ptypeptr = ptscript;
2377
      return TRUE;
2378
      }
2379

2380
    break;  /* Non-script found */
2381
    }
2382

2383
  if (r > 0) bot = i + 1; else top = i;
2384
  }
2385

2386
*errorcodeptr = ERR47;   /* Unrecognized property */
2387
return FALSE;
2388

2389
ERROR_RETURN:            /* Malformed \P or \p */
2390
*errorcodeptr = ERR46;
2391
*ptrptr = ptr;
2392
return FALSE;
2393
}
2394
#endif
2395

2396

2397

2398
/*************************************************
2399
*           Check for POSIX class syntax         *
2400
*************************************************/
2401

2402
/* This function is called when the sequence "[:" or "[." or "[=" is
2403
encountered in a character class. It checks whether this is followed by a
2404
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2405
reach an unescaped ']' without the special preceding character, return FALSE.
2406

2407
Originally, this function only recognized a sequence of letters between the
2408
terminators, but it seems that Perl recognizes any sequence of characters,
2409
though of course unknown POSIX names are subsequently rejected. Perl gives an
2410
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2411
didn't consider this to be a POSIX class. Likewise for [:1234:].
2412

2413
The problem in trying to be exactly like Perl is in the handling of escapes. We
2414
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2415
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2416
below handles the special cases \\ and \], but does not try to do any other
2417
escape processing. This makes it different from Perl for cases such as
2418
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2419
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2420
when Perl does, I think.
2421

2422
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2423
It seems that the appearance of a nested POSIX class supersedes an apparent
2424
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2425
a digit. This is handled by returning FALSE if the start of a new group with
2426
the same terminator is encountered, since the next closing sequence must close
2427
the nested group, not the outer one.
2428

2429
In Perl, unescaped square brackets may also appear as part of class names. For
2430
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2431
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2432
seem right at all. PCRE does not allow closing square brackets in POSIX class
2433
names.
2434

2435
Arguments:
2436
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2437
  ptrend   pointer to the end of the pattern
2438
  endptr   where to return a pointer to the terminating ':', '.', or '='
2439

2440
Returns:   TRUE or FALSE
2441
*/
2442

2443
static BOOL
2444
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2445
{
2446
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2447
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2448

2449
for (; ptrend - ptr >= 2; ptr++)
2450
  {
2451
  if (*ptr == CHAR_BACKSLASH &&
2452
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2453
    ptr++;
2454

2455
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2456
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2457

2458
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2459
    {
2460
    *endptr = ptr;
2461
    return TRUE;
2462
    }
2463
  }
2464

2465
return FALSE;
2466
}
2467

2468

2469

2470
/*************************************************
2471
*          Check POSIX class name                *
2472
*************************************************/
2473

2474
/* This function is called to check the name given in a POSIX-style class entry
2475
such as [:alnum:].
2476

2477
Arguments:
2478
  ptr        points to the first letter
2479
  len        the length of the name
2480

2481
Returns:     a value representing the name, or -1 if unknown
2482
*/
2483

2484
static int
2485
check_posix_name(PCRE2_SPTR ptr, int len)
2486
{
2487
const char *pn = posix_names;
2488
int yield = 0;
2489
while (posix_name_lengths[yield] != 0)
2490
  {
2491
  if (len == posix_name_lengths[yield] &&
2492
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2493
  pn += posix_name_lengths[yield] + 1;
2494
  yield++;
2495
  }
2496
return -1;
2497
}
2498

2499

2500

2501
/*************************************************
2502
*       Read a subpattern or VERB name           *
2503
*************************************************/
2504

2505
/* This function is called from parse_regex() below whenever it needs to read
2506
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2507
pointer must be to the preceding character. If that character is '*' we are
2508
reading a verb or alpha assertion name. The pointer is updated to point after
2509
the name, for a VERB or alpha assertion name, or after tha name's terminator
2510
for a subpattern name. Returning both the offset and the name pointer is
2511
redundant information, but some callers use one and some the other, so it is
2512
simplest just to return both. When the name is in braces, spaces and tabs are
2513
allowed (and ignored) at either end.
2514

2515
Arguments:
2516
  ptrptr      points to the character pointer variable
2517
  ptrend      points to the end of the input string
2518
  utf         true if the input is UTF-encoded
2519
  terminator  the terminator of a subpattern name must be this
2520
  offsetptr   where to put the offset from the start of the pattern
2521
  nameptr     where to put a pointer to the name in the input
2522
  namelenptr  where to put the length of the name
2523
  errcodeptr  where to put an error code
2524
  cb          pointer to the compile data block
2525

2526
Returns:    TRUE if a name was read
2527
            FALSE otherwise, with error code set
2528
*/
2529

2530
static BOOL
2531
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2532
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2533
  int *errorcodeptr, compile_block *cb)
2534
{
2535
PCRE2_SPTR ptr = *ptrptr;
2536
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2537
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2538

2539
if (is_braced)
2540
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2541

2542
if (ptr >= ptrend)                 /* No characters in name */
2543
  {
2544
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2545
                            ERR60; /* Verb not recognized or malformed */
2546
  goto FAILED;
2547
  }
2548

2549
*nameptr = ptr;
2550
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2551

2552
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2553
ought to be updated to match. */
2554

2555
/* In UTF mode, a group name may contain letters and decimal digits as defined
2556
by Unicode properties, and underscores, but must not start with a digit. */
2557

2558
#ifdef SUPPORT_UNICODE
2559
if (utf && is_group)
2560
  {
2561
  uint32_t c, type;
2562

2563
  GETCHAR(c, ptr);
2564
  type = UCD_CHARTYPE(c);
2565

2566
  if (type == ucp_Nd)
2567
    {
2568
    *errorcodeptr = ERR44;
2569
    goto FAILED;
2570
    }
2571

2572
  for(;;)
2573
    {
2574
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2575
        c != CHAR_UNDERSCORE) break;
2576
    ptr++;
2577
    FORWARDCHARTEST(ptr, ptrend);
2578
    if (ptr >= ptrend) break;
2579
    GETCHAR(c, ptr);
2580
    type = UCD_CHARTYPE(c);
2581
    }
2582
  }
2583
else
2584
#else
2585
(void)utf;  /* Avoid compiler warning */
2586
#endif      /* SUPPORT_UNICODE */
2587

2588
/* Handle non-group names and group names in non-UTF modes. A group name must
2589
not start with a digit. If either of the others start with a digit it just
2590
won't be recognized. */
2591

2592
  {
2593
  if (is_group && IS_DIGIT(*ptr))
2594
    {
2595
    *errorcodeptr = ERR44;
2596
    goto FAILED;
2597
    }
2598

2599
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2600
    {
2601
    ptr++;
2602
    }
2603
  }
2604

2605
/* Check name length */
2606

2607
if (ptr > *nameptr + MAX_NAME_SIZE)
2608
  {
2609
  *errorcodeptr = ERR48;
2610
  goto FAILED;
2611
  }
2612
*namelenptr = (uint32_t)(ptr - *nameptr);
2613

2614
/* Subpattern names must not be empty, and their terminator is checked here.
2615
(What follows a verb or alpha assertion name is checked separately.) */
2616

2617
if (is_group)
2618
  {
2619
  if (ptr == *nameptr)
2620
    {
2621
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2622
    goto FAILED;
2623
    }
2624
  if (is_braced)
2625
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2626
  if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2627
    {
2628
    *errorcodeptr = ERR42;
2629
    goto FAILED;
2630
    }
2631
  ptr++;
2632
  }
2633

2634
*ptrptr = ptr;
2635
return TRUE;
2636

2637
FAILED:
2638
*ptrptr = ptr;
2639
return FALSE;
2640
}
2641

2642

2643

2644
/*************************************************
2645
*          Manage callouts at start of cycle     *
2646
*************************************************/
2647

2648
/* At the start of a new item in parse_regex() we are able to record the
2649
details of the previous item in a prior callout, and also to set up an
2650
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2651
which would otherwise happen for items such as \Q that contribute nothing to
2652
the parsed pattern.
2653

2654
Arguments:
2655
  ptr              current pattern pointer
2656
  pcalloutptr      points to a pointer to previous callout, or NULL
2657
  auto_callout     TRUE if auto_callouts are enabled
2658
  parsed_pattern   the parsed pattern pointer
2659
  cb               compile block
2660

2661
Returns: possibly updated parsed_pattern pointer.
2662
*/
2663

2664
static uint32_t *
2665
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2666
  uint32_t *parsed_pattern, compile_block *cb)
2667
{
2668
uint32_t *previous_callout = *pcalloutptr;
2669

2670
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2671
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2672

2673
if (!auto_callout) previous_callout = NULL; else
2674
  {
2675
  if (previous_callout == NULL ||
2676
      previous_callout != parsed_pattern - 4 ||
2677
      previous_callout[3] != 255)
2678
    {
2679
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2680
    parsed_pattern += 4;
2681
    previous_callout[0] = META_CALLOUT_NUMBER;
2682
    previous_callout[2] = 0;
2683
    previous_callout[3] = 255;
2684
    }
2685
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2686
  }
2687

2688
*pcalloutptr = previous_callout;
2689
return parsed_pattern;
2690
}
2691

2692

2693

2694
/*************************************************
2695
*          Handle \d, \D, \s, \S, \w, \W         *
2696
*************************************************/
2697

2698
/* This function is called from parse_regex() below, both for freestanding
2699
escapes, and those within classes, to handle those escapes that may change when
2700
Unicode property support is requested. Note that PCRE2_UCP will never be set
2701
without Unicode support because that is checked when pcre2_compile() is called.
2702

2703
Arguments:
2704
  escape          the ESC_... value
2705
  parsed_pattern  where to add the code
2706
  options         options bits
2707
  xoptions        extra options bits
2708

2709
Returns:          updated value of parsed_pattern
2710
*/
2711
static uint32_t *
2712
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2713
  uint32_t xoptions)
2714
{
2715
uint32_t ascii_option = 0;
2716
uint32_t prop = ESC_p;
2717

2718
switch(escape)
2719
  {
2720
  case ESC_D:
2721
  prop = ESC_P;
2722
  /* Fall through */
2723
  case ESC_d:
2724
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2725
  break;
2726

2727
  case ESC_S:
2728
  prop = ESC_P;
2729
  /* Fall through */
2730
  case ESC_s:
2731
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2732
  break;
2733

2734
  case ESC_W:
2735
  prop = ESC_P;
2736
  /* Fall through */
2737
  case ESC_w:
2738
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2739
  break;
2740
  }
2741

2742
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2743
  {
2744
  *parsed_pattern++ = META_ESCAPE + escape;
2745
  }
2746
else
2747
  {
2748
  *parsed_pattern++ = META_ESCAPE + prop;
2749
  switch(escape)
2750
    {
2751
    case ESC_d:
2752
    case ESC_D:
2753
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2754
    break;
2755

2756
    case ESC_s:
2757
    case ESC_S:
2758
    *parsed_pattern++ = PT_SPACE << 16;
2759
    break;
2760

2761
    case ESC_w:
2762
    case ESC_W:
2763
    *parsed_pattern++ = PT_WORD << 16;
2764
    break;
2765
    }
2766
  }
2767

2768
return parsed_pattern;
2769
}
2770

2771

2772

2773
/*************************************************
2774
* Maximum size of parsed_pattern for given input *
2775
*************************************************/
2776

2777
/* This function is called from parse_regex() below, to determine the amount
2778
of memory to allocate for parsed_pattern. It is also called to check whether
2779
the amount of data written respects the amount of memory allocated.
2780

2781
Arguments:
2782
  ptr             points to the start of the pattern
2783
  ptrend          points to the end of the pattern
2784
  utf             TRUE in UTF mode
2785
  options         the options bits
2786

2787
Returns:          the number of uint32_t units for parsed_pattern
2788
*/
2789
static ptrdiff_t
2790
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2791
  uint32_t options)
2792
{
2793
PCRE2_SIZE big32count = 0;
2794
ptrdiff_t parsed_size_needed;
2795

2796
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2797
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2798
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2799
when literal characters greater than META_END (0x80000000) have to be coded as
2800
two units. In this case, therefore, we scan the pattern to check for such
2801
values. */
2802

2803
#if PCRE2_CODE_UNIT_WIDTH == 32
2804
if (!utf)
2805
  {
2806
  PCRE2_SPTR p;
2807
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2808
  }
2809
#else
2810
(void)utf;  /* Avoid compiler warning */
2811
#endif
2812

2813
parsed_size_needed = (ptrend - ptr) + big32count;
2814

2815
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
2816
elements) for each character. This is overkill, but memory is plentiful these
2817
days. */
2818

2819
if ((options & PCRE2_AUTO_CALLOUT) != 0)
2820
  parsed_size_needed += (ptrend - ptr) * 4;
2821

2822
return parsed_size_needed;
2823
}
2824

2825

2826

2827
/*************************************************
2828
*      Parse regex and identify named groups     *
2829
*************************************************/
2830

2831
/* This function is called first of all. It scans the pattern and does two
2832
things: (1) It identifies capturing groups and makes a table of named capturing
2833
groups so that information about them is fully available to both the compiling
2834
scans. (2) It writes a parsed version of the pattern with comments omitted and
2835
escapes processed into the parsed_pattern vector.
2836

2837
Arguments:
2838
  ptr             points to the start of the pattern
2839
  options         compiling dynamic options (may change during the scan)
2840
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2841
  cb              pointer to the compile data block
2842

2843
Returns:   zero on success or a non-zero error code, with the
2844
             error offset placed in the cb field
2845
*/
2846

2847
/* A structure and some flags for dealing with nested groups. */
2848

2849
typedef struct nest_save {
2850
  uint16_t  nest_depth;
2851
  uint16_t  reset_group;
2852
  uint16_t  max_group;
2853
  uint16_t  flags;
2854
  uint32_t  options;
2855
  uint32_t  xoptions;
2856
} nest_save;
2857

2858
#define NSF_RESET          0x0001u
2859
#define NSF_CONDASSERT     0x0002u
2860
#define NSF_ATOMICSR       0x0004u
2861

2862
/* Options that are changeable within the pattern must be tracked during
2863
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2864
but all must be tracked so that META_OPTIONS items set the correct values for
2865
the main compiling phase. */
2866

2867
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2868
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2869
  PCRE2_UNGREEDY)
2870

2871
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2872
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2873
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2874

2875
/* States used for analyzing ranges in character classes. The two OK values
2876
must be last. */
2877

2878
enum {
2879
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
2880
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
2881
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
2882
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
2883
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
2884
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
2885
};
2886

2887
/* States used for analyzing operators and operands in extended character
2888
classes. */
2889

2890
enum {
2891
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
2892
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
2893
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
2894
};
2895

2896
/* States used for determining the parse mode in character classes. The two
2897
PERL_EXT values must be last. */
2898

2899
enum {
2900
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
2901
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
2902
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
2903
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
2904
};
2905

2906
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2907
the storing of literal values in the main parsed pattern, where they can always
2908
be quantified. */
2909

2910
#if PCRE2_CODE_UNIT_WIDTH == 32
2911
#define PARSED_LITERAL(c, p) \
2912
  { \
2913
  if (c >= META_END) *p++ = META_BIGVALUE; \
2914
  *p++ = c; \
2915
  okquantifier = TRUE; \
2916
  }
2917
#else
2918
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2919
#endif
2920

2921
/* Here's the actual function. */
2922

2923
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
2924
  BOOL *has_lookbehind, compile_block *cb)
2925
{
2926
uint32_t c;
2927
uint32_t delimiter;
2928
uint32_t namelen;
2929
uint32_t class_range_state;
2930
uint32_t class_op_state;
2931
uint32_t class_mode_state;
2932
uint32_t *class_start;
2933
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2934
uint32_t *verbstartptr = NULL;
2935
uint32_t *previous_callout = NULL;
2936
uint32_t *parsed_pattern = cb->parsed_pattern;
2937
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2938
uint32_t *this_parsed_item = NULL;
2939
uint32_t *prev_parsed_item = NULL;
2940
uint32_t meta_quantifier = 0;
2941
uint32_t add_after_mark = 0;
2942
uint16_t nest_depth = 0;
2943
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
2944
int16_t class_maxdepth_m1 = -1;
2945
int after_manual_callout = 0;
2946
int expect_cond_assert = 0;
2947
int errorcode = 0;
2948
int escape;
2949
int i;
2950
BOOL inescq = FALSE;
2951
BOOL inverbname = FALSE;
2952
BOOL utf = (options & PCRE2_UTF) != 0;
2953
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2954
BOOL isdupname;
2955
BOOL negate_class;
2956
BOOL okquantifier = FALSE;
2957
PCRE2_SPTR thisptr;
2958
PCRE2_SPTR name;
2959
PCRE2_SPTR ptrend = cb->end_pattern;
2960
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2961
PCRE2_SPTR class_range_forbid_ptr = NULL;
2962
named_group *ng;
2963
nest_save *top_nest, *end_nests;
2964
#ifdef PCRE2_DEBUG
2965
uint32_t *parsed_pattern_check;
2966
ptrdiff_t parsed_pattern_extra = 0;
2967
ptrdiff_t parsed_pattern_extra_check = 0;
2968
PCRE2_SPTR ptr_check;
2969
#endif
2970

2971
PCRE2_ASSERT(parsed_pattern != NULL);
2972

2973
/* Insert leading items for word and line matching (features provided for the
2974
benefit of pcre2grep). */
2975

2976
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2977
  {
2978
  *parsed_pattern++ = META_CIRCUMFLEX;
2979
  *parsed_pattern++ = META_NOCAPTURE;
2980
  }
2981
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2982
  {
2983
  *parsed_pattern++ = META_ESCAPE + ESC_b;
2984
  *parsed_pattern++ = META_NOCAPTURE;
2985
  }
2986

2987
#ifdef PCRE2_DEBUG
2988
parsed_pattern_check = parsed_pattern;
2989
ptr_check = ptr;
2990
#endif
2991

2992
/* If the pattern is actually a literal string, process it separately to avoid
2993
cluttering up the main loop. */
2994

2995
if ((options & PCRE2_LITERAL) != 0)
2996
  {
2997
  while (ptr < ptrend)
2998
    {
2999
    if (parsed_pattern >= parsed_pattern_end)
3000
      {
3001
      PCRE2_DEBUG_UNREACHABLE();
3002
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3003
      goto FAILED;
3004
      }
3005
    thisptr = ptr;
3006
    GETCHARINCTEST(c, ptr);
3007
    if (auto_callout)
3008
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3009
        auto_callout, parsed_pattern, cb);
3010
    PARSED_LITERAL(c, parsed_pattern);
3011
    }
3012
  goto PARSED_END;
3013
  }
3014

3015
/* Process a real regex which may contain meta-characters. */
3016

3017
top_nest = NULL;
3018
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3019

3020
/* The size of the nest_save structure might not be a factor of the size of the
3021
workspace. Therefore we must round down end_nests so as to correctly avoid
3022
creating a nest_save that spans the end of the workspace. */
3023

3024
end_nests = (nest_save *)((char *)end_nests -
3025
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3026

3027
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3028

3029
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3030

3031
/* Now scan the pattern */
3032

3033
while (ptr < ptrend)
3034
  {
3035
  int prev_expect_cond_assert;
3036
  uint32_t min_repeat = 0, max_repeat = 0;
3037
  uint32_t set, unset, *optset;
3038
  uint32_t xset, xunset, *xoptset;
3039
  uint32_t terminator;
3040
  uint32_t prev_meta_quantifier;
3041
  BOOL prev_okquantifier;
3042
  PCRE2_SPTR tempptr;
3043
  PCRE2_SIZE offset;
3044

3045
  if (nest_depth > cb->cx->parens_nest_limit)
3046
    {
3047
    errorcode = ERR19;
3048
    goto FAILED;        /* Parentheses too deeply nested */
3049
    }
3050

3051
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3052
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3053
  write a little bit too much, everything will appear to be OK, because the
3054
  upfront size is an overestimate... but a malicious pattern could end up
3055
  forcing a write past the buffer end. We must catch this during
3056
  development. */
3057

3058
#ifdef PCRE2_DEBUG
3059
  /* Strong post-write check. Won't help in release builds - at this point
3060
  the write has already occurred so it's too late. However, should stop us
3061
  committing unsafe code. */
3062
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3063
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3064
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3065
  parsed_pattern_check = parsed_pattern;
3066
  parsed_pattern_extra_check = parsed_pattern_extra;
3067
  ptr_check = ptr;
3068
#endif
3069

3070
  if (parsed_pattern >= parsed_pattern_end)
3071
    {
3072
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3073
    (but the code below can write many chars). Better than nothing. */
3074
    PCRE2_DEBUG_UNREACHABLE();
3075
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3076
    goto FAILED;
3077
    }
3078

3079
  /* If the last time round this loop something was added, parsed_pattern will
3080
  no longer be equal to this_parsed_item. Remember where the previous item
3081
  started and reset for the next item. Note that sometimes round the loop,
3082
  nothing gets added (e.g. for ignored white space). */
3083

3084
  if (this_parsed_item != parsed_pattern)
3085
    {
3086
    prev_parsed_item = this_parsed_item;
3087
    this_parsed_item = parsed_pattern;
3088
    }
3089

3090
  /* Get next input character, save its position for callout handling. */
3091

3092
  thisptr = ptr;
3093
  GETCHARINCTEST(c, ptr);
3094

3095
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3096
  callouts, except when processing a (*VERB) "name".  */
3097

3098
  if (inescq)
3099
    {
3100
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3101
      {
3102
      inescq = FALSE;
3103
      ptr++;   /* Skip E */
3104
      }
3105
    else
3106
      {
3107
      if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
3108
        {                           /* expecting a conditional assertion, */
3109
        ptr--;                      /* but an empty \Q\E sequence is OK.  */
3110
        errorcode = ERR28;
3111
        goto FAILED;
3112
        }
3113
      if (inverbname)
3114
        {                          /* Don't use PARSED_LITERAL() because it */
3115
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3116
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3117
#endif
3118
        *parsed_pattern++ = c;
3119
        }
3120
      else
3121
        {
3122
        if (after_manual_callout-- <= 0)
3123
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3124
            auto_callout, parsed_pattern, cb);
3125
        PARSED_LITERAL(c, parsed_pattern);
3126
        }
3127
      meta_quantifier = 0;
3128
      }
3129
    continue;  /* Next character */
3130
    }
3131

3132
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3133
  characters up to the closing parenthesis are literals except when
3134
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3135
  and \E and escaped characters are allowed (no character types such as \d). If
3136
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3137
  this by not entering the special (*VERB:NAME) processing - they are then
3138
  picked up below. Note that c is a character, not a code unit, so we must not
3139
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3140
  TRUE in 8-bit mode. */
3141

3142
  if (inverbname &&
3143
       (
3144
        /* EITHER: not both options set */
3145
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3146
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3147
#ifdef SUPPORT_UNICODE
3148
        /* OR: character > 255 AND not Unicode Pattern White Space */
3149
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3150
#endif
3151
        /* OR: not a # comment or isspace() white space */
3152
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3153
#ifdef SUPPORT_UNICODE
3154
        /* and not CHAR_NEL when Unicode is supported */
3155
          && c != CHAR_NEL
3156
#endif
3157
       )))
3158
    {
3159
    PCRE2_SIZE verbnamelength;
3160

3161
    switch(c)
3162
      {
3163
      default:                     /* Don't use PARSED_LITERAL() because it */
3164
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3165
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3166
#endif
3167
      *parsed_pattern++ = c;
3168
      break;
3169

3170
      case CHAR_RIGHT_PARENTHESIS:
3171
      inverbname = FALSE;
3172
      /* This is the length in characters */
3173
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3174
      /* But the limit on the length is in code units */
3175
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3176
        {
3177
        ptr--;
3178
        errorcode = ERR76;
3179
        goto FAILED;
3180
        }
3181
      *verblengthptr = (uint32_t)verbnamelength;
3182

3183
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3184
      a (*MARK) was generated for the name. We now add the original verb as the
3185
      next item. */
3186

3187
      if (add_after_mark != 0)
3188
        {
3189
        *parsed_pattern++ = add_after_mark;
3190
        add_after_mark = 0;
3191
        }
3192
      break;
3193

3194
      case CHAR_BACKSLASH:
3195
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3196
        {
3197
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3198
          xoptions, cb->bracount, FALSE, cb);
3199
        if (errorcode != 0) goto FAILED;
3200
        }
3201
      else escape = 0;   /* Treat all as literal */
3202

3203
      switch(escape)
3204
        {
3205
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3206
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3207
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3208
#endif
3209
        *parsed_pattern++ = c;
3210
        break;
3211

3212
        case ESC_ub:
3213
        *parsed_pattern++ = CHAR_u;
3214
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3215
        break;
3216

3217
        case ESC_Q:
3218
        inescq = TRUE;
3219
        break;
3220

3221
        case ESC_E:           /* Ignore */
3222
        break;
3223

3224
        default:
3225
        errorcode = ERR40;    /* Invalid in verb name */
3226
        goto FAILED;
3227
        }
3228
      }
3229
    continue;   /* Next character in pattern */
3230
    }
3231

3232
  /* Not a verb name character. At this point we must process everything that
3233
  must not change the quantification state. This is mainly comments, but we
3234
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3235
  A+, as in Perl. An isolated \E is ignored. */
3236

3237
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3238
    {
3239
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3240
      {
3241
      inescq = *ptr == CHAR_Q;
3242
      ptr++;
3243
      continue;
3244
      }
3245
    }
3246

3247
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3248
  character, not a code unit, so we must not use MAX_255 to test its size
3249
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3250
  whitespace characters are those designated as "Pattern White Space" by
3251
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3252
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3253
  subset of space characters that match \h and \v. */
3254

3255
  if ((options & PCRE2_EXTENDED) != 0)
3256
    {
3257
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3258
#ifdef SUPPORT_UNICODE
3259
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3260
#endif
3261
    if (c == CHAR_NUMBER_SIGN)
3262
      {
3263
      while (ptr < ptrend)
3264
        {
3265
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3266
          {                       /* IS_NEWLINE sets cb->nllen. */
3267
          ptr += cb->nllen;
3268
          break;
3269
          }
3270
        ptr++;
3271
#ifdef SUPPORT_UNICODE
3272
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3273
#endif
3274
        }
3275
      continue;  /* Next character in pattern */
3276
      }
3277
    }
3278

3279
  /* Skip over bracketed comments */
3280

3281
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3282
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3283
    {
3284
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3285
    if (ptr >= ptrend)
3286
      {
3287
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3288
      goto FAILED;        /* to make it easier to debug. */
3289
      }
3290
    ptr++;
3291
    continue;  /* Next character in pattern */
3292
    }
3293

3294
  /* If the next item is not a quantifier, fill in length of any previous
3295
  callout and create an auto callout if required. */
3296

3297
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3298
       (c != CHAR_LEFT_CURLY_BRACKET ||
3299
         (tempptr = ptr,
3300
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3301
    {
3302
    if (after_manual_callout-- <= 0)
3303
      {
3304
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3305
        parsed_pattern, cb);
3306
      this_parsed_item = parsed_pattern;  /* New start for current item */
3307
      }
3308
    }
3309

3310
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3311
  assertion, possibly preceded by a callout. If the value is 1, we have just
3312
  had the callout and expect an assertion. There must be at least 3 more
3313
  characters in all cases. When expect_cond_assert is 2, we know that the
3314
  current character is an opening parenthesis, as otherwise we wouldn't be
3315
  here. However, when it is 1, we need to check, and it's easiest just to check
3316
  always. Note that expect_cond_assert may be negative, since all callouts just
3317
  decrement it. */
3318

3319
  if (expect_cond_assert > 0)
3320
    {
3321
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3322
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3323
    if (ok)
3324
      {
3325
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3326
        {
3327
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3328
        }
3329
      else switch(ptr[1])  /* Traditional symbolic format */
3330
        {
3331
        case CHAR_C:
3332
        ok = expect_cond_assert == 2;
3333
        break;
3334

3335
        case CHAR_EQUALS_SIGN:
3336
        case CHAR_EXCLAMATION_MARK:
3337
        break;
3338

3339
        case CHAR_LESS_THAN_SIGN:
3340
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3341
        break;
3342

3343
        default:
3344
        ok = FALSE;
3345
        }
3346
      }
3347

3348
    if (!ok)
3349
      {
3350
      ptr--;   /* Adjust error offset */
3351
      errorcode = ERR28;
3352
      goto FAILED;
3353
      }
3354
    }
3355

3356
  /* Remember whether we are expecting a conditional assertion, and set the
3357
  default for this item. */
3358

3359
  prev_expect_cond_assert = expect_cond_assert;
3360
  expect_cond_assert = 0;
3361

3362
  /* Remember quantification status for the previous significant item, then set
3363
  default for this item. */
3364

3365
  prev_okquantifier = okquantifier;
3366
  prev_meta_quantifier = meta_quantifier;
3367
  okquantifier = FALSE;
3368
  meta_quantifier = 0;
3369

3370
  /* If the previous significant item was a quantifier, adjust the parsed code
3371
  if there is a following modifier. The base meta value is always followed by
3372
  the PLUS and QUERY values, in that order. We do this here rather than after
3373
  reading a quantifier so that intervening comments and /x whitespace can be
3374
  ignored without having to replicate code. */
3375

3376
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3377
    {
3378
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3379
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3380
        0x00020000u : 0x00010000u);
3381
    continue;  /* Next character in pattern */
3382
    }
3383

3384
  /* Process the next item in the main part of a pattern. */
3385

3386
  switch(c)
3387
    {
3388
    default:              /* Non-special character */
3389
    PARSED_LITERAL(c, parsed_pattern);
3390
    break;
3391

3392

3393
    /* ---- Escape sequence ---- */
3394

3395
    case CHAR_BACKSLASH:
3396
    tempptr = ptr;
3397
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3398
      xoptions, cb->bracount, FALSE, cb);
3399
    if (errorcode != 0)
3400
      {
3401
      ESCAPE_FAILED:
3402
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3403
        goto FAILED;
3404
      ptr = tempptr;
3405
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3406
        {
3407
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3408
        }
3409
      escape = 0;                 /* Treat as literal character */
3410
      }
3411

3412
    /* The escape was a data escape or literal character. */
3413

3414
    if (escape == 0)
3415
      {
3416
      PARSED_LITERAL(c, parsed_pattern);
3417
      }
3418

3419
    /* The escape was a back (or forward) reference. We keep the offset in
3420
    order to give a more useful diagnostic for a bad forward reference. For
3421
    references to groups numbered less than 10 we can't use more than two items
3422
    in parsed_pattern because they may be just two characters in the input (and
3423
    in a 64-bit world an offset may need two elements). So for them, the offset
3424
    of the first occurrent is held in a special vector. */
3425

3426
    else if (escape < 0)
3427
      {
3428
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3429
      escape = -escape - 1;
3430
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3431
      if (escape < 10)
3432
        {
3433
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3434
          cb->small_ref_offset[escape] = offset;
3435
        }
3436
      else
3437
        {
3438
        PUTOFFSET(offset, parsed_pattern);
3439
        }
3440
      okquantifier = TRUE;
3441
      }
3442

3443
    /* The escape was a character class such as \d etc. or other special
3444
    escape indicator such as \A or \X. Most of them generate just a single
3445
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3446
    value. They are supported only when Unicode is available. The type and
3447
    value are packed into a single 32-bit value so that the whole sequences
3448
    uses only two elements in the parsed_vector. This is because the same
3449
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3450
    set.
3451

3452
    There are also some cases where the escape sequence is followed by a name:
3453
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3454
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3455
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3456
    and returned as a negative value (handled above). A name is coded as an
3457
    offset into the pattern and a length. */
3458

3459
    else switch (escape)
3460
      {
3461
      case ESC_C:
3462
#ifdef NEVER_BACKSLASH_C
3463
      errorcode = ERR85;
3464
      goto ESCAPE_FAILED;
3465
#else
3466
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3467
        {
3468
        errorcode = ERR83;
3469
        goto ESCAPE_FAILED;
3470
        }
3471
#endif
3472
      okquantifier = TRUE;
3473
      *parsed_pattern++ = META_ESCAPE + escape;
3474
      break;
3475

3476
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3477
      when \u{ is not followed by hex digits and }. It requests two literal
3478
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3479
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3480

3481
      case ESC_ub:
3482
      *parsed_pattern++ = CHAR_u;
3483
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3484
      break;
3485

3486
      case ESC_X:
3487
#ifndef SUPPORT_UNICODE
3488
      errorcode = ERR45;   /* Supported only with Unicode support */
3489
      goto ESCAPE_FAILED;
3490
#endif
3491
      case ESC_H:
3492
      case ESC_h:
3493
      case ESC_N:
3494
      case ESC_R:
3495
      case ESC_V:
3496
      case ESC_v:
3497
      okquantifier = TRUE;
3498
      *parsed_pattern++ = META_ESCAPE + escape;
3499
      break;
3500

3501
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3502
      *parsed_pattern++ = META_ESCAPE + escape;
3503
      break;
3504

3505
      /* Escapes that may change in UCP mode. */
3506

3507
      case ESC_d:
3508
      case ESC_D:
3509
      case ESC_s:
3510
      case ESC_S:
3511
      case ESC_w:
3512
      case ESC_W:
3513
      okquantifier = TRUE;
3514
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3515
        xoptions);
3516
      break;
3517

3518
      /* Unicode property matching */
3519

3520
      case ESC_P:
3521
      case ESC_p:
3522
#ifdef SUPPORT_UNICODE
3523
        {
3524
        BOOL negated;
3525
        uint16_t ptype = 0, pdata = 0;
3526
        if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3527
          goto ESCAPE_FAILED;
3528
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3529
        *parsed_pattern++ = META_ESCAPE + escape;
3530
        *parsed_pattern++ = (ptype << 16) | pdata;
3531
        okquantifier = TRUE;
3532
        }
3533
#else
3534
      errorcode = ERR45;
3535
      goto ESCAPE_FAILED;
3536
#endif
3537
      break;  /* End \P and \p */
3538

3539
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3540
      numerical or named subroutine call, and control comes here. When used
3541
      with brace delimiters it is a numerical back reference and does not come
3542
      here because check_escape() returns it directly as a reference. \k is
3543
      always a named back reference. */
3544

3545
      case ESC_g:
3546
      case ESC_k:
3547
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3548
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3549
        {
3550
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3551
        goto ESCAPE_FAILED;
3552
        }
3553
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3554
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3555
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3556

3557
      /* For a non-braced \g, check for a numerical recursion. */
3558

3559
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3560
        {
3561
        PCRE2_SPTR p = ptr + 1;
3562

3563
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3564
            &errorcode))
3565
          {
3566
          if (p >= ptrend || *p != terminator)
3567
            {
3568
            errorcode = ERR57;
3569
            goto ESCAPE_FAILED;
3570
            }
3571
          ptr = p;
3572
          goto SET_RECURSION;
3573
          }
3574
        if (errorcode != 0) goto ESCAPE_FAILED;
3575
        }
3576

3577
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3578
      before } but not for other delimiters. */
3579

3580
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3581
          &errorcode, cb)) goto ESCAPE_FAILED;
3582

3583
      /* \k and \g when used with braces are back references, whereas \g used
3584
      with quotes or angle brackets is a recursion */
3585

3586
      *parsed_pattern++ =
3587
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3588
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3589
      *parsed_pattern++ = namelen;
3590

3591
      PUTOFFSET(offset, parsed_pattern);
3592
      okquantifier = TRUE;
3593
      break;  /* End special escape processing */
3594
      }
3595
    break;    /* End escape sequence processing */
3596

3597

3598
    /* ---- Single-character special items ---- */
3599

3600
    case CHAR_CIRCUMFLEX_ACCENT:
3601
    *parsed_pattern++ = META_CIRCUMFLEX;
3602
    break;
3603

3604
    case CHAR_DOLLAR_SIGN:
3605
    *parsed_pattern++ = META_DOLLAR;
3606
    break;
3607

3608
    case CHAR_DOT:
3609
    *parsed_pattern++ = META_DOT;
3610
    okquantifier = TRUE;
3611
    break;
3612

3613

3614
    /* ---- Single-character quantifiers ---- */
3615

3616
    case CHAR_ASTERISK:
3617
    meta_quantifier = META_ASTERISK;
3618
    goto CHECK_QUANTIFIER;
3619

3620
    case CHAR_PLUS:
3621
    meta_quantifier = META_PLUS;
3622
    goto CHECK_QUANTIFIER;
3623

3624
    case CHAR_QUESTION_MARK:
3625
    meta_quantifier = META_QUERY;
3626
    goto CHECK_QUANTIFIER;
3627

3628

3629
    /* ---- Potential {n,m} quantifier ---- */
3630

3631
    case CHAR_LEFT_CURLY_BRACKET:
3632
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3633
        &errorcode))
3634
      {
3635
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3636
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3637
      break;                               /* No more quantifier processing */
3638
      }
3639
    meta_quantifier = META_MINMAX;
3640
    /* Fall through */
3641

3642

3643
    /* ---- Quantifier post-processing ---- */
3644

3645
    /* Check that a quantifier is allowed after the previous item. This
3646
    guarantees that there is a previous item. */
3647

3648
    CHECK_QUANTIFIER:
3649
    if (!prev_okquantifier)
3650
      {
3651
      errorcode = ERR9;
3652
      goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
3653
      }
3654

3655
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3656
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3657
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3658
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3659
    (*MARK) for when (*ACCEPT) has an argument. */
3660

3661
    if (*prev_parsed_item == META_ACCEPT)
3662
      {
3663
      uint32_t *p;
3664
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3665
      *verbstartptr = META_NOCAPTURE;
3666
      parsed_pattern[1] = META_KET;
3667
      parsed_pattern += 2;
3668

3669
#ifdef PCRE2_DEBUG
3670
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3671
      parsed_pattern_extra -= 2;
3672
#endif
3673
      }
3674

3675
    /* Now we can put the quantifier into the parsed pattern vector. At this
3676
    stage, we have only the basic quantifier. The check for a following + or ?
3677
    modifier happens at the top of the loop, after any intervening comments
3678
    have been removed. */
3679

3680
    *parsed_pattern++ = meta_quantifier;
3681
    if (c == CHAR_LEFT_CURLY_BRACKET)
3682
      {
3683
      *parsed_pattern++ = min_repeat;
3684
      *parsed_pattern++ = max_repeat;
3685
      }
3686
    break;
3687

3688

3689
    /* ---- Character class ---- */
3690

3691
    case CHAR_LEFT_SQUARE_BRACKET:
3692

3693
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3694
    used for "start of word" and "end of word". As these are otherwise illegal
3695
    sequences, we don't break anything by recognizing them. They are replaced
3696
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3697
    erroneous and are handled by the normal code below. */
3698

3699
    if (ptrend - ptr >= 6 &&
3700
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3701
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3702
      {
3703
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3704

3705
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3706
        {
3707
        *parsed_pattern++ = META_LOOKAHEAD;
3708
        }
3709
      else
3710
        {
3711
        *parsed_pattern++ = META_LOOKBEHIND;
3712
        *has_lookbehind = TRUE;
3713

3714
        /* The offset is used only for the "non-fixed length" error; this won't
3715
        occur here, so just store zero. */
3716

3717
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3718
        }
3719

3720
      if ((options & PCRE2_UCP) == 0)
3721
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3722
      else
3723
        {
3724
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3725
        *parsed_pattern++ = PT_WORD << 16;
3726
        }
3727
      *parsed_pattern++ = META_KET;
3728
      ptr += 6;
3729
      okquantifier = TRUE;
3730
      break;
3731
      }
3732

3733
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3734
    they are encountered at the top level, so we'll do that too. */
3735

3736
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3737
         *ptr == CHAR_EQUALS_SIGN) &&
3738
        check_posix_syntax(ptr, ptrend, &tempptr))
3739
      {
3740
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3741
      goto FAILED;
3742
      }
3743

3744
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3745
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3746

3747
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3748
    set c to the '[' character, and ptr to just after the '['. */
3749

3750
    FROM_PERL_EXTENDED_CLASS:
3751
    okquantifier = TRUE;
3752

3753
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3754
    because there are holes in the encoding, and simply using the range A-Z
3755
    (for example) would include the characters in the holes. This applies only
3756
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3757
    in this respect. In order to accommodate this, we keep track of whether
3758
    character values are literal or not, and a state variable for handling
3759
    ranges. */
3760

3761
    /* Loop for the contents of the class. Classes may be nested, if
3762
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3763

3764
    /* c is still set to '[' so the loop will handle the start of the class. */
3765

3766
    class_depth_m1 = -1;
3767
    class_maxdepth_m1 = -1;
3768
    class_range_state = RANGE_NO;
3769
    class_op_state = CLASS_OP_EMPTY;
3770
    class_start = NULL;
3771

3772
    for (;;)
3773
      {
3774
      BOOL char_is_literal = TRUE;
3775

3776
      /* Inside \Q...\E everything is literal except \E */
3777

3778
      if (inescq)
3779
        {
3780
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3781
          {
3782
          inescq = FALSE;                   /* Reset literal state */
3783
          ptr++;                            /* Skip the 'E' */
3784
          goto CLASS_CONTINUE;
3785
          }
3786

3787
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3788
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3789
        were're only giving an error if the \Q..\E is non-empty. */
3790

3791
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3792
          {
3793
          errorcode = ERR116;
3794
          goto FAILED;
3795
          }
3796

3797
        goto CLASS_LITERAL;
3798
        }
3799

3800
      /* Skip over space and tab (only) in extended-more mode, or anywhere
3801
      inside a Perl extended class (which implies /xx). */
3802

3803
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
3804
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
3805
           class_mode_state >= CLASS_MODE_PERL_EXT))
3806
        goto CLASS_CONTINUE;
3807

3808
      /* Handle POSIX class names. Perl allows a negation extension of the
3809
      form [:^name:]. A square bracket that doesn't match the syntax is
3810
      treated as a literal. We also recognize the POSIX constructions
3811
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3812
      5.6 and 5.8 do. */
3813

3814
      if (class_depth_m1 >= 0 &&
3815
          c == CHAR_LEFT_SQUARE_BRACKET &&
3816
          ptrend - ptr >= 3 &&
3817
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3818
           *ptr == CHAR_EQUALS_SIGN) &&
3819
          check_posix_syntax(ptr, ptrend, &tempptr))
3820
        {
3821
        BOOL posix_negate = FALSE;
3822
        int posix_class;
3823

3824
        /* Perl treats a hyphen before a POSIX class as a literal, not the
3825
        start of a range. However, it gives a warning in its warning mode. PCRE
3826
        does not have a warning mode, so we give an error, because this is
3827
        likely an error on the user's part. */
3828

3829
        if (class_range_state == RANGE_STARTED)
3830
          {
3831
          ptr = tempptr + 2;
3832
          errorcode = ERR50;
3833
          goto FAILED;
3834
          }
3835

3836
        /* Perl treats a hyphen after a POSIX class as a literal, not the
3837
        start of a range. However, it gives a warning in its warning mode
3838
        unless the hyphen is the last character in the class. PCRE does not
3839
        have a warning mode, so we give an error, because this is likely an
3840
        error on the user's part.
3841

3842
        Roll back to the hyphen for the error position. */
3843

3844
        if (class_range_state == RANGE_FORBID_STARTED)
3845
          {
3846
          ptr = class_range_forbid_ptr;
3847
          errorcode = ERR50;
3848
          goto FAILED;
3849
          }
3850

3851
        /* Disallow implicit union in Perl extended classes. */
3852

3853
        if (class_op_state == CLASS_OP_OPERAND &&
3854
            class_mode_state == CLASS_MODE_PERL_EXT)
3855
          {
3856
          ptr = tempptr + 2;
3857
          errorcode = ERR113;
3858
          goto FAILED;
3859
          }
3860

3861
        if (*ptr != CHAR_COLON)
3862
          {
3863
          ptr = tempptr + 2;
3864
          errorcode = ERR13;
3865
          goto FAILED;
3866
          }
3867

3868
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3869
          {
3870
          posix_negate = TRUE;
3871
          ptr++;
3872
          }
3873

3874
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3875
        ptr = tempptr + 2;
3876
        if (posix_class < 0)
3877
          {
3878
          errorcode = ERR30;
3879
          goto FAILED;
3880
          }
3881

3882
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
3883
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
3884
        (because it would be interpreted as range). */
3885

3886
        class_range_state = RANGE_FORBID_NO;
3887
        class_op_state = CLASS_OP_OPERAND;
3888

3889
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3890
        of the POSIX classes are converted to use Unicode properties \p or \P
3891
        or, in one case, \h or \H. The substitutes table has two values per
3892
        class, containing the type and value of a \p or \P item. The special
3893
        cases are specified with a negative type: a non-zero value causes \h or
3894
        \H to be used, and a zero value falls through to behave like a non-UCP
3895
        POSIX class. There are now also some extra options that force ASCII for
3896
        some classes. */
3897

3898
#ifdef SUPPORT_UNICODE
3899
        if ((options & PCRE2_UCP) != 0 &&
3900
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3901
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3902
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3903
          {
3904
          int ptype = posix_substitutes[2*posix_class];
3905
          int pvalue = posix_substitutes[2*posix_class + 1];
3906

3907
          if (ptype >= 0)
3908
            {
3909
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3910
            *parsed_pattern++ = (ptype << 16) | pvalue;
3911
            goto CLASS_CONTINUE;
3912
            }
3913

3914
          if (pvalue != 0)
3915
            {
3916
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3917
            goto CLASS_CONTINUE;
3918
            }
3919

3920
          /* Fall through */
3921
          }
3922
#endif  /* SUPPORT_UNICODE */
3923

3924
        /* Non-UCP POSIX class */
3925

3926
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3927
        *parsed_pattern++ = posix_class;
3928
        }
3929

3930
      /* Check for the start of the outermost class, or the start of a nested class. */
3931

3932
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
3933
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
3934
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
3935
               (c == CHAR_LEFT_PARENTHESIS &&
3936
                class_mode_state == CLASS_MODE_PERL_EXT))
3937
        {
3938
        uint32_t start_c = c;
3939
        uint32_t new_class_mode_state;
3940

3941
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
3942
        class. */
3943

3944
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
3945
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
3946
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
3947
        else
3948
          new_class_mode_state = class_mode_state;
3949

3950
        /* Tidy up the other class before starting the nested class. */
3951
        /* -[ beginning a nested class is a literal '-' */
3952

3953
        if (class_range_state == RANGE_STARTED)
3954
          parsed_pattern[-1] = CHAR_MINUS;
3955

3956
        /* Disallow implicit union in Perl extended classes. */
3957

3958
        if (class_op_state == CLASS_OP_OPERAND &&
3959
            class_mode_state == CLASS_MODE_PERL_EXT)
3960
          {
3961
          errorcode = ERR113;
3962
          goto FAILED;
3963
          }
3964

3965
        /* Validate nesting depth */
3966
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
3967
          {
3968
          errorcode = ERR107;
3969
          goto FAILED;        /* Classes too deeply nested */
3970
          }
3971

3972
        /* Process the character class start. If the first character is '^', set
3973
        the negation flag. If the first few characters (either before or after ^)
3974
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3975
        This makes for compatibility with Perl. */
3976

3977
        negate_class = FALSE;
3978
        for (;;)
3979
          {
3980
          if (ptr >= ptrend)
3981
            {
3982
            if (start_c == CHAR_LEFT_PARENTHESIS)
3983
              errorcode = ERR14;  /* Missing terminating ')' */
3984
            else
3985
              errorcode = ERR6;   /* Missing terminating ']' */
3986
            goto FAILED;
3987
            }
3988

3989
          GETCHARINCTEST(c, ptr);
3990
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
3991
          else if (c == CHAR_BACKSLASH)
3992
            {
3993
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3994
            else if (ptrend - ptr >= 3 &&
3995
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3996
              ptr += 3;
3997
            else
3998
              break;
3999
            }
4000
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4001
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4002
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4003
            continue;
4004
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4005
            negate_class = TRUE;
4006
          else break;
4007
          }
4008

4009
        /* Now the real contents of the class; c has the first "real" character.
4010
        Empty classes are permitted only if the option is set, and if it's not
4011
        a Perl-extended class. */
4012

4013
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4014
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4015
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4016
          {
4017
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4018

4019
          if (class_start != NULL)
4020
            {
4021
            PCRE2_ASSERT(class_depth_m1 >= 0);
4022
            /* Represents that the class is an extended class. */
4023
            *class_start |= CLASS_IS_ECLASS;
4024
            class_start = NULL;
4025
            }
4026

4027
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4028

4029
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4030
          very first (top-level) class being empty. */
4031
          if (class_depth_m1 < 0) break;
4032

4033
          class_range_state = RANGE_NO; /* for processing the containing class */
4034
          class_op_state = CLASS_OP_OPERAND;
4035
          goto CLASS_CONTINUE;
4036
          }
4037

4038
        /* Enter a non-empty class. */
4039

4040
        if (class_start != NULL)
4041
          {
4042
          PCRE2_ASSERT(class_depth_m1 >= 0);
4043
          /* Represents that the class is an extended class. */
4044
          *class_start |= CLASS_IS_ECLASS;
4045
          class_start = NULL;
4046
          }
4047

4048
        class_start = parsed_pattern;
4049
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4050
        class_range_state = RANGE_NO;
4051
        class_op_state = CLASS_OP_EMPTY;
4052
        class_mode_state = new_class_mode_state;
4053
        ++class_depth_m1;
4054
        if (class_maxdepth_m1 < class_depth_m1)
4055
          class_maxdepth_m1 = class_depth_m1;
4056
        /* Reset; no op seen yet at new depth. */
4057
        cb->class_op_used[class_depth_m1] = 0;
4058

4059
        /* Implement the special start-of-class literal meaning of ']'. */
4060
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4061
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4062
          {
4063
          class_range_state = RANGE_OK_LITERAL;
4064
          class_op_state = CLASS_OP_OPERAND;
4065
          PARSED_LITERAL(c, parsed_pattern);
4066
          goto CLASS_CONTINUE;
4067
          }
4068

4069
        continue;  /* We have already loaded c with the next character */
4070
        }
4071

4072
      /* Check for the end of the class. */
4073

4074
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4075
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4076
        {
4077
        /* In Perl extended mode, the ']' can only be used to match the
4078
        opening '[', and ')' must match an opening parenthesis. */
4079
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4080
          {
4081
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4082
            {
4083
            errorcode = ERR14;
4084
            goto FAILED_BACK;
4085
            }
4086
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4087
            {
4088
            errorcode = ERR22;
4089
            goto FAILED;
4090
            }
4091
          }
4092

4093
        /* Check no trailing operator. */
4094
        if (class_op_state == CLASS_OP_OPERATOR)
4095
          {
4096
          errorcode = ERR110;
4097
          goto FAILED;
4098
          }
4099

4100
        /* Check no empty expression for Perl extended expressions. */
4101
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4102
            class_op_state == CLASS_OP_EMPTY)
4103
          {
4104
          errorcode = ERR114;
4105
          goto FAILED;
4106
          }
4107

4108
        /* -] at the end of a class is a literal '-' */
4109
        if (class_range_state == RANGE_STARTED)
4110
          parsed_pattern[-1] = CHAR_MINUS;
4111

4112
        *parsed_pattern++ = META_CLASS_END;
4113

4114
        if (--class_depth_m1 < 0)
4115
          {
4116
          /* Check for and consume ')' after '(?[...]'. */
4117
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4118
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4119
            {
4120
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4121
              {
4122
              errorcode = ERR115;
4123
              goto FAILED;
4124
              }
4125

4126
            ptr++;
4127
            }
4128

4129
          break;
4130
          }
4131

4132
        class_range_state = RANGE_NO; /* for processing the containing class */
4133
        class_op_state = CLASS_OP_OPERAND;
4134
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4135
          class_mode_state = CLASS_MODE_PERL_EXT;
4136
        /* The extended class flag has already
4137
        been set for the parent class. */
4138
        class_start = NULL;
4139
        }
4140

4141
      /* Handle a Perl set binary operator */
4142

4143
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4144
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4145
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4146
        {
4147
        /* Check that there was a preceding operand. */
4148
        if (class_op_state != CLASS_OP_OPERAND)
4149
          {
4150
          errorcode = ERR109;
4151
          goto FAILED;
4152
          }
4153

4154
        if (class_start != NULL)
4155
          {
4156
          PCRE2_ASSERT(class_depth_m1 >= 0);
4157
          /* Represents that the class is an extended class. */
4158
          *class_start |= CLASS_IS_ECLASS;
4159
          class_start = NULL;
4160
          }
4161

4162
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4163
                     class_range_state != RANGE_FORBID_STARTED);
4164

4165
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4166
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4167
                            c == CHAR_MINUS? META_ECLASS_SUB :
4168
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4169
                            META_ECLASS_XOR;
4170
        class_range_state = RANGE_NO;
4171
        class_op_state = CLASS_OP_OPERATOR;
4172
        }
4173

4174
      /* Handle a Perl set unary operator */
4175

4176
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4177
               c == CHAR_EXCLAMATION_MARK)
4178
        {
4179
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4180
        start of the class, or follows an operator). */
4181
        if (class_op_state == CLASS_OP_OPERAND)
4182
          {
4183
          errorcode = ERR113;
4184
          goto FAILED;
4185
          }
4186

4187
        if (class_start != NULL)
4188
          {
4189
          PCRE2_ASSERT(class_depth_m1 >= 0);
4190
          /* Represents that the class is an extended class. */
4191
          *class_start |= CLASS_IS_ECLASS;
4192
          class_start = NULL;
4193
          }
4194

4195
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4196
                     class_range_state != RANGE_FORBID_STARTED);
4197

4198
        *parsed_pattern++ = META_ECLASS_NOT;
4199
        class_range_state = RANGE_NO;
4200
        class_op_state = CLASS_OP_OPERATOR;
4201
        }
4202

4203
      /* Handle a UTS#18 set operator */
4204

4205
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4206
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4207
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4208
               ptr < ptrend && *ptr == c)
4209
        {
4210
        ++ptr;
4211

4212
        /* Check there isn't a triple-repetition. */
4213
        if (ptr < ptrend && *ptr == c)
4214
          {
4215
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4216
          errorcode = ERR108;
4217
          goto FAILED;
4218
          }
4219

4220
        /* Check for a preceding operand. */
4221
        if (class_op_state != CLASS_OP_OPERAND)
4222
          {
4223
          errorcode = ERR109;
4224
          goto FAILED;
4225
          }
4226

4227
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4228
        if (cb->class_op_used[class_depth_m1] != 0 &&
4229
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4230
          {
4231
          errorcode = ERR111;
4232
          goto FAILED;
4233
          }
4234

4235
        if (class_start != NULL)
4236
          {
4237
          PCRE2_ASSERT(class_depth_m1 >= 0);
4238
          /* Represents that the class is an extended class. */
4239
          *class_start |= CLASS_IS_ECLASS;
4240
          class_start = NULL;
4241
          }
4242

4243
        /* Dangling '-' before an operator is a literal */
4244
        if (class_range_state == RANGE_STARTED)
4245
          parsed_pattern[-1] = CHAR_MINUS;
4246

4247
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4248
                            c == CHAR_MINUS? META_ECLASS_SUB :
4249
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4250
                            META_ECLASS_XOR;
4251
        class_range_state = RANGE_NO;
4252
        class_op_state = CLASS_OP_OPERATOR;
4253
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4254
        }
4255

4256
      /* Handle escapes in a class */
4257

4258
      else if (c == CHAR_BACKSLASH)
4259
        {
4260
        tempptr = ptr;
4261
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4262
          xoptions, cb->bracount, TRUE, cb);
4263

4264
        if (errorcode != 0)
4265
          {
4266
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4267
              class_mode_state >= CLASS_MODE_PERL_EXT)
4268
            goto FAILED;
4269
          ptr = tempptr;
4270
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4271
            {
4272
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4273
            }
4274
          escape = 0;                 /* Treat as literal character */
4275
          }
4276

4277
        switch(escape)
4278
          {
4279
          case 0:  /* Escaped character code point is in c */
4280
          char_is_literal = FALSE;
4281
          goto CLASS_LITERAL;      /* (a few lines above) */
4282

4283
          case ESC_b:
4284
          c = CHAR_BS;    /* \b is backspace in a class */
4285
          char_is_literal = FALSE;
4286
          goto CLASS_LITERAL;
4287

4288
          case ESC_k:
4289
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4290
          char_is_literal = FALSE;
4291
          goto CLASS_LITERAL;
4292

4293
          case ESC_Q:
4294
          inescq = TRUE;  /* Enter literal mode */
4295
          goto CLASS_CONTINUE;
4296

4297
          case ESC_E:     /* Ignore orphan \E */
4298
          goto CLASS_CONTINUE;
4299

4300
          case ESC_B:     /* Always an error in a class */
4301
          case ESC_R:
4302
          case ESC_X:
4303
          errorcode = ERR7;
4304
          ptr--;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4305
          goto FAILED;
4306

4307
          case ESC_N:     /* Not permitted by Perl either */
4308
          errorcode = ERR71;
4309
          goto FAILED;
4310

4311
          case ESC_H:
4312
          case ESC_h:
4313
          case ESC_V:
4314
          case ESC_v:
4315
          *parsed_pattern++ = META_ESCAPE + escape;
4316
          break;
4317

4318
          /* These escapes may be converted to Unicode property tests when
4319
          PCRE2_UCP is set. */
4320

4321
          case ESC_d:
4322
          case ESC_D:
4323
          case ESC_s:
4324
          case ESC_S:
4325
          case ESC_w:
4326
          case ESC_W:
4327
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4328
            xoptions);
4329
          break;
4330

4331
          /* Explicit Unicode property matching */
4332

4333
          case ESC_P:
4334
          case ESC_p:
4335
#ifdef SUPPORT_UNICODE
4336
            {
4337
            BOOL negated;
4338
            uint16_t ptype = 0, pdata = 0;
4339
            if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
4340
              goto FAILED;
4341

4342
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4343
            get converted to the general characteristic L&. That is, upper,
4344
            lower, and title case letters are all conflated. */
4345

4346
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4347
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4348
              {
4349
              ptype = PT_LAMP;
4350
              pdata = 0;
4351
              }
4352

4353
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4354
            *parsed_pattern++ = META_ESCAPE + escape;
4355
            *parsed_pattern++ = (ptype << 16) | pdata;
4356
            }
4357
#else
4358
          errorcode = ERR45;
4359
          goto FAILED;
4360
#endif
4361
          break;  /* End \P and \p */
4362

4363
          /* All others are not allowed in a class */
4364

4365
          default:
4366
          PCRE2_DEBUG_UNREACHABLE();
4367
          /* Fall through */
4368

4369
          case ESC_A:
4370
          case ESC_Z:
4371
          case ESC_z:
4372
          case ESC_G:
4373
          case ESC_K:
4374
          case ESC_C:
4375
          errorcode = ERR7;
4376
          ptr--;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4377
          goto FAILED;
4378
          }
4379

4380
        /* All the switch-cases above which end in "break" describe a set
4381
        of characters. None may start a range. */
4382

4383
        /* The second part of a range can be a single-character escape
4384
        sequence (detected above), but not any of the other escapes. Perl
4385
        treats a hyphen as a literal in such circumstances. However, in Perl's
4386
        warning mode, a warning is given, so PCRE now faults it, as it is
4387
        almost certainly a mistake on the user's part. */
4388

4389
        if (class_range_state == RANGE_STARTED)
4390
          {
4391
          errorcode = ERR50;
4392
          goto FAILED;
4393
          }
4394

4395
        /* Perl gives a warning unless the hyphen following a multi-character
4396
        escape is the last character in the class. PCRE throws an error. */
4397

4398
        if (class_range_state == RANGE_FORBID_STARTED)
4399
          {
4400
          ptr = class_range_forbid_ptr;
4401
          errorcode = ERR50;
4402
          goto FAILED;
4403
          }
4404

4405
        /* Disallow implicit union in Perl extended classes. */
4406

4407
        if (class_op_state == CLASS_OP_OPERAND &&
4408
            class_mode_state == CLASS_MODE_PERL_EXT)
4409
          {
4410
          errorcode = ERR113;
4411
          goto FAILED;
4412
          }
4413

4414
        class_range_state = RANGE_FORBID_NO;
4415
        class_op_state = CLASS_OP_OPERAND;
4416
        }
4417

4418
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4419
      Perl extended class. */
4420

4421
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4422
        {
4423
        errorcode = ERR116;
4424
        goto FAILED;
4425
        }
4426

4427
      /* Handle potential start of range */
4428

4429
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4430
        {
4431
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4432
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4433
        class_range_state = RANGE_STARTED;
4434
        }
4435

4436
      /* Handle forbidden start of range */
4437

4438
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4439
        {
4440
        *parsed_pattern++ = CHAR_MINUS;
4441
        class_range_state = RANGE_FORBID_STARTED;
4442
        class_range_forbid_ptr = ptr;
4443
        }
4444

4445
      /* Handle a literal character */
4446

4447
      else
4448
        {
4449
        CLASS_LITERAL:
4450

4451
        /* Disallow implicit union in Perl extended classes. */
4452

4453
        if (class_op_state == CLASS_OP_OPERAND &&
4454
            class_mode_state == CLASS_MODE_PERL_EXT)
4455
          {
4456
          errorcode = ERR113;
4457
          goto FAILED;
4458
          }
4459

4460
        if (class_range_state == RANGE_STARTED)
4461
          {
4462
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4463
            parsed_pattern--;
4464
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4465
            {
4466
            errorcode = ERR8;
4467
            goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4468
            }
4469
          else
4470
            {
4471
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4472
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4473
            PARSED_LITERAL(c, parsed_pattern);
4474
            }
4475
          class_range_state = RANGE_NO;
4476
          class_op_state = CLASS_OP_OPERAND;
4477
          }
4478
        else if (class_range_state == RANGE_FORBID_STARTED)
4479
          {
4480
          ptr = class_range_forbid_ptr;
4481
          errorcode = ERR50;
4482
          goto FAILED;
4483
          }
4484
        else  /* Potential start of range */
4485
          {
4486
          class_range_state = char_is_literal?
4487
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4488
          class_op_state = CLASS_OP_OPERAND;
4489
          PARSED_LITERAL(c, parsed_pattern);
4490
          }
4491
        }
4492

4493
      /* Proceed to next thing in the class. */
4494

4495
      CLASS_CONTINUE:
4496
      if (ptr >= ptrend)
4497
        {
4498
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4499
          errorcode = ERR14;   /* Missing terminating ')' */
4500
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4501
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4502
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4503
        else
4504
          errorcode = ERR6;    /* Missing terminating ']' */
4505
        goto FAILED;
4506
        }
4507
      GETCHARINCTEST(c, ptr);
4508
      }     /* End of class-processing loop */
4509

4510
    break;  /* End of character class */
4511

4512

4513
    /* ---- Opening parenthesis ---- */
4514

4515
    case CHAR_LEFT_PARENTHESIS:
4516
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4517

4518
    /* If ( is not followed by ? it is either a capture or a special verb or an
4519
    alpha assertion or a positive non-atomic lookahead. */
4520

4521
    if (*ptr != CHAR_QUESTION_MARK)
4522
      {
4523
      const char *vn;
4524

4525
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4526
      off). */
4527

4528
      if (*ptr != CHAR_ASTERISK)
4529
        {
4530
        nest_depth++;
4531
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4532
          {
4533
          if (cb->bracount >= MAX_GROUP_NUMBER)
4534
            {
4535
            errorcode = ERR97;
4536
            goto FAILED;
4537
            }
4538
          cb->bracount++;
4539
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4540
          }
4541
        else *parsed_pattern++ = META_NOCAPTURE;
4542
        }
4543

4544
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4545
      quantifier" error rather than "(*MARK) must have an argument". */
4546

4547
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4548
        break;
4549

4550
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4551
      synonyms for the historical symbolic assertions, but the script run and
4552
      non-atomic lookaround ones are new. They are distinguished by starting
4553
      with a lower case letter. Checking both ends of the alphabet makes this
4554
      work in all character codes. */
4555

4556
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4557
        {
4558
        uint32_t meta;
4559

4560
        vn = alasnames;
4561
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4562
          &errorcode, cb)) goto FAILED;
4563
        if (ptr >= ptrend || *ptr != CHAR_COLON)
4564
          {
4565
          errorcode = ERR95;  /* Malformed */
4566
          goto FAILED;
4567
          }
4568

4569
        /* Scan the table of alpha assertion names */
4570

4571
        for (i = 0; i < alascount; i++)
4572
          {
4573
          if (namelen == alasmeta[i].len &&
4574
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4575
            break;
4576
          vn += alasmeta[i].len + 1;
4577
          }
4578

4579
        if (i >= alascount)
4580
          {
4581
          errorcode = ERR95;  /* Alpha assertion not recognized */
4582
          goto FAILED;
4583
          }
4584

4585
        /* Check for expecting an assertion condition. If so, only atomic
4586
        lookaround assertions are valid. */
4587

4588
        meta = alasmeta[i].meta;
4589
        if (prev_expect_cond_assert > 0 &&
4590
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4591
          {
4592
          errorcode = ERR28;  /* Atomic assertion expected */
4593
          goto FAILED;
4594
          }
4595

4596
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4597
        to the code that handles the traditional symbolic forms. */
4598

4599
        switch(meta)
4600
          {
4601
          default:
4602
          PCRE2_DEBUG_UNREACHABLE();
4603
          errorcode = ERR89;  /* Unknown code; should never occur because */
4604
          goto FAILED;        /* the meta values come from a table above. */
4605

4606
          case META_ATOMIC:
4607
          goto ATOMIC_GROUP;
4608

4609
          case META_LOOKAHEAD:
4610
          goto POSITIVE_LOOK_AHEAD;
4611

4612
          case META_LOOKAHEAD_NA:
4613
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4614

4615
          case META_LOOKAHEADNOT:
4616
          goto NEGATIVE_LOOK_AHEAD;
4617

4618
          case META_SCS:
4619
          if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4620

4621
          if (*ptr != CHAR_LEFT_PARENTHESIS)
4622
            {
4623
            errorcode = ERR15;
4624
            goto FAILED;
4625
            }
4626

4627
          ptr++;
4628
          *parsed_pattern++ = META_SCS;
4629
          /* Temporary variable, zero in the first iteration. */
4630
          offset = 0;
4631

4632
          for (;;)
4633
            {
4634
            PCRE2_SIZE next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4635

4636
            /* Handle (scan_substring:([+-]number)... */
4637
            if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
4638
                &i, &errorcode))
4639
              {
4640
              PCRE2_ASSERT(i >= 0);
4641
              if (i <= 0)
4642
                {
4643
                errorcode = ERR15;
4644
                goto FAILED;
4645
                }
4646
              meta = META_SCS_NUMBER;
4647
              namelen = (uint32_t)i;
4648
              }
4649
            else if (errorcode != 0) goto FAILED;   /* Number too big */
4650
            else
4651
              {
4652
              if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4653

4654
              /* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */
4655
              if (*ptr == CHAR_LESS_THAN_SIGN)
4656
                terminator = CHAR_GREATER_THAN_SIGN;
4657
              else if (*ptr == CHAR_APOSTROPHE)
4658
                terminator = CHAR_APOSTROPHE;
4659
              else
4660
                {
4661
                errorcode = ERR15;
4662
                goto FAILED;
4663
                }
4664

4665
              if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
4666
                  &name, &namelen, &errorcode, cb)) goto FAILED;
4667

4668
              meta = META_SCS_NAME;
4669
              }
4670

4671
            PCRE2_ASSERT(next_offset > 0);
4672
            if (offset == 0 || (next_offset - offset) >= 0x10000)
4673
              {
4674
              *parsed_pattern++ = META_OFFSET;
4675
              PUTOFFSET(next_offset, parsed_pattern);
4676
              offset = next_offset;
4677
              }
4678

4679
            /* The offset is encoded as a relative offset, because for some
4680
            inputs such as ",2" in (*scs:(1,2,3)...), we only have space for
4681
            two uint32_t values, and an opcode and absolute offset may require
4682
            three uint32_t values. */
4683
            *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
4684
            *parsed_pattern++ = namelen;
4685
            offset = next_offset;
4686

4687
            if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4688

4689
            if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
4690

4691
            if (*ptr != CHAR_COMMA)
4692
              {
4693
              errorcode = ERR24;
4694
              goto FAILED;
4695
              }
4696

4697
            ptr++;
4698
            }
4699
          ptr++;
4700
          goto POST_ASSERTION;
4701

4702
          case META_LOOKBEHIND:
4703
          case META_LOOKBEHINDNOT:
4704
          case META_LOOKBEHIND_NA:
4705
          *parsed_pattern++ = meta;
4706
          ptr--;
4707
          goto POST_LOOKBEHIND;
4708

4709
          /* The script run facilities are handled here. Unicode support is
4710
          required (give an error if not, as this is a security issue). Always
4711
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4712
          META_ATOMIC and remember that we need two META_KETs at the end. */
4713

4714
          case META_SCRIPT_RUN:
4715
          case META_ATOMIC_SCRIPT_RUN:
4716
#ifdef SUPPORT_UNICODE
4717
          *parsed_pattern++ = META_SCRIPT_RUN;
4718
          nest_depth++;
4719
          ptr++;
4720
          if (meta == META_ATOMIC_SCRIPT_RUN)
4721
            {
4722
            *parsed_pattern++ = META_ATOMIC;
4723
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4724
            else if (++top_nest >= end_nests)
4725
              {
4726
              errorcode = ERR84;
4727
              goto FAILED;
4728
              }
4729
            top_nest->nest_depth = nest_depth;
4730
            top_nest->flags = NSF_ATOMICSR;
4731
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4732
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4733

4734
#ifdef PCRE2_DEBUG
4735
            /* We'll write out two META_KETs for a single ")" in the input
4736
            pattern, so we reserve space for that in our bounds check. */
4737
            parsed_pattern_extra++;
4738
#endif
4739
            }
4740
          break;
4741
#else  /* SUPPORT_UNICODE */
4742
          errorcode = ERR96;
4743
          goto FAILED;
4744
#endif
4745
          }
4746
        }
4747

4748

4749
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4750

4751
      else
4752
        {
4753
        vn = verbnames;
4754
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4755
          &errorcode, cb)) goto FAILED;
4756
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4757
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4758
          {
4759
          errorcode = ERR60;  /* Malformed */
4760
          goto FAILED;
4761
          }
4762

4763
        /* Scan the table of verb names */
4764

4765
        for (i = 0; i < verbcount; i++)
4766
          {
4767
          if (namelen == verbs[i].len &&
4768
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4769
            break;
4770
          vn += verbs[i].len + 1;
4771
          }
4772

4773
        if (i >= verbcount)
4774
          {
4775
          errorcode = ERR60;  /* Verb not recognized */
4776
          goto FAILED;
4777
          }
4778

4779
        /* An empty argument is treated as no argument. */
4780

4781
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4782
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4783
          ptr++;    /* Advance to the closing parens */
4784

4785
        /* Check for mandatory non-empty argument; this is (*MARK) */
4786

4787
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4788
          {
4789
          errorcode = ERR66;
4790
          goto FAILED;
4791
          }
4792

4793
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4794
        for handling quantified (*ACCEPT). */
4795

4796
        verbstartptr = parsed_pattern;
4797
        okquantifier = (verbs[i].meta == META_ACCEPT);
4798
#ifdef PCRE2_DEBUG
4799
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4800
        with a non-capturing bracket, if there is a following quantifier. */
4801
        if (okquantifier) parsed_pattern_extra += 2;
4802
#endif
4803

4804
        /* It appears that Perl allows any characters whatsoever, other than a
4805
        closing parenthesis, to appear in arguments ("names"), so we no longer
4806
        insist on letters, digits, and underscores. Perl does not, however, do
4807
        any interpretation within arguments, and has no means of including a
4808
        closing parenthesis. PCRE supports escape processing but only when it
4809
        is requested by an option. We set inverbname TRUE here, and let the
4810
        main loop take care of this so that escape and \x processing is done by
4811
        the main code above. */
4812

4813
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4814
          {
4815
          /* Some optional arguments can be treated as a preceding (*MARK) */
4816

4817
          if (verbs[i].has_arg < 0)
4818
            {
4819
            add_after_mark = verbs[i].meta;
4820
            *parsed_pattern++ = META_MARK;
4821
            }
4822

4823
          /* The remaining verbs with arguments (except *MARK) need a different
4824
          opcode. */
4825

4826
          else
4827
            {
4828
            *parsed_pattern++ = verbs[i].meta +
4829
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4830
            }
4831

4832
          /* Set up for reading the name in the main loop. */
4833

4834
          verblengthptr = parsed_pattern++;
4835
          verbnamestart = ptr;
4836
          inverbname = TRUE;
4837
          }
4838
        else  /* No verb "name" argument */
4839
          {
4840
          *parsed_pattern++ = verbs[i].meta;
4841
          }
4842
        }     /* End of (*VERB) handling */
4843
      break;  /* Done with this parenthesis */
4844
      }       /* End of groups that don't start with (? */
4845

4846

4847
    /* ---- Items starting (? ---- */
4848

4849
    /* The type of item is determined by what follows (?. Handle (?| and option
4850
    changes under "default" because both need a new block on the nest stack.
4851
    Comments starting with (?# are handled above. Note that there is some
4852
    ambiguity about the sequence (?- because if a digit follows it's a relative
4853
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4854

4855
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4856

4857
    switch(*ptr)
4858
      {
4859
      default:
4860
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4861
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4862

4863
      /* We now have either (?| or a (possibly empty) option setting,
4864
      optionally followed by a non-capturing group. */
4865

4866
      nest_depth++;
4867
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4868
      else if (++top_nest >= end_nests)
4869
        {
4870
        errorcode = ERR84;
4871
        goto FAILED;
4872
        }
4873
      top_nest->nest_depth = nest_depth;
4874
      top_nest->flags = 0;
4875
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
4876
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4877

4878
      /* Start of non-capturing group that resets the capture count for each
4879
      branch. */
4880

4881
      if (*ptr == CHAR_VERTICAL_LINE)
4882
        {
4883
        top_nest->reset_group = (uint16_t)cb->bracount;
4884
        top_nest->max_group = (uint16_t)cb->bracount;
4885
        top_nest->flags |= NSF_RESET;
4886
        cb->external_flags |= PCRE2_DUPCAPUSED;
4887
        *parsed_pattern++ = META_NOCAPTURE;
4888
        ptr++;
4889
        }
4890

4891
      /* Scan for options imnrsxJU to be set or unset. */
4892

4893
      else
4894
        {
4895
        BOOL hyphenok = TRUE;
4896
        uint32_t oldoptions = options;
4897
        uint32_t oldxoptions = xoptions;
4898

4899
        top_nest->reset_group = 0;
4900
        top_nest->max_group = 0;
4901
        set = unset = 0;
4902
        optset = &set;
4903
        xset = xunset = 0;
4904
        xoptset = &xset;
4905

4906
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4907

4908
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4909
          {
4910
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4911
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4912
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4913
          hyphenok = FALSE;
4914
          ptr++;
4915
          }
4916

4917
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4918
                               *ptr != CHAR_COLON)
4919
          {
4920
          switch (*ptr++)
4921
            {
4922
            case CHAR_MINUS:
4923
            if (!hyphenok)
4924
              {
4925
              errorcode = ERR94;
4926
              ptr--;  /* Correct the offset */
4927
              goto FAILED;
4928
              }
4929
            optset = &unset;
4930
            xoptset = &xunset;
4931
            hyphenok = FALSE;
4932
            break;
4933

4934
            /* There are some two-character sequences that start with 'a'. */
4935

4936
            case CHAR_a:
4937
            if (ptr < ptrend)
4938
              {
4939
              if (*ptr == CHAR_D)
4940
                {
4941
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4942
                ptr++;
4943
                break;
4944
                }
4945
              if (*ptr == CHAR_P)
4946
                {
4947
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4948
                ptr++;
4949
                break;
4950
                }
4951
              if (*ptr == CHAR_S)
4952
                {
4953
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4954
                ptr++;
4955
                break;
4956
                }
4957
              if (*ptr == CHAR_T)
4958
                {
4959
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4960
                ptr++;
4961
                break;
4962
                }
4963
              if (*ptr == CHAR_W)
4964
                {
4965
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4966
                ptr++;
4967
                break;
4968
                }
4969
              }
4970
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4971
                        PCRE2_EXTRA_ASCII_BSW|
4972
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4973
            break;
4974

4975
            case CHAR_J:  /* Record that it changed in the external options */
4976
            *optset |= PCRE2_DUPNAMES;
4977
            cb->external_flags |= PCRE2_JCHANGED;
4978
            break;
4979

4980
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
4981
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4982
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4983
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4984
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
4985
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4986

4987
            /* If x appears twice it sets the extended extended option. */
4988

4989
            case CHAR_x:
4990
            *optset |= PCRE2_EXTENDED;
4991
            if (ptr < ptrend && *ptr == CHAR_x)
4992
              {
4993
              *optset |= PCRE2_EXTENDED_MORE;
4994
              ptr++;
4995
              }
4996
            break;
4997

4998
            default:
4999
            errorcode = ERR11;
5000
            ptr--;    /* Correct the offset */
5001
            goto FAILED;
5002
            }
5003
          }
5004

5005
        /* If we are setting extended without extended-more, ensure that any
5006
        existing extended-more gets unset. Also, unsetting extended must also
5007
        unset extended-more. */
5008

5009
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5010
            (unset & PCRE2_EXTENDED) != 0)
5011
          unset |= PCRE2_EXTENDED_MORE;
5012

5013
        options = (options | set) & (~unset);
5014
        xoptions = (xoptions | xset) & (~xunset);
5015

5016
        /* If the options ended with ')' this is not the start of a nested
5017
        group with option changes, so the options change at this level.
5018
        In this case, if the previous level set up a nest block, discard the
5019
        one we have just created. Otherwise adjust it for the previous level.
5020
        If the options ended with ':' we are starting a non-capturing group,
5021
        possibly with an options setting. */
5022

5023
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5024
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5025
          {
5026
          nest_depth--;  /* This is not a nested group after all. */
5027
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5028
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5029
          else top_nest->nest_depth = nest_depth;
5030
          }
5031
        else *parsed_pattern++ = META_NOCAPTURE;
5032

5033
        /* If nothing changed, no need to record. */
5034

5035
        if (options != oldoptions || xoptions != oldxoptions)
5036
          {
5037
          *parsed_pattern++ = META_OPTIONS;
5038
          *parsed_pattern++ = options;
5039
          *parsed_pattern++ = xoptions;
5040
          }
5041
        }     /* End options processing */
5042
      break;  /* End default case after (? */
5043

5044

5045
      /* ---- Python syntax support ---- */
5046

5047
      case CHAR_P:
5048
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5049

5050
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5051

5052
      if (*ptr == CHAR_LESS_THAN_SIGN)
5053
        {
5054
        terminator = CHAR_GREATER_THAN_SIGN;
5055
        goto DEFINE_NAME;
5056
        }
5057

5058
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5059
      call. */
5060

5061
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5062

5063
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5064
      else after (?P is an error. */
5065

5066
      if (*ptr != CHAR_EQUALS_SIGN)
5067
        {
5068
        errorcode = ERR41;
5069
        goto FAILED;
5070
        }
5071
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5072
          &namelen, &errorcode, cb)) goto FAILED;
5073
      *parsed_pattern++ = META_BACKREF_BYNAME;
5074
      *parsed_pattern++ = namelen;
5075
      PUTOFFSET(offset, parsed_pattern);
5076
      okquantifier = TRUE;
5077
      break;   /* End of (?P processing */
5078

5079

5080
      /* ---- Recursion/subroutine calls by number ---- */
5081

5082
      case CHAR_R:
5083
      i = 0;         /* (?R) == (?R0) */
5084
      ptr++;
5085
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5086
        {
5087
        errorcode = ERR58;
5088
        goto FAILED;
5089
        }
5090
      goto SET_RECURSION;
5091

5092
      /* An item starting (?- followed by a digit comes here via the "default"
5093
      case because (?- followed by a non-digit is an options setting. */
5094

5095
      case CHAR_PLUS:
5096
      if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
5097
        {
5098
        errorcode = ERR29;   /* Missing number */
5099
        goto FAILED;
5100
        }
5101
      /* Fall through */
5102

5103
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5104
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5105
      RECURSION_BYNUMBER:
5106
      if (!read_number(&ptr, ptrend,
5107
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5108
          MAX_GROUP_NUMBER, ERR61,
5109
          &i, &errorcode)) goto FAILED;
5110
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5111
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5112
        goto UNCLOSED_PARENTHESIS;
5113

5114
      SET_RECURSION:
5115
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5116
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5117
      ptr++;
5118
      PUTOFFSET(offset, parsed_pattern);
5119
      okquantifier = TRUE;
5120
      break;  /* End of recursive call by number handling */
5121

5122

5123
      /* ---- Recursion/subroutine calls by name ---- */
5124

5125
      case CHAR_AMPERSAND:
5126
      RECURSE_BY_NAME:
5127
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5128
          &namelen, &errorcode, cb)) goto FAILED;
5129
      *parsed_pattern++ = META_RECURSE_BYNAME;
5130
      *parsed_pattern++ = namelen;
5131
      PUTOFFSET(offset, parsed_pattern);
5132
      okquantifier = TRUE;
5133
      break;
5134

5135
      /* ---- Callout with numerical or string argument ---- */
5136

5137
      case CHAR_C:
5138
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5139
        {
5140
        errorcode = ERR103;
5141
        goto FAILED;
5142
        }
5143

5144
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5145

5146
      /* If the previous item was a condition starting (?(? an assertion,
5147
      optionally preceded by a callout, is expected. This is checked later on,
5148
      during actual compilation. However we need to identify this kind of
5149
      assertion in this pass because it must not be qualified. The value of
5150
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5151
      for a callout - still leaving a positive value that identifies the
5152
      assertion. Multiple callouts or any other items will make it zero or
5153
      less, which doesn't matter because they will cause an error later. */
5154

5155
      expect_cond_assert = prev_expect_cond_assert - 1;
5156

5157
      /* If previous_callout is not NULL, it means this follows a previous
5158
      callout. If it was a manual callout, do nothing; this means its "length
5159
      of next pattern item" field will remain zero. If it was an automatic
5160
      callout, abolish it. */
5161

5162
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5163
          previous_callout == parsed_pattern - 4 &&
5164
          parsed_pattern[-1] == 255)
5165
        parsed_pattern = previous_callout;
5166

5167
      /* Save for updating next pattern item length, and skip one item before
5168
      completing. */
5169

5170
      previous_callout = parsed_pattern;
5171
      after_manual_callout = 1;
5172

5173
      /* Handle a string argument; specific delimiter is required. */
5174

5175
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5176
        {
5177
        PCRE2_SIZE calloutlength;
5178
        PCRE2_SPTR startptr = ptr;
5179

5180
        delimiter = 0;
5181
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5182
          {
5183
          if (*ptr == PRIV(callout_start_delims)[i])
5184
            {
5185
            delimiter = PRIV(callout_end_delims)[i];
5186
            break;
5187
            }
5188
          }
5189
        if (delimiter == 0)
5190
          {
5191
          errorcode = ERR82;
5192
          goto FAILED;
5193
          }
5194

5195
        *parsed_pattern = META_CALLOUT_STRING;
5196
        parsed_pattern += 3;   /* Skip pattern info */
5197

5198
        for (;;)
5199
          {
5200
          if (++ptr >= ptrend)
5201
            {
5202
            errorcode = ERR81;
5203
            ptr = startptr;   /* To give a more useful message */
5204
            goto FAILED;
5205
            }
5206
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5207
            break;
5208
          }
5209

5210
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5211
        if (calloutlength > UINT32_MAX)
5212
          {
5213
          errorcode = ERR72;
5214
          goto FAILED;
5215
          }
5216
        *parsed_pattern++ = (uint32_t)calloutlength;
5217
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5218
        PUTOFFSET(offset, parsed_pattern);
5219
        }
5220

5221
      /* Handle a callout with an optional numerical argument, which must be
5222
      less than or equal to 255. A missing argument gives 0. */
5223

5224
      else
5225
        {
5226
        int n = 0;
5227
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5228
        parsed_pattern += 3;                       /* Skip pattern info */
5229
        while (ptr < ptrend && IS_DIGIT(*ptr))
5230
          {
5231
          n = n * 10 + (*ptr++ - CHAR_0);
5232
          if (n > 255)
5233
            {
5234
            errorcode = ERR38;
5235
            goto FAILED;
5236
            }
5237
          }
5238
        *parsed_pattern++ = n;
5239
        }
5240

5241
      /* Both formats must have a closing parenthesis */
5242

5243
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5244
        {
5245
        errorcode = ERR39;
5246
        goto FAILED;
5247
        }
5248
      ptr++;
5249

5250
      /* Remember the offset to the next item in the pattern, and set a default
5251
      length. This should get updated after the next item is read. */
5252

5253
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5254
      previous_callout[2] = 0;
5255
      break;                  /* End callout */
5256

5257

5258
      /* ---- Conditional group ---- */
5259

5260
      /* A condition can be an assertion, a number (referring to a numbered
5261
      group's having been set), a name (referring to a named group), or 'R',
5262
      referring to overall recursion. R<digits> and R&name are also permitted
5263
      for recursion state tests. Numbers may be preceded by + or - to specify a
5264
      relative group number.
5265

5266
      There are several syntaxes for testing a named group: (?(name)) is used
5267
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5268

5269
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5270
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5271
      the Perl DEFINE feature or the Python named test. We look for a name
5272
      first; if not found, we try the other case.
5273

5274
      For compatibility with auto-callouts, we allow a callout to be specified
5275
      before a condition that is an assertion. */
5276

5277
      case CHAR_LEFT_PARENTHESIS:
5278
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5279
      nest_depth++;
5280

5281
      /* If the next character is ? or * there must be an assertion next
5282
      (optionally preceded by a callout). We do not check this here, but
5283
      instead we set expect_cond_assert to 2. If this is still greater than
5284
      zero (callouts decrement it) when the next assertion is read, it will be
5285
      marked as a condition that must not be repeated. A value greater than
5286
      zero also causes checking that an assertion (possibly with callout)
5287
      follows. */
5288

5289
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5290
        {
5291
        *parsed_pattern++ = META_COND_ASSERT;
5292
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5293
        expect_cond_assert = 2;
5294
        break;  /* End of conditional */
5295
        }
5296

5297
      /* Handle (?([+-]number)... */
5298

5299
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5300
          &errorcode))
5301
        {
5302
        PCRE2_ASSERT(i >= 0);
5303
        if (i <= 0)
5304
          {
5305
          errorcode = ERR15;
5306
          goto FAILED;
5307
          }
5308
        *parsed_pattern++ = META_COND_NUMBER;
5309
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5310
        PUTOFFSET(offset, parsed_pattern);
5311
        *parsed_pattern++ = i;
5312
        }
5313
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5314

5315
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5316

5317
      else if (ptrend - ptr >= 10 &&
5318
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5319
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5320
        {
5321
        uint32_t ge = 0;
5322
        int major = 0;
5323
        int minor = 0;
5324

5325
        ptr += 7;
5326
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5327
          {
5328
          ge = 1;
5329
          ptr++;
5330
          }
5331

5332
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5333
        references its argument twice. */
5334

5335
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5336
          goto BAD_VERSION_CONDITION;
5337

5338
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5339
          goto FAILED;
5340

5341
        if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5342
        if (*ptr == CHAR_DOT)
5343
          {
5344
          if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
5345
          minor = (*ptr++ - CHAR_0) * 10;
5346
          if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5347
          if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
5348
          if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5349
            goto BAD_VERSION_CONDITION;
5350
          }
5351

5352
        *parsed_pattern++ = META_COND_VERSION;
5353
        *parsed_pattern++ = ge;
5354
        *parsed_pattern++ = major;
5355
        *parsed_pattern++ = minor;
5356
        }
5357

5358
      /* All the remaining cases now require us to read a name. We cannot at
5359
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5360
      recursion test by number or a name, because the named groups have not yet
5361
      all been identified. Those cases are treated as names, but given a
5362
      different META code. */
5363

5364
      else
5365
        {
5366
        BOOL was_r_ampersand = FALSE;
5367

5368
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5369
          {
5370
          terminator = CHAR_RIGHT_PARENTHESIS;
5371
          was_r_ampersand = TRUE;
5372
          ptr++;
5373
          }
5374
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5375
          terminator = CHAR_GREATER_THAN_SIGN;
5376
        else if (*ptr == CHAR_APOSTROPHE)
5377
          terminator = CHAR_APOSTROPHE;
5378
        else
5379
          {
5380
          terminator = CHAR_RIGHT_PARENTHESIS;
5381
          ptr--;   /* Point to char before name */
5382
          }
5383
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5384
            &errorcode, cb)) goto FAILED;
5385

5386
        /* Handle (?(R&name) */
5387

5388
        if (was_r_ampersand)
5389
          {
5390
          *parsed_pattern = META_COND_RNAME;
5391
          ptr--;   /* Back to closing parens */
5392
          }
5393

5394
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5395
        special code. Likewise if the name consists of R followed only by
5396
        digits. Otherwise, handle it like a quoted name. */
5397

5398
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5399
          {
5400
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5401
            *parsed_pattern = META_COND_DEFINE;
5402
          else
5403
            {
5404
            for (i = 1; i < (int)namelen; i++)
5405
              if (!IS_DIGIT(name[i])) break;
5406
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5407
              META_COND_RNUMBER : META_COND_NAME;
5408
            }
5409
          ptr--;   /* Back to closing parens */
5410
          }
5411

5412
        /* Handle (?('name') or (?(<name>) */
5413

5414
        else *parsed_pattern = META_COND_NAME;
5415

5416
        /* All these cases except DEFINE end with the name length and offset;
5417
        DEFINE just has an offset (for the "too many branches" error). */
5418

5419
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5420
        PUTOFFSET(offset, parsed_pattern);
5421
        }  /* End cases that read a name */
5422

5423
      /* Check the closing parenthesis of the condition */
5424

5425
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5426
        {
5427
        errorcode = ERR24;
5428
        goto FAILED;
5429
        }
5430
      ptr++;
5431
      break;  /* End of condition processing */
5432

5433

5434
      /* ---- Atomic group ---- */
5435

5436
      case CHAR_GREATER_THAN_SIGN:
5437
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5438
      *parsed_pattern++ = META_ATOMIC;
5439
      nest_depth++;
5440
      ptr++;
5441
      break;
5442

5443

5444
      /* ---- Lookahead assertions ---- */
5445

5446
      case CHAR_EQUALS_SIGN:
5447
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5448
      *parsed_pattern++ = META_LOOKAHEAD;
5449
      ptr++;
5450
      goto POST_ASSERTION;
5451

5452
      case CHAR_ASTERISK:
5453
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5454
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5455
      ptr++;
5456
      goto POST_ASSERTION;
5457

5458
      case CHAR_EXCLAMATION_MARK:
5459
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5460
      *parsed_pattern++ = META_LOOKAHEADNOT;
5461
      ptr++;
5462
      goto POST_ASSERTION;
5463

5464

5465
      /* ---- Lookbehind assertions ---- */
5466

5467
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5468
      is the start of the name of a capturing group. */
5469

5470
      case CHAR_LESS_THAN_SIGN:
5471
      if (ptrend - ptr <= 1 ||
5472
         (ptr[1] != CHAR_EQUALS_SIGN &&
5473
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5474
          ptr[1] != CHAR_ASTERISK))
5475
        {
5476
        terminator = CHAR_GREATER_THAN_SIGN;
5477
        goto DEFINE_NAME;
5478
        }
5479
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5480
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5481
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5482

5483
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5484
      *has_lookbehind = TRUE;
5485
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5486
      PUTOFFSET(offset, parsed_pattern);
5487
      ptr += 2;
5488
      /* Fall through */
5489

5490
      /* If the previous item was a condition starting (?(? an assertion,
5491
      optionally preceded by a callout, is expected. This is checked later on,
5492
      during actual compilation. However we need to identify this kind of
5493
      assertion in this pass because it must not be qualified. The value of
5494
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5495
      for a callout - still leaving a positive value that identifies the
5496
      assertion. Multiple callouts or any other items will make it zero or
5497
      less, which doesn't matter because they will cause an error later. */
5498

5499
      POST_ASSERTION:
5500
      nest_depth++;
5501
      if (prev_expect_cond_assert > 0)
5502
        {
5503
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5504
        else if (++top_nest >= end_nests)
5505
          {
5506
          errorcode = ERR84;
5507
          goto FAILED;
5508
          }
5509
        top_nest->nest_depth = nest_depth;
5510
        top_nest->flags = NSF_CONDASSERT;
5511
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5512
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5513
        }
5514
      break;
5515

5516

5517
      /* ---- Define a named group ---- */
5518

5519
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5520
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5521
      terminator set to '>'. */
5522

5523
      case CHAR_APOSTROPHE:
5524
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5525

5526
      DEFINE_NAME:
5527
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5528
          &errorcode, cb)) goto FAILED;
5529

5530
      /* We have a name for this capturing group. It is also assigned a number,
5531
      which is its primary means of identification. */
5532

5533
      if (cb->bracount >= MAX_GROUP_NUMBER)
5534
        {
5535
        errorcode = ERR97;
5536
        goto FAILED;
5537
        }
5538
      cb->bracount++;
5539
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5540
      nest_depth++;
5541

5542
      /* Check not too many names */
5543

5544
      if (cb->names_found >= MAX_NAME_COUNT)
5545
        {
5546
        errorcode = ERR49;
5547
        goto FAILED;
5548
        }
5549

5550
      /* Adjust the entry size to accommodate the longest name found. */
5551

5552
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5553
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5554

5555
      /* Scan the list to check for duplicates. For duplicate names, if the
5556
      number is the same, break the loop, which causes the name to be
5557
      discarded; otherwise, if DUPNAMES is not set, give an error.
5558
      If it is set, allow the name with a different number, but continue
5559
      scanning in case this is a duplicate with the same number. For
5560
      non-duplicate names, give an error if the number is duplicated. */
5561

5562
      isdupname = FALSE;
5563
      ng = cb->named_groups;
5564
      for (i = 0; i < cb->names_found; i++, ng++)
5565
        {
5566
        if (namelen == ng->length &&
5567
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5568
          {
5569
          if (ng->number == cb->bracount) break;
5570
          if ((options & PCRE2_DUPNAMES) == 0)
5571
            {
5572
            errorcode = ERR43;
5573
            goto FAILED;
5574
            }
5575
          isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
5576
          cb->dupnames = TRUE;              /* Duplicate names exist */
5577
          }
5578
        else if (ng->number == cb->bracount)
5579
          {
5580
          errorcode = ERR65;
5581
          goto FAILED;
5582
          }
5583
        }
5584

5585
      if (i < cb->names_found) break;   /* Ignore duplicate with same number */
5586

5587
      /* Increase the list size if necessary */
5588

5589
      if (cb->names_found >= cb->named_group_list_size)
5590
        {
5591
        uint32_t newsize = cb->named_group_list_size * 2;
5592
        named_group *newspace =
5593
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5594
          cb->cx->memctl.memory_data);
5595
        if (newspace == NULL)
5596
          {
5597
          errorcode = ERR21;
5598
          goto FAILED;
5599
          }
5600

5601
        memcpy(newspace, cb->named_groups,
5602
          cb->named_group_list_size * sizeof(named_group));
5603
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5604
          cb->cx->memctl.free((void *)cb->named_groups,
5605
          cb->cx->memctl.memory_data);
5606
        cb->named_groups = newspace;
5607
        cb->named_group_list_size = newsize;
5608
        }
5609

5610
      /* Add this name to the list */
5611

5612
      cb->named_groups[cb->names_found].name = name;
5613
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5614
      cb->named_groups[cb->names_found].number = cb->bracount;
5615
      cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
5616
      cb->names_found++;
5617
      break;
5618

5619

5620
      /* ---- Perl extended character class ---- */
5621

5622
      /* These are of the form '(?[...])'. We handle these via the same parser
5623
      that consumes ordinary '[...]' classes, but with a flag set to activate
5624
      the extended behaviour. */
5625

5626
      case CHAR_LEFT_SQUARE_BRACKET:
5627
      class_mode_state = CLASS_MODE_PERL_EXT;
5628
      c = *ptr++;
5629
      goto FROM_PERL_EXTENDED_CLASS;
5630
      }        /* End of (? switch */
5631
    break;     /* End of ( handling */
5632

5633

5634
    /* ---- Branch terminators ---- */
5635

5636
    /* Alternation: reset the capture count if we are in a (?| group. */
5637

5638
    case CHAR_VERTICAL_LINE:
5639
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5640
        (top_nest->flags & NSF_RESET) != 0)
5641
      {
5642
      if (cb->bracount > top_nest->max_group)
5643
        top_nest->max_group = (uint16_t)cb->bracount;
5644
      cb->bracount = top_nest->reset_group;
5645
      }
5646
    *parsed_pattern++ = META_ALT;
5647
    break;
5648

5649
    /* End of group; reset the capture count to the maximum if we are in a (?|
5650
    group and/or reset the options that are tracked during parsing. Disallow
5651
    quantifier for a condition that is an assertion. */
5652

5653
    case CHAR_RIGHT_PARENTHESIS:
5654
    okquantifier = TRUE;
5655
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5656
      {
5657
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5658
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5659
      if ((top_nest->flags & NSF_RESET) != 0 &&
5660
          top_nest->max_group > cb->bracount)
5661
        cb->bracount = top_nest->max_group;
5662
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5663
        okquantifier = FALSE;
5664

5665
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5666
        {
5667
        *parsed_pattern++ = META_KET;
5668

5669
#ifdef PCRE2_DEBUG
5670
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5671
        parsed_pattern_extra--;
5672
#endif
5673
        }
5674

5675
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5676
        else top_nest--;
5677
      }
5678
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5679
      {
5680
      errorcode = ERR22;
5681
      goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
5682
      }
5683
    nest_depth--;
5684
    *parsed_pattern++ = META_KET;
5685
    break;
5686
    }  /* End of switch on pattern character */
5687
  }    /* End of main character scan loop */
5688

5689
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5690

5691
if (inverbname && ptr >= ptrend)
5692
  {
5693
  errorcode = ERR60;
5694
  goto FAILED;
5695
  }
5696

5697

5698
PARSED_END:
5699

5700
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5701
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5702
               max_parsed_pattern(ptr_check, ptr, utf, options));
5703

5704
/* Manage callout for the final item */
5705

5706
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5707
  parsed_pattern, cb);
5708

5709
/* Insert trailing items for word and line matching (features provided for the
5710
benefit of pcre2grep). */
5711

5712
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5713
  {
5714
  *parsed_pattern++ = META_KET;
5715
  *parsed_pattern++ = META_DOLLAR;
5716
  }
5717
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5718
  {
5719
  *parsed_pattern++ = META_KET;
5720
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5721
  }
5722

5723
/* Terminate the parsed pattern, then return success if all groups are closed.
5724
Otherwise we have unclosed parentheses. */
5725

5726
if (parsed_pattern >= parsed_pattern_end)
5727
  {
5728
  PCRE2_DEBUG_UNREACHABLE();
5729
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5730
  goto FAILED;
5731
  }
5732

5733
*parsed_pattern = META_END;
5734
if (nest_depth == 0) return 0;
5735

5736
UNCLOSED_PARENTHESIS:
5737
errorcode = ERR14;
5738

5739
/* Come here for all failures. */
5740

5741
FAILED:
5742
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5743
return errorcode;
5744

5745
/* Some errors need to indicate the previous character. */
5746

5747
FAILED_BACK:
5748
ptr--;
5749
goto FAILED;
5750

5751
/* This failure happens several times. */
5752

5753
BAD_VERSION_CONDITION:
5754
errorcode = ERR79;
5755
goto FAILED;
5756
}
5757

5758

5759

5760
/*************************************************
5761
*       Find first significant opcode            *
5762
*************************************************/
5763

5764
/* This is called by several functions that scan a compiled expression looking
5765
for a fixed first character, or an anchoring opcode etc. It skips over things
5766
that do not influence this. For some calls, it makes sense to skip negative
5767
forward and all backward assertions, and also the \b assertion; for others it
5768
does not.
5769

5770
Arguments:
5771
  code         pointer to the start of the group
5772
  skipassert   TRUE if certain assertions are to be skipped
5773

5774
Returns:       pointer to the first significant opcode
5775
*/
5776

5777
static const PCRE2_UCHAR*
5778
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5779
{
5780
for (;;)
5781
  {
5782
  switch ((int)*code)
5783
    {
5784
    case OP_ASSERT_NOT:
5785
    case OP_ASSERTBACK:
5786
    case OP_ASSERTBACK_NOT:
5787
    case OP_ASSERTBACK_NA:
5788
    if (!skipassert) return code;
5789
    do code += GET(code, 1); while (*code == OP_ALT);
5790
    code += PRIV(OP_lengths)[*code];
5791
    break;
5792

5793
    case OP_WORD_BOUNDARY:
5794
    case OP_NOT_WORD_BOUNDARY:
5795
    case OP_UCP_WORD_BOUNDARY:
5796
    case OP_NOT_UCP_WORD_BOUNDARY:
5797
    if (!skipassert) return code;
5798
    /* Fall through */
5799

5800
    case OP_CALLOUT:
5801
    case OP_CREF:
5802
    case OP_DNCREF:
5803
    case OP_RREF:
5804
    case OP_DNRREF:
5805
    case OP_FALSE:
5806
    case OP_TRUE:
5807
    code += PRIV(OP_lengths)[*code];
5808
    break;
5809

5810
    case OP_CALLOUT_STR:
5811
    code += GET(code, 1 + 2*LINK_SIZE);
5812
    break;
5813

5814
    case OP_SKIPZERO:
5815
    code += 2 + GET(code, 2) + LINK_SIZE;
5816
    break;
5817

5818
    case OP_COND:
5819
    case OP_SCOND:
5820
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5821
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
5822
      return code;
5823
    code += GET(code, 1) + 1 + LINK_SIZE;
5824
    break;
5825

5826
    case OP_MARK:
5827
    case OP_COMMIT_ARG:
5828
    case OP_PRUNE_ARG:
5829
    case OP_SKIP_ARG:
5830
    case OP_THEN_ARG:
5831
    code += code[1] + PRIV(OP_lengths)[*code];
5832
    break;
5833

5834
    default:
5835
    return code;
5836
    }
5837
  }
5838

5839
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
5840
}
5841

5842

5843

5844
/*************************************************
5845
*    Find details of duplicate group names       *
5846
*************************************************/
5847

5848
/* This is called from compile_branch() when it needs to know the index and
5849
count of duplicates in the names table when processing named backreferences,
5850
either directly, or as conditions.
5851

5852
Arguments:
5853
  name          points to the name
5854
  length        the length of the name
5855
  indexptr      where to put the index
5856
  countptr      where to put the count of duplicates
5857
  errorcodeptr  where to put an error code
5858
  cb            the compile block
5859

5860
Returns:        TRUE if OK, FALSE if not, error code set
5861
*/
5862

5863
static BOOL
5864
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5865
  int *countptr, int *errorcodeptr, compile_block *cb)
5866
{
5867
uint32_t i, groupnumber;
5868
int count;
5869
PCRE2_UCHAR *slot = cb->name_table;
5870

5871
/* Find the first entry in the table */
5872

5873
for (i = 0; i < cb->names_found; i++)
5874
  {
5875
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5876
      slot[IMM2_SIZE+length] == 0) break;
5877
  slot += cb->name_entry_size;
5878
  }
5879

5880
/* This should not occur, because this function is called only when we know we
5881
have duplicate names. Give an internal error. */
5882

5883
if (i >= cb->names_found)
5884
  {
5885
  PCRE2_DEBUG_UNREACHABLE();
5886
  *errorcodeptr = ERR53;
5887
  cb->erroroffset = name - cb->start_pattern;
5888
  return FALSE;
5889
  }
5890

5891
/* Record the index and then see how many duplicates there are, updating the
5892
backref map and maximum back reference as we do. */
5893

5894
*indexptr = i;
5895
count = 0;
5896

5897
for (;;)
5898
  {
5899
  count++;
5900
  groupnumber = GET2(slot,0);
5901
  cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5902
  if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5903
  if (++i >= cb->names_found) break;
5904
  slot += cb->name_entry_size;
5905
  if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5906
    (slot+IMM2_SIZE)[length] != 0) break;
5907
  }
5908

5909
*countptr = count;
5910
return TRUE;
5911
}
5912

5913

5914

5915
/*************************************************
5916
*           Compile one branch                   *
5917
*************************************************/
5918

5919
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5920
the options are changed during the branch, the pointer is used to change the
5921
external options bits. This function is used during the pre-compile phase when
5922
we are trying to find out the amount of memory needed, as well as during the
5923
real compile phase. The value of lengthptr distinguishes the two phases.
5924

5925
Arguments:
5926
  optionsptr        pointer to the option bits
5927
  xoptionsptr       pointer to the extra option bits
5928
  codeptr           points to the pointer to the current code point
5929
  pptrptr           points to the current parsed pattern pointer
5930
  errorcodeptr      points to error code variable
5931
  firstcuptr        place to put the first required code unit
5932
  firstcuflagsptr   place to put the first code unit flags
5933
  reqcuptr          place to put the last required code unit
5934
  reqcuflagsptr     place to put the last required code unit flags
5935
  bcptr             points to current branch chain
5936
  open_caps         points to current capitem
5937
  cb                contains pointers to tables etc.
5938
  lengthptr         NULL during the real compile phase
5939
                    points to length accumulator during pre-compile phase
5940

5941
Returns:            0 There's been an error, *errorcodeptr is non-zero
5942
                   +1 Success, this branch must match at least one character
5943
                   -1 Success, this branch may match an empty string
5944
*/
5945

5946
static int
5947
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5948
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5949
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5950
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5951
  compile_block *cb, PCRE2_SIZE *lengthptr)
5952
{
5953
int bravalue = 0;
5954
int okreturn = -1;
5955
int group_return = 0;
5956
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5957
uint32_t greedy_default, greedy_non_default;
5958
uint32_t repeat_type, op_type;
5959
uint32_t options = *optionsptr;               /* May change dynamically */
5960
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5961
uint32_t firstcu, reqcu;
5962
uint32_t zeroreqcu, zerofirstcu;
5963
uint32_t *pptr = *pptrptr;
5964
uint32_t meta, meta_arg;
5965
uint32_t firstcuflags, reqcuflags;
5966
uint32_t zeroreqcuflags, zerofirstcuflags;
5967
uint32_t req_caseopt, reqvary, tempreqvary;
5968
/* Some opcodes, such as META_SCS_NUMBER or META_SCS_NAME,
5969
depends on the previous value of offset. */
5970
PCRE2_SIZE offset = 0;
5971
PCRE2_SIZE length_prevgroup = 0;
5972
PCRE2_UCHAR *code = *codeptr;
5973
PCRE2_UCHAR *last_code = code;
5974
PCRE2_UCHAR *orig_code = code;
5975
PCRE2_UCHAR *tempcode;
5976
PCRE2_UCHAR *previous = NULL;
5977
PCRE2_UCHAR op_previous;
5978
BOOL groupsetfirstcu = FALSE;
5979
BOOL had_accept = FALSE;
5980
BOOL matched_char = FALSE;
5981
BOOL previous_matched_char = FALSE;
5982
BOOL reset_caseful = FALSE;
5983

5984
/* We can fish out the UTF setting once and for all into a BOOL, but we must
5985
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5986
as we process the pattern. */
5987

5988
#ifdef SUPPORT_UNICODE
5989
BOOL utf = (options & PCRE2_UTF) != 0;
5990
BOOL ucp = (options & PCRE2_UCP) != 0;
5991
#else  /* No Unicode support */
5992
BOOL utf = FALSE;
5993
#endif
5994

5995
/* Set up the default and non-default settings for greediness */
5996

5997
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5998
greedy_non_default = greedy_default ^ 1;
5999

6000
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6001
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6002
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6003

6004
When we hit a repeat whose minimum is zero, we may have to adjust these values
6005
to take the zero repeat into account. This is implemented by setting them to
6006
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6007
item types that can be repeated set these backoff variables appropriately. */
6008

6009
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6010
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6011

6012
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6013
according to the current setting of the caseless flag. The REQ_CASELESS value
6014
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6015
to record the case status of the value. This is used only for ASCII characters.
6016
*/
6017

6018
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6019

6020
/* Switch on next META item until the end of the branch */
6021

6022
for (;; pptr++)
6023
  {
6024
  BOOL possessive_quantifier;
6025
  BOOL note_group_empty;
6026
  uint32_t mclength;
6027
  uint32_t skipunits;
6028
  uint32_t subreqcu, subfirstcu;
6029
  uint32_t groupnumber;
6030
  uint32_t verbarglen, verbculen;
6031
  uint32_t subreqcuflags, subfirstcuflags;
6032
  open_capitem *oc;
6033
  PCRE2_UCHAR mcbuffer[8];
6034

6035
  /* Get next META item in the pattern and its potential argument. */
6036

6037
  meta = META_CODE(*pptr);
6038
  meta_arg = META_DATA(*pptr);
6039

6040
  /* If we are in the pre-compile phase, accumulate the length used for the
6041
  previous cycle of this loop, unless the next item is a quantifier. */
6042

6043
  if (lengthptr != NULL)
6044
    {
6045
    if (code > cb->start_workspace + cb->workspace_size -
6046
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6047
      {
6048
      if (code >= cb->start_workspace + cb->workspace_size)
6049
        {
6050
        PCRE2_DEBUG_UNREACHABLE();
6051
        *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6052
        }
6053
      else
6054
        *errorcodeptr = ERR86;
6055
      return 0;
6056
      }
6057

6058
    /* There is at least one situation where code goes backwards: this is the
6059
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6060
    is processed, the whole class is eliminated. However, it is created first,
6061
    so we have to allow memory for it. Therefore, don't ever reduce the length
6062
    at this point. */
6063

6064
    if (code < last_code) code = last_code;
6065

6066
    /* If the next thing is not a quantifier, we add the length of the previous
6067
    item into the total, and reset the code pointer to the start of the
6068
    workspace. Otherwise leave the previous item available to be quantified. */
6069

6070
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6071
      {
6072
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6073
        {
6074
        *errorcodeptr = ERR20;   /* Integer overflow */
6075
        return 0;
6076
        }
6077
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6078
      if (*lengthptr > MAX_PATTERN_SIZE)
6079
        {
6080
        *errorcodeptr = ERR20;   /* Pattern is too large */
6081
        return 0;
6082
        }
6083
      code = orig_code;
6084
      }
6085

6086
    /* Remember where this code item starts so we can catch the "backwards"
6087
    case above next time round. */
6088

6089
    last_code = code;
6090
    }
6091

6092
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6093
  where it starts so that it can be quantified when a quantifier follows.
6094
  Checking for the legality of quantifiers happens in parse_regex(), except for
6095
  a quantifier after an assertion that is a condition. */
6096

6097
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6098
    {
6099
    previous = code;
6100
    if (matched_char && !had_accept) okreturn = 1;
6101
    }
6102

6103
  previous_matched_char = matched_char;
6104
  matched_char = FALSE;
6105
  note_group_empty = FALSE;
6106
  skipunits = 0;         /* Default value for most subgroups */
6107

6108
  switch(meta)
6109
    {
6110
    /* ===================================================================*/
6111
    /* The branch terminates at pattern end or | or ) */
6112

6113
    case META_END:
6114
    case META_ALT:
6115
    case META_KET:
6116
    *firstcuptr = firstcu;
6117
    *firstcuflagsptr = firstcuflags;
6118
    *reqcuptr = reqcu;
6119
    *reqcuflagsptr = reqcuflags;
6120
    *codeptr = code;
6121
    *pptrptr = pptr;
6122
    return okreturn;
6123

6124

6125
    /* ===================================================================*/
6126
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6127
    the setting of any following char as a first character. */
6128

6129
    case META_CIRCUMFLEX:
6130
    if ((options & PCRE2_MULTILINE) != 0)
6131
      {
6132
      if (firstcuflags == REQ_UNSET)
6133
        zerofirstcuflags = firstcuflags = REQ_NONE;
6134
      *code++ = OP_CIRCM;
6135
      }
6136
    else *code++ = OP_CIRC;
6137
    break;
6138

6139
    case META_DOLLAR:
6140
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6141
    break;
6142

6143
    /* There can never be a first char if '.' is first, whatever happens about
6144
    repeats. The value of reqcu doesn't change either. */
6145

6146
    case META_DOT:
6147
    matched_char = TRUE;
6148
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6149
    zerofirstcu = firstcu;
6150
    zerofirstcuflags = firstcuflags;
6151
    zeroreqcu = reqcu;
6152
    zeroreqcuflags = reqcuflags;
6153
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6154
    break;
6155

6156

6157
    /* ===================================================================*/
6158
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6159
    Otherwise, an initial ']' is taken as a data character. When empty classes
6160
    are allowed, [] must generate an empty class - we have no dedicated opcode
6161
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6162
    construct would be a clearer way for a pattern author to represent a
6163
    non-matching branch, but it does have different semantics to '[]' if both
6164
    are followed by a quantifier). The empty-negated [^] matches any character,
6165
    so is useful: generate OP_ALLANY for this. */
6166

6167
    case META_CLASS_EMPTY:
6168
    case META_CLASS_EMPTY_NOT:
6169
    matched_char = TRUE;
6170
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6171
    else
6172
      {
6173
      *code++ = OP_CLASS;
6174
      memset(code, 0, 32);
6175
      code += 32 / sizeof(PCRE2_UCHAR);
6176
      }
6177

6178
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6179
    zerofirstcu = firstcu;
6180
    zerofirstcuflags = firstcuflags;
6181
    break;
6182

6183

6184
    /* ===================================================================*/
6185
    /* Non-empty character class. If the included characters are all < 256, we
6186
    build a 32-byte bitmap of the permitted characters, except in the special
6187
    case where there is only one such character. For negated classes, we build
6188
    the map as usual, then invert it at the end. However, we use a different
6189
    opcode so that data characters > 255 can be handled correctly.
6190

6191
    If the class contains characters outside the 0-255 range, a different
6192
    opcode is compiled. It may optionally have a bit map for characters < 256,
6193
    but those above are explicitly listed afterwards. A flag code unit tells
6194
    whether the bitmap is present, and whether this is a negated class or
6195
    not. */
6196

6197
    case META_CLASS_NOT:
6198
    case META_CLASS:
6199
    matched_char = TRUE;
6200

6201
    /* Check for complex extended classes and handle them separately. */
6202

6203
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6204
      {
6205
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6206
                                      errorcodeptr, cb, lengthptr))
6207
        return 0;
6208
      goto CLASS_END_PROCESSING;
6209
      }
6210

6211
    /* We can optimize the case of a single character in a class by generating
6212
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6213
    negative. In the negative case there can be no first char if this item is
6214
    first, whatever repeat count may follow. In the case of reqcu, save the
6215
    previous value for reinstating. */
6216

6217
    /* NOTE: at present this optimization is not effective if the only
6218
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6219

6220
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6221
      {
6222
      uint32_t c = pptr[1];
6223

6224
      pptr += 2;                 /* Move on to class end */
6225
      if (meta == META_CLASS)    /* A positive one-char class can be */
6226
        {                        /* handled as a normal literal character. */
6227
        meta = c;                /* Set up the character */
6228
        goto NORMAL_CHAR_SET;
6229
        }
6230

6231
      /* Handle a negative one-character class */
6232

6233
      zeroreqcu = reqcu;
6234
      zeroreqcuflags = reqcuflags;
6235
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6236
      zerofirstcu = firstcu;
6237
      zerofirstcuflags = firstcuflags;
6238

6239
      /* For caseless UTF or UCP mode, check whether this character has more
6240
      than one other case. If so, generate a special OP_NOTPROP item instead of
6241
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6242
      caseless set that starts with an ASCII character. If the character is
6243
      affected by the special Turkish rules, hardcode the not-matching
6244
      characters using a caseset. */
6245

6246
#ifdef SUPPORT_UNICODE
6247
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6248
        {
6249
        uint32_t caseset;
6250

6251
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6252
              PCRE2_EXTRA_TURKISH_CASING &&
6253
            UCD_ANY_I(c))
6254
          {
6255
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6256
          }
6257
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6258
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6259
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6260
          {
6261
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6262
          }
6263

6264
        if (caseset != 0)
6265
          {
6266
          *code++ = OP_NOTPROP;
6267
          *code++ = PT_CLIST;
6268
          *code++ = caseset;
6269
          break;   /* We are finished with this class */
6270
          }
6271
        }
6272
#endif
6273
      /* Char has only one other (usable) case, or UCP not available */
6274

6275
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6276
      code += PUTCHAR(c, code);
6277
      break;   /* We are finished with this class */
6278
      }        /* End of 1-char optimization */
6279

6280
    /* Handle character classes that contain more than just one literal
6281
    character. If there are exactly two characters in a positive class, see if
6282
    they are case partners. This can be optimized to generate a caseless single
6283
    character match (which also sets first/required code units if relevant).
6284
    When casing restrictions apply, ignore a caseless set if both characters
6285
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6286
    Unicode "othercase". */
6287

6288
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6289
        pptr[3] == META_CLASS_END)
6290
      {
6291
      uint32_t c = pptr[1];
6292

6293
#ifdef SUPPORT_UNICODE
6294
      if ((UCD_CASESET(c) == 0 ||
6295
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6296
            c < 128 && pptr[2] < 128)) &&
6297
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6298
              PCRE2_EXTRA_TURKISH_CASING &&
6299
            UCD_ANY_I(c)))
6300
#endif
6301
        {
6302
        uint32_t d;
6303

6304
#ifdef SUPPORT_UNICODE
6305
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6306
#endif
6307
          {
6308
#if PCRE2_CODE_UNIT_WIDTH != 8
6309
          if (c > 255) d = c; else
6310
#endif
6311
          d = TABLE_GET(c, cb->fcc, c);
6312
          }
6313

6314
        if (c != d && pptr[2] == d)
6315
          {
6316
          pptr += 3;                 /* Move on to class end */
6317
          meta = c;
6318
          if ((options & PCRE2_CASELESS) == 0)
6319
            {
6320
            reset_caseful = TRUE;
6321
            options |= PCRE2_CASELESS;
6322
            req_caseopt = REQ_CASELESS;
6323
            }
6324
          goto CLASS_CASELESS_CHAR;
6325
          }
6326
        }
6327
      }
6328

6329
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6330

6331
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6332
                                          &code, meta == META_CLASS_NOT, NULL,
6333
                                          errorcodeptr, cb, lengthptr);
6334
    if (pptr == NULL) return 0;
6335
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6336

6337
    CLASS_END_PROCESSING:
6338

6339
    /* If this class is the first thing in the branch, there can be no first
6340
    char setting, whatever the repeat count. Any reqcu setting must remain
6341
    unchanged after any kind of repeat. */
6342

6343
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6344
    zerofirstcu = firstcu;
6345
    zerofirstcuflags = firstcuflags;
6346
    zeroreqcu = reqcu;
6347
    zeroreqcuflags = reqcuflags;
6348
    break;  /* End of class processing */
6349

6350

6351
    /* ===================================================================*/
6352
    /* Deal with (*VERB)s. */
6353

6354
    /* Check for open captures before ACCEPT and close those that are within
6355
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6356
    assertion. In the first pass, just accumulate the length required;
6357
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6358
    workspace overflow. Do not set firstcu after *ACCEPT. */
6359

6360
    case META_ACCEPT:
6361
    cb->had_accept = had_accept = TRUE;
6362
    for (oc = open_caps;
6363
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6364
         oc = oc->next)
6365
      {
6366
      if (lengthptr != NULL)
6367
        {
6368
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6369
        }
6370
      else
6371
        {
6372
        *code++ = OP_CLOSE;
6373
        PUT2INC(code, 0, oc->number);
6374
        }
6375
      }
6376
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6377
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378
    break;
6379

6380
    case META_PRUNE:
6381
    case META_SKIP:
6382
    cb->had_pruneorskip = TRUE;
6383
    /* Fall through */
6384
    case META_COMMIT:
6385
    case META_FAIL:
6386
    *code++ = verbops[(meta - META_MARK) >> 16];
6387
    break;
6388

6389
    case META_THEN:
6390
    cb->external_flags |= PCRE2_HASTHEN;
6391
    *code++ = OP_THEN;
6392
    break;
6393

6394
    /* Handle verbs with arguments. Arguments can be very long, especially in
6395
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6396
    However, the argument length is constrained to be small enough to fit in
6397
    one code unit. This check happens in parse_regex(). In the first pass,
6398
    instead of putting the argument into memory, we just update the length
6399
    counter and set up an empty argument. */
6400

6401
    case META_THEN_ARG:
6402
    cb->external_flags |= PCRE2_HASTHEN;
6403
    goto VERB_ARG;
6404

6405
    case META_PRUNE_ARG:
6406
    case META_SKIP_ARG:
6407
    cb->had_pruneorskip = TRUE;
6408
    /* Fall through */
6409
    case META_MARK:
6410
    case META_COMMIT_ARG:
6411
    VERB_ARG:
6412
    *code++ = verbops[(meta - META_MARK) >> 16];
6413
    /* The length is in characters. */
6414
    verbarglen = *(++pptr);
6415
    verbculen = 0;
6416
    tempcode = code++;
6417
    for (int i = 0; i < (int)verbarglen; i++)
6418
      {
6419
      meta = *(++pptr);
6420
#ifdef SUPPORT_UNICODE
6421
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6422
#endif
6423
        {
6424
        mclength = 1;
6425
        mcbuffer[0] = meta;
6426
        }
6427
      if (lengthptr != NULL) *lengthptr += mclength; else
6428
        {
6429
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6430
        code += mclength;
6431
        verbculen += mclength;
6432
        }
6433
      }
6434

6435
    *tempcode = verbculen;   /* Fill in the code unit length */
6436
    *code++ = 0;             /* Terminating zero */
6437
    break;
6438

6439

6440
    /* ===================================================================*/
6441
    /* Handle options change. The new setting must be passed back for use in
6442
    subsequent branches. Reset the greedy defaults and the case value for
6443
    firstcu and reqcu. */
6444

6445
    case META_OPTIONS:
6446
    *optionsptr = options = *(++pptr);
6447
    *xoptionsptr = xoptions = *(++pptr);
6448
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6449
    greedy_non_default = greedy_default ^ 1;
6450
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6451
    break;
6452

6453
    case META_OFFSET:
6454
    GETPLUSOFFSET(offset, pptr);
6455
    break;
6456

6457
    case META_SCS:
6458
    bravalue = OP_ASSERT_SCS;
6459
    cb->assert_depth += 1;
6460
    goto GROUP_PROCESS;
6461

6462

6463
    /* ===================================================================*/
6464
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6465
    because it could be a numerical check on recursion, or a name check on a
6466
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6467
    we can handle it either way. We first try for a name; if not found, process
6468
    the number. */
6469

6470
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6471
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6472
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6473
    case META_SCS_NAME:       /* Name of scan substring */
6474
    bravalue = OP_COND;
6475
      {
6476
      int count, index;
6477
      unsigned int i;
6478
      PCRE2_SPTR name;
6479
      named_group *ng = cb->named_groups;
6480
      uint32_t length = *(++pptr);
6481

6482
      if (meta == META_SCS_NAME)
6483
        offset += meta_arg;
6484
      else
6485
        GETPLUSOFFSET(offset, pptr);
6486
      name = cb->start_pattern + offset;
6487

6488
      /* In the first pass, the names generated in the pre-pass are available,
6489
      but the main name table has not yet been created. Scan the list of names
6490
      generated in the pre-pass in order to get a number and whether or not
6491
      this name is duplicated. If it is not duplicated, we can handle it as a
6492
      numerical group. */
6493

6494
      for (i = 0; i < cb->names_found; i++, ng++)
6495
        if (length == ng->length &&
6496
            PRIV(strncmp)(name, ng->name, length) == 0) break;
6497

6498
      if (i >= cb->names_found)
6499
        {
6500
        /* If the name was not found we have a bad reference, unless we are
6501
        dealing with R<digits>, which is treated as a recursion test by
6502
        number. */
6503

6504
        groupnumber = 0;
6505
        if (meta == META_COND_RNUMBER)
6506
          {
6507
          for (i = 1; i < length; i++)
6508
            {
6509
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6510
            if (groupnumber > MAX_GROUP_NUMBER)
6511
              {
6512
              *errorcodeptr = ERR61;
6513
              cb->erroroffset = offset + i;
6514
              return 0;
6515
              }
6516
            }
6517
          }
6518

6519
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6520
          {
6521
          *errorcodeptr = ERR15;
6522
          cb->erroroffset = offset;
6523
          return 0;
6524
          }
6525

6526
        /* (?Rdigits) treated as a recursion reference by number. A value of
6527
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6528
        translated into RREF_ANY (which is 0xffff). */
6529

6530
        if (groupnumber == 0) groupnumber = RREF_ANY;
6531
        code[1+LINK_SIZE] = OP_RREF;
6532
        PUT2(code, 2+LINK_SIZE, groupnumber);
6533
        skipunits = 1+IMM2_SIZE;
6534
        goto GROUP_PROCESS_NOTE_EMPTY;
6535
        }
6536
      else if (!ng->isdup)
6537
        {
6538
        /* Otherwise found a duplicated name */
6539
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6540

6541
        if (meta == META_SCS_NAME)
6542
          {
6543
          code[0] = OP_CREF;
6544
          PUT2(code, 1, ng->number);
6545
          code += 1+IMM2_SIZE;
6546
          break;
6547
          }
6548

6549
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6550
        PUT2(code, 2+LINK_SIZE, ng->number);
6551
        skipunits = 1+IMM2_SIZE;
6552
        if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;
6553
        cb->assert_depth += 1;
6554
        goto GROUP_PROCESS;
6555
        }
6556

6557
      /* We have a duplicated name. In the compile pass we have to search the
6558
      main table in order to get the index and count values. */
6559

6560
      count = 0;  /* Values for first pass (avoids compiler warning) */
6561
      index = 0;
6562
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6563
            &count, errorcodeptr, cb)) return 0;
6564

6565
      if (meta == META_SCS_NAME)
6566
        {
6567
        code[0] = OP_DNCREF;
6568
        PUT2(code, 1, index);
6569
        PUT2(code, 1+IMM2_SIZE, count);
6570
        code += 1+2*IMM2_SIZE;
6571
        break;
6572
        }
6573

6574
      /* A duplicated name was found. Note that if an R<digits> name is found
6575
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6576

6577
      code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6578

6579
      /* Insert appropriate data values. */
6580
      skipunits = 1+2*IMM2_SIZE;
6581
      PUT2(code, 2+LINK_SIZE, index);
6582
      PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6583
      }
6584

6585
    PCRE2_ASSERT(meta != META_SCS_NAME);
6586
    goto GROUP_PROCESS_NOTE_EMPTY;
6587

6588
    /* The DEFINE condition is always false. Its internal groups may never
6589
    be called, so matched_char must remain false, hence the jump to
6590
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6591

6592
    case META_COND_DEFINE:
6593
    bravalue = OP_COND;
6594
    GETPLUSOFFSET(offset, pptr);
6595
    code[1+LINK_SIZE] = OP_DEFINE;
6596
    skipunits = 1;
6597
    goto GROUP_PROCESS;
6598

6599
    /* Conditional test of a group's being set. */
6600

6601
    case META_COND_NUMBER:
6602
    case META_SCS_NUMBER:
6603
    bravalue = OP_COND;
6604
    if (meta == META_SCS_NUMBER)
6605
      offset += meta_arg;
6606
    else
6607
      GETPLUSOFFSET(offset, pptr);
6608

6609
    groupnumber = *(++pptr);
6610
    if (groupnumber > cb->bracount)
6611
      {
6612
      *errorcodeptr = ERR15;
6613
      cb->erroroffset = offset;
6614
      return 0;
6615
      }
6616
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6617

6618
    if (meta == META_SCS_NUMBER)
6619
      {
6620
      code[0] = OP_CREF;
6621
      PUT2(code, 1, groupnumber);
6622
      code += 1+IMM2_SIZE;
6623
      break;
6624
      }
6625

6626
    /* Point at initial ( for too many branches error */
6627
    offset -= 2;
6628
    code[1+LINK_SIZE] = OP_CREF;
6629
    skipunits = 1+IMM2_SIZE;
6630
    PUT2(code, 2+LINK_SIZE, groupnumber);
6631
    goto GROUP_PROCESS_NOTE_EMPTY;
6632

6633
    /* Test for the PCRE2 version. */
6634

6635
    case META_COND_VERSION:
6636
    bravalue = OP_COND;
6637
    if (pptr[1] > 0)
6638
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6639
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6640
          OP_TRUE : OP_FALSE;
6641
    else
6642
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6643
        OP_TRUE : OP_FALSE;
6644
    skipunits = 1;
6645
    pptr += 3;
6646
    goto GROUP_PROCESS_NOTE_EMPTY;
6647

6648
    /* The condition is an assertion, possibly preceded by a callout. */
6649

6650
    case META_COND_ASSERT:
6651
    bravalue = OP_COND;
6652
    goto GROUP_PROCESS_NOTE_EMPTY;
6653

6654

6655
    /* ===================================================================*/
6656
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6657
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6658

6659
    case META_LOOKAHEAD:
6660
    bravalue = OP_ASSERT;
6661
    cb->assert_depth += 1;
6662
    goto GROUP_PROCESS;
6663

6664
    case META_LOOKAHEAD_NA:
6665
    bravalue = OP_ASSERT_NA;
6666
    cb->assert_depth += 1;
6667
    goto GROUP_PROCESS;
6668

6669
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6670
    thing to do, but Perl allows all assertions to be quantified, and when
6671
    they contain capturing parentheses there may be a potential use for
6672
    this feature. Not that that applies to a quantified (?!) but we allow
6673
    it for uniformity. */
6674

6675
    case META_LOOKAHEADNOT:
6676
    if (pptr[1] == META_KET &&
6677
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6678
      {
6679
      *code++ = OP_FAIL;
6680
      pptr++;
6681
      }
6682
    else
6683
      {
6684
      bravalue = OP_ASSERT_NOT;
6685
      cb->assert_depth += 1;
6686
      goto GROUP_PROCESS;
6687
      }
6688
    break;
6689

6690
    case META_LOOKBEHIND:
6691
    bravalue = OP_ASSERTBACK;
6692
    cb->assert_depth += 1;
6693
    goto GROUP_PROCESS;
6694

6695
    case META_LOOKBEHINDNOT:
6696
    bravalue = OP_ASSERTBACK_NOT;
6697
    cb->assert_depth += 1;
6698
    goto GROUP_PROCESS;
6699

6700
    case META_LOOKBEHIND_NA:
6701
    bravalue = OP_ASSERTBACK_NA;
6702
    cb->assert_depth += 1;
6703
    goto GROUP_PROCESS;
6704

6705
    case META_ATOMIC:
6706
    bravalue = OP_ONCE;
6707
    goto GROUP_PROCESS_NOTE_EMPTY;
6708

6709
    case META_SCRIPT_RUN:
6710
    bravalue = OP_SCRIPT_RUN;
6711
    goto GROUP_PROCESS_NOTE_EMPTY;
6712

6713
    case META_NOCAPTURE:
6714
    bravalue = OP_BRA;
6715
    /* Fall through */
6716

6717
    /* Process nested bracketed regex. The nesting depth is maintained for the
6718
    benefit of the stackguard function. The test for too deep nesting is now
6719
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6720
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6721
    note of whether or not they may match an empty string. */
6722

6723
    GROUP_PROCESS_NOTE_EMPTY:
6724
    note_group_empty = TRUE;
6725

6726
    GROUP_PROCESS:
6727
    cb->parens_depth += 1;
6728
    *code = bravalue;
6729
    pptr++;
6730
    tempcode = code;
6731
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6732
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6733

6734
    if ((group_return =
6735
         compile_regex(
6736
         options,                         /* The options state */
6737
         xoptions,                        /* The extra options state */
6738
         &tempcode,                       /* Where to put code (updated) */
6739
         &pptr,                           /* Input pointer (updated) */
6740
         errorcodeptr,                    /* Where to put an error message */
6741
         skipunits,                       /* Skip over bracket number */
6742
         &subfirstcu,                     /* For possible first char */
6743
         &subfirstcuflags,
6744
         &subreqcu,                       /* For possible last char */
6745
         &subreqcuflags,
6746
         bcptr,                           /* Current branch chain */
6747
         open_caps,                       /* Pointer to capture stack */
6748
         cb,                              /* Compile data block */
6749
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6750
           &length_prevgroup              /* Pre-compile phase */
6751
         )) == 0)
6752
      return 0;  /* Error */
6753

6754
    cb->parens_depth -= 1;
6755

6756
    /* If that was a non-conditional significant group (not an assertion, not a
6757
    DEFINE) that matches at least one character, then the current item matches
6758
    a character. Conditionals are handled below. */
6759

6760
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6761
      matched_char = TRUE;
6762

6763
    /* If we've just compiled an assertion, pop the assert depth. */
6764

6765
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6766
      cb->assert_depth -= 1;
6767

6768
    /* At the end of compiling, code is still pointing to the start of the
6769
    group, while tempcode has been updated to point past the end of the group.
6770
    The parsed pattern pointer (pptr) is on the closing META_KET.
6771

6772
    If this is a conditional bracket, check that there are no more than
6773
    two branches in the group, or just one if it's a DEFINE group. We do this
6774
    in the real compile phase, not in the pre-pass, where the whole group may
6775
    not be available. */
6776

6777
    if (bravalue == OP_COND && lengthptr == NULL)
6778
      {
6779
      PCRE2_UCHAR *tc = code;
6780
      int condcount = 0;
6781

6782
      do {
6783
         condcount++;
6784
         tc += GET(tc,1);
6785
         }
6786
      while (*tc != OP_KET);
6787

6788
      /* A DEFINE group is never obeyed inline (the "condition" is always
6789
      false). It must have only one branch. Having checked this, change the
6790
      opcode to OP_FALSE. */
6791

6792
      if (code[LINK_SIZE+1] == OP_DEFINE)
6793
        {
6794
        if (condcount > 1)
6795
          {
6796
          cb->erroroffset = offset;
6797
          *errorcodeptr = ERR54;
6798
          return 0;
6799
          }
6800
        code[LINK_SIZE+1] = OP_FALSE;
6801
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6802
        }
6803

6804
      /* A "normal" conditional group. If there is just one branch, we must not
6805
      make use of its firstcu or reqcu, because this is equivalent to an
6806
      empty second branch. Also, it may match an empty string. If there are two
6807
      branches, this item must match a character if the group must. */
6808

6809
      else
6810
        {
6811
        if (condcount > 2)
6812
          {
6813
          cb->erroroffset = offset;
6814
          *errorcodeptr = ERR27;
6815
          return 0;
6816
          }
6817
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6818
          else if (group_return > 0) matched_char = TRUE;
6819
        }
6820
      }
6821

6822
    /* In the pre-compile phase, update the length by the length of the group,
6823
    less the brackets at either end. Then reduce the compiled code to just a
6824
    set of non-capturing brackets so that it doesn't use much memory if it is
6825
    duplicated by a quantifier.*/
6826

6827
    if (lengthptr != NULL)
6828
      {
6829
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6830
        {
6831
        *errorcodeptr = ERR20;
6832
        return 0;
6833
        }
6834
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6835
      code++;   /* This already contains bravalue */
6836
      PUTINC(code, 0, 1 + LINK_SIZE);
6837
      *code++ = OP_KET;
6838
      PUTINC(code, 0, 1 + LINK_SIZE);
6839
      break;    /* No need to waste time with special character handling */
6840
      }
6841

6842
    /* Otherwise update the main code pointer to the end of the group. */
6843

6844
    code = tempcode;
6845

6846
    /* For a DEFINE group, required and first character settings are not
6847
    relevant. */
6848

6849
    if (bravalue == OP_DEFINE) break;
6850

6851
    /* Handle updating of the required and first code units for other types of
6852
    group. Update for normal brackets of all kinds, and conditions with two
6853
    branches (see code above). If the bracket is followed by a quantifier with
6854
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
6855
    zerofirstcu outside the main loop so that they can be accessed for the back
6856
    off. */
6857

6858
    zeroreqcu = reqcu;
6859
    zeroreqcuflags = reqcuflags;
6860
    zerofirstcu = firstcu;
6861
    zerofirstcuflags = firstcuflags;
6862
    groupsetfirstcu = FALSE;
6863

6864
    if (bravalue >= OP_ONCE)  /* Not an assertion */
6865
      {
6866
      /* If we have not yet set a firstcu in this branch, take it from the
6867
      subpattern, remembering that it was set here so that a repeat of more
6868
      than one can replicate it as reqcu if necessary. If the subpattern has
6869
      no firstcu, set "none" for the whole branch. In both cases, a zero
6870
      repeat forces firstcu to "none". */
6871

6872
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6873
        {
6874
        if (subfirstcuflags < REQ_NONE)
6875
          {
6876
          firstcu = subfirstcu;
6877
          firstcuflags = subfirstcuflags;
6878
          groupsetfirstcu = TRUE;
6879
          }
6880
        else firstcuflags = REQ_NONE;
6881
        zerofirstcuflags = REQ_NONE;
6882
        }
6883

6884
      /* If firstcu was previously set, convert the subpattern's firstcu
6885
      into reqcu if there wasn't one, using the vary flag that was in
6886
      existence beforehand. */
6887

6888
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6889
        {
6890
        subreqcu = subfirstcu;
6891
        subreqcuflags = subfirstcuflags | tempreqvary;
6892
        }
6893

6894
      /* If the subpattern set a required code unit (or set a first code unit
6895
      that isn't really the first code unit - see above), set it. */
6896

6897
      if (subreqcuflags < REQ_NONE)
6898
        {
6899
        reqcu = subreqcu;
6900
        reqcuflags = subreqcuflags;
6901
        }
6902
      }
6903

6904
    /* For a forward assertion, we take the reqcu, if set, provided that the
6905
    group has also set a firstcu. This can be helpful if the pattern that
6906
    follows the assertion doesn't set a different char. For example, it's
6907
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6908
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6909
    the "real" "a" would then become a reqcu instead of a firstcu. This is
6910
    overcome by a scan at the end if there's no firstcu, looking for an
6911
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6912
    we must only take the reqcu when the group also set a firstcu. Otherwise,
6913
    in that example, 'X' ends up set for both. */
6914

6915
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6916
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
6917
      {
6918
      reqcu = subreqcu;
6919
      reqcuflags = subreqcuflags;
6920
      }
6921

6922
    break;  /* End of nested group handling */
6923

6924

6925
    /* ===================================================================*/
6926
    /* Handle named backreferences and recursions. */
6927

6928
    case META_BACKREF_BYNAME:
6929
    case META_RECURSE_BYNAME:
6930
      {
6931
      int count, index;
6932
      PCRE2_SPTR name;
6933
      BOOL is_dupname = FALSE;
6934
      named_group *ng = cb->named_groups;
6935
      uint32_t length = *(++pptr);
6936

6937
      GETPLUSOFFSET(offset, pptr);
6938
      name = cb->start_pattern + offset;
6939

6940
      /* In the first pass, the names generated in the pre-pass are available,
6941
      but the main name table has not yet been created. Scan the list of names
6942
      generated in the pre-pass in order to get a number and whether or not
6943
      this name is duplicated. */
6944

6945
      groupnumber = 0;
6946
      for (unsigned int i = 0; i < cb->names_found; i++, ng++)
6947
        {
6948
        if (length == ng->length &&
6949
            PRIV(strncmp)(name, ng->name, length) == 0)
6950
          {
6951
          is_dupname = ng->isdup;
6952
          groupnumber = ng->number;
6953

6954
          /* For a recursion, that's all that is needed. We can now go to
6955
          the code that handles numerical recursion, applying it to the first
6956
          group with the given name. */
6957

6958
          if (meta == META_RECURSE_BYNAME)
6959
            {
6960
            meta_arg = groupnumber;
6961
            goto HANDLE_NUMERICAL_RECURSION;
6962
            }
6963

6964
          /* For a back reference, update the back reference map and the
6965
          maximum back reference. */
6966

6967
          cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6968
          if (groupnumber > cb->top_backref)
6969
            cb->top_backref = groupnumber;
6970
          }
6971
        }
6972

6973
      /* If the name was not found we have a bad reference. */
6974

6975
      if (groupnumber == 0)
6976
        {
6977
        *errorcodeptr = ERR15;
6978
        cb->erroroffset = offset;
6979
        return 0;
6980
        }
6981

6982
      /* If a back reference name is not duplicated, we can handle it as
6983
      a numerical reference. */
6984

6985
      if (!is_dupname)
6986
        {
6987
        meta_arg = groupnumber;
6988
        goto HANDLE_SINGLE_REFERENCE;
6989
        }
6990

6991
      /* If a back reference name is duplicated, we generate a different
6992
      opcode to a numerical back reference. In the second pass we must
6993
      search for the index and count in the final name table. */
6994

6995
      count = 0;  /* Values for first pass (avoids compiler warning) */
6996
      index = 0;
6997
      if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6998
            &count, errorcodeptr, cb)) return 0;
6999

7000
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7001
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7002
      PUT2INC(code, 0, index);
7003
      PUT2INC(code, 0, count);
7004
      if ((options & PCRE2_CASELESS) != 0)
7005
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7006
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7007
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7008
                   REFI_FLAG_TURKISH_CASING : 0);
7009
      }
7010
    break;
7011

7012

7013
    /* ===================================================================*/
7014
    /* Handle a numerical callout. */
7015

7016
    case META_CALLOUT_NUMBER:
7017
    code[0] = OP_CALLOUT;
7018
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7019
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7020
    code[1 + 2*LINK_SIZE] = pptr[3];
7021
    pptr += 3;
7022
    code += PRIV(OP_lengths)[OP_CALLOUT];
7023
    break;
7024

7025

7026
    /* ===================================================================*/
7027
    /* Handle a callout with a string argument. In the pre-pass we just compute
7028
    the length without generating anything. The length in pptr[3] includes both
7029
    delimiters; in the actual compile only the first one is copied, but a
7030
    terminating zero is added. Any doubled delimiters within the string make
7031
    this an overestimate, but it is not worth bothering about. */
7032

7033
    case META_CALLOUT_STRING:
7034
    if (lengthptr != NULL)
7035
      {
7036
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7037
      pptr += 3;
7038
      SKIPOFFSET(pptr);
7039
      }
7040

7041
    /* In the real compile we can copy the string. The starting delimiter is
7042
     included so that the client can discover it if they want. We also pass the
7043
     start offset to help a script language give better error messages. */
7044

7045
    else
7046
      {
7047
      PCRE2_SPTR pp;
7048
      uint32_t delimiter;
7049
      uint32_t length = pptr[3];
7050
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7051

7052
      code[0] = OP_CALLOUT_STR;
7053
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7054
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7055

7056
      pptr += 3;
7057
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7058
      pp = cb->start_pattern + offset;
7059
      delimiter = *callout_string++ = *pp++;
7060
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7061
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7062
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7063

7064
      /* The syntax of the pattern was checked in the parsing scan. The length
7065
      includes both delimiters, but we have passed the opening one just above,
7066
      so we reduce length before testing it. The test is for > 1 because we do
7067
      not want to copy the final delimiter. This also ensures that pp[1] is
7068
      accessible. */
7069

7070
      while (--length > 1)
7071
        {
7072
        if (*pp == delimiter && pp[1] == delimiter)
7073
          {
7074
          *callout_string++ = delimiter;
7075
          pp += 2;
7076
          length--;
7077
          }
7078
        else *callout_string++ = *pp++;
7079
        }
7080
      *callout_string++ = CHAR_NUL;
7081

7082
      /* Set the length of the entire item, the advance to its end. */
7083

7084
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7085
      code = callout_string;
7086
      }
7087
    break;
7088

7089

7090
    /* ===================================================================*/
7091
    /* Handle repetition. The different types are all sorted out in the parsing
7092
    pass. */
7093

7094
    case META_MINMAX_PLUS:
7095
    case META_MINMAX_QUERY:
7096
    case META_MINMAX:
7097
    repeat_min = *(++pptr);
7098
    repeat_max = *(++pptr);
7099
    goto REPEAT;
7100

7101
    case META_ASTERISK:
7102
    case META_ASTERISK_PLUS:
7103
    case META_ASTERISK_QUERY:
7104
    repeat_min = 0;
7105
    repeat_max = REPEAT_UNLIMITED;
7106
    goto REPEAT;
7107

7108
    case META_PLUS:
7109
    case META_PLUS_PLUS:
7110
    case META_PLUS_QUERY:
7111
    repeat_min = 1;
7112
    repeat_max = REPEAT_UNLIMITED;
7113
    goto REPEAT;
7114

7115
    case META_QUERY:
7116
    case META_QUERY_PLUS:
7117
    case META_QUERY_QUERY:
7118
    repeat_min = 0;
7119
    repeat_max = 1;
7120

7121
    REPEAT:
7122
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7123

7124
    /* Remember whether this is a variable length repeat, and default to
7125
    single-char opcodes. */
7126

7127
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7128

7129
    /* Adjust first and required code units for a zero repeat. */
7130

7131
    if (repeat_min == 0)
7132
      {
7133
      firstcu = zerofirstcu;
7134
      firstcuflags = zerofirstcuflags;
7135
      reqcu = zeroreqcu;
7136
      reqcuflags = zeroreqcuflags;
7137
      }
7138

7139
    /* Note the greediness and possessiveness. */
7140

7141
    switch (meta)
7142
      {
7143
      case META_MINMAX_PLUS:
7144
      case META_ASTERISK_PLUS:
7145
      case META_PLUS_PLUS:
7146
      case META_QUERY_PLUS:
7147
      repeat_type = 0;                  /* Force greedy */
7148
      possessive_quantifier = TRUE;
7149
      break;
7150

7151
      case META_MINMAX_QUERY:
7152
      case META_ASTERISK_QUERY:
7153
      case META_PLUS_QUERY:
7154
      case META_QUERY_QUERY:
7155
      repeat_type = greedy_non_default;
7156
      possessive_quantifier = FALSE;
7157
      break;
7158

7159
      default:
7160
      repeat_type = greedy_default;
7161
      possessive_quantifier = FALSE;
7162
      break;
7163
      }
7164

7165
    /* Save start of previous item, in case we have to move it up in order to
7166
    insert something before it, and remember what it was. */
7167

7168
    PCRE2_ASSERT(previous != NULL);
7169
    tempcode = previous;
7170
    op_previous = *previous;
7171

7172
    /* Now handle repetition for the different types of item. If the repeat
7173
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7174
    non-parenthesized items, as they have only one alternative. For anything in
7175
    parentheses, we must not ignore if {1} is possessive. */
7176

7177
    switch (op_previous)
7178
      {
7179
      /* If previous was a character or negated character match, abolish the
7180
      item and generate a repeat item instead. If a char item has a minimum of
7181
      more than one, ensure that it is set in reqcu - it might not be if a
7182
      sequence such as x{3} is the first thing in a branch because the x will
7183
      have gone into firstcu instead.  */
7184

7185
      case OP_CHAR:
7186
      case OP_CHARI:
7187
      case OP_NOT:
7188
      case OP_NOTI:
7189
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7190
      op_type = chartypeoffset[op_previous - OP_CHAR];
7191

7192
      /* Deal with UTF characters that take up more than one code unit. */
7193

7194
#ifdef MAYBE_UTF_MULTI
7195
      if (utf && NOT_FIRSTCU(code[-1]))
7196
        {
7197
        PCRE2_UCHAR *lastchar = code - 1;
7198
        BACKCHAR(lastchar);
7199
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7200
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7201
        }
7202
      else
7203
#endif  /* MAYBE_UTF_MULTI */
7204

7205
      /* Handle the case of a single code unit - either with no UTF support, or
7206
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7207
      case, for a repeated positive match, get the caseless flag for the
7208
      required code unit from the previous character, because a class like [Aa]
7209
      sets a caseless A but by now the req_caseopt flag has been reset. */
7210

7211
        {
7212
        mcbuffer[0] = code[-1];
7213
        mclength = 1;
7214
        if (op_previous <= OP_CHARI && repeat_min > 1)
7215
          {
7216
          reqcu = mcbuffer[0];
7217
          reqcuflags = cb->req_varyopt;
7218
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7219
          }
7220
        }
7221
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7222

7223
      /* If previous was a character class or a back reference, we put the
7224
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7225

7226
#ifdef SUPPORT_WIDE_CHARS
7227
      case OP_XCLASS:
7228
      case OP_ECLASS:
7229
#endif
7230
      case OP_CLASS:
7231
      case OP_NCLASS:
7232
      case OP_REF:
7233
      case OP_REFI:
7234
      case OP_DNREF:
7235
      case OP_DNREFI:
7236

7237
      if (repeat_max == 0)
7238
        {
7239
        code = previous;
7240
        goto END_REPEAT;
7241
        }
7242
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7243

7244
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7245
        *code++ = OP_CRSTAR + repeat_type;
7246
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7247
        *code++ = OP_CRPLUS + repeat_type;
7248
      else if (repeat_min == 0 && repeat_max == 1)
7249
        *code++ = OP_CRQUERY + repeat_type;
7250
      else
7251
        {
7252
        *code++ = OP_CRRANGE + repeat_type;
7253
        PUT2INC(code, 0, repeat_min);
7254
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7255
        PUT2INC(code, 0, repeat_max);
7256
        }
7257
      break;
7258

7259
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7260
      because pcre2_match() could not handle backtracking into recursively
7261
      called groups. Now that this backtracking is available, we no longer need
7262
      to do this. However, we still need to replicate recursions as we do for
7263
      groups so as to have independent backtracking points. We can replicate
7264
      for the minimum number of repeats directly. For optional repeats we now
7265
      wrap the recursion in OP_BRA brackets and make use of the bracket
7266
      repetition. */
7267

7268
      case OP_RECURSE:
7269
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7270
        goto END_REPEAT;
7271

7272
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7273
      minimum is 1 and the maximum unlimited, because that can be handled with
7274
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7275
      minimum, we just need to generate the appropriate additional copies.
7276
      Otherwise we need to generate one more, to simulate the situation when
7277
      the minimum is zero. */
7278

7279
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7280
        {
7281
        int replicate = repeat_min;
7282
        if (repeat_min == repeat_max) replicate--;
7283

7284
        /* In the pre-compile phase, we don't actually do the replication. We
7285
        just adjust the length as if we had. Do some paranoid checks for
7286
        potential integer overflow. */
7287

7288
        if (lengthptr != NULL)
7289
          {
7290
          PCRE2_SIZE delta;
7291
          if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7292
              OFLOW_MAX - *lengthptr < delta)
7293
            {
7294
            *errorcodeptr = ERR20;
7295
            return 0;
7296
            }
7297
          *lengthptr += delta;
7298
          }
7299

7300
        else for (int i = 0; i < replicate; i++)
7301
          {
7302
          memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7303
          previous = code;
7304
          code += 1 + LINK_SIZE;
7305
          }
7306

7307
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7308
        the counts and fall through. */
7309

7310
        if (repeat_min == repeat_max) break;
7311
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7312
        repeat_min = 0;
7313
        }
7314

7315
      /* Wrap the recursion call in OP_BRA brackets. */
7316

7317
      (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7318
      op_previous = *previous = OP_BRA;
7319
      PUT(previous, 1, 2 + 2*LINK_SIZE);
7320
      previous[2 + 2*LINK_SIZE] = OP_KET;
7321
      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7322
      code += 2 + 2 * LINK_SIZE;
7323
      length_prevgroup = 3 + 3*LINK_SIZE;
7324
      group_return = -1;  /* Set "may match empty string" */
7325

7326
      /* Now treat as a repeated OP_BRA. */
7327
      /* Fall through */
7328

7329
      /* If previous was a bracket group, we may have to replicate it in
7330
      certain cases. Note that at this point we can encounter only the "basic"
7331
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7332
      converted into the more special varieties such as BRAPOS and SBRA.
7333
      Originally, PCRE did not allow repetition of assertions, but now it does,
7334
      for Perl compatibility. */
7335

7336
      case OP_ASSERT:
7337
      case OP_ASSERT_NOT:
7338
      case OP_ASSERT_NA:
7339
      case OP_ASSERTBACK:
7340
      case OP_ASSERTBACK_NOT:
7341
      case OP_ASSERTBACK_NA:
7342
      case OP_ASSERT_SCS:
7343
      case OP_ONCE:
7344
      case OP_SCRIPT_RUN:
7345
      case OP_BRA:
7346
      case OP_CBRA:
7347
      case OP_COND:
7348
        {
7349
        int len = (int)(code - previous);
7350
        PCRE2_UCHAR *bralink = NULL;
7351
        PCRE2_UCHAR *brazeroptr = NULL;
7352

7353
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7354
          goto END_REPEAT;
7355

7356
        /* Repeating a DEFINE group (or any group where the condition is always
7357
        FALSE and there is only one branch) is pointless, but Perl allows the
7358
        syntax, so we just ignore the repeat. */
7359

7360
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7361
            previous[GET(previous, 1)] != OP_ALT)
7362
          goto END_REPEAT;
7363

7364
        /* Perl allows all assertions to be quantified, and when they contain
7365
        capturing parentheses and/or are optional there are potential uses for
7366
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7367
        invalid grounds that further repetition was never useful. This was
7368
        always a bit pointless, since an assertion could be wrapped with a
7369
        repeated group to achieve the effect. General repetition is now
7370
        permitted, but if the maximum is unlimited it is set to one more than
7371
        the minimum. */
7372

7373
        if (op_previous < OP_ONCE)    /* Assertion */
7374
          {
7375
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7376
          }
7377

7378
        /* The case of a zero minimum is special because of the need to stick
7379
        OP_BRAZERO in front of it, and because the group appears once in the
7380
        data, whereas in other cases it appears the minimum number of times. For
7381
        this reason, it is simplest to treat this case separately, as otherwise
7382
        the code gets far too messy. There are several special subcases when the
7383
        minimum is zero. */
7384

7385
        if (repeat_min == 0)
7386
          {
7387
          /* If the maximum is also zero, we used to just omit the group from
7388
          the output altogether, like this:
7389

7390
          ** if (repeat_max == 0)
7391
          **   {
7392
          **   code = previous;
7393
          **   goto END_REPEAT;
7394
          **   }
7395

7396
          However, that fails when a group or a subgroup within it is
7397
          referenced as a subroutine from elsewhere in the pattern, so now we
7398
          stick in OP_SKIPZERO in front of it so that it is skipped on
7399
          execution. As we don't have a list of which groups are referenced, we
7400
          cannot do this selectively.
7401

7402
          If the maximum is 1 or unlimited, we just have to stick in the
7403
          BRAZERO and do no more at this point. */
7404

7405
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7406
            {
7407
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7408
            code++;
7409
            if (repeat_max == 0)
7410
              {
7411
              *previous++ = OP_SKIPZERO;
7412
              goto END_REPEAT;
7413
              }
7414
            brazeroptr = previous;    /* Save for possessive optimizing */
7415
            *previous++ = OP_BRAZERO + repeat_type;
7416
            }
7417

7418
          /* If the maximum is greater than 1 and limited, we have to replicate
7419
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7420
          The first one has to be handled carefully because it's the original
7421
          copy, which has to be moved up. The remainder can be handled by code
7422
          that is common with the non-zero minimum case below. We have to
7423
          adjust the value or repeat_max, since one less copy is required. */
7424

7425
          else
7426
            {
7427
            int linkoffset;
7428
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7429
            code += 2 + LINK_SIZE;
7430
            *previous++ = OP_BRAZERO + repeat_type;
7431
            *previous++ = OP_BRA;
7432

7433
            /* We chain together the bracket link offset fields that have to be
7434
            filled in later when the ends of the brackets are reached. */
7435

7436
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7437
            bralink = previous;
7438
            PUTINC(previous, 0, linkoffset);
7439
            }
7440

7441
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7442
          }
7443

7444
        /* If the minimum is greater than zero, replicate the group as many
7445
        times as necessary, and adjust the maximum to the number of subsequent
7446
        copies that we need. */
7447

7448
        else
7449
          {
7450
          if (repeat_min > 1)
7451
            {
7452
            /* In the pre-compile phase, we don't actually do the replication.
7453
            We just adjust the length as if we had. Do some paranoid checks for
7454
            potential integer overflow. */
7455

7456
            if (lengthptr != NULL)
7457
              {
7458
              PCRE2_SIZE delta;
7459
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7460
                                 (int)length_prevgroup) ||
7461
                  OFLOW_MAX - *lengthptr < delta)
7462
                {
7463
                *errorcodeptr = ERR20;
7464
                return 0;
7465
                }
7466
              *lengthptr += delta;
7467
              }
7468

7469
            /* This is compiling for real. If there is a set first code unit
7470
            for the group, and we have not yet set a "required code unit", set
7471
            it. */
7472

7473
            else
7474
              {
7475
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7476
                {
7477
                reqcu = firstcu;
7478
                reqcuflags = firstcuflags;
7479
                }
7480
              for (uint32_t i = 1; i < repeat_min; i++)
7481
                {
7482
                memcpy(code, previous, CU2BYTES(len));
7483
                code += len;
7484
                }
7485
              }
7486
            }
7487

7488
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7489
          }
7490

7491
        /* This code is common to both the zero and non-zero minimum cases. If
7492
        the maximum is limited, it replicates the group in a nested fashion,
7493
        remembering the bracket starts on a stack. In the case of a zero
7494
        minimum, the first one was set up above. In all cases the repeat_max
7495
        now specifies the number of additional copies needed. Again, we must
7496
        remember to replicate entries on the forward reference list. */
7497

7498
        if (repeat_max != REPEAT_UNLIMITED)
7499
          {
7500
          /* In the pre-compile phase, we don't actually do the replication. We
7501
          just adjust the length as if we had. For each repetition we must add
7502
          1 to the length for BRAZERO and for all but the last repetition we
7503
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7504
          paranoid checks to avoid integer overflow. */
7505

7506
          if (lengthptr != NULL && repeat_max > 0)
7507
            {
7508
            PCRE2_SIZE delta;
7509
            if (PRIV(ckd_smul)(&delta, repeat_max,
7510
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7511
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7512
              {
7513
              *errorcodeptr = ERR20;
7514
              return 0;
7515
              }
7516
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7517
            *lengthptr += delta;
7518
            }
7519

7520
          /* This is compiling for real */
7521

7522
          else for (uint32_t i = repeat_max; i >= 1; i--)
7523
            {
7524
            *code++ = OP_BRAZERO + repeat_type;
7525

7526
            /* All but the final copy start a new nesting, maintaining the
7527
            chain of brackets outstanding. */
7528

7529
            if (i != 1)
7530
              {
7531
              int linkoffset;
7532
              *code++ = OP_BRA;
7533
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7534
              bralink = code;
7535
              PUTINC(code, 0, linkoffset);
7536
              }
7537

7538
            memcpy(code, previous, CU2BYTES(len));
7539
            code += len;
7540
            }
7541

7542
          /* Now chain through the pending brackets, and fill in their length
7543
          fields (which are holding the chain links pro tem). */
7544

7545
          while (bralink != NULL)
7546
            {
7547
            int oldlinkoffset;
7548
            int linkoffset = (int)(code - bralink + 1);
7549
            PCRE2_UCHAR *bra = code - linkoffset;
7550
            oldlinkoffset = GET(bra, 1);
7551
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7552
            *code++ = OP_KET;
7553
            PUTINC(code, 0, linkoffset);
7554
            PUT(bra, 1, linkoffset);
7555
            }
7556
          }
7557

7558
        /* If the maximum is unlimited, set a repeater in the final copy. For
7559
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7560
        possessively repeated ONCE brackets can be converted into non-capturing
7561
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7562
        saves having to deal with possessive ONCEs specially.
7563

7564
        Otherwise, when we are doing the actual compile phase, check to see
7565
        whether this group is one that could match an empty string. If so,
7566
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7567
        that runtime checking can be done. [This check is also applied to ONCE
7568
        and SCRIPT_RUN groups at runtime, but in a different way.]
7569

7570
        Then, if the quantifier was possessive and the bracket is not a
7571
        conditional, we convert the BRA code to the POS form, and the KET code
7572
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7573
        kind of subpattern at both the start and at the end.) The use of
7574
        special opcodes makes it possible to reduce greatly the stack usage in
7575
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7576
        OP_BRAPOSZERO.
7577

7578
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7579
        flag so that the default action below, of wrapping everything inside
7580
        atomic brackets, does not happen. When the minimum is greater than 1,
7581
        there will be earlier copies of the group, and so we still have to wrap
7582
        the whole thing. */
7583

7584
        else
7585
          {
7586
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7587
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7588

7589
          /* Convert possessive ONCE brackets to non-capturing */
7590

7591
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7592

7593
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7594
          to do is to set the KET. */
7595

7596
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7597
            *ketcode = OP_KETRMAX + repeat_type;
7598

7599
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7600
          (which have been converted to non-capturing above). */
7601

7602
          else
7603
            {
7604
            /* In the compile phase, adjust the opcode if the group can match
7605
            an empty string. For a conditional group with only one branch, the
7606
            value of group_return will not show "could be empty", so we must
7607
            check that separately. */
7608

7609
            if (lengthptr == NULL)
7610
              {
7611
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7612
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7613
                *bracode = OP_SCOND;
7614
              }
7615

7616
            /* Handle possessive quantifiers. */
7617

7618
            if (possessive_quantifier)
7619
              {
7620
              /* For COND brackets, we wrap the whole thing in a possessively
7621
              repeated non-capturing bracket, because we have not invented POS
7622
              versions of the COND opcodes. */
7623

7624
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7625
                {
7626
                int nlen = (int)(code - bracode);
7627
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7628
                code += 1 + LINK_SIZE;
7629
                nlen += 1 + LINK_SIZE;
7630
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7631
                *code++ = OP_KETRPOS;
7632
                PUTINC(code, 0, nlen);
7633
                PUT(bracode, 1, nlen);
7634
                }
7635

7636
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7637

7638
              else
7639
                {
7640
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7641
                *ketcode = OP_KETRPOS;
7642
                }
7643

7644
              /* If the minimum is zero, mark it as possessive, then unset the
7645
              possessive flag when the minimum is 0 or 1. */
7646

7647
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7648
              if (repeat_min < 2) possessive_quantifier = FALSE;
7649
              }
7650

7651
            /* Non-possessive quantifier */
7652

7653
            else *ketcode = OP_KETRMAX + repeat_type;
7654
            }
7655
          }
7656
        }
7657
      break;
7658

7659
      /* If previous was a character type match (\d or similar), abolish it and
7660
      create a suitable repeat item. The code is shared with single-character
7661
      repeats by setting op_type to add a suitable offset into repeat_type.
7662
      Note the the Unicode property types will be present only when
7663
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7664
      here because it just makes it horribly messy. */
7665

7666
      default:
7667
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7668
        {
7669
        PCRE2_DEBUG_UNREACHABLE();
7670
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7671
        return 0;
7672
        }
7673
      else
7674
        {
7675
        int prop_type, prop_value;
7676
        PCRE2_UCHAR *oldcode;
7677

7678
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7679

7680
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7681
        mclength = 0;                         /* Not a character */
7682

7683
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7684
          {
7685
          prop_type = previous[1];
7686
          prop_value = previous[2];
7687
          }
7688
        else
7689
          {
7690
          /* Come here from just above with a character in mcbuffer/mclength.
7691
          You must also set op_type before the jump. */
7692
          OUTPUT_SINGLE_REPEAT:
7693
          prop_type = prop_value = -1;
7694
          }
7695

7696
        /* At this point, if prop_type == prop_value == -1 we either have a
7697
        character in mcbuffer when mclength is greater than zero, or we have
7698
        mclength zero, in which case there is a non-property character type in
7699
        op_previous. If prop_type/value are not negative, we have a property
7700
        character type in op_previous. */
7701

7702
        oldcode = code;                   /* Save where we were */
7703
        code = previous;                  /* Usually overwrite previous item */
7704

7705
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7706
        this case, so we do too - by simply omitting the item altogether. */
7707

7708
        if (repeat_max == 0) goto END_REPEAT;
7709

7710
        /* Combine the op_type with the repeat_type */
7711

7712
        repeat_type += op_type;
7713

7714
        /* A minimum of zero is handled either as the special case * or ?, or as
7715
        an UPTO, with the maximum given. */
7716

7717
        if (repeat_min == 0)
7718
          {
7719
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7720
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7721
          else
7722
            {
7723
            *code++ = OP_UPTO + repeat_type;
7724
            PUT2INC(code, 0, repeat_max);
7725
            }
7726
          }
7727

7728
        /* A repeat minimum of 1 is optimized into some special cases. If the
7729
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7730
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7731
        one less than the maximum. */
7732

7733
        else if (repeat_min == 1)
7734
          {
7735
          if (repeat_max == REPEAT_UNLIMITED)
7736
            *code++ = OP_PLUS + repeat_type;
7737
          else
7738
            {
7739
            code = oldcode;  /* Leave previous item in place */
7740
            if (repeat_max == 1) goto END_REPEAT;
7741
            *code++ = OP_UPTO + repeat_type;
7742
            PUT2INC(code, 0, repeat_max - 1);
7743
            }
7744
          }
7745

7746
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7747
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7748

7749
        else
7750
          {
7751
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7752
          PUT2INC(code, 0, repeat_min);
7753

7754
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7755
          and then generate the second opcode. For a repeated Unicode property
7756
          match, there are two extra values that define the required property,
7757
          and mclength is set zero to indicate this. */
7758

7759
          if (repeat_max != repeat_min)
7760
            {
7761
            if (mclength > 0)
7762
              {
7763
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7764
              code += mclength;
7765
              }
7766
            else
7767
              {
7768
              *code++ = op_previous;
7769
              if (prop_type >= 0)
7770
                {
7771
                *code++ = prop_type;
7772
                *code++ = prop_value;
7773
                }
7774
              }
7775

7776
            /* Now set up the following opcode */
7777

7778
            if (repeat_max == REPEAT_UNLIMITED)
7779
              *code++ = OP_STAR + repeat_type;
7780
            else
7781
              {
7782
              repeat_max -= repeat_min;
7783
              if (repeat_max == 1)
7784
                {
7785
                *code++ = OP_QUERY + repeat_type;
7786
                }
7787
              else
7788
                {
7789
                *code++ = OP_UPTO + repeat_type;
7790
                PUT2INC(code, 0, repeat_max);
7791
                }
7792
              }
7793
            }
7794
          }
7795

7796
        /* Fill in the character or character type for the final opcode. */
7797

7798
        if (mclength > 0)
7799
          {
7800
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7801
          code += mclength;
7802
          }
7803
        else
7804
          {
7805
          *code++ = op_previous;
7806
          if (prop_type >= 0)
7807
            {
7808
            *code++ = prop_type;
7809
            *code++ = prop_value;
7810
            }
7811
          }
7812
        }
7813
      break;
7814
      }  /* End of switch on different op_previous values */
7815

7816

7817
    /* If the character following a repeat is '+', possessive_quantifier is
7818
    TRUE. For some opcodes, there are special alternative opcodes for this
7819
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
7820
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
7821
    Sun's Java package, but the special opcodes can optimize it.
7822

7823
    Some (but not all) possessively repeated subpatterns have already been
7824
    completely handled in the code just above. For them, possessive_quantifier
7825
    is always FALSE at this stage. Note that the repeated item starts at
7826
    tempcode, not at previous, which might be the first part of a string whose
7827
    (former) last char we repeated. */
7828

7829
    if (possessive_quantifier)
7830
      {
7831
      int len;
7832

7833
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7834
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7835
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7836
      remains is greater than zero, there's a further opcode that can be
7837
      handled. If not, do nothing, leaving the EXACT alone. */
7838

7839
      switch(*tempcode)
7840
        {
7841
        case OP_TYPEEXACT:
7842
        tempcode += PRIV(OP_lengths)[*tempcode] +
7843
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
7844
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7845
        break;
7846

7847
        /* CHAR opcodes are used for exacts whose count is 1. */
7848

7849
        case OP_CHAR:
7850
        case OP_CHARI:
7851
        case OP_NOT:
7852
        case OP_NOTI:
7853
        case OP_EXACT:
7854
        case OP_EXACTI:
7855
        case OP_NOTEXACT:
7856
        case OP_NOTEXACTI:
7857
        tempcode += PRIV(OP_lengths)[*tempcode];
7858
#ifdef SUPPORT_UNICODE
7859
        if (utf && HAS_EXTRALEN(tempcode[-1]))
7860
          tempcode += GET_EXTRALEN(tempcode[-1]);
7861
#endif
7862
        break;
7863

7864
        /* For the class opcodes, the repeat operator appears at the end;
7865
        adjust tempcode to point to it. */
7866

7867
        case OP_CLASS:
7868
        case OP_NCLASS:
7869
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7870
        break;
7871

7872
#ifdef SUPPORT_WIDE_CHARS
7873
        case OP_XCLASS:
7874
        case OP_ECLASS:
7875
        tempcode += GET(tempcode, 1);
7876
        break;
7877
#endif
7878
        }
7879

7880
      /* If tempcode is equal to code (which points to the end of the repeated
7881
      item), it means we have skipped an EXACT item but there is no following
7882
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7883
      all other cases, tempcode will be pointing to the repeat opcode, and will
7884
      be less than code, so the value of len will be greater than 0. */
7885

7886
      len = (int)(code - tempcode);
7887
      if (len > 0)
7888
        {
7889
        unsigned int repcode = *tempcode;
7890

7891
        /* There is a table for possessifying opcodes, all of which are less
7892
        than OP_CALLOUT. A zero entry means there is no possessified version.
7893
        */
7894

7895
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7896
          *tempcode = opcode_possessify[repcode];
7897

7898
        /* For opcode without a special possessified version, wrap the item in
7899
        ONCE brackets. */
7900

7901
        else
7902
          {
7903
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7904
          code += 1 + LINK_SIZE;
7905
          len += 1 + LINK_SIZE;
7906
          tempcode[0] = OP_ONCE;
7907
          *code++ = OP_KET;
7908
          PUTINC(code, 0, len);
7909
          PUT(tempcode, 1, len);
7910
          }
7911
        }
7912
      }
7913

7914
    /* We set the "follows varying string" flag for subsequently encountered
7915
    reqcus if it isn't already set and we have just passed a varying length
7916
    item. */
7917

7918
    END_REPEAT:
7919
    cb->req_varyopt |= reqvary;
7920
    break;
7921

7922

7923
    /* ===================================================================*/
7924
    /* Handle a 32-bit data character with a value greater than META_END. */
7925

7926
    case META_BIGVALUE:
7927
    pptr++;
7928
    goto NORMAL_CHAR;
7929

7930

7931
    /* ===============================================================*/
7932
    /* Handle a back reference by number, which is the meta argument. The
7933
    pattern offsets for back references to group numbers less than 10 are held
7934
    in a special vector, to avoid using more than two parsed pattern elements
7935
    in 64-bit environments. We only need the offset to the first occurrence,
7936
    because if that doesn't fail, subsequent ones will also be OK. */
7937

7938
    case META_BACKREF:
7939
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7940
      else GETPLUSOFFSET(offset, pptr);
7941

7942
    if (meta_arg > cb->bracount)
7943
      {
7944
      cb->erroroffset = offset;
7945
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
7946
      return 0;
7947
      }
7948

7949
    /* Come here from named backref handling when the reference is to a
7950
    single group (that is, not to a duplicated name). The back reference
7951
    data will have already been updated. We must disable firstcu if not
7952
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7953
    later. */
7954

7955
    HANDLE_SINGLE_REFERENCE:
7956
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7957
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7958
    PUT2INC(code, 0, meta_arg);
7959
    if ((options & PCRE2_CASELESS) != 0)
7960
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7961
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
7962
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7963
                 REFI_FLAG_TURKISH_CASING : 0);
7964

7965
    /* Update the map of back references, and keep the highest one. We
7966
    could do this in parse_regex() for numerical back references, but not
7967
    for named back references, because we don't know the numbers to which
7968
    named back references refer. So we do it all in this function. */
7969

7970
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7971
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7972
    break;
7973

7974

7975
    /* ===============================================================*/
7976
    /* Handle recursion by inserting the number of the called group (which is
7977
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7978
    scanned and these numbers are replaced by offsets within the pattern. It is
7979
    done like this to avoid problems with forward references and adjusting
7980
    offsets when groups are duplicated and moved (as discovered in previous
7981
    implementations). Note that a recursion does not have a set first
7982
    character. */
7983

7984
    case META_RECURSE:
7985
    GETPLUSOFFSET(offset, pptr);
7986
    if (meta_arg > cb->bracount)
7987
      {
7988
      cb->erroroffset = offset;
7989
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
7990
      return 0;
7991
      }
7992
    HANDLE_NUMERICAL_RECURSION:
7993
    *code = OP_RECURSE;
7994
    PUT(code, 1, meta_arg);
7995
    code += 1 + LINK_SIZE;
7996
    groupsetfirstcu = FALSE;
7997
    cb->had_recurse = TRUE;
7998
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7999
    zerofirstcu = firstcu;
8000
    zerofirstcuflags = firstcuflags;
8001
    break;
8002

8003

8004
    /* ===============================================================*/
8005
    /* Handle capturing parentheses; the number is the meta argument. */
8006

8007
    case META_CAPTURE:
8008
    bravalue = OP_CBRA;
8009
    skipunits = IMM2_SIZE;
8010
    PUT2(code, 1+LINK_SIZE, meta_arg);
8011
    cb->lastcapture = meta_arg;
8012
    goto GROUP_PROCESS_NOTE_EMPTY;
8013

8014

8015
    /* ===============================================================*/
8016
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8017
    arranged to be the same as the corresponding OP_values in the default case
8018
    when PCRE2_UCP is not set (which is the only case in which they will appear
8019
    here).
8020

8021
    Note: \Q and \E are never seen here, as they were dealt with in
8022
    parse_pattern(). Neither are numerical back references or recursions, which
8023
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8024
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8025
    META_RECURSE_BYNAME. */
8026

8027
    case META_ESCAPE:
8028

8029
    /* We can test for escape sequences that consume a character because their
8030
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8031
    are ever created. For these sequences, we disable the setting of a first
8032
    character if it hasn't already been set. */
8033

8034
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8035
      {
8036
      matched_char = TRUE;
8037
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8038
      }
8039

8040
    /* Set values to reset to if this is followed by a zero repeat. */
8041

8042
    zerofirstcu = firstcu;
8043
    zerofirstcuflags = firstcuflags;
8044
    zeroreqcu = reqcu;
8045
    zeroreqcuflags = reqcuflags;
8046

8047
    /* If Unicode is not supported, \P and \p are not allowed and are
8048
    faulted at parse time, so will never appear here. */
8049

8050
#ifdef SUPPORT_UNICODE
8051
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8052
      {
8053
      uint32_t ptype = *(++pptr) >> 16;
8054
      uint32_t pdata = *pptr & 0xffff;
8055

8056
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8057
      converted to the general characteristic L&. That is, upper, lower, and
8058
      title case letters are all conflated. */
8059

8060
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8061
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8062
        {
8063
        ptype = PT_LAMP;
8064
        pdata = 0;
8065
        }
8066

8067
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8068
      is compiled to [] so as to benefit from the auto-anchoring code. */
8069

8070
      if (ptype == PT_ANY)
8071
        {
8072
        if (meta_arg == ESC_P)
8073
          {
8074
          *code++ = OP_CLASS;
8075
          memset(code, 0, 32);
8076
          code += 32 / sizeof(PCRE2_UCHAR);
8077
          }
8078
        else
8079
          *code++ = OP_ALLANY;
8080
        }
8081
      else
8082
        {
8083
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8084
        *code++ = ptype;
8085
        *code++ = pdata;
8086
        }
8087
      break;  /* End META_ESCAPE */
8088
      }
8089
#endif
8090

8091
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8092
    done. However, there's an option, in case anyone was relying on it. */
8093

8094
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8095
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8096
      {
8097
      *errorcodeptr = ERR99;
8098
      return 0;
8099
      }
8100

8101
    /* For the rest (including \X when Unicode is supported - if not it's
8102
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8103
    not set; if it is set, most of them do not show up here because they are
8104
    converted into Unicode property tests in parse_regex().
8105

8106
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8107
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8108
    There are special UCP codes for \B and \b which are used in UCP mode unless
8109
    "word" matching is being forced to ASCII.
8110

8111
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8112
    if it does. */
8113

8114
    switch(meta_arg)
8115
      {
8116
      case ESC_C:
8117
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8118
#if PCRE2_CODE_UNIT_WIDTH == 32
8119
      meta_arg = OP_ALLANY;
8120
#else
8121
      if (!utf) meta_arg = OP_ALLANY;
8122
#endif
8123
      break;
8124

8125
      case ESC_B:
8126
      case ESC_b:
8127
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8128
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8129
          OP_UCP_WORD_BOUNDARY;
8130
      /* Fall through */
8131

8132
      case ESC_A:
8133
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8134
      break;
8135
      }
8136

8137
    *code++ = meta_arg;
8138
    break;  /* End META_ESCAPE */
8139

8140

8141
    /* ===================================================================*/
8142
    /* Handle an unrecognized meta value. A parsed pattern value less than
8143
    META_END is a literal. Otherwise we have a problem. */
8144

8145
    default:
8146
    if (meta >= META_END)
8147
      {
8148
      PCRE2_DEBUG_UNREACHABLE();
8149
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8150
      return 0;
8151
      }
8152

8153
    /* Handle a literal character. We come here by goto in the case of a
8154
    32-bit, non-UTF character whose value is greater than META_END. */
8155

8156
    NORMAL_CHAR:
8157
    meta = *pptr;     /* Get the full 32 bits */
8158
    NORMAL_CHAR_SET:  /* Character is already in meta */
8159
    matched_char = TRUE;
8160

8161
    /* For caseless UTF or UCP mode, check whether this character has more than
8162
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8163
    When casing restrictions apply, ignore caseless sets that start with an
8164
    ASCII character. If the character is affected by the special Turkish rules,
8165
    hardcode the matching characters using a caseset. */
8166

8167
#ifdef SUPPORT_UNICODE
8168
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8169
      {
8170
      uint32_t caseset;
8171

8172
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8173
            PCRE2_EXTRA_TURKISH_CASING &&
8174
          UCD_ANY_I(meta))
8175
        {
8176
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8177
        }
8178
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8179
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8180
               PRIV(ucd_caseless_sets)[caseset] < 128)
8181
        {
8182
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8183
        }
8184

8185
      if (caseset != 0)
8186
        {
8187
        *code++ = OP_PROP;
8188
        *code++ = PT_CLIST;
8189
        *code++ = caseset;
8190
        if (firstcuflags == REQ_UNSET)
8191
          firstcuflags = zerofirstcuflags = REQ_NONE;
8192
        break;  /* End handling this meta item */
8193
        }
8194
      }
8195
#endif
8196

8197
    /* Caseful matches, or caseless and not one of the multicase characters. We
8198
    come here by goto in the case of a positive class that contains only
8199
    case-partners of a character with just two cases; matched_char has already
8200
    been set TRUE and options fudged if necessary. */
8201

8202
    CLASS_CASELESS_CHAR:
8203

8204
    /* Get the character's code units into mcbuffer, with the length in
8205
    mclength. When not in UTF mode, the length is always 1. */
8206

8207
#ifdef SUPPORT_UNICODE
8208
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8209
#endif
8210
      {
8211
      mclength = 1;
8212
      mcbuffer[0] = meta;
8213
      }
8214

8215
    /* Generate the appropriate code */
8216

8217
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8218
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8219
    code += mclength;
8220

8221
    /* Remember if \r or \n were seen */
8222

8223
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8224
      cb->external_flags |= PCRE2_HASCRORLF;
8225

8226
    /* Set the first and required code units appropriately. If no previous
8227
    first code unit, set it from this character, but revert to none on a zero
8228
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8229
    a zero repeat. */
8230

8231
    if (firstcuflags == REQ_UNSET)
8232
      {
8233
      zerofirstcuflags = REQ_NONE;
8234
      zeroreqcu = reqcu;
8235
      zeroreqcuflags = reqcuflags;
8236

8237
      /* If the character is more than one code unit long, we can set a single
8238
      firstcu only if it is not to be matched caselessly. Multiple possible
8239
      starting code units may be picked up later in the studying code. */
8240

8241
      if (mclength == 1 || req_caseopt == 0)
8242
        {
8243
        firstcu = mcbuffer[0];
8244
        firstcuflags = req_caseopt;
8245
        if (mclength != 1)
8246
          {
8247
          reqcu = code[-1];
8248
          reqcuflags = cb->req_varyopt;
8249
          }
8250
        }
8251
      else firstcuflags = reqcuflags = REQ_NONE;
8252
      }
8253

8254
    /* firstcu was previously set; we can set reqcu only if the length is
8255
    1 or the matching is caseful. */
8256

8257
    else
8258
      {
8259
      zerofirstcu = firstcu;
8260
      zerofirstcuflags = firstcuflags;
8261
      zeroreqcu = reqcu;
8262
      zeroreqcuflags = reqcuflags;
8263
      if (mclength == 1 || req_caseopt == 0)
8264
        {
8265
        reqcu = code[-1];
8266
        reqcuflags = req_caseopt | cb->req_varyopt;
8267
        }
8268
      }
8269

8270
    /* If caselessness was temporarily instated, reset it. */
8271

8272
    if (reset_caseful)
8273
      {
8274
      options &= ~PCRE2_CASELESS;
8275
      req_caseopt = 0;
8276
      reset_caseful = FALSE;
8277
      }
8278

8279
    break;    /* End literal character handling */
8280
    }         /* End of big switch */
8281
  }           /* End of big loop */
8282

8283
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8284
return 0;                  /* Avoid compiler warnings */
8285
}
8286

8287

8288

8289
/*************************************************
8290
*   Compile regex: a sequence of alternatives    *
8291
*************************************************/
8292

8293
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8294
the closing bracket or META_END. The code variable is pointing at the code unit
8295
into which the BRA operator has been stored. This function is used during the
8296
pre-compile phase when we are trying to find out the amount of memory needed,
8297
as well as during the real compile phase. The value of lengthptr distinguishes
8298
the two phases.
8299

8300
Arguments:
8301
  options           option bits, including any changes for this subpattern
8302
  xoptions          extra option bits, ditto
8303
  codeptr           -> the address of the current code pointer
8304
  pptrptr           -> the address of the current parsed pattern pointer
8305
  errorcodeptr      -> pointer to error code variable
8306
  skipunits         skip this many code units at start (for brackets and OP_COND)
8307
  firstcuptr        place to put the first required code unit
8308
  firstcuflagsptr   place to put the first code unit flags
8309
  reqcuptr          place to put the last required code unit
8310
  reqcuflagsptr     place to put the last required code unit flags
8311
  bcptr             pointer to the chain of currently open branches
8312
  cb                points to the data block with tables pointers etc.
8313
  lengthptr         NULL during the real compile phase
8314
                    points to length accumulator during pre-compile phase
8315

8316
Returns:            0 There has been an error
8317
                   +1 Success, this group must match at least one character
8318
                   -1 Success, this group may match an empty string
8319
*/
8320

8321
static int
8322
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8323
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8324
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8325
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8326
  compile_block *cb, PCRE2_SIZE *lengthptr)
8327
{
8328
PCRE2_UCHAR *code = *codeptr;
8329
PCRE2_UCHAR *last_branch = code;
8330
PCRE2_UCHAR *start_bracket = code;
8331
BOOL lookbehind;
8332
open_capitem capitem;
8333
int capnumber = 0;
8334
int okreturn = 1;
8335
uint32_t *pptr = *pptrptr;
8336
uint32_t firstcu, reqcu;
8337
uint32_t lookbehindlength;
8338
uint32_t lookbehindminlength;
8339
uint32_t firstcuflags, reqcuflags;
8340
PCRE2_SIZE length;
8341
branch_chain bc;
8342

8343
/* If set, call the external function that checks for stack availability. */
8344

8345
if (cb->cx->stack_guard != NULL &&
8346
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8347
  {
8348
  *errorcodeptr= ERR33;
8349
  return 0;
8350
  }
8351

8352
/* Miscellaneous initialization */
8353

8354
bc.outer = bcptr;
8355
bc.current_branch = code;
8356

8357
firstcu = reqcu = 0;
8358
firstcuflags = reqcuflags = REQ_UNSET;
8359

8360
/* Accumulate the length for use in the pre-compile phase. Start with the
8361
length of the BRA and KET and any extra code units that are required at the
8362
beginning. We accumulate in a local variable to save frequent testing of
8363
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8364
start and end of each alternative, because compiled items are discarded during
8365
the pre-compile phase so that the workspace is not exceeded. */
8366

8367
length = 2 + 2*LINK_SIZE + skipunits;
8368

8369
/* Remember if this is a lookbehind assertion, and if it is, save its length
8370
and skip over the pattern offset. */
8371

8372
lookbehind = *code == OP_ASSERTBACK ||
8373
             *code == OP_ASSERTBACK_NOT ||
8374
             *code == OP_ASSERTBACK_NA;
8375

8376
if (lookbehind)
8377
  {
8378
  lookbehindlength = META_DATA(pptr[-1]);
8379
  lookbehindminlength = *pptr;
8380
  pptr += SIZEOFFSET;
8381
  }
8382
else lookbehindlength = lookbehindminlength = 0;
8383

8384
/* If this is a capturing subpattern, add to the chain of open capturing items
8385
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8386
need be tested here; changing this opcode to one of its variants, e.g.
8387
OP_SCBRAPOS, happens later, after the group has been compiled. */
8388

8389
if (*code == OP_CBRA)
8390
  {
8391
  capnumber = GET2(code, 1 + LINK_SIZE);
8392
  capitem.number = capnumber;
8393
  capitem.next = open_caps;
8394
  capitem.assert_depth = cb->assert_depth;
8395
  open_caps = &capitem;
8396
  }
8397

8398
/* Offset is set zero to mark that this bracket is still open */
8399

8400
PUT(code, 1, 0);
8401
code += 1 + LINK_SIZE + skipunits;
8402

8403
/* Loop for each alternative branch */
8404

8405
for (;;)
8406
  {
8407
  int branch_return;
8408
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8409
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8410

8411
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8412
  is only a single minimum length for the whole assertion. When the minimum
8413
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8414
  though not necessarily the same length. In this case, the original OP_REVERSE
8415
  can be used. It can also be used if a branch in a variable length lookbehind
8416
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8417
  maximum and minimum values. */
8418

8419
  if (lookbehind && lookbehindlength > 0)
8420
    {
8421
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8422
        lookbehindminlength == lookbehindlength)
8423
      {
8424
      *code++ = OP_REVERSE;
8425
      PUT2INC(code, 0, lookbehindlength);
8426
      length += 1 + IMM2_SIZE;
8427
      }
8428
    else
8429
      {
8430
      *code++ = OP_VREVERSE;
8431
      PUT2INC(code, 0, lookbehindminlength);
8432
      PUT2INC(code, 0, lookbehindlength);
8433
      length += 1 + 2*IMM2_SIZE;
8434
      }
8435
    }
8436

8437
  /* Now compile the branch; in the pre-compile phase its length gets added
8438
  into the length. */
8439

8440
  if ((branch_return =
8441
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8442
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8443
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8444
    return 0;
8445

8446
  /* If a branch can match an empty string, so can the whole group. */
8447

8448
  if (branch_return < 0) okreturn = -1;
8449

8450
  /* In the real compile phase, there is some post-processing to be done. */
8451

8452
  if (lengthptr == NULL)
8453
    {
8454
    /* If this is the first branch, the firstcu and reqcu values for the
8455
    branch become the values for the regex. */
8456

8457
    if (*last_branch != OP_ALT)
8458
      {
8459
      firstcu = branchfirstcu;
8460
      firstcuflags = branchfirstcuflags;
8461
      reqcu = branchreqcu;
8462
      reqcuflags = branchreqcuflags;
8463
      }
8464

8465
    /* If this is not the first branch, the first char and reqcu have to
8466
    match the values from all the previous branches, except that if the
8467
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8468
    and we set REQ_VARY for the group from this branch's value. */
8469

8470
    else
8471
      {
8472
      /* If we previously had a firstcu, but it doesn't match the new branch,
8473
      we have to abandon the firstcu for the regex, but if there was
8474
      previously no reqcu, it takes on the value of the old firstcu. */
8475

8476
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8477
        {
8478
        if (firstcuflags < REQ_NONE)
8479
          {
8480
          if (reqcuflags >= REQ_NONE)
8481
            {
8482
            reqcu = firstcu;
8483
            reqcuflags = firstcuflags;
8484
            }
8485
          }
8486
        firstcuflags = REQ_NONE;
8487
        }
8488

8489
      /* If we (now or from before) have no firstcu, a firstcu from the
8490
      branch becomes a reqcu if there isn't a branch reqcu. */
8491

8492
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8493
          branchreqcuflags >= REQ_NONE)
8494
        {
8495
        branchreqcu = branchfirstcu;
8496
        branchreqcuflags = branchfirstcuflags;
8497
        }
8498

8499
      /* Now ensure that the reqcus match */
8500

8501
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8502
          reqcu != branchreqcu)
8503
        reqcuflags = REQ_NONE;
8504
      else
8505
        {
8506
        reqcu = branchreqcu;
8507
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8508
        }
8509
      }
8510
    }
8511

8512
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8513
  In the real compile phase, go back through the alternative branches and
8514
  reverse the chain of offsets, with the field in the BRA item now becoming an
8515
  offset to the first alternative. If there are no alternatives, it points to
8516
  the end of the group. The length in the terminating ket is always the length
8517
  of the whole bracketed item. Return leaving the pointer at the terminating
8518
  char. */
8519

8520
  if (META_CODE(*pptr) != META_ALT)
8521
    {
8522
    if (lengthptr == NULL)
8523
      {
8524
      uint32_t branch_length = (uint32_t)(code - last_branch);
8525
      do
8526
        {
8527
        uint32_t prev_length = GET(last_branch, 1);
8528
        PUT(last_branch, 1, branch_length);
8529
        branch_length = prev_length;
8530
        last_branch -= branch_length;
8531
        }
8532
      while (branch_length > 0);
8533
      }
8534

8535
    /* Fill in the ket */
8536

8537
    *code = OP_KET;
8538
    PUT(code, 1, (uint32_t)(code - start_bracket));
8539
    code += 1 + LINK_SIZE;
8540

8541
    /* Set values to pass back */
8542

8543
    *codeptr = code;
8544
    *pptrptr = pptr;
8545
    *firstcuptr = firstcu;
8546
    *firstcuflagsptr = firstcuflags;
8547
    *reqcuptr = reqcu;
8548
    *reqcuflagsptr = reqcuflags;
8549
    if (lengthptr != NULL)
8550
      {
8551
      if (OFLOW_MAX - *lengthptr < length)
8552
        {
8553
        *errorcodeptr = ERR20;
8554
        return 0;
8555
        }
8556
      *lengthptr += length;
8557
      }
8558
    return okreturn;
8559
    }
8560

8561
  /* Another branch follows. In the pre-compile phase, we can move the code
8562
  pointer back to where it was for the start of the first branch. (That is,
8563
  pretend that each branch is the only one.)
8564

8565
  In the real compile phase, insert an ALT node. Its length field points back
8566
  to the previous branch while the bracket remains open. At the end the chain
8567
  is reversed. It's done like this so that the start of the bracket has a
8568
  zero offset until it is closed, making it possible to detect recursion. */
8569

8570
  if (lengthptr != NULL)
8571
    {
8572
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8573
    length += 1 + LINK_SIZE;
8574
    }
8575
  else
8576
    {
8577
    *code = OP_ALT;
8578
    PUT(code, 1, (int)(code - last_branch));
8579
    bc.current_branch = last_branch = code;
8580
    code += 1 + LINK_SIZE;
8581
    }
8582

8583
  /* Set the maximum lookbehind length for the next branch (if not in a
8584
  lookbehind the value will be zero) and then advance past the vertical bar. */
8585

8586
  lookbehindlength = META_DATA(*pptr);
8587
  pptr++;
8588
  }
8589

8590
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8591
return 0;                  /* Avoid compiler warnings */
8592
}
8593

8594

8595

8596
/*************************************************
8597
*          Check for anchored pattern            *
8598
*************************************************/
8599

8600
/* Try to find out if this is an anchored regular expression. Consider each
8601
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8602
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8603
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8604
be found, because ^ generates OP_CIRCM in that mode.
8605

8606
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8607
This is the code for \G, which means "match at start of match position, taking
8608
into account the match offset".
8609

8610
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8611
because that will try the rest of the pattern at all possible matching points,
8612
so there is no point trying again.... er ....
8613

8614
.... except when the .* appears inside capturing parentheses, and there is a
8615
subsequent back reference to those parentheses. We haven't enough information
8616
to catch that case precisely.
8617

8618
At first, the best we could do was to detect when .* was in capturing brackets
8619
and the highest back reference was greater than or equal to that level.
8620
However, by keeping a bitmap of the first 31 back references, we can catch some
8621
of the more common cases more precisely.
8622

8623
... A second exception is when the .* appears inside an atomic group, because
8624
this prevents the number of characters it matches from being adjusted.
8625

8626
Arguments:
8627
  code           points to start of the compiled pattern
8628
  bracket_map    a bitmap of which brackets we are inside while testing; this
8629
                   handles up to substring 31; after that we just have to take
8630
                   the less precise approach
8631
  cb             points to the compile data block
8632
  atomcount      atomic group level
8633
  inassert       TRUE if in an assertion
8634
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8635

8636
Returns:     TRUE or FALSE
8637
*/
8638

8639
static BOOL
8640
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8641
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8642
{
8643
do {
8644
   PCRE2_SPTR scode = first_significant_code(
8645
     code + PRIV(OP_lengths)[*code], FALSE);
8646
   int op = *scode;
8647

8648
   /* Non-capturing brackets */
8649

8650
   if (op == OP_BRA  || op == OP_BRAPOS ||
8651
       op == OP_SBRA || op == OP_SBRAPOS)
8652
     {
8653
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8654
       return FALSE;
8655
     }
8656

8657
   /* Capturing brackets */
8658

8659
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8660
            op == OP_SCBRA || op == OP_SCBRAPOS)
8661
     {
8662
     int n = GET2(scode, 1+LINK_SIZE);
8663
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8664
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8665
     }
8666

8667
   /* Positive forward assertion */
8668

8669
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8670
     {
8671
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8672
     }
8673

8674
   /* Condition. If there is no second branch, it can't be anchored. */
8675

8676
   else if (op == OP_COND || op == OP_SCOND)
8677
     {
8678
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8679
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8680
       return FALSE;
8681
     }
8682

8683
   /* Atomic groups */
8684

8685
   else if (op == OP_ONCE)
8686
     {
8687
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8688
       return FALSE;
8689
     }
8690

8691
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8692
   it isn't in brackets that are or may be referenced or inside an atomic
8693
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8694
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8695
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8696
   There is also an option that disables auto-anchoring. */
8697

8698
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8699
             op == OP_TYPEPOSSTAR))
8700
     {
8701
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8702
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8703
       return FALSE;
8704
     }
8705

8706
   /* Check for explicit anchoring */
8707

8708
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8709

8710
   code += GET(code, 1);
8711
   }
8712
while (*code == OP_ALT);   /* Loop for each alternative */
8713
return TRUE;
8714
}
8715

8716

8717

8718
/*************************************************
8719
*         Check for starting with ^ or .*        *
8720
*************************************************/
8721

8722
/* This is called to find out if every branch starts with ^ or .* so that
8723
"first char" processing can be done to speed things up in multiline
8724
matching and for non-DOTALL patterns that start with .* (which must start at
8725
the beginning or after \n). As in the case of is_anchored() (see above), we
8726
have to take account of back references to capturing brackets that contain .*
8727
because in that case we can't make the assumption. Also, the appearance of .*
8728
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8729
or *SKIP does not count, because once again the assumption no longer holds.
8730

8731
Arguments:
8732
  code           points to start of the compiled pattern or a group
8733
  bracket_map    a bitmap of which brackets we are inside while testing; this
8734
                   handles up to substring 31; after that we just have to take
8735
                   the less precise approach
8736
  cb             points to the compile data
8737
  atomcount      atomic group level
8738
  inassert       TRUE if in an assertion
8739
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8740

8741
Returns:         TRUE or FALSE
8742
*/
8743

8744
static BOOL
8745
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8746
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8747
{
8748
do {
8749
   PCRE2_SPTR scode = first_significant_code(
8750
     code + PRIV(OP_lengths)[*code], FALSE);
8751
   int op = *scode;
8752

8753
   /* If we are at the start of a conditional assertion group, *both* the
8754
   conditional assertion *and* what follows the condition must satisfy the test
8755
   for start of line. Other kinds of condition fail. Note that there may be an
8756
   auto-callout at the start of a condition. */
8757

8758
   if (op == OP_COND)
8759
     {
8760
     scode += 1 + LINK_SIZE;
8761

8762
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8763
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8764

8765
     switch (*scode)
8766
       {
8767
       case OP_CREF:
8768
       case OP_DNCREF:
8769
       case OP_RREF:
8770
       case OP_DNRREF:
8771
       case OP_FAIL:
8772
       case OP_FALSE:
8773
       case OP_TRUE:
8774
       return FALSE;
8775

8776
       default:     /* Assertion */
8777
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8778
         return FALSE;
8779
       do scode += GET(scode, 1); while (*scode == OP_ALT);
8780
       scode += 1 + LINK_SIZE;
8781
       break;
8782
       }
8783
     scode = first_significant_code(scode, FALSE);
8784
     op = *scode;
8785
     }
8786

8787
   /* Non-capturing brackets */
8788

8789
   if (op == OP_BRA  || op == OP_BRAPOS ||
8790
       op == OP_SBRA || op == OP_SBRAPOS)
8791
     {
8792
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8793
       return FALSE;
8794
     }
8795

8796
   /* Capturing brackets */
8797

8798
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8799
            op == OP_SCBRA || op == OP_SCBRAPOS)
8800
     {
8801
     int n = GET2(scode, 1+LINK_SIZE);
8802
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8803
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
8804
       return FALSE;
8805
     }
8806

8807
   /* Positive forward assertions */
8808

8809
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8810
     {
8811
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8812
       return FALSE;
8813
     }
8814

8815
   /* Atomic brackets */
8816

8817
   else if (op == OP_ONCE)
8818
     {
8819
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8820
       return FALSE;
8821
     }
8822

8823
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8824
   brackets that may be referenced or an assertion, and as long as the pattern
8825
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8826
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8827
   i.e. not at the start of a line. There is also an option that disables this
8828
   optimization. */
8829

8830
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8831
     {
8832
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8833
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8834
       return FALSE;
8835
     }
8836

8837
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8838
   in particular that this includes atomic brackets OP_ONCE because the number
8839
   of characters matched by .* cannot be adjusted inside them. */
8840

8841
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8842

8843
   /* Move on to the next alternative */
8844

8845
   code += GET(code, 1);
8846
   }
8847
while (*code == OP_ALT);  /* Loop for each alternative */
8848
return TRUE;
8849
}
8850

8851

8852

8853
/*************************************************
8854
*   Scan compiled regex for recursion reference  *
8855
*************************************************/
8856

8857
/* This function scans through a compiled pattern until it finds an instance of
8858
OP_RECURSE.
8859

8860
Arguments:
8861
  code        points to start of expression
8862
  utf         TRUE in UTF mode
8863

8864
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8865
*/
8866

8867
static PCRE2_UCHAR *
8868
find_recurse(PCRE2_UCHAR *code, BOOL utf)
8869
{
8870
for (;;)
8871
  {
8872
  PCRE2_UCHAR c = *code;
8873
  if (c == OP_END) return NULL;
8874
  if (c == OP_RECURSE) return code;
8875

8876
  /* XCLASS is used for classes that cannot be represented just by a bit map.
8877
  This includes negated single high-valued characters. ECLASS is used for
8878
  classes that use set operations internally. CALLOUT_STR is used for
8879
  callouts with string arguments. In each case the length in the table is
8880
  zero; the actual length is stored in the compiled code. */
8881

8882
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
8883
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8884

8885
  /* Otherwise, we can get the item's length from the table, except that for
8886
  repeated character types, we have to test for \p and \P, which have an extra
8887
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8888
  we must add in its length. */
8889

8890
  else
8891
    {
8892
    switch(c)
8893
      {
8894
      case OP_TYPESTAR:
8895
      case OP_TYPEMINSTAR:
8896
      case OP_TYPEPLUS:
8897
      case OP_TYPEMINPLUS:
8898
      case OP_TYPEQUERY:
8899
      case OP_TYPEMINQUERY:
8900
      case OP_TYPEPOSSTAR:
8901
      case OP_TYPEPOSPLUS:
8902
      case OP_TYPEPOSQUERY:
8903
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8904
      break;
8905

8906
      case OP_TYPEPOSUPTO:
8907
      case OP_TYPEUPTO:
8908
      case OP_TYPEMINUPTO:
8909
      case OP_TYPEEXACT:
8910
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8911
        code += 2;
8912
      break;
8913

8914
      case OP_MARK:
8915
      case OP_COMMIT_ARG:
8916
      case OP_PRUNE_ARG:
8917
      case OP_SKIP_ARG:
8918
      case OP_THEN_ARG:
8919
      code += code[1];
8920
      break;
8921
      }
8922

8923
    /* Add in the fixed length from the table */
8924

8925
    code += PRIV(OP_lengths)[c];
8926

8927
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8928
    be followed by a multi-unit character. The length in the table is a
8929
    minimum, so we have to arrange to skip the extra units. */
8930

8931
#ifdef MAYBE_UTF_MULTI
8932
    if (utf) switch(c)
8933
      {
8934
      case OP_CHAR:
8935
      case OP_CHARI:
8936
      case OP_NOT:
8937
      case OP_NOTI:
8938
      case OP_EXACT:
8939
      case OP_EXACTI:
8940
      case OP_NOTEXACT:
8941
      case OP_NOTEXACTI:
8942
      case OP_UPTO:
8943
      case OP_UPTOI:
8944
      case OP_NOTUPTO:
8945
      case OP_NOTUPTOI:
8946
      case OP_MINUPTO:
8947
      case OP_MINUPTOI:
8948
      case OP_NOTMINUPTO:
8949
      case OP_NOTMINUPTOI:
8950
      case OP_POSUPTO:
8951
      case OP_POSUPTOI:
8952
      case OP_NOTPOSUPTO:
8953
      case OP_NOTPOSUPTOI:
8954
      case OP_STAR:
8955
      case OP_STARI:
8956
      case OP_NOTSTAR:
8957
      case OP_NOTSTARI:
8958
      case OP_MINSTAR:
8959
      case OP_MINSTARI:
8960
      case OP_NOTMINSTAR:
8961
      case OP_NOTMINSTARI:
8962
      case OP_POSSTAR:
8963
      case OP_POSSTARI:
8964
      case OP_NOTPOSSTAR:
8965
      case OP_NOTPOSSTARI:
8966
      case OP_PLUS:
8967
      case OP_PLUSI:
8968
      case OP_NOTPLUS:
8969
      case OP_NOTPLUSI:
8970
      case OP_MINPLUS:
8971
      case OP_MINPLUSI:
8972
      case OP_NOTMINPLUS:
8973
      case OP_NOTMINPLUSI:
8974
      case OP_POSPLUS:
8975
      case OP_POSPLUSI:
8976
      case OP_NOTPOSPLUS:
8977
      case OP_NOTPOSPLUSI:
8978
      case OP_QUERY:
8979
      case OP_QUERYI:
8980
      case OP_NOTQUERY:
8981
      case OP_NOTQUERYI:
8982
      case OP_MINQUERY:
8983
      case OP_MINQUERYI:
8984
      case OP_NOTMINQUERY:
8985
      case OP_NOTMINQUERYI:
8986
      case OP_POSQUERY:
8987
      case OP_POSQUERYI:
8988
      case OP_NOTPOSQUERY:
8989
      case OP_NOTPOSQUERYI:
8990
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8991
      break;
8992
      }
8993
#else
8994
    (void)(utf);  /* Keep compiler happy by referencing function argument */
8995
#endif  /* MAYBE_UTF_MULTI */
8996
    }
8997
  }
8998
}
8999

9000

9001

9002
/*************************************************
9003
*    Check for asserted fixed first code unit    *
9004
*************************************************/
9005

9006
/* During compilation, the "first code unit" settings from forward assertions
9007
are discarded, because they can cause conflicts with actual literals that
9008
follow. However, if we end up without a first code unit setting for an
9009
unanchored pattern, it is worth scanning the regex to see if there is an
9010
initial asserted first code unit. If all branches start with the same asserted
9011
code unit, or with a non-conditional bracket all of whose alternatives start
9012
with the same asserted code unit (recurse ad lib), then we return that code
9013
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9014
REQ_NONE in the flags.
9015

9016
Arguments:
9017
  code       points to start of compiled pattern
9018
  flags      points to the first code unit flags
9019
  inassert   non-zero if in an assertion
9020

9021
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9022
*/
9023

9024
static uint32_t
9025
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9026
{
9027
uint32_t c = 0;
9028
uint32_t cflags = REQ_NONE;
9029

9030
*flags = REQ_NONE;
9031
do {
9032
   uint32_t d;
9033
   uint32_t dflags;
9034
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9035
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9036
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9037
   PCRE2_UCHAR op = *scode;
9038

9039
   switch(op)
9040
     {
9041
     default:
9042
     return 0;
9043

9044
     case OP_BRA:
9045
     case OP_BRAPOS:
9046
     case OP_CBRA:
9047
     case OP_SCBRA:
9048
     case OP_CBRAPOS:
9049
     case OP_SCBRAPOS:
9050
     case OP_ASSERT:
9051
     case OP_ASSERT_NA:
9052
     case OP_ONCE:
9053
     case OP_SCRIPT_RUN:
9054
     d = find_firstassertedcu(scode, &dflags, inassert +
9055
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9056
     if (dflags >= REQ_NONE) return 0;
9057
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9058
       else if (c != d || cflags != dflags) return 0;
9059
     break;
9060

9061
     case OP_EXACT:
9062
     scode += IMM2_SIZE;
9063
     /* Fall through */
9064

9065
     case OP_CHAR:
9066
     case OP_PLUS:
9067
     case OP_MINPLUS:
9068
     case OP_POSPLUS:
9069
     if (inassert == 0) return 0;
9070
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9071
       else if (c != scode[1]) return 0;
9072
     break;
9073

9074
     case OP_EXACTI:
9075
     scode += IMM2_SIZE;
9076
     /* Fall through */
9077

9078
     case OP_CHARI:
9079
     case OP_PLUSI:
9080
     case OP_MINPLUSI:
9081
     case OP_POSPLUSI:
9082
     if (inassert == 0) return 0;
9083

9084
     /* If the character is more than one code unit long, we cannot set its
9085
     first code unit when matching caselessly. Later scanning may pick up
9086
     multiple code units. */
9087

9088
#ifdef SUPPORT_UNICODE
9089
#if PCRE2_CODE_UNIT_WIDTH == 8
9090
     if (scode[1] >= 0x80) return 0;
9091
#elif PCRE2_CODE_UNIT_WIDTH == 16
9092
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9093
#endif
9094
#endif
9095

9096
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9097
       else if (c != scode[1]) return 0;
9098
     break;
9099
     }
9100

9101
   code += GET(code, 1);
9102
   }
9103
while (*code == OP_ALT);
9104

9105
*flags = cflags;
9106
return c;
9107
}
9108

9109

9110

9111
/*************************************************
9112
*     Add an entry to the name/number table      *
9113
*************************************************/
9114

9115
/* This function is called between compiling passes to add an entry to the
9116
name/number table, maintaining alphabetical order. Checking for permitted
9117
and forbidden duplicates has already been done.
9118

9119
Arguments:
9120
  cb           the compile data block
9121
  name         the name to add
9122
  length       the length of the name
9123
  groupno      the group number
9124
  tablecount   the count of names in the table so far
9125

9126
Returns:       nothing
9127
*/
9128

9129
static void
9130
add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9131
  unsigned int groupno, uint32_t tablecount)
9132
{
9133
uint32_t i;
9134
PCRE2_UCHAR *slot = cb->name_table;
9135

9136
for (i = 0; i < tablecount; i++)
9137
  {
9138
  int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9139
  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9140
    crc = -1; /* Current name is a substring */
9141

9142
  /* Make space in the table and break the loop for an earlier name. For a
9143
  duplicate or later name, carry on. We do this for duplicates so that in the
9144
  simple case (when ?(| is not used) they are in order of their numbers. In all
9145
  cases they are in the order in which they appear in the pattern. */
9146

9147
  if (crc < 0)
9148
    {
9149
    (void)memmove(slot + cb->name_entry_size, slot,
9150
      CU2BYTES((tablecount - i) * cb->name_entry_size));
9151
    break;
9152
    }
9153

9154
  /* Continue the loop for a later or duplicate name */
9155

9156
  slot += cb->name_entry_size;
9157
  }
9158

9159
PUT2(slot, 0, groupno);
9160
memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9161

9162
/* Add a terminating zero and fill the rest of the slot with zeroes so that
9163
the memory is all initialized. Otherwise valgrind moans about uninitialized
9164
memory when saving serialized compiled patterns. */
9165

9166
memset(slot + IMM2_SIZE + length, 0,
9167
  CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9168
}
9169

9170

9171

9172
/*************************************************
9173
*             Skip in parsed pattern             *
9174
*************************************************/
9175

9176
/* This function is called to skip parts of the parsed pattern when finding the
9177
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9178
the end of the branch, it is called to skip over an internal lookaround or
9179
(DEFINE) group, and it is also called to skip to the end of a class, during
9180
which it will never encounter nested groups (but there's no need to have
9181
special code for that).
9182

9183
When called to find the end of a branch or group, pptr must point to the first
9184
meta code inside the branch, not the branch-starting code. In other cases it
9185
can point to the item that causes the function to be called.
9186

9187
Arguments:
9188
  pptr       current pointer to skip from
9189
  skiptype   PSKIP_CLASS when skipping to end of class
9190
             PSKIP_ALT when META_ALT ends the skip
9191
             PSKIP_KET when only META_KET ends the skip
9192

9193
Returns:     new value of pptr
9194
             NULL if META_END is reached - should never occur
9195
               or for an unknown meta value - likewise
9196
*/
9197

9198
static uint32_t *
9199
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9200
{
9201
uint32_t nestlevel = 0;
9202

9203
for (;; pptr++)
9204
  {
9205
  uint32_t meta = META_CODE(*pptr);
9206

9207
  switch(meta)
9208
    {
9209
    default:  /* Just skip over most items */
9210
    if (meta < META_END) continue;  /* Literal */
9211
    break;
9212

9213
    case META_END:
9214

9215
    /* The parsed regex is malformed; we have reached the end and did
9216
    not find the end of the construct which we are skipping over. */
9217

9218
    PCRE2_DEBUG_UNREACHABLE();
9219
    return NULL;
9220

9221
    /* The data for these items is variable in length. */
9222

9223
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9224
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9225
    break;
9226

9227
    case META_ESCAPE:
9228
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9229
      pptr += 1;     /* Skip prop data */
9230
    break;
9231

9232
    case META_MARK:     /* Add the length of the name. */
9233
    case META_COMMIT_ARG:
9234
    case META_PRUNE_ARG:
9235
    case META_SKIP_ARG:
9236
    case META_THEN_ARG:
9237
    pptr += pptr[1];
9238
    break;
9239

9240
    /* These are the "active" items in this loop. */
9241

9242
    case META_CLASS_END:
9243
    if (skiptype == PSKIP_CLASS) return pptr;
9244
    break;
9245

9246
    case META_ATOMIC:
9247
    case META_CAPTURE:
9248
    case META_COND_ASSERT:
9249
    case META_COND_DEFINE:
9250
    case META_COND_NAME:
9251
    case META_COND_NUMBER:
9252
    case META_COND_RNAME:
9253
    case META_COND_RNUMBER:
9254
    case META_COND_VERSION:
9255
    case META_SCS:
9256
    case META_LOOKAHEAD:
9257
    case META_LOOKAHEADNOT:
9258
    case META_LOOKAHEAD_NA:
9259
    case META_LOOKBEHIND:
9260
    case META_LOOKBEHINDNOT:
9261
    case META_LOOKBEHIND_NA:
9262
    case META_NOCAPTURE:
9263
    case META_SCRIPT_RUN:
9264
    nestlevel++;
9265
    break;
9266

9267
    case META_ALT:
9268
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9269
    break;
9270

9271
    case META_KET:
9272
    if (nestlevel == 0) return pptr;
9273
    nestlevel--;
9274
    break;
9275
    }
9276

9277
  /* The extra data item length for each meta is in a table. */
9278

9279
  meta = (meta >> 16) & 0x7fff;
9280
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9281
  pptr += meta_extra_lengths[meta];
9282
  }
9283

9284
PCRE2_UNREACHABLE(); /* Control never reaches here */
9285
}
9286

9287

9288

9289
/*************************************************
9290
*       Find length of a parsed group            *
9291
*************************************************/
9292

9293
/* This is called for nested groups within a branch of a lookbehind whose
9294
length is being computed. On entry, the pointer must be at the first element
9295
after the group initializing code. On exit it points to OP_KET. Caching is used
9296
to improve processing speed when the same capturing group occurs many times.
9297

9298
Arguments:
9299
  pptrptr     pointer to pointer in the parsed pattern
9300
  minptr      where to return the minimum length
9301
  isinline    FALSE if a reference or recursion; TRUE for inline group
9302
  errcodeptr  pointer to the errorcode
9303
  lcptr       pointer to the loop counter
9304
  group       number of captured group or -1 for a non-capturing group
9305
  recurses    chain of recurse_check to catch mutual recursion
9306
  cb          pointer to the compile data
9307

9308
Returns:      the maximum group length or a negative number
9309
*/
9310

9311
static int
9312
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9313
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9314
{
9315
uint32_t *gi = cb->groupinfo + 2 * group;
9316
int branchlength, branchminlength;
9317
int grouplength = -1;
9318
int groupminlength = INT_MAX;
9319

9320
/* The cache can be used only if there is no possibility of there being two
9321
groups with the same number. We do not need to set the end pointer for a group
9322
that is being processed as a back reference or recursion, but we must do so for
9323
an inline group. */
9324

9325
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9326
  {
9327
  uint32_t groupinfo = gi[0];
9328
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9329
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9330
    {
9331
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9332
    *minptr = gi[1];
9333
    return groupinfo & GI_FIXED_LENGTH_MASK;
9334
    }
9335
  }
9336

9337
/* Scan the group. In this case we find the end pointer of necessity. */
9338

9339
for(;;)
9340
  {
9341
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9342
    recurses, cb);
9343
  if (branchlength < 0) goto ISNOTFIXED;
9344
  if (branchlength > grouplength) grouplength = branchlength;
9345
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9346
  if (**pptrptr == META_KET) break;
9347
  *pptrptr += 1;   /* Skip META_ALT */
9348
  }
9349

9350
if (group > 0)
9351
  {
9352
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9353
  gi[1] = groupminlength;
9354
  }
9355

9356
*minptr = groupminlength;
9357
return grouplength;
9358

9359
ISNOTFIXED:
9360
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9361
return -1;
9362
}
9363

9364

9365

9366
/*************************************************
9367
*        Find length of a parsed branch          *
9368
*************************************************/
9369

9370
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9371
giving an error if the length is not limited. On entry, *pptrptr points to the
9372
first element inside the branch. On exit it is set to point to the ALT or KET.
9373

9374
Arguments:
9375
  pptrptr     pointer to pointer in the parsed pattern
9376
  minptr      where to return the minimum length
9377
  errcodeptr  pointer to error code
9378
  lcptr       pointer to loop counter
9379
  recurses    chain of recurse_check to catch mutual recursion
9380
  cb          pointer to compile block
9381

9382
Returns:      the maximum length, or a negative value on error
9383
*/
9384

9385
static int
9386
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9387
  parsed_recurse_check *recurses, compile_block *cb)
9388
{
9389
int branchlength = 0;
9390
int branchminlength = 0;
9391
int grouplength, groupminlength;
9392
uint32_t lastitemlength = 0;
9393
uint32_t lastitemminlength = 0;
9394
uint32_t *pptr = *pptrptr;
9395
PCRE2_SIZE offset;
9396
parsed_recurse_check this_recurse;
9397

9398
/* A large and/or complex regex can take too long to process. This can happen
9399
more often when (?| groups are present in the pattern because their length
9400
cannot be cached. */
9401

9402
if ((*lcptr)++ > 2000)
9403
  {
9404
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9405
  return -1;
9406
  }
9407

9408
/* Scan the branch, accumulating the length. */
9409

9410
for (;; pptr++)
9411
  {
9412
  parsed_recurse_check *r;
9413
  uint32_t *gptr, *gptrend;
9414
  uint32_t escape;
9415
  uint32_t min, max;
9416
  uint32_t group = 0;
9417
  uint32_t itemlength = 0;
9418
  uint32_t itemminlength = 0;
9419

9420
  if (*pptr < META_END)
9421
    {
9422
    itemlength = itemminlength = 1;
9423
    }
9424

9425
  else switch (META_CODE(*pptr))
9426
    {
9427
    case META_KET:
9428
    case META_ALT:
9429
    goto EXIT;
9430

9431
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9432
    actual termination. */
9433

9434
    case META_ACCEPT:
9435
    case META_FAIL:
9436
    pptr = parsed_skip(pptr, PSKIP_ALT);
9437
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9438
    goto EXIT;
9439

9440
    case META_MARK:
9441
    case META_COMMIT_ARG:
9442
    case META_PRUNE_ARG:
9443
    case META_SKIP_ARG:
9444
    case META_THEN_ARG:
9445
    pptr += pptr[1] + 1;
9446
    break;
9447

9448
    case META_CIRCUMFLEX:
9449
    case META_COMMIT:
9450
    case META_DOLLAR:
9451
    case META_PRUNE:
9452
    case META_SKIP:
9453
    case META_THEN:
9454
    break;
9455

9456
    case META_OPTIONS:
9457
    pptr += 2;
9458
    break;
9459

9460
    case META_BIGVALUE:
9461
    itemlength = itemminlength = 1;
9462
    pptr += 1;
9463
    break;
9464

9465
    case META_CLASS:
9466
    case META_CLASS_NOT:
9467
    itemlength = itemminlength = 1;
9468
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9469
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9470
    break;
9471

9472
    case META_CLASS_EMPTY_NOT:
9473
    case META_DOT:
9474
    itemlength = itemminlength = 1;
9475
    break;
9476

9477
    case META_CALLOUT_NUMBER:
9478
    pptr += 3;
9479
    break;
9480

9481
    case META_CALLOUT_STRING:
9482
    pptr += 3 + SIZEOFFSET;
9483
    break;
9484

9485
    /* Only some escapes consume a character. Of those, \R can match one or two
9486
    characters, but \X is never allowed because it matches an unknown number of
9487
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9488

9489
    case META_ESCAPE:
9490
    escape = META_DATA(*pptr);
9491
    if (escape == ESC_X) return -1;
9492
    if (escape == ESC_R)
9493
      {
9494
      itemminlength = 1;
9495
      itemlength = 2;
9496
      }
9497
    else if (escape > ESC_b && escape < ESC_Z)
9498
      {
9499
#if PCRE2_CODE_UNIT_WIDTH != 32
9500
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9501
        {
9502
        *errcodeptr = ERR36;
9503
        return -1;
9504
        }
9505
#endif
9506
      itemlength = itemminlength = 1;
9507
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9508
      }
9509
    break;
9510

9511
    /* Lookaheads do not contribute to the length of this branch, but they may
9512
    contain lookbehinds within them whose lengths need to be set. */
9513

9514
    case META_LOOKAHEAD:
9515
    case META_LOOKAHEADNOT:
9516
    case META_LOOKAHEAD_NA:
9517
    case META_SCS:
9518
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9519
    if (*errcodeptr != 0) return -1;
9520

9521
    /* Ignore any qualifiers that follow a lookahead assertion. */
9522

9523
    switch (pptr[1])
9524
      {
9525
      case META_ASTERISK:
9526
      case META_ASTERISK_PLUS:
9527
      case META_ASTERISK_QUERY:
9528
      case META_PLUS:
9529
      case META_PLUS_PLUS:
9530
      case META_PLUS_QUERY:
9531
      case META_QUERY:
9532
      case META_QUERY_PLUS:
9533
      case META_QUERY_QUERY:
9534
      pptr++;
9535
      break;
9536

9537
      case META_MINMAX:
9538
      case META_MINMAX_PLUS:
9539
      case META_MINMAX_QUERY:
9540
      pptr += 3;
9541
      break;
9542

9543
      default:
9544
      break;
9545
      }
9546
    break;
9547

9548
    /* A nested lookbehind does not contribute any length to this lookbehind,
9549
    but must itself be checked and have its lengths set. Note that
9550
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9551
    of the group, so no need to update it here. */
9552

9553
    case META_LOOKBEHIND:
9554
    case META_LOOKBEHINDNOT:
9555
    case META_LOOKBEHIND_NA:
9556
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9557
      return -1;
9558
    break;
9559

9560
    /* Back references and recursions are handled by very similar code. At this
9561
    stage, the names generated in the parsing pass are available, but the main
9562
    name table has not yet been created. So for the named varieties, scan the
9563
    list of names in order to get the number of the first one in the pattern,
9564
    and whether or not this name is duplicated. */
9565

9566
    case META_BACKREF_BYNAME:
9567
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9568
      goto ISNOTFIXED;
9569
    /* Fall through */
9570

9571
    case META_RECURSE_BYNAME:
9572
      {
9573
      int i;
9574
      PCRE2_SPTR name;
9575
      BOOL is_dupname = FALSE;
9576
      named_group *ng = cb->named_groups;
9577
      uint32_t meta_code = META_CODE(*pptr);
9578
      uint32_t length = *(++pptr);
9579

9580
      GETPLUSOFFSET(offset, pptr);
9581
      name = cb->start_pattern + offset;
9582
      for (i = 0; i < cb->names_found; i++, ng++)
9583
        {
9584
        if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9585
          {
9586
          group = ng->number;
9587
          is_dupname = ng->isdup;
9588
          break;
9589
          }
9590
        }
9591

9592
      if (group == 0)
9593
        {
9594
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9595
        cb->erroroffset = offset;
9596
        return -1;
9597
        }
9598

9599
      /* A numerical back reference can be fixed length if duplicate capturing
9600
      groups are not being used. A non-duplicate named back reference can also
9601
      be handled. */
9602

9603
      if (meta_code == META_RECURSE_BYNAME ||
9604
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9605
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9606
      }
9607
    goto ISNOTFIXED;                     /* Duplicate name or number */
9608

9609
    /* The offset values for back references < 10 are in a separate vector
9610
    because otherwise they would use more than two parsed pattern elements on
9611
    64-bit systems. */
9612

9613
    case META_BACKREF:
9614
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9615
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9616
      goto ISNOTFIXED;
9617
    group = META_DATA(*pptr);
9618
    if (group < 10)
9619
      {
9620
      offset = cb->small_ref_offset[group];
9621
      goto RECURSE_OR_BACKREF_LENGTH;
9622
      }
9623

9624
    /* Fall through */
9625
    /* For groups >= 10 - picking up group twice does no harm. */
9626

9627
    /* A true recursion implies not fixed length, but a subroutine call may
9628
    be OK. Back reference "recursions" are also failed. */
9629

9630
    case META_RECURSE:
9631
    group = META_DATA(*pptr);
9632
    GETPLUSOFFSET(offset, pptr);
9633

9634
    RECURSE_OR_BACKREF_LENGTH:
9635
    if (group > cb->bracount)
9636
      {
9637
      cb->erroroffset = offset;
9638
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9639
      return -1;
9640
      }
9641
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9642
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9643
      {
9644
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9645
        else if (*gptr == (META_CAPTURE | group)) break;
9646
      }
9647

9648
    /* We must start the search for the end of the group at the first meta code
9649
    inside the group. Otherwise it will be treated as an enclosed group. */
9650

9651
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9652
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9653
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9654
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9655
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9656
    this_recurse.prev = recurses;
9657
    this_recurse.groupptr = gptr;
9658

9659
    /* We do not need to know the position of the end of the group, that is,
9660
    gptr is not used after the call to get_grouplength(). Setting the second
9661
    argument FALSE stops it scanning for the end when the length can be found
9662
    in the cache. */
9663

9664
    gptr++;
9665
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9666
      lcptr, group, &this_recurse, cb);
9667
    if (grouplength < 0)
9668
      {
9669
      if (*errcodeptr == 0) goto ISNOTFIXED;
9670
      return -1;  /* Error already set */
9671
      }
9672
    itemlength = grouplength;
9673
    itemminlength = groupminlength;
9674
    break;
9675

9676
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9677
    the length of this branch. Skip from the following item to the next
9678
    unpaired ket. */
9679

9680
    case META_COND_DEFINE:
9681
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9682
    break;
9683

9684
    /* Check other nested groups - advance past the initial data for each type
9685
    and then seek a fixed length with get_grouplength(). */
9686

9687
    case META_COND_NAME:
9688
    case META_COND_NUMBER:
9689
    case META_COND_RNAME:
9690
    case META_COND_RNUMBER:
9691
    pptr += 2 + SIZEOFFSET;
9692
    goto CHECK_GROUP;
9693

9694
    case META_COND_ASSERT:
9695
    pptr += 1;
9696
    goto CHECK_GROUP;
9697

9698
    case META_COND_VERSION:
9699
    pptr += 4;
9700
    goto CHECK_GROUP;
9701

9702
    case META_CAPTURE:
9703
    group = META_DATA(*pptr);
9704
    /* Fall through */
9705

9706
    case META_ATOMIC:
9707
    case META_NOCAPTURE:
9708
    case META_SCRIPT_RUN:
9709
    pptr++;
9710
    CHECK_GROUP:
9711
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9712
      lcptr, group, recurses, cb);
9713
    if (grouplength < 0) return -1;
9714
    itemlength = grouplength;
9715
    itemminlength = groupminlength;
9716
    break;
9717

9718
    case META_QUERY:
9719
    case META_QUERY_PLUS:
9720
    case META_QUERY_QUERY:
9721
    min = 0;
9722
    max = 1;
9723
    goto REPETITION;
9724

9725
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9726
    must subtract the length that has already been added. */
9727

9728
    case META_MINMAX:
9729
    case META_MINMAX_PLUS:
9730
    case META_MINMAX_QUERY:
9731
    min = pptr[1];
9732
    max = pptr[2];
9733
    pptr += 2;
9734

9735
    REPETITION:
9736
    if (max != REPEAT_UNLIMITED)
9737
      {
9738
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9739
          max != 0 &&
9740
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9741
        {
9742
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9743
        return -1;
9744
        }
9745
      if (min == 0) branchminlength -= lastitemminlength;
9746
        else itemminlength = (min - 1) * lastitemminlength;
9747
      if (max == 0) branchlength -= lastitemlength;
9748
        else itemlength = (max - 1) * lastitemlength;
9749
      break;
9750
      }
9751
    /* Fall through */
9752

9753
    /* Any other item means this branch does not have a fixed length. */
9754

9755
    default:
9756
    ISNOTFIXED:
9757
    *errcodeptr = ERR25;   /* Not fixed length */
9758
    return -1;
9759
    }
9760

9761
  /* Add the item length to the branchlength, checking for integer overflow and
9762
  for the branch length exceeding the overall limit. Later, if there is at
9763
  least one variable-length branch in the group, there is a test for the
9764
  (smaller) variable-length branch length limit. */
9765

9766
  if (INT_MAX - branchlength < (int)itemlength ||
9767
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9768
    {
9769
    *errcodeptr = ERR87;
9770
    return -1;
9771
    }
9772

9773
  branchminlength += itemminlength;
9774

9775
  /* Save this item length for use if the next item is a quantifier. */
9776

9777
  lastitemlength = itemlength;
9778
  lastitemminlength = itemminlength;
9779
  }
9780

9781
EXIT:
9782
*pptrptr = pptr;
9783
*minptr = branchminlength;
9784
return branchlength;
9785

9786
PARSED_SKIP_FAILED:
9787
PCRE2_DEBUG_UNREACHABLE();
9788
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9789
return -1;
9790
}
9791

9792

9793

9794
/*************************************************
9795
*        Set lengths in a lookbehind             *
9796
*************************************************/
9797

9798
/* This function is called for each lookbehind, to set the lengths in its
9799
branches. An error occurs if any branch does not have a limited maximum length
9800
that is less than the limit (65535). On exit, the pointer must be left on the
9801
final ket.
9802

9803
The function also maintains the max_lookbehind value. Any lookbehind branch
9804
that contains a nested lookbehind may actually look further back than the
9805
length of the branch. The additional amount is passed back from
9806
get_branchlength() as an "extra" value.
9807

9808
Arguments:
9809
  pptrptr     pointer to pointer in the parsed pattern
9810
  errcodeptr  pointer to error code
9811
  lcptr       pointer to loop counter
9812
  recurses    chain of recurse_check to catch mutual recursion
9813
  cb          pointer to compile block
9814

9815
Returns:      TRUE if all is well
9816
              FALSE otherwise, with error code and offset set
9817
*/
9818

9819
static BOOL
9820
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9821
  parsed_recurse_check *recurses, compile_block *cb)
9822
{
9823
PCRE2_SIZE offset;
9824
uint32_t *bptr = *pptrptr;
9825
uint32_t *gbptr = bptr;
9826
int maxlength = 0;
9827
int minlength = INT_MAX;
9828
BOOL variable = FALSE;
9829

9830
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9831
*pptrptr += SIZEOFFSET;
9832

9833
/* Each branch can have a different maximum length, but we can keep only a
9834
single minimum for the whole group, because there's nowhere to save individual
9835
values in the META_ALT item. */
9836

9837
do
9838
  {
9839
  int branchlength, branchminlength;
9840

9841
  *pptrptr += 1;
9842
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9843
    recurses, cb);
9844

9845
  if (branchlength < 0)
9846
    {
9847
    /* The errorcode and offset may already be set from a nested lookbehind. */
9848
    if (*errcodeptr == 0) *errcodeptr = ERR25;
9849
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9850
    return FALSE;
9851
    }
9852

9853
  if (branchlength != branchminlength) variable = TRUE;
9854
  if (branchminlength < minlength) minlength = branchminlength;
9855
  if (branchlength > maxlength) maxlength = branchlength;
9856
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9857
  *bptr |= branchlength;  /* branchlength never more than 65535 */
9858
  bptr = *pptrptr;
9859
  }
9860
while (META_CODE(*bptr) == META_ALT);
9861

9862
/* If any branch is of variable length, the whole lookbehind is of variable
9863
length. If the maximum length of any branch exceeds the maximum for variable
9864
lookbehinds, give an error. Otherwise, the minimum length is set in the word
9865
that follows the original group META value. For a fixed-length lookbehind, this
9866
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9867
possibly different) length. */
9868

9869
if (variable)
9870
  {
9871
  gbptr[1] = minlength;
9872
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
9873
    {
9874
    *errcodeptr = ERR100;
9875
    cb->erroroffset = offset;
9876
    return FALSE;
9877
    }
9878
  }
9879
else gbptr[1] = LOOKBEHIND_MAX;
9880

9881
return TRUE;
9882
}
9883

9884

9885

9886
/*************************************************
9887
*         Check parsed pattern lookbehinds       *
9888
*************************************************/
9889

9890
/* This function is called at the end of parsing a pattern if any lookbehinds
9891
were encountered. It scans the parsed pattern for them, calling
9892
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9893
the error offset is marked unset. The enables the functions above not to
9894
override settings from deeper nestings.
9895

9896
This function is called recursively from get_branchlength() for lookaheads in
9897
order to process any lookbehinds that they may contain. It stops when it hits a
9898
non-nested closing parenthesis in this case, returning a pointer to it.
9899

9900
Arguments
9901
  pptr      points to where to start (start of pattern or start of lookahead)
9902
  retptr    if not NULL, return the ket pointer here
9903
  recurses  chain of recurse_check to catch mutual recursion
9904
  cb        points to the compile block
9905
  lcptr     points to loop counter
9906

9907
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9908
*/
9909

9910
static int
9911
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9912
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9913
{
9914
int errorcode = 0;
9915
int nestlevel = 0;
9916

9917
cb->erroroffset = PCRE2_UNSET;
9918

9919
for (; *pptr != META_END; pptr++)
9920
  {
9921
  if (*pptr < META_END) continue;  /* Literal */
9922

9923
  switch (META_CODE(*pptr))
9924
    {
9925
    default:
9926

9927
    /* The following erroroffset is a bogus but safe value. This branch should
9928
    be avoided by providing a proper implementation for all supported cases
9929
    below. */
9930

9931
    PCRE2_DEBUG_UNREACHABLE();
9932
    cb->erroroffset = 0;
9933
    return ERR70;  /* Unrecognized meta code */
9934

9935
    case META_ESCAPE:
9936
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9937
      pptr += 1;    /* Skip prop data */
9938
    break;
9939

9940
    case META_KET:
9941
    if (--nestlevel < 0)
9942
      {
9943
      if (retptr != NULL) *retptr = pptr;
9944
      return 0;
9945
      }
9946
    break;
9947

9948
    case META_ATOMIC:
9949
    case META_CAPTURE:
9950
    case META_COND_ASSERT:
9951
    case META_SCS:
9952
    case META_LOOKAHEAD:
9953
    case META_LOOKAHEADNOT:
9954
    case META_LOOKAHEAD_NA:
9955
    case META_NOCAPTURE:
9956
    case META_SCRIPT_RUN:
9957
    nestlevel++;
9958
    break;
9959

9960
    case META_ACCEPT:
9961
    case META_ALT:
9962
    case META_ASTERISK:
9963
    case META_ASTERISK_PLUS:
9964
    case META_ASTERISK_QUERY:
9965
    case META_BACKREF:
9966
    case META_CIRCUMFLEX:
9967
    case META_CLASS:
9968
    case META_CLASS_EMPTY:
9969
    case META_CLASS_EMPTY_NOT:
9970
    case META_CLASS_END:
9971
    case META_CLASS_NOT:
9972
    case META_COMMIT:
9973
    case META_DOLLAR:
9974
    case META_DOT:
9975
    case META_FAIL:
9976
    case META_PLUS:
9977
    case META_PLUS_PLUS:
9978
    case META_PLUS_QUERY:
9979
    case META_PRUNE:
9980
    case META_QUERY:
9981
    case META_QUERY_PLUS:
9982
    case META_QUERY_QUERY:
9983
    case META_RANGE_ESCAPED:
9984
    case META_RANGE_LITERAL:
9985
    case META_SKIP:
9986
    case META_THEN:
9987
    break;
9988

9989
    case META_OFFSET:
9990
    case META_RECURSE:
9991
    pptr += SIZEOFFSET;
9992
    break;
9993

9994
    case META_BACKREF_BYNAME:
9995
    case META_RECURSE_BYNAME:
9996
    pptr += 1 + SIZEOFFSET;
9997
    break;
9998

9999
    case META_COND_DEFINE:
10000
    pptr += SIZEOFFSET;
10001
    nestlevel++;
10002
    break;
10003

10004
    case META_COND_NAME:
10005
    case META_COND_NUMBER:
10006
    case META_COND_RNAME:
10007
    case META_COND_RNUMBER:
10008
    pptr += 1 + SIZEOFFSET;
10009
    nestlevel++;
10010
    break;
10011

10012
    case META_COND_VERSION:
10013
    pptr += 3;
10014
    nestlevel++;
10015
    break;
10016

10017
    case META_CALLOUT_STRING:
10018
    pptr += 3 + SIZEOFFSET;
10019
    break;
10020

10021
    case META_BIGVALUE:
10022
    case META_POSIX:
10023
    case META_POSIX_NEG:
10024
    case META_SCS_NAME:
10025
    case META_SCS_NUMBER:
10026
    pptr += 1;
10027
    break;
10028

10029
    case META_MINMAX:
10030
    case META_MINMAX_QUERY:
10031
    case META_MINMAX_PLUS:
10032
    case META_OPTIONS:
10033
    pptr += 2;
10034
    break;
10035

10036
    case META_CALLOUT_NUMBER:
10037
    pptr += 3;
10038
    break;
10039

10040
    case META_MARK:
10041
    case META_COMMIT_ARG:
10042
    case META_PRUNE_ARG:
10043
    case META_SKIP_ARG:
10044
    case META_THEN_ARG:
10045
    pptr += 1 + pptr[1];
10046
    break;
10047

10048
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10049
    the final ket of the group, so no need to update it here. */
10050

10051
    case META_LOOKBEHIND:
10052
    case META_LOOKBEHINDNOT:
10053
    case META_LOOKBEHIND_NA:
10054
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10055
      return errorcode;
10056
    break;
10057
    }
10058
  }
10059

10060
return 0;
10061
}
10062

10063

10064

10065
/*************************************************
10066
*     External function to compile a pattern     *
10067
*************************************************/
10068

10069
/* This function reads a regular expression in the form of a string and returns
10070
a pointer to a block of store holding a compiled version of the expression.
10071

10072
Arguments:
10073
  pattern       the regular expression
10074
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10075
  options       option bits
10076
  errorptr      pointer to errorcode
10077
  erroroffset   pointer to error offset
10078
  ccontext      points to a compile context or is NULL
10079

10080
Returns:        pointer to compiled data block, or NULL on error,
10081
                with errorcode and erroroffset set
10082
*/
10083

10084
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10085
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10086
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10087
{
10088
BOOL utf;                             /* Set TRUE for UTF mode */
10089
BOOL ucp;                             /* Set TRUE for UCP mode */
10090
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10091
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10092
pcre2_real_code *re = NULL;           /* What we will return */
10093
compile_block cb;                     /* "Static" compile-time data */
10094
const uint8_t *tables;                /* Char tables base pointer */
10095

10096
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10097
PCRE2_UCHAR * codestart;              /* Start of compiled code */
10098
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10099
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10100

10101
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10102
PCRE2_SIZE usedlength;                /* Actual length used */
10103
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10104
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10105

10106
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10107
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10108
uint32_t setflags = 0;                /* NL and BSR set flags */
10109
uint32_t xoptions;                    /* Flags from context, modified */
10110

10111
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10112
uint32_t limit_heap  = UINT32_MAX;
10113
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10114
uint32_t limit_depth = UINT32_MAX;
10115

10116
int newline = 0;                      /* Unset; can be set by the pattern */
10117
int bsr = 0;                          /* Unset; can be set by the pattern */
10118
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10119
int regexrc;                          /* Return from compile */
10120

10121
uint32_t i;                           /* Local loop counter */
10122

10123
/* Enable all optimizations by default. */
10124
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10125
                                          PCRE2_OPTIMIZATION_ALL;
10126

10127
/* Comments at the head of this file explain about these variables. */
10128

10129
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10130
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10131
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10132

10133
/* The workspace is used in different ways in the different compiling phases.
10134
It needs to be 16-bit aligned for the preliminary parsing scan. */
10135

10136
uint32_t c16workspace[C16_WORK_SIZE];
10137
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10138

10139

10140
/* -------------- Check arguments and set up the pattern ----------------- */
10141

10142
/* There must be error code and offset pointers. */
10143

10144
if (errorptr == NULL || erroroffset == NULL) return NULL;
10145
*errorptr = ERR0;
10146
*erroroffset = 0;
10147

10148
/* There must be a pattern, but NULL is allowed with zero length. */
10149

10150
if (pattern == NULL)
10151
  {
10152
  if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10153
    {
10154
    *errorptr = ERR16;
10155
    return NULL;
10156
    }
10157
  }
10158

10159
/* A NULL compile context means "use a default context" */
10160

10161
if (ccontext == NULL)
10162
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10163

10164
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10165

10166
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10167

10168
/* Check that all undefined public option bits are zero. */
10169

10170
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10171
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10172
  {
10173
  *errorptr = ERR17;
10174
  return NULL;
10175
  }
10176

10177
if ((options & PCRE2_LITERAL) != 0 &&
10178
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10179
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10180
  {
10181
  *errorptr = ERR92;
10182
  return NULL;
10183
  }
10184

10185
/* A zero-terminated pattern is indicated by the special length value
10186
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10187

10188
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10189
  patlen = PRIV(strlen)(pattern);
10190
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10191

10192
if (patlen > ccontext->max_pattern_length)
10193
  {
10194
  *errorptr = ERR88;
10195
  return NULL;
10196
  }
10197

10198
/* Optimization flags in 'options' can override those in the compile context.
10199
This is because some options to disable optimizations were added before the
10200
optimization flags word existed, and we need to continue supporting them
10201
for backwards compatibility. */
10202

10203
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10204
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10205
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10206
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10207
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10208
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10209

10210
/* From here on, all returns from this function should end up going via the
10211
EXIT label. */
10212

10213

10214
/* ------------ Initialize the "static" compile data -------------- */
10215

10216
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10217

10218
cb.lcc = tables + lcc_offset;          /* Individual */
10219
cb.fcc = tables + fcc_offset;          /*   character */
10220
cb.cbits = tables + cbits_offset;      /*      tables */
10221
cb.ctypes = tables + ctypes_offset;
10222

10223
cb.assert_depth = 0;
10224
cb.bracount = 0;
10225
cb.cx = ccontext;
10226
cb.dupnames = FALSE;
10227
cb.end_pattern = pattern + patlen;
10228
cb.erroroffset = 0;
10229
cb.external_flags = 0;
10230
cb.external_options = options;
10231
cb.groupinfo = stack_groupinfo;
10232
cb.had_recurse = FALSE;
10233
cb.lastcapture = 0;
10234
cb.max_lookbehind = 0;                               /* Max encountered */
10235
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10236
cb.name_entry_size = 0;
10237
cb.name_table = NULL;
10238
cb.named_groups = named_groups;
10239
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10240
cb.names_found = 0;
10241
cb.parens_depth = 0;
10242
cb.parsed_pattern = stack_parsed_pattern;
10243
cb.req_varyopt = 0;
10244
cb.start_code = cworkspace;
10245
cb.start_pattern = pattern;
10246
cb.start_workspace = cworkspace;
10247
cb.workspace_size = COMPILE_WORK_SIZE;
10248
#ifdef SUPPORT_WIDE_CHARS
10249
cb.cranges = NULL;
10250
cb.next_cranges = NULL;
10251
cb.char_lists_size = 0;
10252
#endif
10253

10254
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10255
references to help in deciding whether (.*) can be treated as anchored or not.
10256
*/
10257

10258
cb.top_backref = 0;
10259
cb.backref_map = 0;
10260

10261
/* Escape sequences \1 to \9 are always back references, but as they are only
10262
two characters long, only two elements can be used in the parsed_pattern
10263
vector. The first contains the reference, and we'd like to use the second to
10264
record the offset in the pattern, so that forward references to non-existent
10265
groups can be diagnosed later with an offset. However, on 64-bit systems,
10266
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10267
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10268
references have enough space for the offset to be put into the parsed pattern.
10269
*/
10270

10271
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10272

10273

10274
/* --------------- Start looking at the pattern --------------- */
10275

10276
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10277
the start of the pattern, and remember the offset to the actual regex. With
10278
valgrind support, make the terminator of a zero-terminated pattern
10279
inaccessible. This catches bugs that would otherwise only show up for
10280
non-zero-terminated patterns. */
10281

10282
#ifdef SUPPORT_VALGRIND
10283
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10284
#endif
10285

10286
xoptions = ccontext->extra_options;
10287
ptr = pattern;
10288
skipatstart = 0;
10289

10290
if ((options & PCRE2_LITERAL) == 0)
10291
  {
10292
  while (patlen - skipatstart >= 2 &&
10293
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10294
         ptr[skipatstart+1] == CHAR_ASTERISK)
10295
    {
10296
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10297
      {
10298
      const pso *p = pso_list + i;
10299

10300
      if (patlen - skipatstart - 2 >= p->length &&
10301
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10302
        {
10303
        uint32_t c, pp;
10304

10305
        skipatstart += p->length + 2;
10306
        switch(p->type)
10307
          {
10308
          case PSO_OPT:
10309
          cb.external_options |= p->value;
10310
          break;
10311

10312
          case PSO_XOPT:
10313
          xoptions |= p->value;
10314
          break;
10315

10316
          case PSO_FLG:
10317
          setflags |= p->value;
10318
          break;
10319

10320
          case PSO_NL:
10321
          newline = p->value;
10322
          setflags |= PCRE2_NL_SET;
10323
          break;
10324

10325
          case PSO_BSR:
10326
          bsr = p->value;
10327
          setflags |= PCRE2_BSR_SET;
10328
          break;
10329

10330
          case PSO_LIMM:
10331
          case PSO_LIMD:
10332
          case PSO_LIMH:
10333
          c = 0;
10334
          pp = skipatstart;
10335
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10336
            {
10337
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10338
            c = c*10 + (ptr[pp++] - CHAR_0);
10339
            }
10340
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10341
            {
10342
            errorcode = ERR60;
10343
            ptr += pp;
10344
            goto HAD_EARLY_ERROR;
10345
            }
10346
          if (p->type == PSO_LIMH) limit_heap = c;
10347
            else if (p->type == PSO_LIMM) limit_match = c;
10348
            else limit_depth = c;
10349
          skipatstart = ++pp;
10350
          break;
10351

10352
          case PSO_OPTMZ:
10353
          optim_flags &= ~(p->value);
10354

10355
          /* For backward compatibility the three original VERBs to disable
10356
          optimizations need to also update the corresponding bit in the
10357
          external options. */
10358

10359
          switch(p->value)
10360
            {
10361
            case PCRE2_OPTIM_AUTO_POSSESS:
10362
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10363
            break;
10364

10365
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10366
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10367
            break;
10368

10369
            case PCRE2_OPTIM_START_OPTIMIZE:
10370
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10371
            break;
10372
            }
10373

10374
          break;
10375

10376
          default:
10377
          /* All values in the enum need an explicit entry for this switch
10378
          but until a better way to prevent coding mistakes is invented keep
10379
          a catch all that triggers a debug build assert as a failsafe */
10380
          PCRE2_DEBUG_UNREACHABLE();
10381
          }
10382
        break;   /* Out of the table scan loop */
10383
        }
10384
      }
10385
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10386
    }
10387
    PCRE2_ASSERT(skipatstart <= patlen);
10388
  }
10389

10390
/* End of pattern-start options; advance to start of real regex. */
10391

10392
ptr += skipatstart;
10393

10394
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10395

10396
#ifndef SUPPORT_UNICODE
10397
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10398
  {
10399
  errorcode = ERR32;
10400
  goto HAD_EARLY_ERROR;
10401
  }
10402
#endif
10403

10404
/* Check UTF. We have the original options in 'options', with that value as
10405
modified by (*UTF) etc in cb->external_options. The extra option
10406
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10407
surrogate code points cannot be represented in UTF-16. */
10408

10409
utf = (cb.external_options & PCRE2_UTF) != 0;
10410
if (utf)
10411
  {
10412
  if ((options & PCRE2_NEVER_UTF) != 0)
10413
    {
10414
    errorcode = ERR74;
10415
    goto HAD_EARLY_ERROR;
10416
    }
10417
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10418
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10419
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10420

10421
#if PCRE2_CODE_UNIT_WIDTH == 16
10422
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10423
    {
10424
    errorcode = ERR91;
10425
    goto HAD_EARLY_ERROR;
10426
    }
10427
#endif
10428
  }
10429

10430
/* Check UCP lockout. */
10431

10432
ucp = (cb.external_options & PCRE2_UCP) != 0;
10433
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10434
  {
10435
  errorcode = ERR75;
10436
  goto HAD_EARLY_ERROR;
10437
  }
10438

10439
/* PCRE2_EXTRA_TURKISH_CASING checks */
10440

10441
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10442
  {
10443
  if (!utf && !ucp)
10444
    {
10445
    errorcode = ERR104;
10446
    goto HAD_EARLY_ERROR;
10447
    }
10448

10449
#if PCRE2_CODE_UNIT_WIDTH == 8
10450
  if (!utf)
10451
    {
10452
    errorcode = ERR105;
10453
    goto HAD_EARLY_ERROR;
10454
    }
10455
#endif
10456

10457
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10458
    {
10459
    errorcode = ERR106;
10460
    goto HAD_EARLY_ERROR;
10461
    }
10462
  }
10463

10464
/* Process the BSR setting. */
10465

10466
if (bsr == 0) bsr = ccontext->bsr_convention;
10467

10468
/* Process the newline setting. */
10469

10470
if (newline == 0) newline = ccontext->newline_convention;
10471
cb.nltype = NLTYPE_FIXED;
10472
switch(newline)
10473
  {
10474
  case PCRE2_NEWLINE_CR:
10475
  cb.nllen = 1;
10476
  cb.nl[0] = CHAR_CR;
10477
  break;
10478

10479
  case PCRE2_NEWLINE_LF:
10480
  cb.nllen = 1;
10481
  cb.nl[0] = CHAR_NL;
10482
  break;
10483

10484
  case PCRE2_NEWLINE_NUL:
10485
  cb.nllen = 1;
10486
  cb.nl[0] = CHAR_NUL;
10487
  break;
10488

10489
  case PCRE2_NEWLINE_CRLF:
10490
  cb.nllen = 2;
10491
  cb.nl[0] = CHAR_CR;
10492
  cb.nl[1] = CHAR_NL;
10493
  break;
10494

10495
  case PCRE2_NEWLINE_ANY:
10496
  cb.nltype = NLTYPE_ANY;
10497
  break;
10498

10499
  case PCRE2_NEWLINE_ANYCRLF:
10500
  cb.nltype = NLTYPE_ANYCRLF;
10501
  break;
10502

10503
  default:
10504
  PCRE2_DEBUG_UNREACHABLE();
10505
  errorcode = ERR56;
10506
  goto HAD_EARLY_ERROR;
10507
  }
10508

10509
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10510
their numerical equivalents, so that this information is always available for
10511
the remaining processing. (2) At the same time, parse the pattern and put a
10512
processed version into the parsed_pattern vector. This has escapes interpreted
10513
and comments removed (amongst other things). */
10514

10515
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10516
patterns the vector on the stack (which was set up above) can be used. */
10517

10518
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10519

10520
/* Allow for 2x uint32_t at the start and 2 at the end, for
10521
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10522

10523
if ((ccontext->extra_options &
10524
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10525
  parsed_size_needed += 4;
10526

10527
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10528

10529
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10530
  parsed_size_needed += 4;
10531

10532
parsed_size_needed += 1;  /* For the final META_END */
10533

10534
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10535
  {
10536
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10537
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10538
  if (heap_parsed_pattern == NULL)
10539
    {
10540
    *errorptr = ERR21;
10541
    goto EXIT;
10542
    }
10543
  cb.parsed_pattern = heap_parsed_pattern;
10544
  }
10545
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10546

10547
/* Do the parsing scan. */
10548

10549
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10550
if (errorcode != 0) goto HAD_CB_ERROR;
10551

10552
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10553
lengths. Workspace is needed to remember whether numbered groups are or are not
10554
of limited length, and if limited, what the minimum and maximum lengths are.
10555
This caching saves re-computing the length of any group that is referenced more
10556
than once, which is particularly relevant when recursion is involved.
10557
Unnumbered groups do not have this exposure because they cannot be referenced.
10558
If there are sufficiently few groups, the default index vector on the stack, as
10559
set up above, can be used. Otherwise we have to get/free some heap memory. The
10560
vector must be initialized to zero. */
10561

10562
if (has_lookbehind)
10563
  {
10564
  int loopcount = 0;
10565
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10566
    {
10567
    cb.groupinfo = ccontext->memctl.malloc(
10568
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10569
    if (cb.groupinfo == NULL)
10570
      {
10571
      errorcode = ERR21;
10572
      cb.erroroffset = 0;
10573
      goto HAD_CB_ERROR;
10574
      }
10575
    }
10576
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10577
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10578
  if (errorcode != 0) goto HAD_CB_ERROR;
10579
  }
10580

10581
/* For debugging, there is a function that shows the parsed pattern vector. */
10582

10583
#ifdef DEBUG_SHOW_PARSED
10584
fprintf(stderr, "+++ Pre-scan complete:\n");
10585
show_parsed(&cb);
10586
#endif
10587

10588
/* For debugging capturing information this code can be enabled. */
10589

10590
#ifdef DEBUG_SHOW_CAPTURES
10591
  {
10592
  named_group *ng = cb.named_groups;
10593
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10594
  for (i = 0; i < cb.names_found; i++, ng++)
10595
    {
10596
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10597
    }
10598
  }
10599
#endif
10600

10601
/* Pretend to compile the pattern while actually just accumulating the amount
10602
of memory required in the 'length' variable. This behaviour is triggered by
10603
passing a non-NULL final argument to compile_regex(). We pass a block of
10604
workspace (cworkspace) for it to compile parts of the pattern into; the
10605
compiled code is discarded when it is no longer needed, so hopefully this
10606
workspace will never overflow, though there is a test for its doing so.
10607

10608
On error, errorcode will be set non-zero, so we don't need to look at the
10609
result of the function. The initial options have been put into the cb block,
10610
but we still have to pass a separate options variable (the first argument)
10611
because the options may change as the pattern is processed. */
10612

10613
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10614
pptr = cb.parsed_pattern;
10615
code = cworkspace;
10616
*code = OP_BRA;
10617

10618
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10619
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10620
   &cb, &length);
10621

10622
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10623

10624
/* This should be caught in compile_regex(), but just in case... */
10625

10626
#if defined SUPPORT_WIDE_CHARS
10627
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10628
if (length > MAX_PATTERN_SIZE ||
10629
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10630
#else
10631
if (length > MAX_PATTERN_SIZE)
10632
#endif
10633
  {
10634
  errorcode = ERR20;
10635
  goto HAD_CB_ERROR;
10636
  }
10637

10638
/* Compute the size of, then, if not too large, get and initialize the data
10639
block for storing the compiled pattern and names table. Integer overflow should
10640
no longer be possible because nowadays we limit the maximum value of
10641
cb.names_found and cb.name_entry_size. */
10642

10643
re_blocksize =
10644
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10645

10646
#if defined SUPPORT_WIDE_CHARS
10647
if (cb.char_lists_size != 0)
10648
  {
10649
#if PCRE2_CODE_UNIT_WIDTH != 32
10650
  /* Align to 32 bit first. This ensures the
10651
  allocated area will also be 32 bit aligned. */
10652
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10653
#endif
10654
  re_blocksize += cb.char_lists_size;
10655
  }
10656
#endif
10657

10658
re_blocksize += CU2BYTES(length);
10659

10660
if (re_blocksize > ccontext->max_pattern_compiled_length)
10661
  {
10662
  errorcode = ERR101;
10663
  goto HAD_CB_ERROR;
10664
  }
10665

10666
re_blocksize += sizeof(pcre2_real_code);
10667
re = (pcre2_real_code *)
10668
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10669
if (re == NULL)
10670
  {
10671
  errorcode = ERR21;
10672
  goto HAD_CB_ERROR;
10673
  }
10674

10675
/* The compiler may put padding at the end of the pcre2_real_code structure in
10676
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10677
compiled pattern is copied (for example, when serialized) undefined bytes are
10678
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10679
write to the last 8 bytes of the structure before setting the fields. */
10680

10681
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10682
re->memctl = ccontext->memctl;
10683
re->tables = tables;
10684
re->executable_jit = NULL;
10685
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10686
re->blocksize = re_blocksize;
10687
re->code_start = re_blocksize - CU2BYTES(length);
10688
re->magic_number = MAGIC_NUMBER;
10689
re->compile_options = options;
10690
re->overall_options = cb.external_options;
10691
re->extra_options = xoptions;
10692
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10693
re->limit_heap = limit_heap;
10694
re->limit_match = limit_match;
10695
re->limit_depth = limit_depth;
10696
re->first_codeunit = 0;
10697
re->last_codeunit = 0;
10698
re->bsr_convention = bsr;
10699
re->newline_convention = newline;
10700
re->max_lookbehind = 0;
10701
re->minlength = 0;
10702
re->top_bracket = 0;
10703
re->top_backref = 0;
10704
re->name_entry_size = cb.name_entry_size;
10705
re->name_count = cb.names_found;
10706
re->optimization_flags = optim_flags;
10707

10708
/* The basic block is immediately followed by the name table, and the compiled
10709
code follows after that. */
10710

10711
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10712

10713
/* Update the compile data block for the actual compile. The starting points of
10714
the name/number translation table and of the code are passed around in the
10715
compile data block. The start/end pattern and initial options are already set
10716
from the pre-compile phase, as is the name_entry_size field. */
10717

10718
cb.parens_depth = 0;
10719
cb.assert_depth = 0;
10720
cb.lastcapture = 0;
10721
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10722
cb.start_code = codestart;
10723
cb.req_varyopt = 0;
10724
cb.had_accept = FALSE;
10725
cb.had_pruneorskip = FALSE;
10726
#ifdef SUPPORT_WIDE_CHARS
10727
cb.char_lists_size = 0;
10728
#endif
10729

10730

10731
/* If any named groups were found, create the name/number table from the list
10732
created in the pre-pass. */
10733

10734
if (cb.names_found > 0)
10735
  {
10736
  named_group *ng = cb.named_groups;
10737
  for (i = 0; i < cb.names_found; i++, ng++)
10738
    add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10739
  }
10740

10741
/* Set up a starting, non-extracting bracket, then compile the expression. On
10742
error, errorcode will be set non-zero, so we don't need to look at the result
10743
of the function here. */
10744

10745
pptr = cb.parsed_pattern;
10746
code = (PCRE2_UCHAR *)codestart;
10747
*code = OP_BRA;
10748
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10749
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10750
  NULL, &cb, NULL);
10751
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10752
re->top_bracket = cb.bracount;
10753
re->top_backref = cb.top_backref;
10754
re->max_lookbehind = cb.max_lookbehind;
10755

10756
if (cb.had_accept)
10757
  {
10758
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10759
  reqcuflags = REQ_NONE;
10760
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10761
  }
10762

10763
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10764
but the estimated length exceeds the really used length, adjust the value of
10765
re->blocksize, and if valgrind support is configured, mark the extra allocated
10766
memory as unaddressable, so that any out-of-bound reads can be detected. */
10767

10768
*code++ = OP_END;
10769
usedlength = code - codestart;
10770
if (usedlength > length)
10771
  {
10772
  PCRE2_DEBUG_UNREACHABLE();
10773
  errorcode = ERR23;  /* Overflow of code block - internal error */
10774
  }
10775
else
10776
  {
10777
  re->blocksize -= CU2BYTES(length - usedlength);
10778
#ifdef SUPPORT_VALGRIND
10779
  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10780
#endif
10781
  }
10782

10783
/* Scan the pattern for recursion/subroutine calls and convert the group
10784
numbers into offsets. Maintain a small cache so that repeated groups containing
10785
recursions are efficiently handled. */
10786

10787
#define RSCAN_CACHE_SIZE 8
10788

10789
if (errorcode == 0 && cb.had_recurse)
10790
  {
10791
  PCRE2_UCHAR *rcode;
10792
  PCRE2_SPTR rgroup;
10793
  unsigned int ccount = 0;
10794
  int start = RSCAN_CACHE_SIZE;
10795
  recurse_cache rc[RSCAN_CACHE_SIZE];
10796

10797
  for (rcode = find_recurse(codestart, utf);
10798
       rcode != NULL;
10799
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
10800
    {
10801
    int p, groupnumber;
10802

10803
    groupnumber = (int)GET(rcode, 1);
10804
    if (groupnumber == 0) rgroup = codestart; else
10805
      {
10806
      PCRE2_SPTR search_from = codestart;
10807
      rgroup = NULL;
10808
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10809
        {
10810
        if (groupnumber == rc[p].groupnumber)
10811
          {
10812
          rgroup = rc[p].group;
10813
          break;
10814
          }
10815

10816
        /* Group n+1 must always start to the right of group n, so we can save
10817
        search time below when the new group number is greater than any of the
10818
        previously found groups. */
10819

10820
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10821
        }
10822

10823
      if (rgroup == NULL)
10824
        {
10825
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10826
        if (rgroup == NULL)
10827
          {
10828
          PCRE2_DEBUG_UNREACHABLE();
10829
          errorcode = ERR53;
10830
          break;
10831
          }
10832
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10833
        rc[start].groupnumber = groupnumber;
10834
        rc[start].group = rgroup;
10835
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
10836
        }
10837
      }
10838

10839
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
10840
    }
10841
  }
10842

10843
/* In rare debugging situations we sometimes need to look at the compiled code
10844
at this stage. */
10845

10846
#ifdef DEBUG_CALL_PRINTINT
10847
pcre2_printint(re, stderr, TRUE);
10848
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10849
#endif
10850

10851
/* Unless disabled, check whether any single character iterators can be
10852
auto-possessified. The function overwrites the appropriate opcode values, so
10853
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10854
used in this code because at least one compiler gives a warning about loss of
10855
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10856
function call. */
10857

10858
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
10859
  {
10860
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10861
  if (PRIV(auto_possessify)(temp, &cb) != 0)
10862
    {
10863
    PCRE2_DEBUG_UNREACHABLE();
10864
    errorcode = ERR80;
10865
    }
10866
  }
10867

10868
/* Failed to compile, or error while post-processing. */
10869

10870
if (errorcode != 0) goto HAD_CB_ERROR;
10871

10872
/* Successful compile. If the anchored option was not passed, set it if
10873
we can determine that the pattern is anchored by virtue of ^ characters or \A
10874
or anything else, such as starting with non-atomic .* when DOTALL is set and
10875
there are no occurrences of *PRUNE or *SKIP (though there is an option to
10876
disable this case). */
10877

10878
if ((re->overall_options & PCRE2_ANCHORED) == 0)
10879
  {
10880
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10881
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10882
    re->overall_options |= PCRE2_ANCHORED;
10883
  }
10884

10885
/* Set up the first code unit or startline flag, the required code unit, and
10886
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
10887
is disabled, as the data it would create will not be used. Note that a first code
10888
unit (but not the startline flag) is useful for anchored patterns because it
10889
can still give a quick "no match" and also avoid searching for a last code
10890
unit. */
10891

10892
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
10893
  {
10894
  int minminlength = 0;  /* For minimal minlength from first/required CU */
10895

10896
  /* If we do not have a first code unit, see if there is one that is asserted
10897
  (these are not saved during the compile because they can cause conflicts with
10898
  actual literals that follow). */
10899

10900
  if (firstcuflags >= REQ_NONE) {
10901
    uint32_t assertedcuflags = 0;
10902
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
10903
    /* It would be wrong to use the asserted first code unit as `firstcu` for
10904
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
10905
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
10906
     * the subject string needs to be at least 2 characters long, which is wrong.
10907
     * With more analysis, we would be able to set firstcu in more cases. */
10908
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
10909
      firstcu = assertedcu;
10910
      firstcuflags = assertedcuflags;
10911
    }
10912
  }
10913

10914
  /* Save the data for a first code unit. The existence of one means the
10915
  minimum length must be at least 1. */
10916

10917
  if (firstcuflags < REQ_NONE)
10918
    {
10919
    re->first_codeunit = firstcu;
10920
    re->flags |= PCRE2_FIRSTSET;
10921
    minminlength++;
10922

10923
    /* Handle caseless first code units. */
10924

10925
    if ((firstcuflags & REQ_CASELESS) != 0)
10926
      {
10927
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10928
        {
10929
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10930
        }
10931

10932
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10933
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
10934
      units and cannot have another case, but if UCP is set they may do. */
10935

10936
#ifdef SUPPORT_UNICODE
10937
#if PCRE2_CODE_UNIT_WIDTH == 8
10938
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10939
        re->flags |= PCRE2_FIRSTCASELESS;
10940
#else
10941
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10942
               UCD_OTHERCASE(firstcu) != firstcu)
10943
        re->flags |= PCRE2_FIRSTCASELESS;
10944
#endif
10945
#endif  /* SUPPORT_UNICODE */
10946
      }
10947
    }
10948

10949
  /* When there is no first code unit, for non-anchored patterns, see if we can
10950
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10951
  branches start with ^ and also when all branches start with non-atomic .* for
10952
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10953
  that disables this case.) */
10954

10955
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
10956
    {
10957
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10958
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10959
      re->flags |= PCRE2_STARTLINE;
10960
    }
10961

10962
  /* Handle the "required code unit", if one is set. In the UTF case we can
10963
  increment the minimum minimum length only if we are sure this really is a
10964
  different character and not a non-starting code unit of the first character,
10965
  because the minimum length count is in characters, not code units. */
10966

10967
  if (reqcuflags < REQ_NONE)
10968
    {
10969
#if PCRE2_CODE_UNIT_WIDTH == 16
10970
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10971
        firstcuflags >= REQ_NONE ||                 /* First not set */
10972
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10973
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10974
#elif PCRE2_CODE_UNIT_WIDTH == 8
10975
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10976
        firstcuflags >= REQ_NONE ||                 /* First not set */
10977
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10978
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
10979
#endif
10980
      {
10981
      minminlength++;
10982
      }
10983

10984
    /* In the case of an anchored pattern, set up the value only if it follows
10985
    a variable length item in the pattern. */
10986

10987
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10988
        (reqcuflags & REQ_VARY) != 0)
10989
      {
10990
      re->last_codeunit = reqcu;
10991
      re->flags |= PCRE2_LASTSET;
10992

10993
      /* Handle caseless required code units as for first code units (above). */
10994

10995
      if ((reqcuflags & REQ_CASELESS) != 0)
10996
        {
10997
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10998
          {
10999
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11000
          }
11001
#ifdef SUPPORT_UNICODE
11002
#if PCRE2_CODE_UNIT_WIDTH == 8
11003
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11004
        re->flags |= PCRE2_LASTCASELESS;
11005
#else
11006
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11007
               UCD_OTHERCASE(reqcu) != reqcu)
11008
        re->flags |= PCRE2_LASTCASELESS;
11009
#endif
11010
#endif  /* SUPPORT_UNICODE */
11011
        }
11012
      }
11013
    }
11014

11015
  /* Study the compiled pattern to set up information such as a bitmap of
11016
  starting code units and a minimum matching length. */
11017

11018
  if (PRIV(study)(re) != 0)
11019
    {
11020
    PCRE2_DEBUG_UNREACHABLE();
11021
    errorcode = ERR31;
11022
    goto HAD_CB_ERROR;
11023
    }
11024

11025
  /* If study() set a bitmap of starting code units, it implies a minimum
11026
  length of at least one. */
11027

11028
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11029
    minminlength = 1;
11030

11031
  /* If the minimum length set (or not set) by study() is less than the minimum
11032
  implied by required code units, override it. */
11033

11034
  if (re->minlength < minminlength) re->minlength = minminlength;
11035
  }   /* End of start-of-match optimizations. */
11036

11037
/* Control ends up here in all cases. When running under valgrind, make a
11038
pattern's terminating zero defined again. If memory was obtained for the parsed
11039
version of the pattern, free it before returning. Also free the list of named
11040
groups if a larger one had to be obtained, and likewise the group information
11041
vector. */
11042

11043
#ifdef SUPPORT_UNICODE
11044
PCRE2_ASSERT(cb.cranges == NULL);
11045
#endif
11046

11047
EXIT:
11048
#ifdef SUPPORT_VALGRIND
11049
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11050
#endif
11051
if (cb.parsed_pattern != stack_parsed_pattern)
11052
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11053
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11054
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11055
if (cb.groupinfo != stack_groupinfo)
11056
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11057

11058
return re;    /* Will be NULL after an error */
11059

11060
/* Errors discovered in parse_regex() set the offset value in the compile
11061
block. Errors discovered before it is called must compute it from the ptr
11062
value. After parse_regex() is called, the offset in the compile block is set to
11063
the end of the pattern, but certain errors in compile_regex() may reset it if
11064
an offset is available in the parsed pattern. */
11065

11066
HAD_CB_ERROR:
11067
ptr = pattern + cb.erroroffset;
11068

11069
HAD_EARLY_ERROR:
11070
PCRE2_ASSERT(ptr >= pattern); /* Ensure we don't return invalid erroroffset */
11071
PCRE2_ASSERT(ptr <= (pattern + patlen));
11072
*erroroffset = ptr - pattern;
11073

11074
HAD_ERROR:
11075
*errorptr = errorcode;
11076
pcre2_code_free(re);
11077
re = NULL;
11078

11079
#ifdef SUPPORT_WIDE_CHARS
11080
if (cb.cranges != NULL)
11081
  {
11082
  class_ranges* cranges = cb.cranges;
11083
  do
11084
    {
11085
    class_ranges* next_cranges = cranges->next;
11086
    cb.cx->memctl.free(cranges, cb.cx->memctl.memory_data);
11087
    cranges = next_cranges;
11088
    }
11089
  while (cranges != NULL);
11090
  }
11091
#endif
11092
goto EXIT;
11093
}
11094

11095
/* These #undefs are here to enable unity builds with CMake. */
11096

11097
#undef NLBLOCK /* Block containing newline information */
11098
#undef PSSTART /* Field containing processed string start */
11099
#undef PSEND   /* Field containing processed string end */
11100

11101
/* End of pcre2_compile.c */
11102

11103
Product

Resources

Company