Path: blob/master/thirdparty/pcre2/src/pcre2_compile.c
21658 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#include "pcre2_compile.h"42434445#define NLBLOCK cb /* Block containing newline information */46#define PSSTART start_pattern /* Field containing processed string start */47#define PSEND end_pattern /* Field containing processed string end */4849/* In rare error cases debugging might require calling pcre2_printint(). */5051#if 052#ifdef EBCDIC53#define PRINTABLE(c) ((c) >= 64 && (c) < 255)54#else55#define PRINTABLE(c) ((c) >= 32 && (c) < 127)56#endif57#define CHAR_OUTPUT(c) (c)58#define CHAR_OUTPUT_HEX(c) (c)59#define CHAR_INPUT(c) (c)60#define CHAR_INPUT_HEX(c) (c)61#include "pcre2_printint_inc.h"62#undef PRINTABLE63#undef CHAR_OUTPUT64#undef CHAR_OUTPUT_HEX65#undef CHAR_INPUT66#define DEBUG_CALL_PRINTINT67#endif6869/* Other debugging code can be enabled by these defines. */7071/* #define DEBUG_SHOW_CAPTURES */72/* #define DEBUG_SHOW_PARSED */7374/* There are a few things that vary with different code unit sizes. Handle them75by defining macros in order to minimize #if usage. */7677#if PCRE2_CODE_UNIT_WIDTH == 878#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 579#define XDIGIT(c) xdigitab[c]8081#else /* Either 16-bit or 32-bit */82#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)8384#if PCRE2_CODE_UNIT_WIDTH == 1685#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 68687#else /* 32-bit */88#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 689#endif90#endif9192/* Function definitions to allow mutual recursion */9394static int95compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,96uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,97open_capitem *, compile_block *, PCRE2_SIZE *);9899static int100get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,101compile_block *);102103static BOOL104set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,105compile_block *);106107static int108check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,109compile_block *, int *);110111112/*************************************************113* Code parameters and static tables *114*************************************************/115116#define MAX_GROUP_NUMBER 65535u117#define MAX_REPEAT_COUNT 65535u118#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)119120/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in121different ways in the different pattern scans. The parsing and group-122identifying pre-scan uses it to handle nesting, and needs it to be 16-bit123aligned for this. Having defined the size in code units, we set up124C16_WORK_SIZE as the number of elements in the 16-bit vector.125126During the first compiling phase, when determining how much memory is required,127the regex is partly compiled into this space, but the compiled parts are128discarded as soon as they can be, so that hopefully there will never be an129overrun. The code does, however, check for an overrun, which can occur for130pathological patterns. The size of the workspace depends on LINK_SIZE because131the length of compiled items varies with this.132133In the real compile phase, this workspace is not currently used. */134135#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */136137#define C16_WORK_SIZE \138((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))139140/* A uint32_t vector is used for caching information about the size of141capturing groups, to improve performance. A default is created on the stack of142this size. */143144#define GROUPINFO_DEFAULT_SIZE 256145146/* The overrun tests check for a slightly smaller size so that they detect the147overrun before it actually does run off the end of the data block. */148149#define WORK_SIZE_SAFETY_MARGIN (100)150151/* This value determines the size of the initial vector that is used for152remembering named groups during the pre-compile. It is allocated on the stack,153but if it is too small, it is expanded, in a similar way to the workspace. The154value is the number of slots in the list. */155156#define NAMED_GROUP_LIST_SIZE 20157158/* The pre-compiling pass over the pattern creates a parsed pattern in a vector159of uint32_t. For short patterns this lives on the stack, with this size. Heap160memory is used for longer patterns. */161162#define PARSED_PATTERN_DEFAULT_SIZE 1024163164/* Maximum length value to check against when making sure that the variable165that holds the compiled pattern length does not overflow. We make it a bit less166than INT_MAX to allow for adding in group terminating code units, so that we167don't have to check them every time. */168169#define OFLOW_MAX (INT_MAX - 20)170171/* Table of extra lengths for each of the meta codes. Must be kept in step with172the definitions above. For some items these values are a basic length to which173a variable amount has to be added. */174175static unsigned char meta_extra_lengths[] = {1760, /* META_END */1770, /* META_ALT */1780, /* META_ATOMIC */1790, /* META_BACKREF - more if group is >= 10 */1801+SIZEOFFSET, /* META_BACKREF_BYNAME */1811, /* META_BIGVALUE */1823, /* META_CALLOUT_NUMBER */1833+SIZEOFFSET, /* META_CALLOUT_STRING */1840, /* META_CAPTURE */1850, /* META_CIRCUMFLEX */1860, /* META_CLASS */1870, /* META_CLASS_EMPTY */1880, /* META_CLASS_EMPTY_NOT */1890, /* META_CLASS_END */1900, /* META_CLASS_NOT */1910, /* META_COND_ASSERT */192SIZEOFFSET, /* META_COND_DEFINE */1931+SIZEOFFSET, /* META_COND_NAME */1941+SIZEOFFSET, /* META_COND_NUMBER */1951+SIZEOFFSET, /* META_COND_RNAME */1961+SIZEOFFSET, /* META_COND_RNUMBER */1973, /* META_COND_VERSION */198SIZEOFFSET, /* META_OFFSET */1990, /* META_SCS */2001, /* META_CAPTURE_NAME */2011, /* META_CAPTURE_NUMBER */2020, /* META_DOLLAR */2030, /* META_DOT */2040, /* META_ESCAPE - one more for ESC_P and ESC_p */2050, /* META_KET */2060, /* META_NOCAPTURE */2072, /* META_OPTIONS */2081, /* META_POSIX */2091, /* META_POSIX_NEG */2100, /* META_RANGE_ESCAPED */2110, /* META_RANGE_LITERAL */212SIZEOFFSET, /* META_RECURSE */2131+SIZEOFFSET, /* META_RECURSE_BYNAME */2140, /* META_SCRIPT_RUN */2150, /* META_LOOKAHEAD */2160, /* META_LOOKAHEADNOT */217SIZEOFFSET, /* META_LOOKBEHIND */218SIZEOFFSET, /* META_LOOKBEHINDNOT */2190, /* META_LOOKAHEAD_NA */220SIZEOFFSET, /* META_LOOKBEHIND_NA */2211, /* META_MARK - plus the string length */2220, /* META_ACCEPT */2230, /* META_FAIL */2240, /* META_COMMIT */2251, /* META_COMMIT_ARG - plus the string length */2260, /* META_PRUNE */2271, /* META_PRUNE_ARG - plus the string length */2280, /* META_SKIP */2291, /* META_SKIP_ARG - plus the string length */2300, /* META_THEN */2311, /* META_THEN_ARG - plus the string length */2320, /* META_ASTERISK */2330, /* META_ASTERISK_PLUS */2340, /* META_ASTERISK_QUERY */2350, /* META_PLUS */2360, /* META_PLUS_PLUS */2370, /* META_PLUS_QUERY */2380, /* META_QUERY */2390, /* META_QUERY_PLUS */2400, /* META_QUERY_QUERY */2412, /* META_MINMAX */2422, /* META_MINMAX_PLUS */2432, /* META_MINMAX_QUERY */2440, /* META_ECLASS_AND */2450, /* META_ECLASS_OR */2460, /* META_ECLASS_SUB */2470, /* META_ECLASS_XOR */2480 /* META_ECLASS_NOT */249};250251/* Types for skipping parts of a parsed pattern. */252253enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };254255/* Values and flags for the unsigned xxcuflags variables that accompany xxcu256variables, which are concerned with first and required code units. A value257greater than or equal to REQ_NONE means "no code unit set"; otherwise the258matching xxcu variable is set, and the low valued bits are relevant. */259260#define REQ_UNSET 0xffffffffu /* Not yet found anything */261#define REQ_NONE 0xfffffffeu /* Found not fixed character */262#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */263#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */264265/* These flags are used in the groupinfo vector. */266267#define GI_SET_FIXED_LENGTH 0x80000000u268#define GI_NOT_FIXED_LENGTH 0x40000000u269#define GI_FIXED_LENGTH_MASK 0x0000ffffu270271/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC272and is fast (a good compiler can turn it into a subtraction and unsigned273comparison). */274275#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)276277/* Table to identify hex digits. The tables in chartables are dependent on the278locale, and may mark arbitrary characters as digits. We want to recognize only2790-9, a-z, and A-Z as hex digits, which is why we have a private table here. It280costs 256 bytes, but it is a lot faster than doing character value tests (at281least in some simple cases I timed), and in some applications one wants PCRE2282to compile efficiently as well as match efficiently. The value in the table is283the binary hex digit value, or 0xff for non-hex digits. */284285/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in286UTF-8 mode. */287288#ifndef EBCDIC289static const uint8_t xdigitab[] =290{2910xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */2920xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */2930xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */2940xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */2950xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */2960xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */2970x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */2980x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */2990xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */3000xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */3010xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */3020xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */3030xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */3040xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */3050xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */3060xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */3070xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */3080xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */3090xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */3100xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */3110xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */3120xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */3130xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */3140xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */3150xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */3160xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */3170xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */3180xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */3190xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */3200xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */3210xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */3220xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */323324#else325326/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */327328static const uint8_t xdigitab[] =329{3300xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */3310xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */3320xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */3330xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */3340xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */3350xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */3360xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */3370xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */3380xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */3390xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */3400xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */3410xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */3420xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */3430xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */3440xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */3450xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */3460xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */3470xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */3480xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */3490xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */3500xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */3510xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */3520xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */3530xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */3540xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */3550xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */3560xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */3570xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */3580xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */3590xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */3600x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */3610x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */362#endif /* EBCDIC */363364365/* Table for handling alphanumeric escaped characters. Positive returns are366simple data values; negative values are for special things like \d and so on.367Zero means further processing is needed (for things like \x), or the escape is368invalid. */369370/* This is the "normal" table for ASCII systems or for EBCDIC systems running371in UTF-8 mode. It runs from '0' to 'z'. */372373#ifndef EBCDIC374#define ESCAPES_FIRST CHAR_0375#define ESCAPES_LAST CHAR_z376#define UPPER_CASE(c) (c-32)377378static const short int escapes[] = {379/* 0 */ 0, /* 1 */ 0,380/* 2 */ 0, /* 3 */ 0,381/* 4 */ 0, /* 5 */ 0,382/* 6 */ 0, /* 7 */ 0,383/* 8 */ 0, /* 9 */ 0,384/* : */ ESCAPES_FIRST+0x0a, /* ; */ ESCAPES_FIRST+0x0b,385/* < */ ESCAPES_FIRST+0x0c, /* = */ ESCAPES_FIRST+0x0d,386/* > */ ESCAPES_FIRST+0x0e, /* ? */ ESCAPES_FIRST+0x0f,387/* @ */ ESCAPES_FIRST+0x10, /* A */ -ESC_A,388/* B */ -ESC_B, /* C */ -ESC_C,389/* D */ -ESC_D, /* E */ -ESC_E,390/* F */ 0, /* G */ -ESC_G,391/* H */ -ESC_H, /* I */ 0,392/* J */ 0, /* K */ -ESC_K,393/* L */ 0, /* M */ 0,394/* N */ -ESC_N, /* O */ 0,395/* P */ -ESC_P, /* Q */ -ESC_Q,396/* R */ -ESC_R, /* S */ -ESC_S,397/* T */ 0, /* U */ 0,398/* V */ -ESC_V, /* W */ -ESC_W,399/* X */ -ESC_X, /* Y */ 0,400/* Z */ -ESC_Z, /* [ */ ESCAPES_FIRST+0x2b,401/* \ */ ESCAPES_FIRST+0x2c, /* ] */ ESCAPES_FIRST+0x2d,402/* ^ */ ESCAPES_FIRST+0x2e, /* _ */ ESCAPES_FIRST+0x2f,403/* ` */ ESCAPES_FIRST+0x30, /* a */ CHAR_BEL,404/* b */ -ESC_b, /* c */ 0,405/* d */ -ESC_d, /* e */ CHAR_ESC,406/* f */ CHAR_FF, /* g */ 0,407/* h */ -ESC_h, /* i */ 0,408/* j */ 0, /* k */ -ESC_k,409/* l */ 0, /* m */ 0,410/* n */ CHAR_LF, /* o */ 0,411/* p */ -ESC_p, /* q */ 0,412/* r */ CHAR_CR, /* s */ -ESC_s,413/* t */ CHAR_HT, /* u */ 0,414/* v */ -ESC_v, /* w */ -ESC_w,415/* x */ 0, /* y */ 0,416/* z */ -ESC_z417};418419#else420421/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.422It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,423which can interpret character literals like 'a' or '[' in an EBCDIC codepage;424in this case, there is wide variance between codepages on the interpretation of425characters between the letters ('[' and '{' and so on are placed in all sorts of426different positions in the table). Thankfully however, all EBCDIC codepages427place the letters and digits in the same location, so we hardcode that here.428Our EBCDIC support can also be provided via numeric literals instead of429character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled430in EBCDIC mode. */431432#define ESCAPES_FIRST CHAR_a433#define ESCAPES_LAST CHAR_9434#define UPPER_CASE(c) (c+64)435436static const short int escapes[] = {437/* 0x81 a */ CHAR_BEL, /* 0x82 b */ -ESC_b,438/* 0x83 c */ 0, /* 0x84 d */ -ESC_d,439/* 0x85 e */ CHAR_ESC, /* 0x86 f */ CHAR_FF,440/* 0x87 g */ 0, /* 0x88 h */ -ESC_h,441/* 0x89 i */ 0, /* 0x8a */ ESCAPES_FIRST+0x09,442/* 0x8b */ ESCAPES_FIRST+0x0a, /* 0x8c */ ESCAPES_FIRST+0x0b,443/* 0x8d */ ESCAPES_FIRST+0x0c, /* 0x8e */ ESCAPES_FIRST+0x0d,444/* 0x8f */ ESCAPES_FIRST+0x0e, /* 0x90 */ ESCAPES_FIRST+0x0f,445/* 0x91 j */ 0, /* 0x92 k */ -ESC_k,446/* 0x93 l */ 0, /* 0x94 m */ 0,447/* 0x95 n */ CHAR_LF, /* 0x96 o */ 0,448/* 0x97 p */ -ESC_p, /* 0x98 q */ 0,449/* 0x99 r */ CHAR_CR, /* 0x9a */ ESCAPES_FIRST+0x19,450/* 0x9b */ ESCAPES_FIRST+0x1a, /* 0x9c */ ESCAPES_FIRST+0x1b,451/* 0x9d */ ESCAPES_FIRST+0x1c, /* 0x9e */ ESCAPES_FIRST+0x1d,452/* 0x9f */ ESCAPES_FIRST+0x1e, /* 0xa0 */ ESCAPES_FIRST+0x1f,453/* 0xa1 */ ESCAPES_FIRST+0x20, /* 0xa2 s */ -ESC_s,454/* 0xa3 t */ CHAR_HT, /* 0xa4 u */ 0,455/* 0xa5 v */ -ESC_v, /* 0xa6 w */ -ESC_w,456/* 0xa7 x */ 0, /* 0xa8 y */ 0,457/* 0xa9 z */ -ESC_z, /* 0xaa */ ESCAPES_FIRST+0x29,458/* 0xab */ ESCAPES_FIRST+0x2a, /* 0xac */ ESCAPES_FIRST+0x2b,459/* 0xad */ ESCAPES_FIRST+0x2c, /* 0xae */ ESCAPES_FIRST+0x2d,460/* 0xaf */ ESCAPES_FIRST+0x2e, /* 0xb0 */ ESCAPES_FIRST+0x2f,461/* 0xb1 */ ESCAPES_FIRST+0x30, /* 0xb2 */ ESCAPES_FIRST+0x31,462/* 0xb3 */ ESCAPES_FIRST+0x32, /* 0xb4 */ ESCAPES_FIRST+0x33,463/* 0xb5 */ ESCAPES_FIRST+0x34, /* 0xb6 */ ESCAPES_FIRST+0x35,464/* 0xb7 */ ESCAPES_FIRST+0x36, /* 0xb8 */ ESCAPES_FIRST+0x37,465/* 0xb9 */ ESCAPES_FIRST+0x38, /* 0xba */ ESCAPES_FIRST+0x39,466/* 0xbb */ ESCAPES_FIRST+0x3a, /* 0xbc */ ESCAPES_FIRST+0x3b,467/* 0xbd */ ESCAPES_FIRST+0x3c, /* 0xbe */ ESCAPES_FIRST+0x3d,468/* 0xbf */ ESCAPES_FIRST+0x3e, /* 0xc0 */ ESCAPES_FIRST+0x3f,469/* 0xc1 A */ -ESC_A, /* 0xc2 B */ -ESC_B,470/* 0xc3 C */ -ESC_C, /* 0xc4 D */ -ESC_D,471/* 0xc5 E */ -ESC_E, /* 0xc6 F */ 0,472/* 0xc7 G */ -ESC_G, /* 0xc8 H */ -ESC_H,473/* 0xc9 I */ 0, /* 0xca */ ESCAPES_FIRST+0x49,474/* 0xcb */ ESCAPES_FIRST+0x4a, /* 0xcc */ ESCAPES_FIRST+0x4b,475/* 0xcd */ ESCAPES_FIRST+0x4c, /* 0xce */ ESCAPES_FIRST+0x4d,476/* 0xcf */ ESCAPES_FIRST+0x4e, /* 0xd0 */ ESCAPES_FIRST+0x4f,477/* 0xd1 J */ 0, /* 0xd2 K */ -ESC_K,478/* 0xd3 L */ 0, /* 0xd4 M */ 0,479/* 0xd5 N */ -ESC_N, /* 0xd6 O */ 0,480/* 0xd7 P */ -ESC_P, /* 0xd8 Q */ -ESC_Q,481/* 0xd9 R */ -ESC_R, /* 0xda */ ESCAPES_FIRST+0x59,482/* 0xdb */ ESCAPES_FIRST+0x5a, /* 0xdc */ ESCAPES_FIRST+0x5b,483/* 0xdd */ ESCAPES_FIRST+0x5c, /* 0xde */ ESCAPES_FIRST+0x5d,484/* 0xdf */ ESCAPES_FIRST+0x5e, /* 0xe0 */ ESCAPES_FIRST+0x5f,485/* 0xe1 */ ESCAPES_FIRST+0x60, /* 0xe2 S */ -ESC_S,486/* 0xe3 T */ 0, /* 0xe4 U */ 0,487/* 0xe5 V */ -ESC_V, /* 0xe6 W */ -ESC_W,488/* 0xe7 X */ -ESC_X, /* 0xe8 Y */ 0,489/* 0xe9 Z */ -ESC_Z, /* 0xea */ ESCAPES_FIRST+0x69,490/* 0xeb */ ESCAPES_FIRST+0x6a, /* 0xec */ ESCAPES_FIRST+0x6b,491/* 0xed */ ESCAPES_FIRST+0x6c, /* 0xee */ ESCAPES_FIRST+0x6d,492/* 0xef */ ESCAPES_FIRST+0x6e, /* 0xf0 0 */ 0,493/* 0xf1 1 */ 0, /* 0xf2 2 */ 0,494/* 0xf3 3 */ 0, /* 0xf4 4 */ 0,495/* 0xf5 5 */ 0, /* 0xf6 6 */ 0,496/* 0xf7 7 */ 0, /* 0xf8 8 */ 0,497/* 0xf9 9 */ 0,498};499500/* We also need a table of characters that may follow \c in an EBCDIC501environment for characters 0-31. */502503static unsigned char ebcdic_escape_c[] = {504CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,505CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,506CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,507CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,508CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE509};510511#endif /* EBCDIC */512513514/* Table of special "verbs" like (*PRUNE). This is a short table, so it is515searched linearly. Put all the names into a single string, in order to reduce516the number of relocations when a shared library is dynamically linked. The517string is built from string macros so that it works in UTF-8 mode on EBCDIC518platforms. */519520typedef struct verbitem {521unsigned int len; /* Length of verb name */522uint32_t meta; /* Base META_ code */523int has_arg; /* Argument requirement */524} verbitem;525526static const char verbnames[] =527"\0" /* Empty name is a shorthand for MARK */528STRING_MARK0529STRING_ACCEPT0530STRING_F0531STRING_FAIL0532STRING_COMMIT0533STRING_PRUNE0534STRING_SKIP0535STRING_THEN;536537static const verbitem verbs[] = {538{ 0, META_MARK, +1 }, /* > 0 => must have an argument */539{ 4, META_MARK, +1 },540{ 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */541{ 1, META_FAIL, -1 },542{ 4, META_FAIL, -1 },543{ 6, META_COMMIT, 0 },544{ 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */545{ 4, META_SKIP, 0 },546{ 4, META_THEN, 0 }547};548549static const int verbcount = sizeof(verbs)/sizeof(verbitem);550551/* Verb opcodes, indexed by their META code offset from META_MARK. */552553static const uint32_t verbops[] = {554OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,555OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };556557/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */558559typedef struct alasitem {560unsigned int len; /* Length of name */561uint32_t meta; /* Base META_ code */562} alasitem;563564static const char alasnames[] =565STRING_pla0566STRING_plb0567STRING_napla0568STRING_naplb0569STRING_nla0570STRING_nlb0571STRING_positive_lookahead0572STRING_positive_lookbehind0573STRING_non_atomic_positive_lookahead0574STRING_non_atomic_positive_lookbehind0575STRING_negative_lookahead0576STRING_negative_lookbehind0577STRING_scs0578STRING_scan_substring0579STRING_atomic0580STRING_sr0581STRING_asr0582STRING_script_run0583STRING_atomic_script_run;584585static const alasitem alasmeta[] = {586{ 3, META_LOOKAHEAD },587{ 3, META_LOOKBEHIND },588{ 5, META_LOOKAHEAD_NA },589{ 5, META_LOOKBEHIND_NA },590{ 3, META_LOOKAHEADNOT },591{ 3, META_LOOKBEHINDNOT },592{ 18, META_LOOKAHEAD },593{ 19, META_LOOKBEHIND },594{ 29, META_LOOKAHEAD_NA },595{ 30, META_LOOKBEHIND_NA },596{ 18, META_LOOKAHEADNOT },597{ 19, META_LOOKBEHINDNOT },598{ 3, META_SCS },599{ 14, META_SCS },600{ 6, META_ATOMIC },601{ 2, META_SCRIPT_RUN }, /* sr = script run */602{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */603{ 10, META_SCRIPT_RUN }, /* script run */604{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */605};606607static const int alascount = sizeof(alasmeta)/sizeof(alasitem);608609/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */610611static uint32_t chartypeoffset[] = {612OP_STAR - OP_STAR, OP_STARI - OP_STAR,613OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };614615/* Tables of names of POSIX character classes and their lengths. The names are616now all in a single string, to reduce the number of relocations when a shared617library is dynamically loaded. The list of lengths is terminated by a zero618length entry. The first three must be alpha, lower, upper, as this is assumed619for handling case independence.620621The indices for several classes are stored in pcre2_compile.h - these must622be kept in sync with posix_names, posix_name_lengths, posix_class_maps,623and posix_substitutes. */624625static const char posix_names[] =626STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0627STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0628STRING_graph0 STRING_print0 STRING_punct0 STRING_space0629STRING_word0 STRING_xdigit;630631static const uint8_t posix_name_lengths[] = {6325, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };633634/* Table of class bit maps for each POSIX class. Each class is formed from a635base map, with an optional addition or removal of another map. Then, for some636classes, there is some additional tweaking: for [:blank:] the vertical space637characters are removed, and for [:alpha:] and [:alnum:] the underscore638character is removed. The triples in the table consist of the base map offset,639second map offset or -1 if no second map, and a non-negative value for map640addition or a negative value for map subtraction (if there are two maps). The641absolute value of the third field has these meanings: 0 => no tweaking, 1 =>642remove vertical space characters, 2 => remove underscore. */643644const int PRIV(posix_class_maps)[] = {645cbit_word, cbit_digit, -2, /* alpha */646cbit_lower, -1, 0, /* lower */647cbit_upper, -1, 0, /* upper */648cbit_word, -1, 2, /* alnum - word without underscore */649cbit_print, cbit_cntrl, 0, /* ascii */650cbit_space, -1, 1, /* blank - a GNU extension */651cbit_cntrl, -1, 0, /* cntrl */652cbit_digit, -1, 0, /* digit */653cbit_graph, -1, 0, /* graph */654cbit_print, -1, 0, /* print */655cbit_punct, -1, 0, /* punct */656cbit_space, -1, 0, /* space */657cbit_word, -1, 0, /* word - a Perl extension */658cbit_xdigit, -1, 0 /* xdigit */659};660661#ifdef SUPPORT_UNICODE662663/* The POSIX class Unicode property substitutes that are used in UCP mode must664be in the order of the POSIX class names, defined above. */665666static int posix_substitutes[] = {667PT_GC, ucp_L, /* alpha */668PT_PC, ucp_Ll, /* lower */669PT_PC, ucp_Lu, /* upper */670PT_ALNUM, 0, /* alnum */671-1, 0, /* ascii, treat as non-UCP */672-1, 1, /* blank, treat as \h */673PT_PC, ucp_Cc, /* cntrl */674PT_PC, ucp_Nd, /* digit */675PT_PXGRAPH, 0, /* graph */676PT_PXPRINT, 0, /* print */677PT_PXPUNCT, 0, /* punct */678PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */679PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */680PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */681};682#endif /* SUPPORT_UNICODE */683684/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset685are allowed. */686687#define PUBLIC_LITERAL_COMPILE_OPTIONS \688(PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \689PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \690PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)691692#define PUBLIC_COMPILE_OPTIONS \693(PUBLIC_LITERAL_COMPILE_OPTIONS| \694PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \695PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \696PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \697PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \698PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \699PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)700701#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \702(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \703PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)704705#define PUBLIC_COMPILE_EXTRA_OPTIONS \706(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \707PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \708PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \709PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \710PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \711PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \712PCRE2_EXTRA_NEVER_CALLOUT)713714/* This is a table of start-of-pattern options such as (*UTF) and settings such715as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward716compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is717generic and always supported. */718719enum { PSO_OPT, /* Value is an option bit */720PSO_XOPT, /* Value is an xoption bit */721PSO_FLG, /* Value is a flag bit */722PSO_NL, /* Value is a newline type */723PSO_BSR, /* Value is a \R type */724PSO_LIMH, /* Read integer value for heap limit */725PSO_LIMM, /* Read integer value for match limit */726PSO_LIMD, /* Read integer value for depth limit */727PSO_OPTMZ /* Value is an optimization bit */728};729730typedef struct pso {731const char *name;732uint16_t length;733uint16_t type;734uint32_t value;735} pso;736737/* NB: STRING_UTFn_RIGHTPAR contains the length as well */738739static const pso pso_list[] = {740{ STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },741{ STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },742{ STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },743{ STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },744{ STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },745{ STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },746{ STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },747{ STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },748{ STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },749{ STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },750{ STRING_TURKISH_CASING_RIGHTPAR, 15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },751{ STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },752{ STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },753{ STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },754{ STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },755{ STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },756{ STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },757{ STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },758{ STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },759{ STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },760{ STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },761{ STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },762{ STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }763};764765/* This table is used when converting repeating opcodes into possessified766versions as a result of an explicit possessive quantifier such as ++. A zero767value means there is no possessified version - in those cases the item in768question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT769because all relevant opcodes are less than that. */770771static const uint8_t opcode_possessify[] = {7720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */7730, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */7747750, /* NOTI */776OP_POSSTAR, 0, /* STAR, MINSTAR */777OP_POSPLUS, 0, /* PLUS, MINPLUS */778OP_POSQUERY, 0, /* QUERY, MINQUERY */779OP_POSUPTO, 0, /* UPTO, MINUPTO */7800, /* EXACT */7810, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */782783OP_POSSTARI, 0, /* STARI, MINSTARI */784OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */785OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */786OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */7870, /* EXACTI */7880, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */789790OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */791OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */792OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */793OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */7940, /* NOTEXACT */7950, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */796797OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */798OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */799OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */800OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */8010, /* NOTEXACTI */8020, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */803804OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */805OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */806OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */807OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */8080, /* TYPEEXACT */8090, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */810811OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */812OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */813OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */814OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */8150, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */8168170, 0, 0, 0, /* CLASS, NCLASS, XCLASS, ECLASS */8180, 0, /* REF, REFI */8190, 0, /* DNREF, DNREFI */8200, 0, /* RECURSE, CALLOUT */821};822823/* Compile-time check that the table has the correct size. */824STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);825826827#ifdef DEBUG_SHOW_PARSED828/*************************************************829* Show the parsed pattern for debugging *830*************************************************/831832/* For debugging the pre-scan, this code, which outputs the parsed data vector,833can be enabled. */834835static void show_parsed(compile_block *cb)836{837uint32_t *pptr = cb->parsed_pattern;838839for (;;)840{841int max, min;842PCRE2_SIZE offset;843uint32_t i;844uint32_t length;845uint32_t meta_arg = META_DATA(*pptr);846847fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);848849if (*pptr < META_END)850{851if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);852pptr++;853}854855else switch (META_CODE(*pptr++))856{857default:858fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");859return;860861case META_END:862fprintf(stderr, "META_END\n");863return;864865case META_CAPTURE:866fprintf(stderr, "META_CAPTURE %d", meta_arg);867break;868869case META_RECURSE:870GETOFFSET(offset, pptr);871fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);872break;873874case META_BACKREF:875if (meta_arg < 10)876offset = cb->small_ref_offset[meta_arg];877else878GETOFFSET(offset, pptr);879fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);880break;881882case META_ESCAPE:883if (meta_arg == ESC_P || meta_arg == ESC_p)884{885uint32_t ptype = *pptr >> 16;886uint32_t pvalue = *pptr++ & 0xffff;887fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,888ptype, pvalue);889}890else891{892uint32_t cc;893/* There's just one escape we might have here that isn't negated in the894escapes table. */895if (meta_arg == ESC_g) cc = CHAR_g;896else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)897{898if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;899}900if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;901fprintf(stderr, "META \\%c", cc);902}903break;904905case META_MINMAX:906min = *pptr++;907max = *pptr++;908if (max != REPEAT_UNLIMITED)909fprintf(stderr, "META {%d,%d}", min, max);910else911fprintf(stderr, "META {%d,}", min);912break;913914case META_MINMAX_QUERY:915min = *pptr++;916max = *pptr++;917if (max != REPEAT_UNLIMITED)918fprintf(stderr, "META {%d,%d}?", min, max);919else920fprintf(stderr, "META {%d,}?", min);921break;922923case META_MINMAX_PLUS:924min = *pptr++;925max = *pptr++;926if (max != REPEAT_UNLIMITED)927fprintf(stderr, "META {%d,%d}+", min, max);928else929fprintf(stderr, "META {%d,}+", min);930break;931932case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;933case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;934case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;935case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;936case META_DOT: fprintf(stderr, "META_DOT"); break;937case META_ASTERISK: fprintf(stderr, "META *"); break;938case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;939case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;940case META_PLUS: fprintf(stderr, "META +"); break;941case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;942case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;943case META_QUERY: fprintf(stderr, "META ?"); break;944case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;945case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;946947case META_ATOMIC: fprintf(stderr, "META (?>"); break;948case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;949case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;950case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;951case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;952case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;953case META_KET: fprintf(stderr, "META )"); break;954case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;955956case META_CLASS: fprintf(stderr, "META ["); break;957case META_CLASS_NOT: fprintf(stderr, "META [^"); break;958case META_CLASS_END: fprintf(stderr, "META ]"); break;959case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;960case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;961962case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;963case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;964965case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;966case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;967968case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;969case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;970case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;971case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;972case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;973case META_THEN: fprintf(stderr, "META (*THEN)"); break;974975case META_OPTIONS:976fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);977pptr += 2;978break;979980case META_LOOKBEHIND:981fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);982pptr += 2;983break;984985case META_LOOKBEHIND_NA:986fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);987pptr += 2;988break;989990case META_LOOKBEHINDNOT:991fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);992pptr += 2;993break;994995case META_CALLOUT_NUMBER:996fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],997pptr[1]);998pptr += 3;999break;10001001case META_CALLOUT_STRING:1002{1003uint32_t patoffset = *pptr++; /* Offset of next pattern item */1004uint32_t patlength = *pptr++; /* Length of next pattern item */1005fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);1006GETOFFSET(offset, pptr);1007fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);1008}1009break;10101011case META_RECURSE_BYNAME:1012fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);1013GETOFFSET(offset, pptr);1014fprintf(stderr, "%zd", offset);1015break;10161017case META_BACKREF_BYNAME:1018fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);1019GETOFFSET(offset, pptr);1020fprintf(stderr, "%zd", offset);1021break;10221023case META_COND_NUMBER:1024fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);1025GETOFFSET(offset, pptr);1026fprintf(stderr, "%zd", offset);1027pptr++;1028break;10291030case META_COND_DEFINE:1031fprintf(stderr, "META (?(DEFINE) offset=");1032GETOFFSET(offset, pptr);1033fprintf(stderr, "%zd", offset);1034break;10351036case META_COND_VERSION:1037fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");1038fprintf(stderr, "%d.", *pptr++);1039fprintf(stderr, "%d)", *pptr++);1040break;10411042case META_COND_NAME:1043fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);1044GETOFFSET(offset, pptr);1045fprintf(stderr, "%zd", offset);1046break;10471048case META_COND_RNAME:1049fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);1050GETOFFSET(offset, pptr);1051fprintf(stderr, "%zd", offset);1052break;10531054/* This is kept as a name, because it might be. */10551056case META_COND_RNUMBER:1057fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);1058GETOFFSET(offset, pptr);1059fprintf(stderr, "%zd", offset);1060break;10611062case META_OFFSET:1063fprintf(stderr, "META_OFFSET offset=");1064GETOFFSET(offset, pptr);1065fprintf(stderr, "%zd", offset);1066break;10671068case META_SCS:1069fprintf(stderr, "META (*scan_substring:");1070break;10711072case META_CAPTURE_NAME:1073fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);1074break;10751076case META_CAPTURE_NUMBER:1077fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);1078break;10791080case META_MARK:1081fprintf(stderr, "META (*MARK:");1082goto SHOWARG;10831084case META_COMMIT_ARG:1085fprintf(stderr, "META (*COMMIT:");1086goto SHOWARG;10871088case META_PRUNE_ARG:1089fprintf(stderr, "META (*PRUNE:");1090goto SHOWARG;10911092case META_SKIP_ARG:1093fprintf(stderr, "META (*SKIP:");1094goto SHOWARG;10951096case META_THEN_ARG:1097fprintf(stderr, "META (*THEN:");1098SHOWARG:1099length = *pptr++;1100for (i = 0; i < length; i++)1101{1102uint32_t cc = *pptr++;1103if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);1104else fprintf(stderr, "\\x{%x}", cc);1105}1106fprintf(stderr, ") length=%u", length);1107break;11081109case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;1110case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;1111case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;1112case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;1113case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;1114}1115fprintf(stderr, "\n");1116}1117return;1118}1119#endif /* DEBUG_SHOW_PARSED */1120112111221123/*************************************************1124* Copy compiled code *1125*************************************************/11261127/* Compiled JIT code cannot be copied, so the new compiled block has no1128associated JIT data. */11291130PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION1131pcre2_code_copy(const pcre2_code *code)1132{1133PCRE2_SIZE *ref_count;1134pcre2_code *newcode;11351136if (code == NULL) return NULL;1137newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);1138if (newcode == NULL) return NULL;1139memcpy(newcode, code, code->blocksize);1140newcode->executable_jit = NULL;11411142/* If the code is one that has been deserialized, increment the reference count1143in the decoded tables. */11441145if ((code->flags & PCRE2_DEREF_TABLES) != 0)1146{1147ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);1148(*ref_count)++;1149}11501151return newcode;1152}1153115411551156/*************************************************1157* Copy compiled code and character tables *1158*************************************************/11591160/* Compiled JIT code cannot be copied, so the new compiled block has no1161associated JIT data. This version of code_copy also makes a separate copy of1162the character tables. */11631164PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION1165pcre2_code_copy_with_tables(const pcre2_code *code)1166{1167PCRE2_SIZE* ref_count;1168pcre2_code *newcode;1169uint8_t *newtables;11701171if (code == NULL) return NULL;1172newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);1173if (newcode == NULL) return NULL;1174memcpy(newcode, code, code->blocksize);1175newcode->executable_jit = NULL;11761177newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),1178code->memctl.memory_data);1179if (newtables == NULL)1180{1181code->memctl.free((void *)newcode, code->memctl.memory_data);1182return NULL;1183}1184memcpy(newtables, code->tables, TABLES_LENGTH);1185ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);1186*ref_count = 1;11871188newcode->tables = newtables;1189newcode->flags |= PCRE2_DEREF_TABLES;1190return newcode;1191}1192119311941195/*************************************************1196* Free compiled code *1197*************************************************/11981199PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION1200pcre2_code_free(pcre2_code *code)1201{1202PCRE2_SIZE* ref_count;12031204if (code != NULL)1205{1206#ifdef SUPPORT_JIT1207if (code->executable_jit != NULL)1208PRIV(jit_free)(code->executable_jit, &code->memctl);1209#endif12101211if ((code->flags & PCRE2_DEREF_TABLES) != 0)1212{1213/* Decoded tables belong to the codes after deserialization, and they must1214be freed when there are no more references to them. The *ref_count should1215always be > 0. */12161217ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);1218if (*ref_count > 0)1219{1220(*ref_count)--;1221if (*ref_count == 0)1222code->memctl.free((void *)code->tables, code->memctl.memory_data);1223}1224}12251226code->memctl.free(code, code->memctl.memory_data);1227}1228}1229123012311232/*************************************************1233* Read a number, possibly signed *1234*************************************************/12351236/* This function is used to read numbers in the pattern. The initial pointer1237must be at the sign or first digit of the number. When relative values1238(introduced by + or -) are allowed, they are relative group numbers, and the1239result must be greater than zero.12401241Arguments:1242ptrptr points to the character pointer variable1243ptrend points to the end of the input string1244allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this1245max_value the largest number allowed;1246you must not pass a value for max_value larger than1247INT_MAX/10 - 1 because this function relies on max_value to1248avoid integer overflow1249max_error the error to give for an over-large number1250intptr where to put the result1251errcodeptr where to put an error code12521253Returns: TRUE - a number was read1254FALSE - errorcode == 0 => no number was found1255errorcode != 0 => an error occurred1256*/12571258static BOOL1259read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,1260uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)1261{1262int sign = 0;1263uint32_t n = 0;1264PCRE2_SPTR ptr = *ptrptr;1265BOOL yield = FALSE;12661267PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);12681269*errorcodeptr = 0;12701271if (allow_sign >= 0 && ptr < ptrend)1272{1273if (*ptr == CHAR_PLUS)1274{1275sign = +1;1276max_value -= allow_sign;1277ptr++;1278}1279else if (*ptr == CHAR_MINUS)1280{1281sign = -1;1282ptr++;1283}1284}12851286if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;1287while (ptr < ptrend && IS_DIGIT(*ptr))1288{1289n = n * 10 + (*ptr++ - CHAR_0);1290if (n > max_value)1291{1292*errorcodeptr = max_error;1293while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;1294goto EXIT;1295}1296}12971298if (allow_sign >= 0 && sign != 0)1299{1300if (n == 0)1301{1302*errorcodeptr = ERR26; /* +0 and -0 are not allowed */1303goto EXIT;1304}13051306if (sign > 0) n += allow_sign;1307else if (n > (uint32_t)allow_sign)1308{1309*errorcodeptr = ERR15; /* Non-existent subpattern */1310goto EXIT;1311}1312else n = allow_sign + 1 - n;1313}13141315yield = TRUE;13161317EXIT:1318*intptr = n;1319*ptrptr = ptr;1320return yield;1321}1322132313241325/*************************************************1326* Read repeat counts *1327*************************************************/13281329/* Read an item of the form {n,m} and return the values when non-NULL pointers1330are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a1331larger value is used for "unlimited". We have to use signed arguments for1332read_number() because it is capable of returning a signed value. As of Perl13335.34.0 either n or m may be absent, but not both. Perl also allows spaces and1334tabs after { and before } and between the numbers and the comma, so we do too.13351336Arguments:1337ptrptr points to pointer to character after '{'1338ptrend pointer to end of input1339minp if not NULL, pointer to int for min1340maxp if not NULL, pointer to int for max1341errorcodeptr points to error code variable13421343Returns: FALSE if not a repeat quantifier, errorcode set zero1344FALSE on error, with errorcode set non-zero1345TRUE on success, with pointer updated to point after '}'1346*/13471348static BOOL1349read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,1350uint32_t *maxp, int *errorcodeptr)1351{1352PCRE2_SPTR p = *ptrptr;1353PCRE2_SPTR pp;1354BOOL yield = FALSE;1355BOOL had_minimum = FALSE;1356int32_t min = 0;1357int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */13581359*errorcodeptr = 0;1360while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;13611362/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence1363such as "X{123456ABC" would incorrectly give a "number too big in quantifier"1364error. */13651366pp = p;1367if (pp < ptrend && IS_DIGIT(*pp))1368{1369had_minimum = TRUE;1370while (++pp < ptrend && IS_DIGIT(*pp)) {}1371}13721373while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;1374if (pp >= ptrend) return FALSE;13751376if (*pp == CHAR_RIGHT_CURLY_BRACKET)1377{1378if (!had_minimum) return FALSE;1379}1380else1381{1382if (*pp++ != CHAR_COMMA) return FALSE;1383while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;1384if (pp >= ptrend) return FALSE;1385if (IS_DIGIT(*pp))1386{1387while (++pp < ptrend && IS_DIGIT(*pp)) {}1388}1389else if (!had_minimum) return FALSE;1390while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;1391if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;1392}13931394/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}1395or {n,m}. The only error that read_number() can return is for a number that is1396too big. If *errorcodeptr is returned as zero it means no number was found. */13971398/* Deal with {,m} or n too big. If we successfully read m there is no need to1399check m >= n because n defaults to zero. */14001401if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))1402{1403if (*errorcodeptr != 0) goto EXIT; /* n too big */1404p++; /* Skip comma and subsequent spaces */1405while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1406if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))1407{1408if (*errorcodeptr != 0) goto EXIT; /* m too big */1409}1410}14111412/* Have read one number. Deal with {n} or {n,} or {n,m} */14131414else1415{1416while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1417if (*p == CHAR_RIGHT_CURLY_BRACKET)1418{1419max = min;1420}1421else /* Handle {n,} or {n,m} */1422{1423p++; /* Skip comma and subsequent spaces */1424while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1425if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))1426{1427if (*errorcodeptr != 0) goto EXIT; /* m too big */1428}14291430if (max < min)1431{1432*errorcodeptr = ERR4;1433goto EXIT;1434}1435}1436}14371438/* Valid quantifier exists */14391440while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1441p++;1442yield = TRUE;1443if (minp != NULL) *minp = (uint32_t)min;1444if (maxp != NULL) *maxp = (uint32_t)max;14451446/* Update the pattern pointer */14471448EXIT:1449*ptrptr = p;1450return yield;1451}1452145314541455/*************************************************1456* Handle escapes *1457*************************************************/14581459/* This function is called when a \ has been encountered. It either returns a1460positive value for a simple escape such as \d, or 0 for a data character, which1461is placed in chptr. A backreference to group n is returned as -(n+1). On1462entry, ptr is pointing at the character after \. On exit, it points after the1463final code unit of the escape sequence.14641465This function is also called from pcre2_substitute() to handle escape sequences1466in replacement strings. In this case, the cb argument is NULL, and in the case1467of escapes that have further processing, only sequences that define a data1468character are recognised. The options argument is the final value of the1469compiled pattern's options.14701471Arguments:1472ptrptr points to the input position pointer1473ptrend points to the end of the input1474chptr points to a returned data character1475errorcodeptr points to the errorcode variable (containing zero)1476options the current options bits1477xoptions the current extra options bits1478bracount the number of capturing parentheses encountered so far1479isclass TRUE if in a character class1480cb compile data block or NULL when called from pcre2_substitute()14811482Returns: zero => a data character1483positive => a special escape sequence1484negative => a numerical back reference1485on error, errorcodeptr is set non-zero1486*/14871488int1489PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,1490int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,1491BOOL isclass, compile_block *cb)1492{1493BOOL utf = (options & PCRE2_UTF) != 0;1494BOOL alt_bsux =1495((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;1496PCRE2_SPTR ptr = *ptrptr;1497uint32_t c, cc;1498int escape = 0;1499int i;15001501/* If backslash is at the end of the string, it's an error. */15021503if (ptr >= ptrend)1504{1505*errorcodeptr = ERR1;1506return 0;1507}15081509GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */1510*errorcodeptr = 0; /* Be optimistic */15111512/* Non-alphanumerics are literals, so we just leave the value in c. An initial1513value test saves a memory lookup for code points outside the alphanumeric1514range. */15151516if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */15171518/* Otherwise, do a table lookup. Non-zero values need little processing here. A1519positive value is a literal value for something like \n. A negative value is1520the negation of one of the ESC_ macros that is passed back for handling by the1521calling function. Some extra checking is needed for \N because only \N{U+dddd}1522is supported. If the value is zero, further processing is handled below. */15231524else if ((i = escapes[c - ESCAPES_FIRST]) != 0)1525{1526if (i > 0)1527{1528c = (uint32_t)i;1529if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)1530c = CHAR_LF;1531}1532else /* Negative table entry */1533{1534escape = -i; /* Else return a special escape */1535if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))1536cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */15371538/* Perl supports \N{name} for character names and \N{U+dddd} for numerical1539Unicode code points, as well as plain \N for "not newline". PCRE does not1540support \N{name}. However, it does support quantification such as \N{2,3},1541so if \N{ is not followed by U+dddd we check for a quantifier. */15421543if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)1544{1545PCRE2_SPTR p = ptr + 1;15461547/* Perl ignores spaces and tabs after { */15481549while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;15501551/* \N{U+ can be handled by the \x{ code. However, this construction is1552not valid in EBCDIC environments because it specifies a Unicode1553character, not a codepoint in the local code. For example \N{U+0041}1554must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode1555casing semantics for the entire pattern, so allow it only in UTF (i.e.1556Unicode) mode. */15571558if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)1559{1560#ifndef EBCDIC1561if (utf)1562{1563ptr = p + 2;1564escape = 0; /* Not a fancy escape after all */1565goto COME_FROM_NU;1566}1567#endif15681569/* Improve error offset. */1570ptr = p + 2;1571while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;1572while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;1573if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET) ptr++;15741575*errorcodeptr = ERR93;1576}15771578/* Give an error in contexts where quantifiers are not allowed1579(character classes; substitution strings). */15801581else if (isclass || cb == NULL)1582{1583ptr++; /* Skip over the opening brace */1584*errorcodeptr = ERR37;1585}15861587/* Give an error if what follows is not a quantifier, but don't override1588an error set by the quantifier reader (e.g. number overflow). */15891590else1591{1592if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&1593*errorcodeptr == 0)1594{1595ptr++; /* Skip over the opening brace */1596*errorcodeptr = ERR37;1597}1598}1599}1600}1601}16021603/* Escapes that need further processing, including those that are unknown, have1604a zero entry in the lookup table. When called from pcre2_substitute(), only \c,1605\o, and \x are recognized (\u and \U can never appear as they are used for case1606forcing). */16071608else1609{1610int s;1611PCRE2_SPTR oldptr;1612BOOL overflow;16131614/* Filter calls from pcre2_substitute(). */16151616if (cb == NULL)1617{1618if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&1619c != CHAR_x && c != CHAR_g)1620{1621*errorcodeptr = ERR3;1622goto EXIT;1623}1624alt_bsux = FALSE; /* Do not modify \x handling */1625}16261627switch (c)1628{1629/* A number of Perl escapes are not handled by PCRE. We give an explicit1630error. */16311632case CHAR_F:1633case CHAR_l:1634case CHAR_L:1635*errorcodeptr = ERR37;1636break;16371638/* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX1639is set. Otherwise, \u must be followed by exactly four hex digits or, if1640PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.1641Otherwise it is a lowercase u letter. This gives some compatibility with1642ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT1643allowed. When \u{ is not followed by hex digits, a special return is given1644because otherwise \u{ 12} (for example) would be treated as u{12}. */16451646case CHAR_u:1647if (!alt_bsux)1648*errorcodeptr = ERR37;1649else1650{1651uint32_t xc;16521653if (ptr >= ptrend) break;1654if (*ptr == CHAR_LEFT_CURLY_BRACKET &&1655(xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)1656{1657PCRE2_SPTR hptr = ptr + 1;16581659cc = 0;1660while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)1661{1662if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */1663{1664*errorcodeptr = ERR77;1665ptr = hptr; /* Show where */1666break; /* *hptr != } will cause another break below */1667}1668cc = (cc << 4) | xc;1669hptr++;1670}16711672if (hptr == ptr + 1 || /* No hex digits */1673hptr >= ptrend || /* Hit end of input */1674*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */1675{1676if (isclass) break; /* In a class, just treat as '\u' literal */1677escape = ESC_ub; /* Special return */1678ptr++; /* Skip { */1679break; /* Hex escape not recognized */1680}16811682c = cc; /* Accept the code point */1683ptr = hptr + 1;1684}16851686else /* Must be exactly 4 hex digits */1687{1688if (ptrend - ptr < 4) break; /* Less than 4 chars */1689if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */1690if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */1691cc = (cc << 4) | xc;1692if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */1693cc = (cc << 4) | xc;1694if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */1695c = (cc << 4) | xc;1696ptr += 4;1697}16981699if (utf)1700{1701if (c > 0x10ffffU) *errorcodeptr = ERR77;1702else1703if (c >= 0xd800 && c <= 0xdfff &&1704(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)1705*errorcodeptr = ERR73;1706}1707else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;1708}1709break;17101711/* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,1712in which case it is an upper case letter. */17131714case CHAR_U:1715if (!alt_bsux) *errorcodeptr = ERR37;1716break;17171718/* In a character class, \g is just a literal "g". Outside a character1719class, \g must be followed by one of a number of specific things:17201721(1) A number, either plain or braced. If positive, it is an absolute1722backreference. If negative, it is a relative backreference. This is a Perl17235.10 feature.17241725(2) Perl 5.10 also supports \g{name} as a reference to a named group. This1726is part of Perl's movement towards a unified syntax for back references. As1727this is synonymous with \k{name}, we fudge it up by pretending it really1728was \k{name}.17291730(3) For Oniguruma compatibility we also support \g followed by a name or a1731number either in angle brackets or in single quotes. However, these are1732(possibly recursive) subroutine calls, _not_ backreferences. We return1733the ESC_g code.17341735Summary: Return a negative number for a numerical back reference (offset1736by 1), ESC_k for a named back reference, and ESC_g for a named or1737numbered subroutine call.17381739The above describes the \g behaviour inside patterns. Inside replacement1740strings (pcre2_substitute) we support only \g<nameornum> for Python1741compatibility. Return ESG_g for the named case, and -(num+1) for the1742numbered case.1743*/17441745case CHAR_g:1746if (isclass) break;17471748if (ptr >= ptrend)1749{1750*errorcodeptr = ERR57;1751break;1752}17531754if (cb == NULL)1755{1756PCRE2_SPTR p;1757/* Substitution strings */1758if (*ptr != CHAR_LESS_THAN_SIGN)1759{1760*errorcodeptr = ERR57;1761break;1762}17631764p = ptr + 1;17651766if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,1767errorcodeptr))1768{1769if (*errorcodeptr == 0) escape = ESC_g; /* No number found */1770break;1771}17721773if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)1774{1775ptr = p;1776*errorcodeptr = ERR119; /* Missing terminator for number */1777break;1778}17791780/* This is the reason that back references are returned as -(s+1) rather1781than just -s. In a pattern, \0 is not a back reference, but \g<0> is1782valid in a substitution string, so this must be representable. */1783ptr = p + 1;1784escape = -(s+1);1785break;1786}17871788if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)1789{1790escape = ESC_g;1791break;1792}17931794/* If there is a brace delimiter, try to read a numerical reference. If1795there isn't one, assume we have a name and treat it as \k. */17961797if (*ptr == CHAR_LEFT_CURLY_BRACKET)1798{1799PCRE2_SPTR p = ptr + 1;18001801while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1802if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,1803errorcodeptr))1804{1805if (*errorcodeptr == 0) escape = ESC_k; /* No number found */1806break;1807}1808while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;18091810if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)1811{1812ptr = p;1813*errorcodeptr = ERR119; /* Missing terminator for number */1814break;1815}1816ptr = p + 1;1817}18181819/* Read an undelimited number */18201821else1822{1823if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,1824errorcodeptr))1825{1826if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */1827break;1828}1829}18301831if (s <= 0)1832{1833*errorcodeptr = ERR15;1834break;1835}18361837escape = -(s+1);1838break;18391840/* The handling of escape sequences consisting of a string of digits1841starting with one that is not zero is not straightforward. Perl has changed1842over the years. Nowadays \g{} for backreferences and \o{} for octal are1843recommended to avoid the ambiguities in the old syntax.18441845Outside a character class, the digits are read as a decimal number. If the1846number is less than 10, or if there are that many previous extracting left1847brackets, it is a back reference. Otherwise, up to three octal digits are1848read to form an escaped character code. Thus \123 is likely to be octal 1231849(cf \0123, which is octal 012 followed by the literal 3). This is the "Perl1850style" of handling ambiguous octal/backrefences such as \12.18511852There is an alternative disambiguation strategy, selected by1853PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must1854have either a leading zero, or exactly three octal digits; otherwise it's1855a backreference. The disambiguation is stable, and does not depend on how1856many capture groups are defined (it's simply an invalid backreference if1857there is no corresponding capture group). Additionally, octal values above1858\377 (\xff) are rejected.18591860Inside a character class, \ followed by a digit is always either a literal18618 or 9 or an octal number. */18621863case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:1864case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:18651866if (isclass)1867{1868/* Fall through to octal handling; never a backreference inside a class. */1869}1870else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)1871{1872/* Python-style disambiguation. */1873if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&1874ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)1875{1876/* We peeked a three-digit octal, so fall through */1877}1878else1879{1880/* We are at a digit, so the only possible error from read_number() is1881a number that is too large. */1882ptr--; /* Back to the digit */18831884if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))1885{1886*errorcodeptr = ERR61;1887break;1888}18891890escape = -(s+1);1891break;1892}1893}1894else1895{1896/* Perl-style disambiguation. */1897oldptr = ptr;1898ptr--; /* Back to the digit */18991900/* As we know we are at a digit, the only possible error from1901read_number() is a number that is too large to be a group number. Because1902that number might be still valid if read as an octal, errorcodeptr is not1903set on failure and therefore a sentinel value of INT_MAX is used instead1904of the original value, and will be used later to properly set the error,1905if not falling through. */19061907if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))1908s = INT_MAX;19091910/* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x1911are octal escapes if there are not that many previous captures. */19121913if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)1914{1915/* s > MAX_GROUP_NUMBER should not be possible because of read_number(),1916but we keep it just to be safe and because it will also catch the1917sentinel value that was set on failure by that function. */19181919if ((unsigned)s > MAX_GROUP_NUMBER)1920{1921PCRE2_ASSERT(s == INT_MAX);1922*errorcodeptr = ERR61;1923}1924else escape = -(s+1); /* Indicates a back reference */1925break;1926}19271928ptr = oldptr; /* Put the pointer back and fall through */1929}19301931/* Handle a digit following \ when the number is not a back reference, or1932we are within a character class. If the first digit is 8 or 9, Perl used to1933generate a binary zero and then treat the digit as a following literal. At1934least by Perl 5.18 this changed so as not to insert the binary zero. */19351936if (c >= CHAR_8) break;19371938PCRE2_FALLTHROUGH /* Fall through */19391940/* \0 always starts an octal number, but we may drop through to here with a1941larger first octal digit. The original code used just to take the least1942significant 8 bits of octal numbers (I think this is what early Perls used1943to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,1944but no more than 3 octal digits. */19451946case CHAR_0:1947c -= CHAR_0;1948while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)1949c = c * 8 + *ptr++ - CHAR_0;1950if (c > 0xff)1951{1952if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;1953#if PCRE2_CODE_UNIT_WIDTH == 81954else if (!utf) *errorcodeptr = ERR51;1955#endif1956}19571958/* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect1959two- or three-character octal escapes \00 and \000, nor \x00. */19601961if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)1962*errorcodeptr = ERR98;1963break;19641965/* \o is a relatively new Perl feature, supporting a more general way of1966specifying character codes in octal. The only supported form is \o{ddd},1967with optional spaces or tabs after { and before }. */19681969case CHAR_o:1970if (ptr >= ptrend || *ptr != CHAR_LEFT_CURLY_BRACKET)1971{1972*errorcodeptr = ERR55;1973break;1974}1975ptr++;19761977while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;1978if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)1979{1980*errorcodeptr = ERR78;1981break;1982}19831984c = 0;1985overflow = FALSE;1986while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)1987{1988cc = *ptr++;1989if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */1990#if PCRE2_CODE_UNIT_WIDTH == 321991if (c >= 0x20000000u) { overflow = TRUE; break; }1992#endif1993c = (c << 3) + (cc - CHAR_0);1994#if PCRE2_CODE_UNIT_WIDTH == 81995if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }1996#elif PCRE2_CODE_UNIT_WIDTH == 161997if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }1998#elif PCRE2_CODE_UNIT_WIDTH == 321999if (utf && c > 0x10ffffU) { overflow = TRUE; break; }2000#endif2001}20022003while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;20042005if (overflow)2006{2007while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;2008*errorcodeptr = ERR34;2009}2010else if (utf && c >= 0xd800 && c <= 0xdfff &&2011(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)2012{2013*errorcodeptr = ERR73;2014}2015else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)2016{2017ptr++;2018}2019else2020{2021*errorcodeptr = ERR64;2022goto ESCAPE_FAILED_FORWARD;2023}2024break;20252026/* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed2027by two hexadecimal digits. Otherwise it is a lowercase x letter. */20282029case CHAR_x:2030if (alt_bsux)2031{2032uint32_t xc;2033if (ptrend - ptr < 2) break; /* Less than 2 characters */2034if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */2035if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */2036c = (cc << 4) | xc;2037ptr += 2;2038}20392040/* Handle \x in Perl's style. \x{ddd} is a character code which can be2041greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex2042digits. If not, { used to be treated as a data character. However, Perl2043seems to read hex digits up to the first non-such, and ignore the rest, so2044that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE2045now gives an error. */20462047else2048{2049if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)2050{2051ptr++;2052while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;20532054#ifndef EBCDIC2055COME_FROM_NU:2056#endif2057if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)2058{2059*errorcodeptr = ERR78;2060break;2061}2062c = 0;2063overflow = FALSE;20642065while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)2066{2067ptr++;2068if (c == 0 && cc == 0) continue; /* Leading zeroes */2069#if PCRE2_CODE_UNIT_WIDTH == 322070if (c >= 0x10000000l) { overflow = TRUE; break; }2071#endif2072c = (c << 4) | cc;2073if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))2074{2075overflow = TRUE;2076break;2077}2078}20792080/* Perl ignores spaces and tabs before } */20812082while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;20832084/* On overflow, skip remaining hex digits */20852086if (overflow)2087{2088while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;2089*errorcodeptr = ERR34;2090}2091else if (utf && c >= 0xd800 && c <= 0xdfff &&2092(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)2093{2094*errorcodeptr = ERR73;2095}2096else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)2097{2098ptr++;2099}21002101/* If the sequence of hex digits (followed by optional space) does not2102end with '}', give an error. We used just to recognize this construct2103and fall through to the normal \x handling, but nowadays Perl gives an2104error, which seems much more sensible, so we do too. */21052106else2107{2108*errorcodeptr = ERR67;2109goto ESCAPE_FAILED_FORWARD;2110}2111} /* End of \x{} processing */21122113/* Read a up to two hex digits after \x */21142115else2116{2117/* Perl has the surprising/broken behaviour that \x without following2118hex digits is treated as an escape for NUL. Their source code laments2119this but keeps it for backwards compatibility. A warning is printed2120when "use warnings" is enabled. Because we don't have warnings, we2121simply forbid it. */2122if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)2123{2124/* Not a hex digit */2125*errorcodeptr = ERR78;2126break;2127}2128ptr++;2129c = cc;21302131/* With "use re 'strict'" Perl actually requires exactly two digits (error2132for \x, \xA and \xAAA). While \x was already rejected, this seems overly2133strict, and there seems little incentive to align with that, given the2134backwards-compatibility cost.21352136For comparison, note that other engines disagree. For example:2137- Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits2138- .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.2139*/2140if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */2141ptr++;2142c = (c << 4) | cc;2143} /* End of \xdd handling */2144} /* End of Perl-style \x handling */2145break;21462147/* The handling of \c is different in ASCII and EBCDIC environments. In an2148ASCII (or Unicode) environment, an error is given if the character2149following \c is not a printable ASCII character. Otherwise, the following2150character is upper-cased if it is a letter, and after that the 0x40 bit is2151flipped. The result is the value of the escape.21522153In an EBCDIC environment the handling of \c is compatible with the2154specification in the perlebcdic document. The following character must be2155a letter or one of small number of special characters. These provide a2156means of defining the character values 0-31.21572158For testing the EBCDIC handling of \c in an ASCII environment, recognize2159the EBCDIC value of 'c' explicitly. */21602161case CHAR_c:2162if (ptr >= ptrend)2163{2164*errorcodeptr = ERR2;2165break;2166}2167c = *ptr;2168if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);21692170/* Handle \c in an ASCII/Unicode environment. */21712172#ifndef EBCDIC /* ASCII/UTF-8 coding */2173if (c < 32 || c > 126) /* Excludes all non-printable ASCII */2174{2175*errorcodeptr = ERR68;2176goto ESCAPE_FAILED_FORWARD;2177}2178c ^= 0x40;21792180/* Handle \c in an EBCDIC environment. The special case \c? is converted to2181255 (0xff) or 95 (0x5f) if other characters suggest we are using the2182POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)2183The other valid sequences correspond to a list of specific characters. */21842185#else2186if (c == CHAR_QUESTION_MARK)2187c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;2188else2189{2190for (i = 0; i < 32; i++)2191{2192if (c == ebcdic_escape_c[i]) break;2193}2194if (i < 32)2195c = i;2196else2197{2198*errorcodeptr = ERR68;2199goto ESCAPE_FAILED_FORWARD;2200}2201}2202#endif /* EBCDIC */22032204ptr++;2205break;22062207/* Any other alphanumeric following \ is an error. Perl gives an error only2208if in warning mode, but PCRE doesn't have a warning mode. */22092210default:2211*errorcodeptr = ERR3;2212break;2213}2214}22152216/* Set the pointer to the next character before returning. */22172218EXIT:2219*ptrptr = ptr;2220*chptr = c;2221return escape;22222223/* Some errors need to indicate the next character. */22242225ESCAPE_FAILED_FORWARD:2226ptr++;2227#ifdef SUPPORT_UNICODE2228if (utf) FORWARDCHARTEST(ptr, ptrend);2229#endif2230goto EXIT;2231}2232223322342235#ifdef SUPPORT_UNICODE2236/*************************************************2237* Handle \P and \p *2238*************************************************/22392240/* This function is called after \P or \p has been encountered, provided that2241PCRE2 is compiled with support for UTF and Unicode properties. On entry, the2242contents of ptrptr are pointing after the P or p. On exit, it is left pointing2243after the final code unit of the escape sequence.22442245Arguments:2246ptrptr the pattern position pointer2247utf true if the input is UTF-encoded2248negptr a boolean that is set TRUE for negation else FALSE2249ptypeptr an unsigned int that is set to the type value2250pdataptr an unsigned int that is set to the detailed property value2251errorcodeptr the error code variable2252cb the compile data22532254Returns: TRUE if the type value was found, or FALSE for an invalid type2255*/22562257static BOOL2258get_ucp(PCRE2_SPTR *ptrptr, BOOL utf, BOOL *negptr, uint16_t *ptypeptr,2259uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)2260{2261uint32_t c;2262ptrdiff_t i;2263PCRE2_SIZE bot, top;2264PCRE2_SPTR ptr = *ptrptr;2265PCRE2_UCHAR name[50];2266PCRE2_UCHAR *vptr = NULL;2267uint16_t ptscript = PT_NOTSCRIPT;22682269#ifndef MAYBE_UTF_MULTI2270(void)utf; /* Avoid compiler warning */2271#endif22722273if (ptr >= cb->end_pattern) goto ERROR_RETURN;2274GETCHARINCTEST(c, ptr);2275*negptr = FALSE;22762277/* \P or \p can be followed by a name in {}, optionally preceded by ^ for2278negation. We must be handling Unicode encoding here, though we may be compiling2279for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC2280input and Unicode input in the same build.) In accordance with Unicode's "loose2281matching" rules, ASCII white space, hyphens, and underscores are ignored. We2282don't use isspace() or tolower() because (a) code points may be greater than2283255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC2284environment. */22852286if (c == CHAR_LEFT_CURLY_BRACKET)2287{2288if (ptr >= cb->end_pattern) goto ERROR_RETURN;22892290for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)2291{2292REDO:22932294if (ptr >= cb->end_pattern) goto ERROR_RETURN;2295GETCHARINCTEST(c, ptr);22962297/* Skip ignorable Unicode characters. */22982299if (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||2300(c >= CHAR_HT && c <= CHAR_CR))2301{2302goto REDO;2303}23042305/* The first significant character being circumflex negates the meaning of2306the item. */23072308if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)2309{2310*negptr = TRUE;2311goto REDO;2312}23132314if (c == CHAR_RIGHT_CURLY_BRACKET) break;23152316/* Names consist of ASCII letters and digits, but equals and colon may also2317occur as a name/value separator. We must also allow for \p{L&}. A simple2318check for a value between '&' and 'z' suffices because anything else in a2319name or value will cause an "unknown property" error anyway. */23202321if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;23222323/* Lower case a capital letter or remember where the name/value separator2324is. */23252326if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;2327else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)2328vptr = name + i;23292330name[i] = c;2331}23322333/* Error if the loop didn't end with '}' - either we hit the end of the2334pattern or the name was longer than any legal property name. */23352336if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;2337name[i] = 0;2338}23392340/* If { doesn't follow \p or \P there is just one following character, which2341must be an ASCII letter. */23422343else if (c >= CHAR_A && c <= CHAR_Z)2344{2345name[0] = c | 0x20; /* Lower case */2346name[1] = 0;2347}2348else if (c >= CHAR_a && c <= CHAR_z)2349{2350name[0] = c;2351name[1] = 0;2352}2353else goto ERROR_RETURN;23542355*ptrptr = ptr; /* Update pattern pointer */23562357/* If the property contains ':' or '=' we have class name and value separately2358specified. The following are supported:23592360. Bidi_Class (synonym bc), for which the property names are "bidi<name>".2361. Script (synonym sc) for which the property name is the script name2362. Script_Extensions (synonym scx), ditto23632364As this is a small number, we currently just check the names directly. If this2365grows, a sorted table and a switch will be neater.23662367For both the script properties, set a PT_xxx value so that (1) they can be2368distinguished and (2) invalid script names that happen to be the name of2369another property can be diagnosed. */23702371if (vptr != NULL)2372{2373int offset = 0;2374PCRE2_UCHAR sname[8];23752376*vptr = 0; /* Terminate property name */2377if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||2378PRIV(strcmp_c8)(name, STRING_bc) == 0)2379{2380offset = 4;2381sname[0] = CHAR_b;2382sname[1] = CHAR_i; /* There is no strcpy_c8 function */2383sname[2] = CHAR_d;2384sname[3] = CHAR_i;2385}23862387else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||2388PRIV(strcmp_c8)(name, STRING_sc) == 0)2389ptscript = PT_SC;23902391else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||2392PRIV(strcmp_c8)(name, STRING_scx) == 0)2393ptscript = PT_SCX;23942395else2396{2397*errorcodeptr = ERR47;2398return FALSE;2399}24002401/* Adjust the string in name[] as needed */24022403memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));2404if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));2405}24062407/* Search for a recognized property using binary chop. */24082409bot = 0;2410top = PRIV(utt_size);24112412while (bot < top)2413{2414int r;2415i = (bot + top) >> 1;2416r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);24172418/* When a matching property is found, some extra checking is needed when the2419\p{xx:yy} syntax is used and xx is either sc or scx. */24202421if (r == 0)2422{2423*pdataptr = PRIV(utt)[i].value;2424if (vptr == NULL || ptscript == PT_NOTSCRIPT)2425{2426*ptypeptr = PRIV(utt)[i].type;2427return TRUE;2428}24292430switch (PRIV(utt)[i].type)2431{2432case PT_SC:2433*ptypeptr = PT_SC;2434return TRUE;24352436case PT_SCX:2437*ptypeptr = ptscript;2438return TRUE;2439}24402441break; /* Non-script found */2442}24432444if (r > 0) bot = i + 1; else top = i;2445}24462447*errorcodeptr = ERR47; /* Unrecognized property */2448return FALSE;24492450ERROR_RETURN: /* Malformed \P or \p */2451*errorcodeptr = ERR46;2452*ptrptr = ptr;2453return FALSE;2454}2455#endif2456245724582459/*************************************************2460* Check for POSIX class syntax *2461*************************************************/24622463/* This function is called when the sequence "[:" or "[." or "[=" is2464encountered in a character class. It checks whether this is followed by a2465sequence of characters terminated by a matching ":]" or ".]" or "=]". If we2466reach an unescaped ']' without the special preceding character, return FALSE.24672468Originally, this function only recognized a sequence of letters between the2469terminators, but it seems that Perl recognizes any sequence of characters,2470though of course unknown POSIX names are subsequently rejected. Perl gives an2471"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE2472didn't consider this to be a POSIX class. Likewise for [:1234:].24732474The problem in trying to be exactly like Perl is in the handling of escapes. We2475have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX2476class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code2477below handles the special cases \\ and \], but does not try to do any other2478escape processing. This makes it different from Perl for cases such as2479[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does2480not recognize "l\ower". This is a lesser evil than not diagnosing bad classes2481when Perl does, I think.24822483A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.2484It seems that the appearance of a nested POSIX class supersedes an apparent2485external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or2486a digit. This is handled by returning FALSE if the start of a new group with2487the same terminator is encountered, since the next closing sequence must close2488the nested group, not the outer one.24892490In Perl, unescaped square brackets may also appear as part of class names. For2491example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for2492[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not2493seem right at all. PCRE does not allow closing square brackets in POSIX class2494names.24952496Arguments:2497ptr pointer to the character after the initial [ (colon, dot, equals)2498ptrend pointer to the end of the pattern2499endptr where to return a pointer to the terminating ':', '.', or '='25002501Returns: TRUE or FALSE2502*/25032504static BOOL2505check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)2506{2507PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */2508terminator = *ptr++; /* compiler warns about "non-constant" initializer. */25092510for (; ptrend - ptr >= 2; ptr++)2511{2512if (*ptr == CHAR_BACKSLASH &&2513(ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))2514ptr++;25152516else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||2517*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;25182519else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)2520{2521*endptr = ptr;2522return TRUE;2523}2524}25252526return FALSE;2527}2528252925302531/*************************************************2532* Check POSIX class name *2533*************************************************/25342535/* This function is called to check the name given in a POSIX-style class entry2536such as [:alnum:].25372538Arguments:2539ptr points to the first letter2540len the length of the name25412542Returns: a value representing the name, or -1 if unknown2543*/25442545static int2546check_posix_name(PCRE2_SPTR ptr, int len)2547{2548const char *pn = posix_names;2549int yield = 0;2550while (posix_name_lengths[yield] != 0)2551{2552if (len == posix_name_lengths[yield] &&2553PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;2554pn += posix_name_lengths[yield] + 1;2555yield++;2556}2557return -1;2558}2559256025612562/*************************************************2563* Read a subpattern or VERB name *2564*************************************************/25652566/* This function is called from parse_regex() below whenever it needs to read2567the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial2568pointer must be to the preceding character. If that character is '*' we are2569reading a verb or alpha assertion name. The pointer is updated to point after2570the name, for a VERB or alpha assertion name, or after the name's terminator2571for a subpattern name. Returning both the offset and the name pointer is2572redundant information, but some callers use one and some the other, so it is2573simplest just to return both. When the name is in braces, spaces and tabs are2574allowed (and ignored) at either end.25752576Arguments:2577ptrptr points to the character pointer variable2578ptrend points to the end of the input string2579utf true if the input is UTF-encoded2580terminator the terminator of a subpattern name must be this2581offsetptr where to put the offset from the start of the pattern2582nameptr where to put a pointer to the name in the input2583namelenptr where to put the length of the name2584errcodeptr where to put an error code2585cb pointer to the compile data block25862587Returns: TRUE if a name was read2588FALSE otherwise, with error code set2589*/25902591static BOOL2592read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,2593PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,2594int *errorcodeptr, compile_block *cb)2595{2596PCRE2_SPTR ptr = *ptrptr;2597BOOL is_group = (*ptr++ != CHAR_ASTERISK);2598BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;25992600if (is_braced)2601while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;26022603if (ptr >= ptrend) /* No characters in name */2604{2605*errorcodeptr = is_group? ERR62: /* Subpattern name expected */2606ERR60; /* Verb not recognized or malformed */2607goto FAILED;2608}26092610*nameptr = ptr;2611*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);26122613/* If this logic were ever to change, the matching function in pcre2_substitute.c2614ought to be updated to match. */26152616/* In UTF mode, a group name may contain letters and decimal digits as defined2617by Unicode properties, and underscores, but must not start with a digit. */26182619#ifdef SUPPORT_UNICODE2620if (utf && is_group)2621{2622uint32_t c, type;2623PCRE2_SPTR p = ptr;26242625GETCHARINC(c, p); /* Peek at next character */2626type = UCD_CHARTYPE(c);26272628if (type == ucp_Nd)2629{2630ptr = p;2631*errorcodeptr = ERR44;2632goto FAILED;2633}26342635for(;;)2636{2637if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&2638c != CHAR_UNDERSCORE) break;2639ptr = p; /* Accept character and peek again */2640if (p >= ptrend) break;2641GETCHARINC(c, p);2642type = UCD_CHARTYPE(c);2643}2644}2645else2646#else2647(void)utf; /* Avoid compiler warning */2648#endif /* SUPPORT_UNICODE */26492650/* Handle non-group names and group names in non-UTF modes. A group name must2651not start with a digit. If either of the others start with a digit it just2652won't be recognized. */26532654{2655if (is_group && IS_DIGIT(*ptr))2656{2657++ptr;2658*errorcodeptr = ERR44;2659goto FAILED;2660}26612662while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)2663{2664ptr++;2665}2666}26672668/* Check name length */26692670if (ptr - *nameptr > MAX_NAME_SIZE)2671{2672*errorcodeptr = ERR48;2673goto FAILED;2674}2675*namelenptr = (uint32_t)(ptr - *nameptr);26762677/* Subpattern names must not be empty, and their terminator is checked here.2678(What follows a verb or alpha assertion name is checked separately.) */26792680if (is_group)2681{2682if (ptr == *nameptr)2683{2684*errorcodeptr = ERR62; /* Subpattern name expected */2685goto FAILED;2686}2687if (is_braced)2688while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;2689if (terminator != 0)2690{2691if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)2692{2693*errorcodeptr = ERR42;2694goto FAILED;2695}2696ptr++;2697}2698}26992700*ptrptr = ptr;2701return TRUE;27022703FAILED:2704*ptrptr = ptr;2705return FALSE;2706}2707270827092710/**************************************************2711* Parse capturing bracket argument list *2712**************************************************/27132714/* Reads a list of capture references. The references2715can be numbers or names.27162717Arguments:2718ptrptr points to the character pointer variable2719ptrend points to the end of the input string2720utf true if the input is UTF-encoded2721parsed_pattern the parsed pattern pointer2722offset last known offset2723errcodeptr where to put an error code2724cb pointer to the compile data block27252726Returns: updated parsed_pattern pointer on success2727NULL otherwise2728*/27292730static uint32_t *2731parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,2732BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,2733int *errorcodeptr, compile_block *cb)2734{2735PCRE2_SIZE next_offset;2736PCRE2_SPTR ptr = *ptrptr;2737PCRE2_SPTR name;2738PCRE2_UCHAR terminator;2739uint32_t meta, namelen;2740int i;27412742if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)2743{2744*errorcodeptr = ERR118;2745goto FAILED;2746}27472748for (;;)2749{2750ptr++;2751next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);27522753if (ptr >= ptrend)2754{2755*errorcodeptr = ERR117;2756goto FAILED;2757}27582759/* Handle [+-]number cases */2760if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,2761&i, errorcodeptr))2762{2763PCRE2_ASSERT(i >= 0);2764if (i <= 0)2765{2766*errorcodeptr = ERR15;2767goto FAILED;2768}2769meta = META_CAPTURE_NUMBER;2770namelen = (uint32_t)i;2771}2772else if (*errorcodeptr != 0) goto FAILED; /* Number too big */2773else2774{2775/* Handle 'name' or <name> cases. */2776if (*ptr == CHAR_LESS_THAN_SIGN)2777terminator = CHAR_GREATER_THAN_SIGN;2778else if (*ptr == CHAR_APOSTROPHE)2779terminator = CHAR_APOSTROPHE;2780else2781{2782*errorcodeptr = ERR117;2783goto FAILED;2784}27852786if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,2787&name, &namelen, errorcodeptr, cb)) goto FAILED;27882789meta = META_CAPTURE_NAME;2790}27912792PCRE2_ASSERT(next_offset > 0);2793if (offset == 0 || (next_offset - offset) >= 0x10000)2794{2795*parsed_pattern++ = META_OFFSET;2796PUTOFFSET(next_offset, parsed_pattern);2797offset = next_offset;2798}27992800/* The offset is encoded as a relative offset, because for some2801inputs such as ",2" in (1,2,3), we only have space for two uint32_t2802values, and an opcode and absolute offset may require three uint32_t2803values. */2804*parsed_pattern++ = meta | (uint32_t)(next_offset - offset);2805*parsed_pattern++ = namelen;2806offset = next_offset;28072808if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;28092810if (*ptr == CHAR_RIGHT_PARENTHESIS) break;28112812if (*ptr != CHAR_COMMA)2813{2814*errorcodeptr = ERR24;2815goto FAILED;2816}2817}28182819*ptrptr = ptr + 1;2820return parsed_pattern;28212822UNCLOSED_PARENTHESIS:2823*errorcodeptr = ERR14;28242825FAILED:2826*ptrptr = ptr;2827return NULL;2828}2829283028312832/*************************************************2833* Manage callouts at start of cycle *2834*************************************************/28352836/* At the start of a new item in parse_regex() we are able to record the2837details of the previous item in a prior callout, and also to set up an2838automatic callout if enabled. Avoid having two adjacent automatic callouts,2839which would otherwise happen for items such as \Q that contribute nothing to2840the parsed pattern.28412842Arguments:2843ptr current pattern pointer2844pcalloutptr points to a pointer to previous callout, or NULL2845auto_callout TRUE if auto_callouts are enabled2846parsed_pattern the parsed pattern pointer2847cb compile block28482849Returns: possibly updated parsed_pattern pointer.2850*/28512852static uint32_t *2853manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,2854uint32_t *parsed_pattern, compile_block *cb)2855{2856uint32_t *previous_callout = *pcalloutptr;28572858if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -2859cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);28602861if (!auto_callout) previous_callout = NULL; else2862{2863if (previous_callout == NULL ||2864previous_callout != parsed_pattern - 4 ||2865previous_callout[3] != 255)2866{2867previous_callout = parsed_pattern; /* Set up new automatic callout */2868parsed_pattern += 4;2869previous_callout[0] = META_CALLOUT_NUMBER;2870previous_callout[2] = 0;2871previous_callout[3] = 255;2872}2873previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);2874}28752876*pcalloutptr = previous_callout;2877return parsed_pattern;2878}2879288028812882/*************************************************2883* Handle \d, \D, \s, \S, \w, \W *2884*************************************************/28852886/* This function is called from parse_regex() below, both for freestanding2887escapes, and those within classes, to handle those escapes that may change when2888Unicode property support is requested. Note that PCRE2_UCP will never be set2889without Unicode support because that is checked when pcre2_compile() is called.28902891Arguments:2892escape the ESC_... value2893parsed_pattern where to add the code2894options options bits2895xoptions extra options bits28962897Returns: updated value of parsed_pattern2898*/2899static uint32_t *2900handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,2901uint32_t xoptions)2902{2903uint32_t ascii_option = 0;2904uint32_t prop = ESC_p;29052906switch(escape)2907{2908case ESC_D:2909prop = ESC_P;2910PCRE2_FALLTHROUGH /* Fall through */2911case ESC_d:2912ascii_option = PCRE2_EXTRA_ASCII_BSD;2913break;29142915case ESC_S:2916prop = ESC_P;2917PCRE2_FALLTHROUGH /* Fall through */2918case ESC_s:2919ascii_option = PCRE2_EXTRA_ASCII_BSS;2920break;29212922case ESC_W:2923prop = ESC_P;2924PCRE2_FALLTHROUGH /* Fall through */2925case ESC_w:2926ascii_option = PCRE2_EXTRA_ASCII_BSW;2927break;2928}29292930if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)2931{2932*parsed_pattern++ = META_ESCAPE + escape;2933}2934else2935{2936*parsed_pattern++ = META_ESCAPE + prop;2937switch(escape)2938{2939case ESC_d:2940case ESC_D:2941*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;2942break;29432944case ESC_s:2945case ESC_S:2946*parsed_pattern++ = PT_SPACE << 16;2947break;29482949case ESC_w:2950case ESC_W:2951*parsed_pattern++ = PT_WORD << 16;2952break;2953}2954}29552956return parsed_pattern;2957}2958295929602961/*************************************************2962* Maximum size of parsed_pattern for given input *2963*************************************************/29642965/* This function is called from parse_regex() below, to determine the amount2966of memory to allocate for parsed_pattern. It is also called to check whether2967the amount of data written respects the amount of memory allocated.29682969Arguments:2970ptr points to the start of the pattern2971ptrend points to the end of the pattern2972utf TRUE in UTF mode2973options the options bits29742975Returns: the number of uint32_t units for parsed_pattern2976*/2977static ptrdiff_t2978max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,2979uint32_t options)2980{2981PCRE2_SIZE big32count = 0;2982ptrdiff_t parsed_size_needed;29832984/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of2985unsigned 32-bit ints written out to the parsed pattern is bounded by the length2986of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,2987when literal characters greater than META_END (0x80000000) have to be coded as2988two units. In this case, therefore, we scan the pattern to check for such2989values. */29902991#if PCRE2_CODE_UNIT_WIDTH == 322992if (!utf)2993{2994PCRE2_SPTR p;2995for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;2996}2997#else2998(void)utf; /* Avoid compiler warning */2999#endif30003001parsed_size_needed = (ptrend - ptr) + big32count;30023003/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (43004elements) for each character. This is overkill, but memory is plentiful these3005days. */30063007if ((options & PCRE2_AUTO_CALLOUT) != 0)3008parsed_size_needed += (ptrend - ptr) * 4;30093010return parsed_size_needed;3011}3012301330143015/*************************************************3016* Parse regex and identify named groups *3017*************************************************/30183019/* This function is called first of all. It scans the pattern and does two3020things: (1) It identifies capturing groups and makes a table of named capturing3021groups so that information about them is fully available to both the compiling3022scans. (2) It writes a parsed version of the pattern with comments omitted and3023escapes processed into the parsed_pattern vector.30243025Arguments:3026ptr points to the start of the pattern3027options compiling dynamic options (may change during the scan)3028has_lookbehind points to a boolean, set TRUE if a lookbehind is found3029cb pointer to the compile data block30303031Returns: zero on success or a non-zero error code, with the3032error offset placed in the cb field3033*/30343035/* A structure and some flags for dealing with nested groups. */30363037typedef struct nest_save {3038uint16_t nest_depth;3039uint16_t reset_group;3040uint16_t max_group;3041uint16_t flags;3042uint32_t options;3043uint32_t xoptions;3044} nest_save;30453046#define NSF_RESET 0x0001u3047#define NSF_CONDASSERT 0x0002u3048#define NSF_ATOMICSR 0x0004u30493050/* Options that are changeable within the pattern must be tracked during3051parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,3052but all must be tracked so that META_OPTIONS items set the correct values for3053the main compiling phase. */30543055#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \3056PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \3057PCRE2_UNGREEDY)30583059#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \3060PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \3061PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)30623063/* States used for analyzing ranges in character classes. The two OK values3064must be last. */30653066enum {3067RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */3068RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */3069RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */3070RANGE_FORBID_STARTED, /* State after '[\d-'*/3071RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */3072RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */3073};30743075/* States used for analyzing operators and operands in extended character3076classes. */30773078enum {3079CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */3080CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */3081CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */3082};30833084/* States used for determining the parse mode in character classes. The two3085PERL_EXT values must be last. */30863087enum {3088CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */3089CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */3090CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */3091CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */3092};30933094/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates3095the storing of literal values in the main parsed pattern, where they can always3096be quantified. */30973098#if PCRE2_CODE_UNIT_WIDTH == 323099#define PARSED_LITERAL(c, p) \3100{ \3101if (c >= META_END) *p++ = META_BIGVALUE; \3102*p++ = c; \3103okquantifier = TRUE; \3104}3105#else3106#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;3107#endif31083109/* Here's the actual function. */31103111static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,3112BOOL *has_lookbehind, compile_block *cb)3113{3114uint32_t c;3115uint32_t delimiter;3116uint32_t namelen;3117uint32_t class_range_state;3118uint32_t class_op_state;3119uint32_t class_mode_state;3120uint32_t *class_start;3121uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */3122uint32_t *verbstartptr = NULL;3123uint32_t *previous_callout = NULL;3124uint32_t *parsed_pattern = cb->parsed_pattern;3125uint32_t *parsed_pattern_end = cb->parsed_pattern_end;3126uint32_t *this_parsed_item = NULL;3127uint32_t *prev_parsed_item = NULL;3128uint32_t meta_quantifier = 0;3129uint32_t add_after_mark = 0;3130uint16_t nest_depth = 0;3131int16_t class_depth_m1 = -1; /* The m1 means minus 1. */3132int16_t class_maxdepth_m1 = -1;3133uint16_t hash;3134int after_manual_callout = 0;3135int expect_cond_assert = 0;3136int errorcode = 0;3137int escape;3138int i;3139BOOL inescq = FALSE;3140BOOL inverbname = FALSE;3141BOOL utf = (options & PCRE2_UTF) != 0;3142BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;3143BOOL is_dupname;3144BOOL negate_class;3145BOOL okquantifier = FALSE;3146PCRE2_SPTR thisptr;3147PCRE2_SPTR name;3148PCRE2_SPTR ptrend = cb->end_pattern;3149PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */3150PCRE2_SPTR class_range_forbid_ptr = NULL;3151named_group *ng;3152nest_save *top_nest, *end_nests;3153#ifdef PCRE2_DEBUG3154uint32_t *parsed_pattern_check;3155ptrdiff_t parsed_pattern_extra = 0;3156ptrdiff_t parsed_pattern_extra_check = 0;3157PCRE2_SPTR ptr_check;3158#endif31593160PCRE2_ASSERT(parsed_pattern != NULL);31613162/* Insert leading items for word and line matching (features provided for the3163benefit of pcre2grep). */31643165if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)3166{3167*parsed_pattern++ = META_CIRCUMFLEX;3168*parsed_pattern++ = META_NOCAPTURE;3169}3170else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)3171{3172*parsed_pattern++ = META_ESCAPE + ESC_b;3173*parsed_pattern++ = META_NOCAPTURE;3174}31753176#ifdef PCRE2_DEBUG3177parsed_pattern_check = parsed_pattern;3178ptr_check = ptr;3179#endif31803181/* If the pattern is actually a literal string, process it separately to avoid3182cluttering up the main loop. */31833184if ((options & PCRE2_LITERAL) != 0)3185{3186while (ptr < ptrend)3187{3188/* LCOV_EXCL_START */3189if (parsed_pattern >= parsed_pattern_end)3190{3191PCRE2_DEBUG_UNREACHABLE();3192errorcode = ERR63; /* Internal error (parsed pattern overflow) */3193goto FAILED;3194}3195/* LCOV_EXCL_STOP */31963197thisptr = ptr;3198GETCHARINCTEST(c, ptr);3199if (auto_callout)3200parsed_pattern = manage_callouts(thisptr, &previous_callout,3201auto_callout, parsed_pattern, cb);3202PARSED_LITERAL(c, parsed_pattern);3203}3204goto PARSED_END;3205}32063207/* Process a real regex which may contain meta-characters. */32083209top_nest = NULL;3210end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);32113212/* The size of the nest_save structure might not be a factor of the size of the3213workspace. Therefore we must round down end_nests so as to correctly avoid3214creating a nest_save that spans the end of the workspace. */32153216end_nests = (nest_save *)((char *)end_nests -3217((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));32183219/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */32203221if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;32223223/* Now scan the pattern */32243225while (ptr < ptrend)3226{3227int prev_expect_cond_assert;3228uint32_t min_repeat = 0, max_repeat = 0;3229uint32_t set, unset, *optset;3230uint32_t xset, xunset, *xoptset;3231uint32_t terminator;3232uint32_t prev_meta_quantifier;3233BOOL prev_okquantifier;3234PCRE2_SPTR tempptr;3235PCRE2_SIZE offset;32363237if (nest_depth > cb->cx->parens_nest_limit)3238{3239errorcode = ERR19;3240goto FAILED; /* Parentheses too deeply nested */3241}32423243/* Check that we haven't emitted too much into parsed_pattern. We allocate3244a suitably-sized buffer upfront, then do unchecked writes to it. If we only3245write a little bit too much, everything will appear to be OK, because the3246upfront size is an overestimate... but a malicious pattern could end up3247forcing a write past the buffer end. We must catch this during3248development. */32493250#ifdef PCRE2_DEBUG3251/* Strong post-write check. Won't help in release builds - at this point3252the write has already occurred so it's too late. However, should stop us3253committing unsafe code. */3254PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +3255(parsed_pattern_extra - parsed_pattern_extra_check) <=3256max_parsed_pattern(ptr_check, ptr, utf, options));3257parsed_pattern_check = parsed_pattern;3258parsed_pattern_extra_check = parsed_pattern_extra;3259ptr_check = ptr;3260#endif32613262/* LCOV_EXCL_START */3263if (parsed_pattern >= parsed_pattern_end)3264{3265/* Weak pre-write check; only ensures parsed_pattern[0] is writeable3266(but the code below can write many chars). Better than nothing. */3267PCRE2_DEBUG_UNREACHABLE();3268errorcode = ERR63; /* Internal error (parsed pattern overflow) */3269goto FAILED;3270}3271/* LCOV_EXCL_STOP */32723273/* If the last time round this loop something was added, parsed_pattern will3274no longer be equal to this_parsed_item. Remember where the previous item3275started and reset for the next item. Note that sometimes round the loop,3276nothing gets added (e.g. for ignored white space). */32773278if (this_parsed_item != parsed_pattern)3279{3280prev_parsed_item = this_parsed_item;3281this_parsed_item = parsed_pattern;3282}32833284/* Get next input character, save its position for callout handling. */32853286thisptr = ptr;3287GETCHARINCTEST(c, ptr);32883289/* Copy quoted literals until \E, allowing for the possibility of automatic3290callouts, except when processing a (*VERB) "name". */32913292if (inescq)3293{3294if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)3295{3296inescq = FALSE;3297ptr++; /* Skip E */3298}3299else3300{3301if (inverbname)3302{ /* Don't use PARSED_LITERAL() because it */3303#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */3304if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;3305#endif3306*parsed_pattern++ = c;3307}3308else3309{3310if (after_manual_callout-- <= 0)3311parsed_pattern = manage_callouts(thisptr, &previous_callout,3312auto_callout, parsed_pattern, cb);3313PARSED_LITERAL(c, parsed_pattern);3314}3315meta_quantifier = 0;3316}3317continue; /* Next character */3318}33193320/* If we are processing the "name" part of a (*VERB:NAME) item, all3321characters up to the closing parenthesis are literals except when3322PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q3323and \E and escaped characters are allowed (no character types such as \d). If3324PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do3325this by not entering the special (*VERB:NAME) processing - they are then3326picked up below. Note that c is a character, not a code unit, so we must not3327use MAX_255 to test its size because MAX_255 tests code units and is assumed3328TRUE in 8-bit mode. */33293330if (inverbname &&3331(3332/* EITHER: not both options set */3333((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=3334(PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||3335#ifdef SUPPORT_UNICODE3336/* OR: character > 255 AND not Unicode Pattern White Space */3337(c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||3338#endif3339/* OR: not a # comment or isspace() white space */3340(c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 03341#ifdef SUPPORT_UNICODE3342/* and not CHAR_NEL when Unicode is supported */3343&& c != CHAR_NEL3344#endif3345)))3346{3347PCRE2_SIZE verbnamelength;33483349switch(c)3350{3351default: /* Don't use PARSED_LITERAL() because it */3352#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */3353if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;3354#endif3355*parsed_pattern++ = c;3356break;33573358case CHAR_RIGHT_PARENTHESIS:3359inverbname = FALSE;3360/* This is the length in characters */3361verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);3362/* But the limit on the length is in code units */3363if (ptr - verbnamestart - 1 > (int)MAX_MARK)3364{3365ptr--;3366errorcode = ERR76;3367goto FAILED;3368}3369*verblengthptr = (uint32_t)verbnamelength;33703371/* If this name was on a verb such as (*ACCEPT) which does not continue,3372a (*MARK) was generated for the name. We now add the original verb as the3373next item. */33743375if (add_after_mark != 0)3376{3377*parsed_pattern++ = add_after_mark;3378add_after_mark = 0;3379}3380break;33813382case CHAR_BACKSLASH:3383if ((options & PCRE2_ALT_VERBNAMES) != 0)3384{3385escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,3386xoptions, cb->bracount, FALSE, cb);3387if (errorcode != 0) goto FAILED;3388}3389else escape = 0; /* Treat all as literal */33903391switch(escape)3392{3393case 0: /* Don't use PARSED_LITERAL() because it */3394#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */3395if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;3396#endif3397*parsed_pattern++ = c;3398break;33993400case ESC_ub:3401*parsed_pattern++ = CHAR_u;3402PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);3403break;34043405case ESC_Q:3406inescq = TRUE;3407break;34083409case ESC_E: /* Ignore */3410break;34113412default:3413errorcode = ERR40; /* Invalid in verb name */3414goto FAILED;3415}3416}3417continue; /* Next character in pattern */3418}34193420/* Not a verb name character. At this point we must process everything that3421must not change the quantification state. This is mainly comments, but we3422handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as3423A+, as in Perl. An isolated \E is ignored. */34243425if (c == CHAR_BACKSLASH && ptr < ptrend)3426{3427if (*ptr == CHAR_Q || *ptr == CHAR_E)3428{3429/* A literal inside a \Q...\E is not allowed if we are expecting a3430conditional assertion, but an empty \Q\E sequence is OK. */3431if (expect_cond_assert > 0 && *ptr == CHAR_Q &&3432!(ptrend - ptr >= 3 && ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E))3433{3434ptr--;3435errorcode = ERR28;3436goto FAILED;3437}3438inescq = *ptr == CHAR_Q;3439ptr++;3440continue;3441}3442}34433444/* Skip over whitespace and # comments in extended mode. Note that c is a3445character, not a code unit, so we must not use MAX_255 to test its size3446because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The3447whitespace characters are those designated as "Pattern White Space" by3448Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is3449U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a3450subset of space characters that match \h and \v. */34513452if ((options & PCRE2_EXTENDED) != 0)3453{3454if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;3455#ifdef SUPPORT_UNICODE3456if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;3457#endif3458if (c == CHAR_NUMBER_SIGN)3459{3460while (ptr < ptrend)3461{3462if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */3463{ /* IS_NEWLINE sets cb->nllen. */3464ptr += cb->nllen;3465break;3466}3467ptr++;3468#ifdef SUPPORT_UNICODE3469if (utf) FORWARDCHARTEST(ptr, ptrend);3470#endif3471}3472continue; /* Next character in pattern */3473}3474}34753476/* Skip over bracketed comments */34773478if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&3479ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)3480{3481while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);3482if (ptr >= ptrend)3483{3484errorcode = ERR18; /* A special error for missing ) in a comment */3485goto FAILED; /* to make it easier to debug. */3486}3487ptr++;3488continue; /* Next character in pattern */3489}34903491/* If the next item is not a quantifier, fill in length of any previous3492callout and create an auto callout if required. */34933494if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&3495(c != CHAR_LEFT_CURLY_BRACKET ||3496(tempptr = ptr,3497!read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))3498{3499if (after_manual_callout-- <= 0)3500{3501parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,3502parsed_pattern, cb);3503this_parsed_item = parsed_pattern; /* New start for current item */3504}3505}35063507/* If expect_cond_assert is 2, we have just passed (?( and are expecting an3508assertion, possibly preceded by a callout. If the value is 1, we have just3509had the callout and expect an assertion. There must be at least 3 more3510characters in all cases. When expect_cond_assert is 2, we know that the3511current character is an opening parenthesis, as otherwise we wouldn't be3512here. However, when it is 1, we need to check, and it's easiest just to check3513always. Note that expect_cond_assert may be negative, since all callouts just3514decrement it. */35153516if (expect_cond_assert > 0)3517{3518BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&3519(ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);3520if (ok)3521{3522if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */3523{3524ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;3525}3526else switch(ptr[1]) /* Traditional symbolic format */3527{3528case CHAR_C:3529ok = expect_cond_assert == 2;3530break;35313532case CHAR_EQUALS_SIGN:3533case CHAR_EXCLAMATION_MARK:3534break;35353536case CHAR_LESS_THAN_SIGN:3537ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;3538break;35393540default:3541ok = FALSE;3542}3543}35443545if (!ok)3546{3547errorcode = ERR28;3548if (expect_cond_assert == 2) goto FAILED;3549goto FAILED_BACK;3550}3551}35523553/* Remember whether we are expecting a conditional assertion, and set the3554default for this item. */35553556prev_expect_cond_assert = expect_cond_assert;3557expect_cond_assert = 0;35583559/* Remember quantification status for the previous significant item, then set3560default for this item. */35613562prev_okquantifier = okquantifier;3563prev_meta_quantifier = meta_quantifier;3564okquantifier = FALSE;3565meta_quantifier = 0;35663567/* If the previous significant item was a quantifier, adjust the parsed code3568if there is a following modifier. The base meta value is always followed by3569the PLUS and QUERY values, in that order. We do this here rather than after3570reading a quantifier so that intervening comments and /x whitespace can be3571ignored without having to replicate code. */35723573if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))3574{3575parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =3576prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?35770x00020000u : 0x00010000u);3578continue; /* Next character in pattern */3579}35803581/* Process the next item in the main part of a pattern. */35823583switch(c)3584{3585default: /* Non-special character */3586PARSED_LITERAL(c, parsed_pattern);3587break;358835893590/* ---- Escape sequence ---- */35913592case CHAR_BACKSLASH:3593tempptr = ptr;3594escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,3595xoptions, cb->bracount, FALSE, cb);3596if (errorcode != 0)3597{3598ESCAPE_FAILED:3599if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)3600goto FAILED;3601ptr = tempptr;3602if (ptr >= ptrend) c = CHAR_BACKSLASH; else3603{3604GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */3605}3606escape = 0; /* Treat as literal character */3607}36083609/* The escape was a data escape or literal character. */36103611if (escape == 0)3612{3613PARSED_LITERAL(c, parsed_pattern);3614}36153616/* The escape was a back (or forward) reference. We keep the offset in3617order to give a more useful diagnostic for a bad forward reference. For3618references to groups numbered less than 10 we can't use more than two items3619in parsed_pattern because they may be just two characters in the input (and3620in a 64-bit world an offset may need two elements). So for them, the offset3621of the first occurrent is held in a special vector. */36223623else if (escape < 0)3624{3625offset = (PCRE2_SIZE)(ptr - cb->start_pattern);3626escape = -escape - 1;3627*parsed_pattern++ = META_BACKREF | (uint32_t)escape;3628if (escape < 10)3629{3630if (cb->small_ref_offset[escape] == PCRE2_UNSET)3631cb->small_ref_offset[escape] = offset;3632}3633else3634{3635PUTOFFSET(offset, parsed_pattern);3636}3637okquantifier = TRUE;3638}36393640/* The escape was a character class such as \d etc. or other special3641escape indicator such as \A or \X. Most of them generate just a single3642parsed item, but \P and \p are followed by a 16-bit type and a 16-bit3643value. They are supported only when Unicode is available. The type and3644value are packed into a single 32-bit value so that the whole sequences3645uses only two elements in the parsed_vector. This is because the same3646coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is3647set.36483649There are also some cases where the escape sequence is followed by a name:3650\k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>3651and \g'name' are subroutine calls by name; \g{name} is a synonym for3652\k{name}. Note that \g<number> and \g'number' are handled by check_escape()3653and returned as a negative value (handled above). A name is coded as an3654offset into the pattern and a length. */36553656else switch (escape)3657{3658case ESC_C:3659#ifdef NEVER_BACKSLASH_C3660errorcode = ERR85;3661goto ESCAPE_FAILED;3662#else3663if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)3664{3665errorcode = ERR83;3666goto ESCAPE_FAILED;3667}3668#endif3669okquantifier = TRUE;3670*parsed_pattern++ = META_ESCAPE + escape;3671break;36723673/* This is a special return that happens only in EXTRA_ALT_BSUX mode,3674when \u{ is not followed by hex digits and }. It requests two literal3675characters, u and { and we need this, as otherwise \u{ 12} (for example)3676would be treated as u{12} now that spaces are allowed in quantifiers. */36773678case ESC_ub:3679*parsed_pattern++ = CHAR_u;3680PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);3681break;36823683case ESC_X:3684#ifndef SUPPORT_UNICODE3685errorcode = ERR45; /* Supported only with Unicode support */3686goto ESCAPE_FAILED;3687#endif3688case ESC_H:3689case ESC_h:3690case ESC_N:3691case ESC_R:3692case ESC_V:3693case ESC_v:3694okquantifier = TRUE;3695*parsed_pattern++ = META_ESCAPE + escape;3696break;36973698default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */3699*parsed_pattern++ = META_ESCAPE + escape;3700break;37013702/* Escapes that may change in UCP mode. */37033704case ESC_d:3705case ESC_D:3706case ESC_s:3707case ESC_S:3708case ESC_w:3709case ESC_W:3710okquantifier = TRUE;3711parsed_pattern = handle_escdsw(escape, parsed_pattern, options,3712xoptions);3713break;37143715/* Unicode property matching */37163717case ESC_P:3718case ESC_p:3719#ifdef SUPPORT_UNICODE3720{3721BOOL negated;3722uint16_t ptype = 0, pdata = 0;3723if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))3724goto ESCAPE_FAILED;3725if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;3726*parsed_pattern++ = META_ESCAPE + escape;3727*parsed_pattern++ = (ptype << 16) | pdata;3728okquantifier = TRUE;3729}3730#else3731errorcode = ERR45;3732goto ESCAPE_FAILED;3733#endif3734break; /* End \P and \p */37353736/* When \g is used with quotes or angle brackets as delimiters, it is a3737numerical or named subroutine call, and control comes here. When used3738with brace delimiters it is a numerical back reference and does not come3739here because check_escape() returns it directly as a reference. \k is3740always a named back reference. */37413742case ESC_g:3743case ESC_k:3744if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&3745*ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))3746{3747errorcode = (escape == ESC_g)? ERR57 : ERR69;3748goto ESCAPE_FAILED;3749}3750terminator = (*ptr == CHAR_LESS_THAN_SIGN)?3751CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?3752CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;37533754/* For a non-braced \g, check for a numerical recursion. */37553756if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)3757{3758PCRE2_SPTR p = ptr + 1;37593760if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,3761&errorcode))3762{3763if (p >= ptrend || *p != terminator)3764{3765ptr = p;3766errorcode = ERR119; /* Missing terminator for number */3767goto ESCAPE_FAILED;3768}3769ptr = p + 1;3770goto SET_RECURSION;3771}3772if (errorcode != 0) goto ESCAPE_FAILED;3773}37743775/* Not a numerical recursion. Perl allows spaces and tabs after { and3776before } but not for other delimiters. */37773778if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,3779&errorcode, cb)) goto ESCAPE_FAILED;37803781/* \k and \g when used with braces are back references, whereas \g used3782with quotes or angle brackets is a recursion */37833784*parsed_pattern++ =3785(escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?3786META_BACKREF_BYNAME : META_RECURSE_BYNAME;3787*parsed_pattern++ = namelen;37883789PUTOFFSET(offset, parsed_pattern);3790okquantifier = TRUE;3791break; /* End special escape processing */3792}3793break; /* End escape sequence processing */379437953796/* ---- Single-character special items ---- */37973798case CHAR_CIRCUMFLEX_ACCENT:3799*parsed_pattern++ = META_CIRCUMFLEX;3800break;38013802case CHAR_DOLLAR_SIGN:3803*parsed_pattern++ = META_DOLLAR;3804break;38053806case CHAR_DOT:3807*parsed_pattern++ = META_DOT;3808okquantifier = TRUE;3809break;381038113812/* ---- Single-character quantifiers ---- */38133814case CHAR_ASTERISK:3815meta_quantifier = META_ASTERISK;3816goto CHECK_QUANTIFIER;38173818case CHAR_PLUS:3819meta_quantifier = META_PLUS;3820goto CHECK_QUANTIFIER;38213822case CHAR_QUESTION_MARK:3823meta_quantifier = META_QUERY;3824goto CHECK_QUANTIFIER;382538263827/* ---- Potential {n,m} quantifier ---- */38283829case CHAR_LEFT_CURLY_BRACKET:3830if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,3831&errorcode))3832{3833if (errorcode != 0) goto FAILED; /* Error in quantifier. */3834PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */3835break; /* No more quantifier processing */3836}3837meta_quantifier = META_MINMAX;3838/* Fall through */383938403841/* ---- Quantifier post-processing ---- */38423843/* Check that a quantifier is allowed after the previous item. This3844guarantees that there is a previous item. */38453846CHECK_QUANTIFIER:3847if (!prev_okquantifier)3848{3849errorcode = ERR9;3850goto FAILED;3851}38523853/* Most (*VERB)s are not allowed to be quantified, but an ungreedy3854quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a3855sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by3856wrapping it in non-capturing brackets, but we have to allow for a preceding3857(*MARK) for when (*ACCEPT) has an argument. */38583859if (*prev_parsed_item == META_ACCEPT)3860{3861uint32_t *p;3862for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];3863*verbstartptr = META_NOCAPTURE;3864parsed_pattern[1] = META_KET;3865parsed_pattern += 2;38663867#ifdef PCRE2_DEBUG3868PCRE2_ASSERT(parsed_pattern_extra >= 2);3869parsed_pattern_extra -= 2;3870#endif3871}38723873/* Now we can put the quantifier into the parsed pattern vector. At this3874stage, we have only the basic quantifier. The check for a following + or ?3875modifier happens at the top of the loop, after any intervening comments3876have been removed. */38773878*parsed_pattern++ = meta_quantifier;3879if (c == CHAR_LEFT_CURLY_BRACKET)3880{3881*parsed_pattern++ = min_repeat;3882*parsed_pattern++ = max_repeat;3883}3884break;388538863887/* ---- Character class ---- */38883889case CHAR_LEFT_SQUARE_BRACKET:38903891/* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is3892used for "start of word" and "end of word". As these are otherwise illegal3893sequences, we don't break anything by recognizing them. They are replaced3894by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are3895erroneous and are handled by the normal code below. */38963897if (ptrend - ptr >= 6 &&3898(PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||3899PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))3900{3901*parsed_pattern++ = META_ESCAPE + ESC_b;39023903if (ptr[2] == CHAR_LESS_THAN_SIGN)3904{3905*parsed_pattern++ = META_LOOKAHEAD;3906}3907else3908{3909*parsed_pattern++ = META_LOOKBEHIND;3910*has_lookbehind = TRUE;39113912/* The offset is used only for the "non-fixed length" error; this won't3913occur here, so just store zero. */39143915PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);3916}39173918if ((options & PCRE2_UCP) == 0)3919*parsed_pattern++ = META_ESCAPE + ESC_w;3920else3921{3922*parsed_pattern++ = META_ESCAPE + ESC_p;3923*parsed_pattern++ = PT_WORD << 16;3924}3925*parsed_pattern++ = META_KET;3926ptr += 6;3927okquantifier = TRUE;3928break;3929}39303931/* PCRE supports POSIX class stuff inside a class. Perl gives an error if3932they are encountered at the top level, so we'll do that too. */39333934if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||3935*ptr == CHAR_EQUALS_SIGN) &&3936check_posix_syntax(ptr, ptrend, &tempptr))3937{3938errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;3939ptr = tempptr + 2;3940goto FAILED;3941}39423943class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?3944CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;39453946/* Jump here from '(?[...])'. That jump must initialize class_mode_state,3947set c to the '[' character, and ptr to just after the '['. */39483949FROM_PERL_EXTENDED_CLASS:3950okquantifier = TRUE;39513952/* In an EBCDIC environment, Perl treats alphabetic ranges specially3953because there are holes in the encoding, and simply using the range A-Z3954(for example) would include the characters in the holes. This applies only3955to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]3956in this respect. In order to accommodate this, we keep track of whether3957character values are literal or not, and a state variable for handling3958ranges. */39593960/* Loop for the contents of the class. Classes may be nested, if3961PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */39623963/* c is still set to '[' so the loop will handle the start of the class. */39643965class_depth_m1 = -1;3966class_maxdepth_m1 = -1;3967class_range_state = RANGE_NO;3968class_op_state = CLASS_OP_EMPTY;3969class_start = NULL;39703971for (;;)3972{3973BOOL char_is_literal = TRUE;39743975/* Inside \Q...\E everything is literal except \E */39763977if (inescq)3978{3979if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)3980{3981inescq = FALSE; /* Reset literal state */3982ptr++; /* Skip the 'E' */3983goto CLASS_CONTINUE;3984}39853986/* Surprisingly, you cannot use \Q..\E to escape a character inside a3987Perl extended class. However, empty \Q\E sequences are allowed, so here3988were're only giving an error if the \Q..\E is non-empty. */39893990if (class_mode_state == CLASS_MODE_PERL_EXT)3991{3992errorcode = ERR116;3993goto FAILED;3994}39953996goto CLASS_LITERAL;3997}39983999/* Skip over space and tab (only) in extended-more mode, or anywhere4000inside a Perl extended class (which implies /xx). */40014002if ((c == CHAR_SPACE || c == CHAR_HT) &&4003((options & PCRE2_EXTENDED_MORE) != 0 ||4004class_mode_state >= CLASS_MODE_PERL_EXT))4005goto CLASS_CONTINUE;40064007/* Handle POSIX class names. Perl allows a negation extension of the4008form [:^name:]. A square bracket that doesn't match the syntax is4009treated as a literal. We also recognize the POSIX constructions4010[.ch.] and [=ch=] ("collating elements") and fault them, as Perl40115.6 and 5.8 do. */40124013if (class_depth_m1 >= 0 &&4014c == CHAR_LEFT_SQUARE_BRACKET &&4015ptrend - ptr >= 3 &&4016(*ptr == CHAR_COLON || *ptr == CHAR_DOT ||4017*ptr == CHAR_EQUALS_SIGN) &&4018check_posix_syntax(ptr, ptrend, &tempptr))4019{4020BOOL posix_negate = FALSE;4021int posix_class;40224023/* Perl treats a hyphen before a POSIX class as a literal, not the4024start of a range. However, it gives a warning in its warning mode. PCRE4025does not have a warning mode, so we give an error, because this is4026likely an error on the user's part. */40274028if (class_range_state == RANGE_STARTED)4029{4030ptr = tempptr + 2;4031errorcode = ERR50;4032goto FAILED;4033}40344035/* Perl treats a hyphen after a POSIX class as a literal, not the4036start of a range. However, it gives a warning in its warning mode4037unless the hyphen is the last character in the class. PCRE does not4038have a warning mode, so we give an error, because this is likely an4039error on the user's part.40404041Roll back to the hyphen for the error position. */40424043if (class_range_state == RANGE_FORBID_STARTED)4044{4045ptr = class_range_forbid_ptr;4046errorcode = ERR50;4047goto FAILED;4048}40494050/* Disallow implicit union in Perl extended classes. */40514052if (class_op_state == CLASS_OP_OPERAND &&4053class_mode_state == CLASS_MODE_PERL_EXT)4054{4055ptr = tempptr + 2;4056errorcode = ERR113;4057goto FAILED;4058}40594060if (*ptr != CHAR_COLON)4061{4062ptr = tempptr + 2;4063errorcode = ERR13;4064goto FAILED;4065}40664067if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)4068{4069posix_negate = TRUE;4070ptr++;4071}40724073posix_class = check_posix_name(ptr, (int)(tempptr - ptr));4074ptr = tempptr + 2;4075if (posix_class < 0)4076{4077errorcode = ERR30;4078goto FAILED;4079}40804081/* Set "a hyphen is forbidden to be the start of a range". For the '-]'4082case, the hyphen is treated as a literal, but for '-1' it is disallowed4083(because it would be interpreted as range). */40844085class_range_state = RANGE_FORBID_NO;4086class_op_state = CLASS_OP_OPERAND;40874088/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some4089of the POSIX classes are converted to use Unicode properties \p or \P4090or, in one case, \h or \H. The substitutes table has two values per4091class, containing the type and value of a \p or \P item. The special4092cases are specified with a negative type: a non-zero value causes \h or4093\H to be used, and a zero value falls through to behave like a non-UCP4094POSIX class. There are now also some extra options that force ASCII for4095some classes. */40964097#ifdef SUPPORT_UNICODE4098if ((options & PCRE2_UCP) != 0 &&4099(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&4100!((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&4101(posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))4102{4103int ptype = posix_substitutes[2*posix_class];4104int pvalue = posix_substitutes[2*posix_class + 1];41054106if (ptype >= 0)4107{4108*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);4109*parsed_pattern++ = (ptype << 16) | pvalue;4110goto CLASS_CONTINUE;4111}41124113if (pvalue != 0)4114{4115*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);4116goto CLASS_CONTINUE;4117}41184119/* Fall through */4120}4121#endif /* SUPPORT_UNICODE */41224123/* Non-UCP POSIX class */41244125*parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;4126*parsed_pattern++ = posix_class;4127}41284129/* Check for the start of the outermost class, or the start of a nested class. */41304131else if ((c == CHAR_LEFT_SQUARE_BRACKET &&4132(class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||4133class_mode_state == CLASS_MODE_PERL_EXT)) ||4134(c == CHAR_LEFT_PARENTHESIS &&4135class_mode_state == CLASS_MODE_PERL_EXT))4136{4137uint32_t start_c = c;4138uint32_t new_class_mode_state;41394140/* Update the class mode, if moving into a 'leaf' inside a Perl extended4141class. */41424143if (start_c == CHAR_LEFT_SQUARE_BRACKET &&4144class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)4145new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;4146else4147new_class_mode_state = class_mode_state;41484149/* Tidy up the other class before starting the nested class. */4150/* -[ beginning a nested class is a literal '-' */41514152if (class_range_state == RANGE_STARTED)4153parsed_pattern[-1] = CHAR_MINUS;41544155/* Disallow implicit union in Perl extended classes. */41564157if (class_op_state == CLASS_OP_OPERAND &&4158class_mode_state == CLASS_MODE_PERL_EXT)4159{4160errorcode = ERR113;4161goto FAILED;4162}41634164/* Validate nesting depth */4165if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)4166{4167ptr--; /* Point rightwards at the paren, same as ERR19. */4168errorcode = ERR107; /* Classes too deeply nested */4169goto FAILED;4170}41714172/* Process the character class start. If the first character is '^', set4173the negation flag. If the first few characters (either before or after ^)4174are \Q\E or \E or space or tab in extended-more mode, we skip them too.4175This makes for compatibility with Perl. */41764177negate_class = FALSE;4178for (;;)4179{4180if (ptr >= ptrend)4181{4182if (start_c == CHAR_LEFT_PARENTHESIS)4183errorcode = ERR14; /* Missing terminating ')' */4184else4185errorcode = ERR6; /* Missing terminating ']' */4186goto FAILED;4187}41884189GETCHARINCTEST(c, ptr);4190if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;4191else if (c == CHAR_BACKSLASH)4192{4193if (ptr < ptrend && *ptr == CHAR_E) ptr++;4194else if (ptrend - ptr >= 3 &&4195PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)4196ptr += 3;4197else4198break;4199}4200else if ((c == CHAR_SPACE || c == CHAR_HT) && /* Note: just these two */4201((options & PCRE2_EXTENDED_MORE) != 0 ||4202new_class_mode_state >= CLASS_MODE_PERL_EXT))4203continue;4204else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)4205negate_class = TRUE;4206else break;4207}42084209/* Now the real contents of the class; c has the first "real" character.4210Empty classes are permitted only if the option is set, and if it's not4211a Perl-extended class. */42124213if (c == CHAR_RIGHT_SQUARE_BRACKET &&4214(cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&4215new_class_mode_state < CLASS_MODE_PERL_EXT)4216{4217PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);42184219if (class_start != NULL)4220{4221PCRE2_ASSERT(class_depth_m1 >= 0);4222/* Represents that the class is an extended class. */4223*class_start |= CLASS_IS_ECLASS;4224class_start = NULL;4225}42264227*parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;42284229/* Leave nesting depth unchanged; but check for zero depth to handle the4230very first (top-level) class being empty. */4231if (class_depth_m1 < 0) break;42324233class_range_state = RANGE_NO; /* for processing the containing class */4234class_op_state = CLASS_OP_OPERAND;4235goto CLASS_CONTINUE;4236}42374238/* Enter a non-empty class. */42394240if (class_start != NULL)4241{4242PCRE2_ASSERT(class_depth_m1 >= 0);4243/* Represents that the class is an extended class. */4244*class_start |= CLASS_IS_ECLASS;4245class_start = NULL;4246}42474248class_start = parsed_pattern;4249*parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;4250class_range_state = RANGE_NO;4251class_op_state = CLASS_OP_EMPTY;4252class_mode_state = new_class_mode_state;4253++class_depth_m1;4254if (class_maxdepth_m1 < class_depth_m1)4255class_maxdepth_m1 = class_depth_m1;4256/* Reset; no op seen yet at new depth. */4257cb->class_op_used[class_depth_m1] = 0;42584259/* Implement the special start-of-class literal meaning of ']'. */4260if (c == CHAR_RIGHT_SQUARE_BRACKET &&4261new_class_mode_state != CLASS_MODE_PERL_EXT)4262{4263class_range_state = RANGE_OK_LITERAL;4264class_op_state = CLASS_OP_OPERAND;4265PARSED_LITERAL(c, parsed_pattern);4266goto CLASS_CONTINUE;4267}42684269continue; /* We have already loaded c with the next character */4270}42714272/* Check for the end of the class. */42734274else if (c == CHAR_RIGHT_SQUARE_BRACKET ||4275(c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))4276{4277/* In Perl extended mode, the ']' can only be used to match the4278opening '[', and ')' must match an opening parenthesis. */4279if (class_mode_state == CLASS_MODE_PERL_EXT)4280{4281if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)4282{4283errorcode = ERR14;4284ptr--; /* Correct the offset */4285goto FAILED;4286}4287if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)4288{4289errorcode = ERR22;4290goto FAILED;4291}4292}42934294/* Check no trailing operator. */4295if (class_op_state == CLASS_OP_OPERATOR)4296{4297errorcode = ERR110;4298goto FAILED;4299}43004301/* Check no empty expression for Perl extended expressions. */4302if (class_mode_state == CLASS_MODE_PERL_EXT &&4303class_op_state == CLASS_OP_EMPTY)4304{4305errorcode = ERR114;4306goto FAILED;4307}43084309/* -] at the end of a class is a literal '-' */4310if (class_range_state == RANGE_STARTED)4311parsed_pattern[-1] = CHAR_MINUS;43124313*parsed_pattern++ = META_CLASS_END;43144315if (--class_depth_m1 < 0)4316{4317/* Check for and consume ')' after '(?[...]'. */4318PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);4319if (class_mode_state == CLASS_MODE_PERL_EXT)4320{4321if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)4322{4323errorcode = ERR115;4324goto FAILED;4325}43264327ptr++;4328}43294330break;4331}43324333class_range_state = RANGE_NO; /* for processing the containing class */4334class_op_state = CLASS_OP_OPERAND;4335if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)4336class_mode_state = CLASS_MODE_PERL_EXT;4337/* The extended class flag has already4338been set for the parent class. */4339class_start = NULL;4340}43414342/* Handle a Perl set binary operator */43434344else if (class_mode_state == CLASS_MODE_PERL_EXT &&4345(c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||4346c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))4347{4348/* Check that there was a preceding operand. */4349if (class_op_state != CLASS_OP_OPERAND)4350{4351errorcode = ERR109;4352goto FAILED;4353}43544355if (class_start != NULL)4356{4357PCRE2_ASSERT(class_depth_m1 >= 0);4358/* Represents that the class is an extended class. */4359*class_start |= CLASS_IS_ECLASS;4360class_start = NULL;4361}43624363PCRE2_ASSERT(class_range_state != RANGE_STARTED &&4364class_range_state != RANGE_FORBID_STARTED);43654366*parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :4367c == CHAR_VERTICAL_LINE? META_ECLASS_OR :4368c == CHAR_MINUS? META_ECLASS_SUB :4369c == CHAR_AMPERSAND? META_ECLASS_AND :4370META_ECLASS_XOR;4371class_range_state = RANGE_NO;4372class_op_state = CLASS_OP_OPERATOR;4373}43744375/* Handle a Perl set unary operator */43764377else if (class_mode_state == CLASS_MODE_PERL_EXT &&4378c == CHAR_EXCLAMATION_MARK)4379{4380/* Check that the "!" has not got a preceding operand (i.e. it's the4381start of the class, or follows an operator). */4382if (class_op_state == CLASS_OP_OPERAND)4383{4384errorcode = ERR113;4385goto FAILED;4386}43874388if (class_start != NULL)4389{4390PCRE2_ASSERT(class_depth_m1 >= 0);4391/* Represents that the class is an extended class. */4392*class_start |= CLASS_IS_ECLASS;4393class_start = NULL;4394}43954396PCRE2_ASSERT(class_range_state != RANGE_STARTED &&4397class_range_state != RANGE_FORBID_STARTED);43984399*parsed_pattern++ = META_ECLASS_NOT;4400class_range_state = RANGE_NO;4401class_op_state = CLASS_OP_OPERATOR;4402}44034404/* Handle a UTS#18 set operator */44054406else if (class_mode_state == CLASS_MODE_ALT_EXT &&4407(c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||4408c == CHAR_AMPERSAND || c == CHAR_TILDE) &&4409ptr < ptrend && *ptr == c)4410{4411++ptr;44124413/* Check there isn't a triple-repetition. */4414if (ptr < ptrend && *ptr == c)4415{4416while (ptr < ptrend && *ptr == c) ++ptr; /* Improve error offset. */4417errorcode = ERR108;4418goto FAILED;4419}44204421/* Check for a preceding operand. */4422if (class_op_state != CLASS_OP_OPERAND)4423{4424errorcode = ERR109;4425goto FAILED;4426}44274428/* Check for mixed precedence. Forbid [A--B&&C]. */4429if (cb->class_op_used[class_depth_m1] != 0 &&4430cb->class_op_used[class_depth_m1] != (uint8_t)c)4431{4432errorcode = ERR111;4433goto FAILED;4434}44354436if (class_start != NULL)4437{4438PCRE2_ASSERT(class_depth_m1 >= 0);4439/* Represents that the class is an extended class. */4440*class_start |= CLASS_IS_ECLASS;4441class_start = NULL;4442}44434444/* Dangling '-' before an operator is a literal */4445if (class_range_state == RANGE_STARTED)4446parsed_pattern[-1] = CHAR_MINUS;44474448*parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :4449c == CHAR_MINUS? META_ECLASS_SUB :4450c == CHAR_AMPERSAND? META_ECLASS_AND :4451META_ECLASS_XOR;4452class_range_state = RANGE_NO;4453class_op_state = CLASS_OP_OPERATOR;4454cb->class_op_used[class_depth_m1] = (uint8_t)c;4455}44564457/* Handle escapes in a class */44584459else if (c == CHAR_BACKSLASH)4460{4461tempptr = ptr;4462escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,4463xoptions, cb->bracount, TRUE, cb);44644465if (errorcode != 0)4466{4467if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||4468class_mode_state >= CLASS_MODE_PERL_EXT)4469goto FAILED;4470ptr = tempptr;4471if (ptr >= ptrend) c = CHAR_BACKSLASH; else4472{4473GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */4474}4475escape = 0; /* Treat as literal character */4476}44774478switch(escape)4479{4480case 0: /* Escaped character code point is in c */4481char_is_literal = FALSE;4482goto CLASS_LITERAL; /* (a few lines above) */44834484case ESC_b:4485c = CHAR_BS; /* \b is backspace in a class */4486char_is_literal = FALSE;4487goto CLASS_LITERAL;44884489case ESC_k:4490c = CHAR_k; /* \k is not special in a class, just like \g */4491char_is_literal = FALSE;4492goto CLASS_LITERAL;44934494case ESC_Q:4495inescq = TRUE; /* Enter literal mode */4496goto CLASS_CONTINUE;44974498case ESC_E: /* Ignore orphan \E */4499goto CLASS_CONTINUE;45004501case ESC_B: /* Always an error in a class */4502case ESC_R:4503case ESC_X:4504errorcode = ERR7;4505goto FAILED;45064507case ESC_N: /* Not permitted by Perl either */4508errorcode = ERR71;4509goto FAILED;45104511case ESC_H:4512case ESC_h:4513case ESC_V:4514case ESC_v:4515*parsed_pattern++ = META_ESCAPE + escape;4516break;45174518/* These escapes may be converted to Unicode property tests when4519PCRE2_UCP is set. */45204521case ESC_d:4522case ESC_D:4523case ESC_s:4524case ESC_S:4525case ESC_w:4526case ESC_W:4527parsed_pattern = handle_escdsw(escape, parsed_pattern, options,4528xoptions);4529break;45304531/* Explicit Unicode property matching */45324533case ESC_P:4534case ESC_p:4535#ifdef SUPPORT_UNICODE4536{4537BOOL negated;4538uint16_t ptype = 0, pdata = 0;4539if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))4540goto FAILED;45414542/* In caseless matching, particular characteristics Lu, Ll, and Lt4543get converted to the general characteristic L&. That is, upper,4544lower, and title case letters are all conflated. */45454546if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&4547(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))4548{4549ptype = PT_LAMP;4550pdata = 0;4551}45524553if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;4554*parsed_pattern++ = META_ESCAPE + escape;4555*parsed_pattern++ = (ptype << 16) | pdata;4556}4557#else4558errorcode = ERR45;4559goto FAILED;4560#endif4561break; /* End \P and \p */45624563/* All others are not allowed in a class */45644565/* LCOV_EXCL_START */4566default:4567PCRE2_DEBUG_UNREACHABLE();4568PCRE2_FALLTHROUGH /* Fall through */4569/* LCOV_EXCL_STOP */45704571case ESC_A:4572case ESC_Z:4573case ESC_z:4574case ESC_G:4575case ESC_K:4576case ESC_C:4577errorcode = ERR7;4578goto FAILED;4579}45804581/* All the switch-cases above which end in "break" describe a set4582of characters. None may start a range. */45834584/* The second part of a range can be a single-character escape4585sequence (detected above), but not any of the other escapes. Perl4586treats a hyphen as a literal in such circumstances. However, in Perl's4587warning mode, a warning is given, so PCRE now faults it, as it is4588almost certainly a mistake on the user's part. */45894590if (class_range_state == RANGE_STARTED)4591{4592errorcode = ERR50;4593goto FAILED;4594}45954596/* Perl gives a warning unless the hyphen following a multi-character4597escape is the last character in the class. PCRE throws an error. */45984599if (class_range_state == RANGE_FORBID_STARTED)4600{4601ptr = class_range_forbid_ptr;4602errorcode = ERR50;4603goto FAILED;4604}46054606/* Disallow implicit union in Perl extended classes. */46074608if (class_op_state == CLASS_OP_OPERAND &&4609class_mode_state == CLASS_MODE_PERL_EXT)4610{4611errorcode = ERR113;4612goto FAILED;4613}46144615class_range_state = RANGE_FORBID_NO;4616class_op_state = CLASS_OP_OPERAND;4617}46184619/* Forbid unescaped literals, and the special meaning of '-', inside a4620Perl extended class. */46214622else if (class_mode_state == CLASS_MODE_PERL_EXT)4623{4624errorcode = ERR116;4625goto FAILED;4626}46274628/* Handle potential start of range */46294630else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)4631{4632*parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?4633META_RANGE_LITERAL : META_RANGE_ESCAPED;4634class_range_state = RANGE_STARTED;4635}46364637/* Handle forbidden start of range */46384639else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)4640{4641*parsed_pattern++ = CHAR_MINUS;4642class_range_state = RANGE_FORBID_STARTED;4643class_range_forbid_ptr = ptr;4644}46454646/* Handle a literal character */46474648else4649{4650CLASS_LITERAL:46514652/* Disallow implicit union in Perl extended classes. */46534654if (class_op_state == CLASS_OP_OPERAND &&4655class_mode_state == CLASS_MODE_PERL_EXT)4656{4657errorcode = ERR113;4658goto FAILED;4659}46604661if (class_range_state == RANGE_STARTED)4662{4663if (c == parsed_pattern[-2]) /* Optimize one-char range */4664parsed_pattern--;4665else if (parsed_pattern[-2] > c) /* Check range is in order */4666{4667errorcode = ERR8;4668goto FAILED;4669}4670else4671{4672if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)4673parsed_pattern[-1] = META_RANGE_ESCAPED;4674PARSED_LITERAL(c, parsed_pattern);4675}4676class_range_state = RANGE_NO;4677class_op_state = CLASS_OP_OPERAND;4678}4679else if (class_range_state == RANGE_FORBID_STARTED)4680{4681ptr = class_range_forbid_ptr;4682errorcode = ERR50;4683goto FAILED;4684}4685else /* Potential start of range */4686{4687class_range_state = char_is_literal?4688RANGE_OK_LITERAL : RANGE_OK_ESCAPED;4689class_op_state = CLASS_OP_OPERAND;4690PARSED_LITERAL(c, parsed_pattern);4691}4692}46934694/* Proceed to next thing in the class. */46954696CLASS_CONTINUE:4697if (ptr >= ptrend)4698{4699if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)4700errorcode = ERR14; /* Missing terminating ')' */4701if (class_mode_state == CLASS_MODE_ALT_EXT &&4702class_depth_m1 == 0 && class_maxdepth_m1 == 1)4703errorcode = ERR112; /* Missing terminating ']', but we saw '[ [ ]...' */4704else4705errorcode = ERR6; /* Missing terminating ']' */4706goto FAILED;4707}4708GETCHARINCTEST(c, ptr);4709} /* End of class-processing loop */47104711break; /* End of character class */471247134714/* ---- Opening parenthesis ---- */47154716case CHAR_LEFT_PARENTHESIS:4717if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;47184719/* If ( is not followed by ? it is either a capture or a special verb or an4720alpha assertion or a positive non-atomic lookahead. */47214722if (*ptr != CHAR_QUESTION_MARK)4723{4724const char *vn;47254726/* Handle capturing brackets (or non-capturing if auto-capture is turned4727off). */47284729if (*ptr != CHAR_ASTERISK)4730{4731nest_depth++;4732if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)4733{4734if (cb->bracount >= MAX_GROUP_NUMBER)4735{4736errorcode = ERR97;4737goto FAILED;4738}4739cb->bracount++;4740*parsed_pattern++ = META_CAPTURE | cb->bracount;4741}4742else *parsed_pattern++ = META_NOCAPTURE;4743}47444745/* Do nothing for (* followed by end of pattern or ) so it gives a "bad4746quantifier" error rather than "(*MARK) must have an argument". */47474748else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)4749break;47504751/* Handle "alpha assertions" such as (*pla:...). Most of these are4752synonyms for the historical symbolic assertions, but the script run and4753non-atomic lookaround ones are new. They are distinguished by starting4754with a lower case letter. Checking both ends of the alphabet makes this4755work in all character codes. */47564757else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)4758{4759uint32_t meta;47604761vn = alasnames;4762if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,4763&errorcode, cb)) goto FAILED;4764if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;4765if (*ptr != CHAR_COLON)4766{4767errorcode = ERR95; /* Malformed */4768goto FAILED_FORWARD;4769}47704771/* Scan the table of alpha assertion names */47724773for (i = 0; i < alascount; i++)4774{4775if (namelen == alasmeta[i].len &&4776PRIV(strncmp_c8)(name, vn, namelen) == 0)4777break;4778vn += alasmeta[i].len + 1;4779}47804781if (i >= alascount)4782{4783errorcode = ERR95; /* Alpha assertion not recognized */4784goto FAILED;4785}47864787/* Check for expecting an assertion condition. If so, only atomic4788lookaround assertions are valid. */47894790meta = alasmeta[i].meta;4791if (prev_expect_cond_assert > 0 &&4792(meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))4793{4794errorcode = ERR28; /* Atomic assertion expected */4795goto FAILED;4796}47974798/* The lookaround alphabetic synonyms can mostly be handled by jumping4799to the code that handles the traditional symbolic forms. */48004801switch(meta)4802{4803/* LCOV_EXCL_START */4804default:4805PCRE2_DEBUG_UNREACHABLE();4806errorcode = ERR89; /* Unknown code; should never occur because */4807goto FAILED; /* the meta values come from a table above. */4808/* LCOV_EXCL_STOP */48094810case META_ATOMIC:4811goto ATOMIC_GROUP;48124813case META_LOOKAHEAD:4814goto POSITIVE_LOOK_AHEAD;48154816case META_LOOKAHEAD_NA:4817goto POSITIVE_NONATOMIC_LOOK_AHEAD;48184819case META_LOOKAHEADNOT:4820goto NEGATIVE_LOOK_AHEAD;48214822case META_SCS:4823ptr++;4824*parsed_pattern++ = META_SCS;48254826parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,48270, &errorcode, cb);4828if (parsed_pattern == NULL) goto FAILED;4829goto POST_ASSERTION;48304831case META_LOOKBEHIND:4832case META_LOOKBEHINDNOT:4833case META_LOOKBEHIND_NA:4834*parsed_pattern++ = meta;4835ptr--;4836goto POST_LOOKBEHIND;48374838/* The script run facilities are handled here. Unicode support is4839required (give an error if not, as this is a security issue). Always4840record a META_SCRIPT_RUN item. Then, for the atomic version, insert4841META_ATOMIC and remember that we need two META_KETs at the end. */48424843case META_SCRIPT_RUN:4844case META_ATOMIC_SCRIPT_RUN:4845#ifdef SUPPORT_UNICODE4846*parsed_pattern++ = META_SCRIPT_RUN;4847nest_depth++;4848ptr++;4849if (meta == META_ATOMIC_SCRIPT_RUN)4850{4851*parsed_pattern++ = META_ATOMIC;4852if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);4853else if (++top_nest >= end_nests)4854{4855errorcode = ERR84;4856goto FAILED;4857}4858top_nest->nest_depth = nest_depth;4859top_nest->flags = NSF_ATOMICSR;4860top_nest->options = options & PARSE_TRACKED_OPTIONS;4861top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;48624863#ifdef PCRE2_DEBUG4864/* We'll write out two META_KETs for a single ")" in the input4865pattern, so we reserve space for that in our bounds check. */4866parsed_pattern_extra++;4867#endif4868}4869break;4870#else /* SUPPORT_UNICODE */4871errorcode = ERR96;4872goto FAILED;4873#endif4874}4875}487648774878/* ---- Handle (*VERB) and (*VERB:NAME) ---- */48794880else4881{4882vn = verbnames;4883if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,4884&errorcode, cb)) goto FAILED;4885if (ptr >= ptrend || (*ptr != CHAR_COLON &&4886*ptr != CHAR_RIGHT_PARENTHESIS))4887{4888errorcode = ERR60; /* Malformed */4889goto FAILED;4890}48914892/* Scan the table of verb names */48934894for (i = 0; i < verbcount; i++)4895{4896if (namelen == verbs[i].len &&4897PRIV(strncmp_c8)(name, vn, namelen) == 0)4898break;4899vn += verbs[i].len + 1;4900}49014902if (i >= verbcount)4903{4904errorcode = ERR60; /* Verb not recognized */4905goto FAILED;4906}49074908/* An empty argument is treated as no argument. */49094910if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&4911ptr[1] == CHAR_RIGHT_PARENTHESIS)4912ptr++; /* Advance to the closing parens */49134914/* Check for mandatory non-empty argument; this is (*MARK) */49154916if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)4917{4918errorcode = ERR66;4919goto FAILED;4920}49214922/* Remember where this verb, possibly with a preceding (*MARK), starts,4923for handling quantified (*ACCEPT). */49244925verbstartptr = parsed_pattern;4926okquantifier = (verbs[i].meta == META_ACCEPT);4927#ifdef PCRE2_DEBUG4928/* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)4929with a non-capturing bracket, if there is a following quantifier. */4930if (okquantifier) parsed_pattern_extra += 2;4931#endif49324933/* It appears that Perl allows any characters whatsoever, other than a4934closing parenthesis, to appear in arguments ("names"), so we no longer4935insist on letters, digits, and underscores. Perl does not, however, do4936any interpretation within arguments, and has no means of including a4937closing parenthesis. PCRE supports escape processing but only when it4938is requested by an option. We set inverbname TRUE here, and let the4939main loop take care of this so that escape and \x processing is done by4940the main code above. */49414942if (*ptr++ == CHAR_COLON) /* Skip past : or ) */4943{4944/* Some optional arguments can be treated as a preceding (*MARK) */49454946if (verbs[i].has_arg < 0)4947{4948add_after_mark = verbs[i].meta;4949*parsed_pattern++ = META_MARK;4950}49514952/* The remaining verbs with arguments (except *MARK) need a different4953opcode. */49544955else4956{4957*parsed_pattern++ = verbs[i].meta +4958((verbs[i].meta != META_MARK)? 0x00010000u:0);4959}49604961/* Set up for reading the name in the main loop. */49624963verblengthptr = parsed_pattern++;4964verbnamestart = ptr;4965inverbname = TRUE;4966}4967else /* No verb "name" argument */4968{4969*parsed_pattern++ = verbs[i].meta;4970}4971} /* End of (*VERB) handling */4972break; /* Done with this parenthesis */4973} /* End of groups that don't start with (? */497449754976/* ---- Items starting (? ---- */49774978/* The type of item is determined by what follows (?. Handle (?| and option4979changes under "default" because both need a new block on the nest stack.4980Comments starting with (?# are handled above. Note that there is some4981ambiguity about the sequence (?- because if a digit follows it's a relative4982recursion or subroutine call whereas otherwise it's an option unsetting. */49834984if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;49854986switch(*ptr)4987{4988default:4989if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))4990goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */49914992/* We now have either (?| or a (possibly empty) option setting,4993optionally followed by a non-capturing group. */49944995nest_depth++;4996if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);4997else if (++top_nest >= end_nests)4998{4999errorcode = ERR84;5000goto FAILED;5001}5002top_nest->nest_depth = nest_depth;5003top_nest->flags = 0;5004top_nest->options = options & PARSE_TRACKED_OPTIONS;5005top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;50065007/* Start of non-capturing group that resets the capture count for each5008branch. */50095010if (*ptr == CHAR_VERTICAL_LINE)5011{5012top_nest->reset_group = (uint16_t)cb->bracount;5013top_nest->max_group = (uint16_t)cb->bracount;5014top_nest->flags |= NSF_RESET;5015cb->external_flags |= PCRE2_DUPCAPUSED;5016*parsed_pattern++ = META_NOCAPTURE;5017ptr++;5018}50195020/* Scan for options imnrsxJU to be set or unset. */50215022else5023{5024BOOL hyphenok = TRUE;5025uint32_t oldoptions = options;5026uint32_t oldxoptions = xoptions;50275028top_nest->reset_group = 0;5029top_nest->max_group = 0;5030set = unset = 0;5031optset = &set;5032xset = xunset = 0;5033xoptset = &xset;50345035/* ^ at the start unsets irmnsx and disables the subsequent use of - */50365037if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)5038{5039options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|5040PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);5041xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);5042hyphenok = FALSE;5043ptr++;5044}50455046while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&5047*ptr != CHAR_COLON)5048{5049switch (*ptr++)5050{5051case CHAR_MINUS:5052if (!hyphenok)5053{5054errorcode = ERR94;5055goto FAILED;5056}5057optset = &unset;5058xoptset = &xunset;5059hyphenok = FALSE;5060break;50615062/* There are some two-character sequences that start with 'a'. */50635064case CHAR_a:5065if (ptr < ptrend)5066{5067if (*ptr == CHAR_D)5068{5069*xoptset |= PCRE2_EXTRA_ASCII_BSD;5070ptr++;5071break;5072}5073if (*ptr == CHAR_P)5074{5075*xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);5076ptr++;5077break;5078}5079if (*ptr == CHAR_S)5080{5081*xoptset |= PCRE2_EXTRA_ASCII_BSS;5082ptr++;5083break;5084}5085if (*ptr == CHAR_T)5086{5087*xoptset |= PCRE2_EXTRA_ASCII_DIGIT;5088ptr++;5089break;5090}5091if (*ptr == CHAR_W)5092{5093*xoptset |= PCRE2_EXTRA_ASCII_BSW;5094ptr++;5095break;5096}5097}5098*xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|5099PCRE2_EXTRA_ASCII_BSW|5100PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;5101break;51025103case CHAR_J: /* Record that it changed in the external options */5104*optset |= PCRE2_DUPNAMES;5105cb->external_flags |= PCRE2_JCHANGED;5106break;51075108case CHAR_i: *optset |= PCRE2_CASELESS; break;5109case CHAR_m: *optset |= PCRE2_MULTILINE; break;5110case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;5111case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;5112case CHAR_s: *optset |= PCRE2_DOTALL; break;5113case CHAR_U: *optset |= PCRE2_UNGREEDY; break;51145115/* If x appears twice it sets the extended extended option. */51165117case CHAR_x:5118*optset |= PCRE2_EXTENDED;5119if (ptr < ptrend && *ptr == CHAR_x)5120{5121*optset |= PCRE2_EXTENDED_MORE;5122ptr++;5123}5124break;51255126default:5127errorcode = ERR11;5128goto FAILED;5129}5130}51315132/* If we are setting extended without extended-more, ensure that any5133existing extended-more gets unset. Also, unsetting extended must also5134unset extended-more. */51355136if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||5137(unset & PCRE2_EXTENDED) != 0)5138unset |= PCRE2_EXTENDED_MORE;51395140options = (options | set) & (~unset);5141xoptions = (xoptions | xset) & (~xunset);51425143/* If the options ended with ')' this is not the start of a nested5144group with option changes, so the options change at this level.5145In this case, if the previous level set up a nest block, discard the5146one we have just created. Otherwise adjust it for the previous level.5147If the options ended with ':' we are starting a non-capturing group,5148possibly with an options setting. */51495150if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;5151if (*ptr++ == CHAR_RIGHT_PARENTHESIS)5152{5153nest_depth--; /* This is not a nested group after all. */5154if (top_nest > (nest_save *)(cb->start_workspace) &&5155(top_nest-1)->nest_depth == nest_depth) top_nest--;5156else top_nest->nest_depth = nest_depth;5157}5158else *parsed_pattern++ = META_NOCAPTURE;51595160/* If nothing changed, no need to record. */51615162if (options != oldoptions || xoptions != oldxoptions)5163{5164*parsed_pattern++ = META_OPTIONS;5165*parsed_pattern++ = options;5166*parsed_pattern++ = xoptions;5167}5168} /* End options processing */5169break; /* End default case after (? */517051715172/* ---- Python syntax support ---- */51735174case CHAR_P:5175if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;51765177/* (?P<name> is the same as (?<name>, which defines a named group. */51785179if (*ptr == CHAR_LESS_THAN_SIGN)5180{5181terminator = CHAR_GREATER_THAN_SIGN;5182goto DEFINE_NAME;5183}51845185/* (?P>name) is the same as (?&name), which is a recursion or subroutine5186call. */51875188if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;51895190/* (?P=name) is the same as \k<name>, a back reference by name. Anything5191else after (?P is an error. */51925193if (*ptr != CHAR_EQUALS_SIGN)5194{5195errorcode = ERR41;5196goto FAILED_FORWARD;5197}5198if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,5199&namelen, &errorcode, cb)) goto FAILED;5200*parsed_pattern++ = META_BACKREF_BYNAME;5201*parsed_pattern++ = namelen;5202PUTOFFSET(offset, parsed_pattern);5203okquantifier = TRUE;5204break; /* End of (?P processing */520552065207/* ---- Recursion/subroutine calls by number ---- */52085209case CHAR_R:5210i = 0; /* (?R) == (?R0) */5211ptr++;5212if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))5213{5214errorcode = ERR58;5215goto FAILED;5216}5217terminator = CHAR_NUL;5218goto SET_RECURSION;52195220/* An item starting (?- followed by a digit comes here via the "default"5221case because (?- followed by a non-digit is an options setting. */52225223case CHAR_PLUS:5224if (ptr + 1 >= ptrend)5225{5226++ptr;5227goto UNCLOSED_PARENTHESIS;5228}5229if (!IS_DIGIT(ptr[1]))5230{5231errorcode = ERR29; /* Missing number */5232++ptr;5233goto FAILED_FORWARD;5234}5235PCRE2_FALLTHROUGH /* Fall through */52365237case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:5238case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:5239RECURSION_BYNUMBER:5240if (!read_number(&ptr, ptrend,5241(IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */5242MAX_GROUP_NUMBER, ERR61,5243&i, &errorcode)) goto FAILED;5244PCRE2_ASSERT(i >= 0); /* NB (?0) is permitted, represented by i=0 */5245terminator = CHAR_NUL;52465247SET_RECURSION:5248*parsed_pattern++ = META_RECURSE | (uint32_t)i;5249offset = (PCRE2_SIZE)(ptr - cb->start_pattern);5250/* End of recursive call by number handling */5251goto READ_RECURSION_ARGUMENTS;525252535254/* ---- Recursion/subroutine calls by name ---- */52555256case CHAR_AMPERSAND:5257RECURSE_BY_NAME:5258if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,5259&namelen, &errorcode, cb)) goto FAILED;5260*parsed_pattern++ = META_RECURSE_BYNAME;5261*parsed_pattern++ = namelen;5262terminator = CHAR_NUL;52635264READ_RECURSION_ARGUMENTS:5265PUTOFFSET(offset, parsed_pattern);5266okquantifier = TRUE;52675268/* Arguments are not supported for \g construct. */5269if (terminator != CHAR_NUL) break;52705271if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)5272{5273parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,5274offset, &errorcode, cb);5275if (parsed_pattern == NULL) goto FAILED;5276}52775278if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5279goto UNCLOSED_PARENTHESIS;52805281ptr++;5282break;52835284/* ---- Callout with numerical or string argument ---- */52855286case CHAR_C:5287if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)5288{5289ptr++;5290errorcode = ERR103;5291goto FAILED;5292}52935294if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;52955296/* If the previous item was a condition starting (?(? an assertion,5297optionally preceded by a callout, is expected. This is checked later on,5298during actual compilation. However we need to identify this kind of5299assertion in this pass because it must not be qualified. The value of5300expect_cond_assert is set to 2 after (?(? is processed. We decrement it5301for a callout - still leaving a positive value that identifies the5302assertion. Multiple callouts or any other items will make it zero or5303less, which doesn't matter because they will cause an error later. */53045305expect_cond_assert = prev_expect_cond_assert - 1;53065307/* If previous_callout is not NULL, it means this follows a previous5308callout. If it was a manual callout, do nothing; this means its "length5309of next pattern item" field will remain zero. If it was an automatic5310callout, abolish it. */53115312if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&5313previous_callout == parsed_pattern - 4 &&5314parsed_pattern[-1] == 255)5315parsed_pattern = previous_callout;53165317/* Save for updating next pattern item length, and skip one item before5318completing. */53195320previous_callout = parsed_pattern;5321after_manual_callout = 1;53225323/* Handle a string argument; specific delimiter is required. */53245325if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))5326{5327PCRE2_SIZE calloutlength;5328PCRE2_SPTR startptr = ptr;53295330delimiter = 0;5331for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)5332{5333if (*ptr == PRIV(callout_start_delims)[i])5334{5335delimiter = PRIV(callout_end_delims)[i];5336break;5337}5338}5339if (delimiter == 0)5340{5341errorcode = ERR82;5342goto FAILED_FORWARD;5343}53445345*parsed_pattern = META_CALLOUT_STRING;5346parsed_pattern += 3; /* Skip pattern info */53475348for (;;)5349{5350if (++ptr >= ptrend)5351{5352errorcode = ERR81;5353ptr = startptr; /* To give a more useful message */5354goto FAILED;5355}5356if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))5357break;5358}53595360calloutlength = (PCRE2_SIZE)(ptr - startptr);5361if (calloutlength > UINT32_MAX)5362{5363errorcode = ERR72;5364goto FAILED;5365}5366*parsed_pattern++ = (uint32_t)calloutlength;5367offset = (PCRE2_SIZE)(startptr - cb->start_pattern);5368PUTOFFSET(offset, parsed_pattern);5369}53705371/* Handle a callout with an optional numerical argument, which must be5372less than or equal to 255. A missing argument gives 0. */53735374else5375{5376int n = 0;5377*parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */5378parsed_pattern += 3; /* Skip pattern info */5379while (ptr < ptrend && IS_DIGIT(*ptr))5380{5381n = n * 10 + (*ptr++ - CHAR_0);5382if (n > 255)5383{5384errorcode = ERR38;5385goto FAILED;5386}5387}5388*parsed_pattern++ = n;5389}53905391/* Both formats must have a closing parenthesis */53925393if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5394{5395errorcode = ERR39;5396goto FAILED;5397}5398ptr++;53995400/* Remember the offset to the next item in the pattern, and set a default5401length. This should get updated after the next item is read. */54025403previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);5404previous_callout[2] = 0;5405break; /* End callout */540654075408/* ---- Conditional group ---- */54095410/* A condition can be an assertion, a number (referring to a numbered5411group's having been set), a name (referring to a named group), or 'R',5412referring to overall recursion. R<digits> and R&name are also permitted5413for recursion state tests. Numbers may be preceded by + or - to specify a5414relative group number.54155416There are several syntaxes for testing a named group: (?(name)) is used5417by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).54185419There are two unfortunate ambiguities. 'R' can be the recursive thing or5420the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be5421the Perl DEFINE feature or the Python named test. We look for a name5422first; if not found, we try the other case.54235424For compatibility with auto-callouts, we allow a callout to be specified5425before a condition that is an assertion. */54265427case CHAR_LEFT_PARENTHESIS:5428if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;5429nest_depth++;54305431/* If the next character is ? or * there must be an assertion next5432(optionally preceded by a callout). We do not check this here, but5433instead we set expect_cond_assert to 2. If this is still greater than5434zero (callouts decrement it) when the next assertion is read, it will be5435marked as a condition that must not be repeated. A value greater than5436zero also causes checking that an assertion (possibly with callout)5437follows. */54385439if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)5440{5441*parsed_pattern++ = META_COND_ASSERT;5442ptr--; /* Pull pointer back to the opening parenthesis. */5443expect_cond_assert = 2;5444break; /* End of conditional */5445}54465447/* Handle (?([+-]number)... */54485449if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,5450&errorcode))5451{5452PCRE2_ASSERT(i >= 0);5453if (i <= 0)5454{5455errorcode = ERR15;5456goto FAILED;5457}5458*parsed_pattern++ = META_COND_NUMBER;5459offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);5460PUTOFFSET(offset, parsed_pattern);5461*parsed_pattern++ = i;5462}5463else if (errorcode != 0) goto FAILED; /* Number too big */54645465/* No number found. Handle the special case (?(VERSION[>]=n.m)... */54665467else if (ptrend - ptr >= 10 &&5468PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&5469ptr[7] != CHAR_RIGHT_PARENTHESIS)5470{5471uint32_t ge = 0;5472int major = 0;5473int minor = 0;54745475ptr += 7;5476if (*ptr == CHAR_GREATER_THAN_SIGN)5477{5478ge = 1;5479ptr++;5480}54815482/* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT5483references its argument twice. */54845485if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))5486{5487errorcode = ERR79;5488if (!ge) goto FAILED_FORWARD;5489goto FAILED;5490}54915492if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))5493goto FAILED;54945495if (ptr < ptrend && *ptr == CHAR_DOT)5496{5497if (++ptr >= ptrend || !IS_DIGIT(*ptr))5498{5499errorcode = ERR79;5500if (ptr < ptrend) goto FAILED_FORWARD;5501goto FAILED;5502}5503if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &minor, &errorcode))5504goto FAILED;5505}5506if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5507{5508errorcode = ERR79;5509if (ptr < ptrend) goto FAILED_FORWARD;5510goto FAILED;5511}55125513*parsed_pattern++ = META_COND_VERSION;5514*parsed_pattern++ = ge;5515*parsed_pattern++ = major;5516*parsed_pattern++ = minor;5517}55185519/* All the remaining cases now require us to read a name. We cannot at5520this stage distinguish ambiguous cases such as (?(R12) which might be a5521recursion test by number or a name, because the named groups have not yet5522all been identified. Those cases are treated as names, but given a5523different META code. */55245525else5526{5527BOOL was_r_ampersand = FALSE;55285529if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)5530{5531terminator = CHAR_RIGHT_PARENTHESIS;5532was_r_ampersand = TRUE;5533ptr++;5534}5535else if (*ptr == CHAR_LESS_THAN_SIGN)5536terminator = CHAR_GREATER_THAN_SIGN;5537else if (*ptr == CHAR_APOSTROPHE)5538terminator = CHAR_APOSTROPHE;5539else5540{5541terminator = CHAR_RIGHT_PARENTHESIS;5542ptr--; /* Point to char before name */5543}55445545if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,5546&errorcode, cb)) goto FAILED;55475548/* Handle (?(R&name) */55495550if (was_r_ampersand)5551{5552*parsed_pattern = META_COND_RNAME;5553ptr--; /* Back to closing parens */5554}55555556/* Handle (?(name). If the name is "DEFINE" we identify it with a5557special code. Likewise if the name consists of R followed only by5558digits. Otherwise, handle it like a quoted name. */55595560else if (terminator == CHAR_RIGHT_PARENTHESIS)5561{5562if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)5563*parsed_pattern = META_COND_DEFINE;5564else5565{5566for (i = 1; i < (int)namelen; i++)5567if (!IS_DIGIT(name[i])) break;5568*parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?5569META_COND_RNUMBER : META_COND_NAME;5570}5571ptr--; /* Back to closing parens */5572}55735574/* Handle (?('name') or (?(<name>) */55755576else *parsed_pattern = META_COND_NAME;55775578/* All these cases except DEFINE end with the name length and offset;5579DEFINE just has an offset (for the "too many branches" error). */55805581if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;5582PUTOFFSET(offset, parsed_pattern);5583} /* End cases that read a name */55845585/* Check the closing parenthesis of the condition */55865587if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5588{5589errorcode = ERR24;5590goto FAILED;5591}5592ptr++;5593break; /* End of condition processing */559455955596/* ---- Atomic group ---- */55975598case CHAR_GREATER_THAN_SIGN:5599ATOMIC_GROUP: /* Come from (*atomic: */5600*parsed_pattern++ = META_ATOMIC;5601nest_depth++;5602ptr++;5603break;560456055606/* ---- Lookahead assertions ---- */56075608case CHAR_EQUALS_SIGN:5609POSITIVE_LOOK_AHEAD: /* Come from (*pla: */5610*parsed_pattern++ = META_LOOKAHEAD;5611ptr++;5612goto POST_ASSERTION;56135614case CHAR_ASTERISK:5615POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (*napla: */5616*parsed_pattern++ = META_LOOKAHEAD_NA;5617ptr++;5618goto POST_ASSERTION;56195620case CHAR_EXCLAMATION_MARK:5621NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */5622*parsed_pattern++ = META_LOOKAHEADNOT;5623ptr++;5624goto POST_ASSERTION;562556265627/* ---- Lookbehind assertions ---- */56285629/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<5630is the start of the name of a capturing group. */56315632case CHAR_LESS_THAN_SIGN:5633if (ptrend - ptr <= 1 ||5634(ptr[1] != CHAR_EQUALS_SIGN &&5635ptr[1] != CHAR_EXCLAMATION_MARK &&5636ptr[1] != CHAR_ASTERISK))5637{5638terminator = CHAR_GREATER_THAN_SIGN;5639goto DEFINE_NAME;5640}5641*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?5642META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?5643META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;56445645POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */5646*has_lookbehind = TRUE;5647offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);5648PUTOFFSET(offset, parsed_pattern);5649ptr += 2;5650/* Fall through */56515652/* If the previous item was a condition starting (?(? an assertion,5653optionally preceded by a callout, is expected. This is checked later on,5654during actual compilation. However we need to identify this kind of5655assertion in this pass because it must not be qualified. The value of5656expect_cond_assert is set to 2 after (?(? is processed. We decrement it5657for a callout - still leaving a positive value that identifies the5658assertion. Multiple callouts or any other items will make it zero or5659less, which doesn't matter because they will cause an error later. */56605661POST_ASSERTION:5662nest_depth++;5663if (prev_expect_cond_assert > 0)5664{5665if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);5666else if (++top_nest >= end_nests)5667{5668errorcode = ERR84;5669goto FAILED;5670}5671top_nest->nest_depth = nest_depth;5672top_nest->flags = NSF_CONDASSERT;5673top_nest->options = options & PARSE_TRACKED_OPTIONS;5674top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;5675}5676break;567756785679/* ---- Define a named group ---- */56805681/* A named group may be defined as (?'name') or (?<name>). In the latter5682case we jump to DEFINE_NAME from the disambiguation of (?< above with the5683terminator set to '>'. */56845685case CHAR_APOSTROPHE:5686terminator = CHAR_APOSTROPHE; /* Terminator */56875688DEFINE_NAME:5689if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,5690&errorcode, cb)) goto FAILED;56915692/* We have a name for this capturing group. It is also assigned a number,5693which is its primary means of identification. */56945695if (cb->bracount >= MAX_GROUP_NUMBER)5696{5697errorcode = ERR97;5698goto FAILED;5699}5700cb->bracount++;5701*parsed_pattern++ = META_CAPTURE | cb->bracount;5702nest_depth++;57035704/* Check not too many names */57055706if (cb->names_found >= MAX_NAME_COUNT)5707{5708errorcode = ERR49;5709goto FAILED;5710}57115712/* Adjust the entry size to accommodate the longest name found. */57135714if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)5715cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);57165717/* Scan the list to check for duplicates. For duplicate names, if the5718number is the same, break the loop, which causes the name to be5719discarded; otherwise, if DUPNAMES is not set, give an error.5720If it is set, allow the name with a different number, but continue5721scanning in case this is a duplicate with the same number. For5722non-duplicate names, give an error if the number is duplicated. */57235724is_dupname = FALSE;5725hash = PRIV(compile_get_hash_from_name)(name, namelen);5726ng = cb->named_groups;5727for (i = 0; i < cb->names_found; i++, ng++)5728{5729if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&5730PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)5731{5732/* When a bracket is referenced by the same name multiple5733times, is not considered as a duplicate and ignored. */5734if (ng->number == cb->bracount) break;5735if ((options & PCRE2_DUPNAMES) == 0)5736{5737errorcode = ERR43;5738goto FAILED;5739}57405741ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;5742is_dupname = TRUE; /* Mark as a duplicate */5743cb->dupnames = TRUE; /* Duplicate names exist */57445745/* The entry represents a duplicate. */5746name = ng->name;5747namelen = 0;57485749/* Even duplicated names may refer to the same5750capture index. These references are also ignored. */5751for (; i < cb->names_found; i++, ng++)5752if (ng->name == name && ng->number == cb->bracount)5753break;5754break;5755}5756else if (ng->number == cb->bracount)5757{5758errorcode = ERR65;5759goto FAILED;5760}5761}57625763/* Ignore duplicate with same number. */5764if (i < cb->names_found) break;57655766/* Increase the list size if necessary */57675768if (cb->names_found >= cb->named_group_list_size)5769{5770uint32_t newsize = cb->named_group_list_size * 2;5771named_group *newspace =5772cb->cx->memctl.malloc(newsize * sizeof(named_group),5773cb->cx->memctl.memory_data);5774if (newspace == NULL)5775{5776errorcode = ERR21;5777goto FAILED;5778}57795780memcpy(newspace, cb->named_groups,5781cb->named_group_list_size * sizeof(named_group));5782if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)5783cb->cx->memctl.free((void *)cb->named_groups,5784cb->cx->memctl.memory_data);5785cb->named_groups = newspace;5786cb->named_group_list_size = newsize;5787}57885789/* Add this name to the list */5790if (is_dupname)5791hash |= NAMED_GROUP_IS_DUPNAME;57925793cb->named_groups[cb->names_found].name = name;5794cb->named_groups[cb->names_found].length = (uint16_t)namelen;5795cb->named_groups[cb->names_found].number = cb->bracount;5796cb->named_groups[cb->names_found].hash_dup = hash;5797cb->names_found++;5798break;579958005801/* ---- Perl extended character class ---- */58025803/* These are of the form '(?[...])'. We handle these via the same parser5804that consumes ordinary '[...]' classes, but with a flag set to activate5805the extended behaviour. */58065807case CHAR_LEFT_SQUARE_BRACKET:5808class_mode_state = CLASS_MODE_PERL_EXT;5809c = *ptr++;5810goto FROM_PERL_EXTENDED_CLASS;5811} /* End of (? switch */5812break; /* End of ( handling */581358145815/* ---- Branch terminators ---- */58165817/* Alternation: reset the capture count if we are in a (?| group. */58185819case CHAR_VERTICAL_LINE:5820if (top_nest != NULL && top_nest->nest_depth == nest_depth &&5821(top_nest->flags & NSF_RESET) != 0)5822{5823if (cb->bracount > top_nest->max_group)5824top_nest->max_group = (uint16_t)cb->bracount;5825cb->bracount = top_nest->reset_group;5826}5827*parsed_pattern++ = META_ALT;5828break;58295830/* End of group; reset the capture count to the maximum if we are in a (?|5831group and/or reset the options that are tracked during parsing. Disallow5832quantifier for a condition that is an assertion. */58335834case CHAR_RIGHT_PARENTHESIS:5835okquantifier = TRUE;5836if (top_nest != NULL && top_nest->nest_depth == nest_depth)5837{5838options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;5839xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;5840if ((top_nest->flags & NSF_RESET) != 0 &&5841top_nest->max_group > cb->bracount)5842cb->bracount = top_nest->max_group;5843if ((top_nest->flags & NSF_CONDASSERT) != 0)5844okquantifier = FALSE;58455846if ((top_nest->flags & NSF_ATOMICSR) != 0)5847{5848*parsed_pattern++ = META_KET;58495850#ifdef PCRE2_DEBUG5851PCRE2_ASSERT(parsed_pattern_extra > 0);5852parsed_pattern_extra--;5853#endif5854}58555856if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;5857else top_nest--;5858}5859if (nest_depth == 0) /* Unmatched closing parenthesis */5860{5861errorcode = ERR22;5862goto FAILED;5863}5864nest_depth--;5865*parsed_pattern++ = META_KET;5866break;5867} /* End of switch on pattern character */5868} /* End of main character scan loop */58695870/* End of pattern reached. Check for missing ) at the end of a verb name. */58715872if (inverbname && ptr >= ptrend)5873{5874errorcode = ERR60;5875goto FAILED;5876}587758785879PARSED_END:58805881PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +5882(parsed_pattern_extra - parsed_pattern_extra_check) <=5883max_parsed_pattern(ptr_check, ptr, utf, options));58845885/* Manage callout for the final item */58865887parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,5888parsed_pattern, cb);58895890/* Insert trailing items for word and line matching (features provided for the5891benefit of pcre2grep). */58925893if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)5894{5895*parsed_pattern++ = META_KET;5896*parsed_pattern++ = META_DOLLAR;5897}5898else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)5899{5900*parsed_pattern++ = META_KET;5901*parsed_pattern++ = META_ESCAPE + ESC_b;5902}59035904/* Terminate the parsed pattern, then return success if all groups are closed.5905Otherwise we have unclosed parentheses. */59065907/* LCOV_EXCL_START */5908if (parsed_pattern >= parsed_pattern_end)5909{5910PCRE2_DEBUG_UNREACHABLE();5911errorcode = ERR63; /* Internal error (parsed pattern overflow) */5912goto FAILED;5913}5914/* LCOV_EXCL_STOP */59155916*parsed_pattern = META_END;5917if (nest_depth == 0) return 0;59185919UNCLOSED_PARENTHESIS:5920errorcode = ERR14;59215922/* Come here for all failures. */59235924FAILED:5925cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);5926return errorcode;59275928/* Some errors need to indicate the previous character. */59295930FAILED_BACK:5931ptr--;5932#ifdef SUPPORT_UNICODE5933if (utf) BACKCHAR(ptr);5934#endif5935goto FAILED;59365937/* Some errors need to indicate the next character. */59385939FAILED_FORWARD:5940ptr++;5941#ifdef SUPPORT_UNICODE5942if (utf) FORWARDCHARTEST(ptr, ptrend);5943#endif5944goto FAILED;5945}5946594759485949/*************************************************5950* Find first significant opcode *5951*************************************************/59525953/* This is called by several functions that scan a compiled expression looking5954for a fixed first character, or an anchoring opcode etc. It skips over things5955that do not influence this. For some calls, it makes sense to skip negative5956forward and all backward assertions, and also the \b assertion; for others it5957does not.59585959Arguments:5960code pointer to the start of the group5961skipassert TRUE if certain assertions are to be skipped59625963Returns: pointer to the first significant opcode5964*/59655966static const PCRE2_UCHAR*5967first_significant_code(PCRE2_SPTR code, BOOL skipassert)5968{5969for (;;)5970{5971switch ((int)*code)5972{5973case OP_ASSERT_NOT:5974case OP_ASSERTBACK:5975case OP_ASSERTBACK_NOT:5976case OP_ASSERTBACK_NA:5977if (!skipassert) return code;5978do code += GET(code, 1); while (*code == OP_ALT);5979code += PRIV(OP_lengths)[*code];5980break;59815982case OP_WORD_BOUNDARY:5983case OP_NOT_WORD_BOUNDARY:5984case OP_UCP_WORD_BOUNDARY:5985case OP_NOT_UCP_WORD_BOUNDARY:5986if (!skipassert) return code;5987PCRE2_FALLTHROUGH /* Fall through */59885989case OP_CALLOUT:5990case OP_CREF:5991case OP_DNCREF:5992case OP_RREF:5993case OP_DNRREF:5994case OP_FALSE:5995case OP_TRUE:5996code += PRIV(OP_lengths)[*code];5997break;59985999case OP_CALLOUT_STR:6000code += GET(code, 1 + 2*LINK_SIZE);6001break;60026003case OP_SKIPZERO:6004code += 2 + GET(code, 2) + LINK_SIZE;6005break;60066007case OP_COND:6008case OP_SCOND:6009if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */6010code[GET(code, 1)] != OP_KET) /* More than one branch */6011return code;6012code += GET(code, 1) + 1 + LINK_SIZE;6013break;60146015case OP_MARK:6016case OP_COMMIT_ARG:6017case OP_PRUNE_ARG:6018case OP_SKIP_ARG:6019case OP_THEN_ARG:6020code += code[1] + PRIV(OP_lengths)[*code];6021break;60226023default:6024return code;6025}6026}60276028/* LCOV_EXCL_START */6029PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */6030/* LCOV_EXCL_STOP */6031}6032603360346035/*************************************************6036* Compile one branch *6037*************************************************/60386039/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If6040the options are changed during the branch, the pointer is used to change the6041external options bits. This function is used during the pre-compile phase when6042we are trying to find out the amount of memory needed, as well as during the6043real compile phase. The value of lengthptr distinguishes the two phases.60446045Arguments:6046optionsptr pointer to the option bits6047xoptionsptr pointer to the extra option bits6048codeptr points to the pointer to the current code point6049pptrptr points to the current parsed pattern pointer6050errorcodeptr points to error code variable6051firstcuptr place to put the first required code unit6052firstcuflagsptr place to put the first code unit flags6053reqcuptr place to put the last required code unit6054reqcuflagsptr place to put the last required code unit flags6055bcptr points to current branch chain6056open_caps points to current capitem6057cb contains pointers to tables etc.6058lengthptr NULL during the real compile phase6059points to length accumulator during pre-compile phase60606061Returns: 0 There's been an error, *errorcodeptr is non-zero6062+1 Success, this branch must match at least one character6063-1 Success, this branch may match an empty string6064*/60656066static int6067compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,6068PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,6069uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,6070uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,6071compile_block *cb, PCRE2_SIZE *lengthptr)6072{6073int bravalue = 0;6074int okreturn = -1;6075int group_return = 0;6076uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */6077uint32_t greedy_default, greedy_non_default;6078uint32_t repeat_type, op_type;6079uint32_t options = *optionsptr; /* May change dynamically */6080uint32_t xoptions = *xoptionsptr; /* May change dynamically */6081uint32_t firstcu, reqcu;6082uint32_t zeroreqcu, zerofirstcu;6083uint32_t *pptr = *pptrptr;6084uint32_t meta, meta_arg;6085uint32_t firstcuflags, reqcuflags;6086uint32_t zeroreqcuflags, zerofirstcuflags;6087uint32_t req_caseopt, reqvary, tempreqvary;6088/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,6089depends on the previous value of offset. */6090PCRE2_SIZE offset = 0;6091PCRE2_SIZE length_prevgroup = 0;6092PCRE2_UCHAR *code = *codeptr;6093PCRE2_UCHAR *last_code = code;6094PCRE2_UCHAR *orig_code = code;6095PCRE2_UCHAR *tempcode;6096PCRE2_UCHAR *previous = NULL;6097PCRE2_UCHAR op_previous;6098BOOL groupsetfirstcu = FALSE;6099BOOL had_accept = FALSE;6100BOOL matched_char = FALSE;6101BOOL previous_matched_char = FALSE;6102BOOL reset_caseful = FALSE;61036104/* We can fish out the UTF setting once and for all into a BOOL, but we must6105not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically6106as we process the pattern. */61076108#ifdef SUPPORT_UNICODE6109BOOL utf = (options & PCRE2_UTF) != 0;6110BOOL ucp = (options & PCRE2_UCP) != 0;6111#else /* No Unicode support */6112BOOL utf = FALSE;6113#endif61146115/* Set up the default and non-default settings for greediness */61166117greedy_default = ((options & PCRE2_UNGREEDY) != 0);6118greedy_non_default = greedy_default ^ 1;61196120/* Initialize no first unit, no required unit. REQ_UNSET means "no char6121matching encountered yet". It gets changed to REQ_NONE if we hit something that6122matches a non-fixed first unit; reqcu just remains unset if we never find one.61236124When we hit a repeat whose minimum is zero, we may have to adjust these values6125to take the zero repeat into account. This is implemented by setting them to6126zerofirstcu and zeroreqcu when such a repeat is encountered. The individual6127item types that can be repeated set these backoff variables appropriately. */61286129firstcu = reqcu = zerofirstcu = zeroreqcu = 0;6130firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;61316132/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,6133according to the current setting of the caseless flag. The REQ_CASELESS value6134leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables6135to record the case status of the value. This is used only for ASCII characters.6136*/61376138req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;61396140/* Switch on next META item until the end of the branch */61416142for (;; pptr++)6143{6144BOOL possessive_quantifier;6145BOOL note_group_empty;6146uint32_t mclength;6147uint32_t skipunits;6148uint32_t subreqcu, subfirstcu;6149uint32_t groupnumber;6150uint32_t verbarglen, verbculen;6151uint32_t subreqcuflags, subfirstcuflags;6152open_capitem *oc;6153PCRE2_UCHAR mcbuffer[8];61546155/* Get next META item in the pattern and its potential argument. */61566157meta = META_CODE(*pptr);6158meta_arg = META_DATA(*pptr);61596160/* If we are in the pre-compile phase, accumulate the length used for the6161previous cycle of this loop, unless the next item is a quantifier. */61626163if (lengthptr != NULL)6164{6165/* LCOV_EXCL_START */6166if (code >= cb->start_workspace + cb->workspace_size)6167{6168PCRE2_DEBUG_UNREACHABLE();6169*errorcodeptr = ERR52; /* Over-ran workspace - internal error */6170cb->erroroffset = 0;6171return 0;6172}6173/* LCOV_EXCL_STOP */61746175if (code > cb->start_workspace + cb->workspace_size -6176WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */6177{6178*errorcodeptr = ERR86; /* Pattern too complicated */6179cb->erroroffset = 0;6180return 0;6181}61826183/* There is at least one situation where code goes backwards: this is the6184case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier6185is processed, the whole class is eliminated. However, it is created first,6186so we have to allow memory for it. Therefore, don't ever reduce the length6187at this point. */61886189if (code < last_code) code = last_code;61906191/* If the next thing is not a quantifier, we add the length of the previous6192item into the total, and reset the code pointer to the start of the6193workspace. Otherwise leave the previous item available to be quantified. */61946195if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)6196{6197if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))6198{6199*errorcodeptr = ERR20; /* Integer overflow */6200cb->erroroffset = 0;6201return 0;6202}6203*lengthptr += (PCRE2_SIZE)(code - orig_code);6204if (*lengthptr > MAX_PATTERN_SIZE)6205{6206*errorcodeptr = ERR20; /* Pattern is too large */6207cb->erroroffset = 0;6208return 0;6209}6210code = orig_code;6211}62126213/* Remember where this code item starts so we can catch the "backwards"6214case above next time round. */62156216last_code = code;6217}62186219/* Process the next parsed pattern item. If it is not a quantifier, remember6220where it starts so that it can be quantified when a quantifier follows.6221Checking for the legality of quantifiers happens in parse_regex(), except for6222a quantifier after an assertion that is a condition. */62236224if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)6225{6226previous = code;6227if (matched_char && !had_accept) okreturn = 1;6228}62296230previous_matched_char = matched_char;6231matched_char = FALSE;6232note_group_empty = FALSE;6233skipunits = 0; /* Default value for most subgroups */62346235switch(meta)6236{6237/* ===================================================================*/6238/* The branch terminates at pattern end or | or ) */62396240case META_END:6241case META_ALT:6242case META_KET:6243*firstcuptr = firstcu;6244*firstcuflagsptr = firstcuflags;6245*reqcuptr = reqcu;6246*reqcuflagsptr = reqcuflags;6247*codeptr = code;6248*pptrptr = pptr;6249return okreturn;625062516252/* ===================================================================*/6253/* Handle single-character metacharacters. In multiline mode, ^ disables6254the setting of any following char as a first character. */62556256case META_CIRCUMFLEX:6257if ((options & PCRE2_MULTILINE) != 0)6258{6259if (firstcuflags == REQ_UNSET)6260zerofirstcuflags = firstcuflags = REQ_NONE;6261*code++ = OP_CIRCM;6262}6263else *code++ = OP_CIRC;6264break;62656266case META_DOLLAR:6267*code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;6268break;62696270/* There can never be a first char if '.' is first, whatever happens about6271repeats. The value of reqcu doesn't change either. */62726273case META_DOT:6274matched_char = TRUE;6275if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6276zerofirstcu = firstcu;6277zerofirstcuflags = firstcuflags;6278zeroreqcu = reqcu;6279zeroreqcuflags = reqcuflags;6280*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;6281break;628262836284/* ===================================================================*/6285/* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.6286Otherwise, an initial ']' is taken as a data character. When empty classes6287are allowed, [] must generate an empty class - we have no dedicated opcode6288to optimise the representation, but it's a rare case (the '(*FAIL)'6289construct would be a clearer way for a pattern author to represent a6290non-matching branch, but it does have different semantics to '[]' if both6291are followed by a quantifier). The empty-negated [^] matches any character,6292so is useful: generate OP_ALLANY for this. */62936294case META_CLASS_EMPTY:6295case META_CLASS_EMPTY_NOT:6296matched_char = TRUE;6297if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;6298else6299{6300*code++ = OP_CLASS;6301memset(code, 0, 32);6302code += 32 / sizeof(PCRE2_UCHAR);6303}63046305if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6306zerofirstcu = firstcu;6307zerofirstcuflags = firstcuflags;6308break;630963106311/* ===================================================================*/6312/* Non-empty character class. If the included characters are all < 256, we6313build a 32-byte bitmap of the permitted characters, except in the special6314case where there is only one such character. For negated classes, we build6315the map as usual, then invert it at the end. However, we use a different6316opcode so that data characters > 255 can be handled correctly.63176318If the class contains characters outside the 0-255 range, a different6319opcode is compiled. It may optionally have a bit map for characters < 256,6320but those above are explicitly listed afterwards. A flag code unit tells6321whether the bitmap is present, and whether this is a negated class or6322not. */63236324case META_CLASS_NOT:6325case META_CLASS:6326matched_char = TRUE;63276328/* Check for complex extended classes and handle them separately. */63296330if ((*pptr & CLASS_IS_ECLASS) != 0)6331{6332if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,6333errorcodeptr, cb, lengthptr))6334return 0;6335goto CLASS_END_PROCESSING;6336}63376338/* We can optimize the case of a single character in a class by generating6339OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's6340negative. In the negative case there can be no first char if this item is6341first, whatever repeat count may follow. In the case of reqcu, save the6342previous value for reinstating. */63436344/* NOTE: at present this optimization is not effective if the only6345character in a class in 32-bit, non-UCP mode has its top bit set. */63466347if (pptr[1] < META_END && pptr[2] == META_CLASS_END)6348{6349uint32_t c = pptr[1];63506351pptr += 2; /* Move on to class end */6352if (meta == META_CLASS) /* A positive one-char class can be */6353{ /* handled as a normal literal character. */6354meta = c; /* Set up the character */6355goto NORMAL_CHAR_SET;6356}63576358/* Handle a negative one-character class */63596360zeroreqcu = reqcu;6361zeroreqcuflags = reqcuflags;6362if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6363zerofirstcu = firstcu;6364zerofirstcuflags = firstcuflags;63656366/* For caseless UTF or UCP mode, check whether this character has more6367than one other case. If so, generate a special OP_NOTPROP item instead of6368OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any6369caseless set that starts with an ASCII character. If the character is6370affected by the special Turkish rules, hardcode the not-matching6371characters using a caseset. */63726373#ifdef SUPPORT_UNICODE6374if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)6375{6376uint32_t caseset;63776378if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==6379PCRE2_EXTRA_TURKISH_CASING &&6380UCD_ANY_I(c))6381{6382caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);6383}6384else if ((caseset = UCD_CASESET(c)) != 0 &&6385(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&6386PRIV(ucd_caseless_sets)[caseset] < 128)6387{6388caseset = 0; /* Ignore the caseless set if it's restricted. */6389}63906391if (caseset != 0)6392{6393*code++ = OP_NOTPROP;6394*code++ = PT_CLIST;6395*code++ = caseset;6396break; /* We are finished with this class */6397}6398}6399#endif6400/* Char has only one other (usable) case, or UCP not available */64016402*code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;6403code += PUTCHAR(c, code);6404break; /* We are finished with this class */6405} /* End of 1-char optimization */64066407/* Handle character classes that contain more than just one literal6408character. If there are exactly two characters in a positive class, see if6409they are case partners. This can be optimized to generate a caseless single6410character match (which also sets first/required code units if relevant).6411When casing restrictions apply, ignore a caseless set if both characters6412are ASCII. When Turkish casing applies, an 'i' does not match its normal6413Unicode "othercase". */64146415if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&6416pptr[3] == META_CLASS_END)6417{6418uint32_t c = pptr[1];64196420#ifdef SUPPORT_UNICODE6421if ((UCD_CASESET(c) == 0 ||6422((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&6423c < 128 && pptr[2] < 128)) &&6424!((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==6425PCRE2_EXTRA_TURKISH_CASING &&6426UCD_ANY_I(c)))6427#endif6428{6429uint32_t d;64306431#ifdef SUPPORT_UNICODE6432if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else6433#endif6434{6435#if PCRE2_CODE_UNIT_WIDTH != 86436if (c > 255) d = c; else6437#endif6438d = TABLE_GET(c, cb->fcc, c);6439}64406441if (c != d && pptr[2] == d)6442{6443pptr += 3; /* Move on to class end */6444meta = c;6445if ((options & PCRE2_CASELESS) == 0)6446{6447reset_caseful = TRUE;6448options |= PCRE2_CASELESS;6449req_caseopt = REQ_CASELESS;6450}6451goto CLASS_CASELESS_CHAR;6452}6453}6454}64556456/* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */64576458pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,6459&code, meta == META_CLASS_NOT, NULL,6460errorcodeptr, cb, lengthptr);6461if (pptr == NULL) return 0;6462PCRE2_ASSERT(*pptr == META_CLASS_END);64636464CLASS_END_PROCESSING:64656466/* If this class is the first thing in the branch, there can be no first6467char setting, whatever the repeat count. Any reqcu setting must remain6468unchanged after any kind of repeat. */64696470if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6471zerofirstcu = firstcu;6472zerofirstcuflags = firstcuflags;6473zeroreqcu = reqcu;6474zeroreqcuflags = reqcuflags;6475break; /* End of class processing */647664776478/* ===================================================================*/6479/* Deal with (*VERB)s. */64806481/* Check for open captures before ACCEPT and close those that are within6482the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an6483assertion. In the first pass, just accumulate the length required;6484otherwise hitting (*ACCEPT) inside many nested parentheses can cause6485workspace overflow. Do not set firstcu after *ACCEPT. */64866487case META_ACCEPT:6488cb->had_accept = had_accept = TRUE;6489for (oc = open_caps;6490oc != NULL && oc->assert_depth >= cb->assert_depth;6491oc = oc->next)6492{6493if (lengthptr != NULL)6494{6495*lengthptr += CU2BYTES(1) + IMM2_SIZE;6496}6497else6498{6499*code++ = OP_CLOSE;6500PUT2INC(code, 0, oc->number);6501}6502}6503*code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;6504if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6505break;65066507case META_PRUNE:6508case META_SKIP:6509cb->had_pruneorskip = TRUE;6510PCRE2_FALLTHROUGH /* Fall through */6511case META_COMMIT:6512case META_FAIL:6513*code++ = verbops[(meta - META_MARK) >> 16];6514break;65156516case META_THEN:6517cb->external_flags |= PCRE2_HASTHEN;6518*code++ = OP_THEN;6519break;65206521/* Handle verbs with arguments. Arguments can be very long, especially in652216- and 32-bit modes, and can overflow the workspace in the first pass.6523However, the argument length is constrained to be small enough to fit in6524one code unit. This check happens in parse_regex(). In the first pass,6525instead of putting the argument into memory, we just update the length6526counter and set up an empty argument. */65276528case META_THEN_ARG:6529cb->external_flags |= PCRE2_HASTHEN;6530goto VERB_ARG;65316532case META_PRUNE_ARG:6533case META_SKIP_ARG:6534cb->had_pruneorskip = TRUE;6535PCRE2_FALLTHROUGH /* Fall through */6536case META_MARK:6537case META_COMMIT_ARG:6538VERB_ARG:6539*code++ = verbops[(meta - META_MARK) >> 16];6540/* The length is in characters. */6541verbarglen = *(++pptr);6542verbculen = 0;6543tempcode = code++;6544for (int i = 0; i < (int)verbarglen; i++)6545{6546meta = *(++pptr);6547#ifdef SUPPORT_UNICODE6548if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else6549#endif6550{6551mclength = 1;6552mcbuffer[0] = meta;6553}6554if (lengthptr != NULL) *lengthptr += mclength; else6555{6556memcpy(code, mcbuffer, CU2BYTES(mclength));6557code += mclength;6558verbculen += mclength;6559}6560}65616562*tempcode = verbculen; /* Fill in the code unit length */6563*code++ = 0; /* Terminating zero */6564break;656565666567/* ===================================================================*/6568/* Handle options change. The new setting must be passed back for use in6569subsequent branches. Reset the greedy defaults and the case value for6570firstcu and reqcu. */65716572case META_OPTIONS:6573*optionsptr = options = *(++pptr);6574*xoptionsptr = xoptions = *(++pptr);6575greedy_default = ((options & PCRE2_UNGREEDY) != 0);6576greedy_non_default = greedy_default ^ 1;6577req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;6578break;65796580/* ===================================================================*/6581/* Handle scan substring. Scan substring assertion starts with META_SCS,6582which recursively calls compile_branch. The first opcode processed by6583this recursive call is always META_OFFSET. */65846585case META_OFFSET:6586if (lengthptr != NULL)6587{6588pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);6589if (pptr == NULL)6590return 0;6591break;6592}65936594while (TRUE)6595{6596int count, index;6597named_group *ng;65986599switch (META_CODE(*pptr))6600{6601case META_OFFSET:6602pptr++;6603SKIPOFFSET(pptr);6604continue;66056606case META_CAPTURE_NAME:6607ng = cb->named_groups + pptr[1];6608pptr += 2;6609count = 0;6610index = 0;66116612if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,6613&count, errorcodeptr, cb)) return 0;66146615code[0] = OP_DNCREF;6616PUT2(code, 1, index);6617PUT2(code, 1 + IMM2_SIZE, count);6618code += 1 + 2 * IMM2_SIZE;6619continue;66206621case META_CAPTURE_NUMBER:6622pptr += 2;6623if (pptr[-1] == 0) continue;66246625code[0] = OP_CREF;6626PUT2(code, 1, pptr[-1]);6627code += 1 + IMM2_SIZE;6628continue;66296630default:6631break;6632}66336634break;6635}6636--pptr;6637break;66386639case META_SCS:6640bravalue = OP_ASSERT_SCS;6641cb->assert_depth += 1;6642goto GROUP_PROCESS;664366446645/* ===================================================================*/6646/* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous6647because it could be a numerical check on recursion, or a name check on a6648group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that6649we can handle it either way. We first try for a name; if not found, process6650the number. */66516652case META_COND_RNUMBER: /* (?(Rdigits) */6653case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */6654case META_COND_RNAME: /* (?(R&name) - test for recursion */6655bravalue = OP_COND;66566657if (lengthptr != NULL)6658{6659uint32_t i;6660PCRE2_SPTR name;6661named_group *ng;6662uint32_t *start_pptr = pptr;6663uint32_t length = *(++pptr);66646665GETPLUSOFFSET(offset, pptr);6666name = cb->start_pattern + offset;66676668/* In the first pass, the names generated in the pre-pass are available,6669but the main name table has not yet been created. Scan the list of names6670generated in the pre-pass in order to get a number and whether or not6671this name is duplicated. If it is not duplicated, we can handle it as a6672numerical group. */66736674ng = PRIV(compile_find_named_group)(name, length, cb);66756676if (ng == NULL)6677{6678/* If the name was not found we have a bad reference, unless we are6679dealing with R<digits>, which is treated as a recursion test by6680number. */66816682groupnumber = 0;6683if (meta == META_COND_RNUMBER)6684{6685for (i = 1; i < length; i++)6686{6687groupnumber = groupnumber * 10 + (name[i] - CHAR_0);6688if (groupnumber > MAX_GROUP_NUMBER)6689{6690*errorcodeptr = ERR61;6691cb->erroroffset = offset + i;6692return 0;6693}6694}6695}66966697if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)6698{6699*errorcodeptr = ERR15;6700cb->erroroffset = offset;6701return 0;6702}67036704/* (?Rdigits) treated as a recursion reference by number. A value of6705zero (which is the result of both (?R) and (?R0)) means "any", and is6706translated into RREF_ANY (which is 0xffff). */67076708if (groupnumber == 0) groupnumber = RREF_ANY;6709PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);6710start_pptr[1] = groupnumber;6711skipunits = 1+IMM2_SIZE;6712goto GROUP_PROCESS_NOTE_EMPTY;6713}67146715/* From here on, we know we have a name (not a number),6716so treat META_COND_RNUMBER the same as META_COND_NAME. */6717if (meta == META_COND_RNUMBER) meta = META_COND_NAME;67186719if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)6720{6721/* Found a non-duplicated name. Since it is a global,6722it is enough to update it in the pre-processing phase. */6723if (ng->number > cb->top_backref) cb->top_backref = ng->number;67246725start_pptr[0] = meta;6726start_pptr[1] = ng->number;67276728skipunits = 1 + IMM2_SIZE;6729goto GROUP_PROCESS_NOTE_EMPTY;6730}67316732/* We have a duplicated name. In the compile pass we have to search the6733main table in order to get the index and count values. */67346735start_pptr[0] = meta | 1;6736start_pptr[1] = (uint32_t)(ng - cb->named_groups);67376738/* A duplicated name was found. Note that if an R<digits> name is found6739(META_COND_RNUMBER), it is a reference test, not a recursion test. */6740skipunits = 1 + 2 * IMM2_SIZE;6741}6742else6743{6744/* Otherwise lengthptr equals to NULL,6745which is the second phase of compilation. */6746int count, index;6747named_group *ng;67486749/* Generate code using the data6750collected in the pre-processing phase. */67516752if (meta == META_COND_RNUMBER)6753{6754code[1+LINK_SIZE] = OP_RREF;6755PUT2(code, 2 + LINK_SIZE, pptr[1]);6756skipunits = 1 + IMM2_SIZE;6757pptr += 1 + SIZEOFFSET;6758goto GROUP_PROCESS_NOTE_EMPTY;6759}67606761if (meta_arg == 0)6762{6763code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;6764PUT2(code, 2 + LINK_SIZE, pptr[1]);6765skipunits = 1 + IMM2_SIZE;6766pptr += 1 + SIZEOFFSET;6767goto GROUP_PROCESS_NOTE_EMPTY;6768}67696770ng = cb->named_groups + pptr[1];6771count = 0; /* Values for first pass (avoids compiler warning) */6772index = 0;67736774/* The failed case is an internal error. */6775if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,6776&count, errorcodeptr, cb)) return 0;67776778/* A duplicated name was found. Note that if an R<digits> name is found6779(META_COND_RNUMBER), it is a reference test, not a recursion test. */67806781code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;67826783/* Insert appropriate data values. */6784PUT2(code, 2 + LINK_SIZE, index);6785PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);6786skipunits = 1 + 2 * IMM2_SIZE;6787pptr += 1 + SIZEOFFSET;6788}67896790PCRE2_ASSERT(meta != META_CAPTURE_NAME);6791goto GROUP_PROCESS_NOTE_EMPTY;67926793/* The DEFINE condition is always false. Its internal groups may never6794be called, so matched_char must remain false, hence the jump to6795GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */67966797case META_COND_DEFINE:6798bravalue = OP_COND;6799GETPLUSOFFSET(offset, pptr);6800code[1+LINK_SIZE] = OP_DEFINE;6801skipunits = 1;6802goto GROUP_PROCESS;68036804/* Conditional test of a group's being set. */68056806case META_COND_NUMBER:6807bravalue = OP_COND;6808GETPLUSOFFSET(offset, pptr);68096810groupnumber = *(++pptr);6811if (groupnumber > cb->bracount)6812{6813*errorcodeptr = ERR15;6814cb->erroroffset = offset;6815return 0;6816}6817if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;68186819/* Point at initial ( for too many branches error */6820offset -= 2;6821code[1+LINK_SIZE] = OP_CREF;6822skipunits = 1+IMM2_SIZE;6823PUT2(code, 2+LINK_SIZE, groupnumber);6824goto GROUP_PROCESS_NOTE_EMPTY;68256826/* Test for the PCRE2 version. */68276828case META_COND_VERSION:6829bravalue = OP_COND;6830if (pptr[1] > 0)6831code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||6832(PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?6833OP_TRUE : OP_FALSE;6834else6835code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?6836OP_TRUE : OP_FALSE;6837skipunits = 1;6838pptr += 3;6839goto GROUP_PROCESS_NOTE_EMPTY;68406841/* The condition is an assertion, possibly preceded by a callout. */68426843case META_COND_ASSERT:6844bravalue = OP_COND;6845goto GROUP_PROCESS_NOTE_EMPTY;684668476848/* ===================================================================*/6849/* Handle all kinds of nested bracketed groups. The non-capturing,6850non-conditional cases are here; others come to GROUP_PROCESS via goto. */68516852case META_LOOKAHEAD:6853bravalue = OP_ASSERT;6854cb->assert_depth += 1;6855goto GROUP_PROCESS;68566857case META_LOOKAHEAD_NA:6858bravalue = OP_ASSERT_NA;6859cb->assert_depth += 1;6860goto GROUP_PROCESS;68616862/* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird6863thing to do, but Perl allows all assertions to be quantified, and when6864they contain capturing parentheses there may be a potential use for6865this feature. Not that that applies to a quantified (?!) but we allow6866it for uniformity. */68676868case META_LOOKAHEADNOT:6869if (pptr[1] == META_KET &&6870(pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))6871{6872*code++ = OP_FAIL;6873pptr++;6874}6875else6876{6877bravalue = OP_ASSERT_NOT;6878cb->assert_depth += 1;6879goto GROUP_PROCESS;6880}6881break;68826883case META_LOOKBEHIND:6884bravalue = OP_ASSERTBACK;6885cb->assert_depth += 1;6886goto GROUP_PROCESS;68876888case META_LOOKBEHINDNOT:6889bravalue = OP_ASSERTBACK_NOT;6890cb->assert_depth += 1;6891goto GROUP_PROCESS;68926893case META_LOOKBEHIND_NA:6894bravalue = OP_ASSERTBACK_NA;6895cb->assert_depth += 1;6896goto GROUP_PROCESS;68976898case META_ATOMIC:6899bravalue = OP_ONCE;6900goto GROUP_PROCESS_NOTE_EMPTY;69016902case META_SCRIPT_RUN:6903bravalue = OP_SCRIPT_RUN;6904goto GROUP_PROCESS_NOTE_EMPTY;69056906case META_NOCAPTURE:6907bravalue = OP_BRA;6908/* Fall through */69096910/* Process nested bracketed regex. The nesting depth is maintained for the6911benefit of the stackguard function. The test for too deep nesting is now6912done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;6913others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take6914note of whether or not they may match an empty string. */69156916GROUP_PROCESS_NOTE_EMPTY:6917note_group_empty = TRUE;69186919GROUP_PROCESS:6920cb->parens_depth += 1;6921*code = bravalue;6922pptr++;6923tempcode = code;6924tempreqvary = cb->req_varyopt; /* Save value before group */6925length_prevgroup = 0; /* Initialize for pre-compile phase */69266927if ((group_return =6928compile_regex(6929options, /* The options state */6930xoptions, /* The extra options state */6931&tempcode, /* Where to put code (updated) */6932&pptr, /* Input pointer (updated) */6933errorcodeptr, /* Where to put an error message */6934skipunits, /* Skip over bracket number */6935&subfirstcu, /* For possible first char */6936&subfirstcuflags,6937&subreqcu, /* For possible last char */6938&subreqcuflags,6939bcptr, /* Current branch chain */6940open_caps, /* Pointer to capture stack */6941cb, /* Compile data block */6942(lengthptr == NULL)? NULL : /* Actual compile phase */6943&length_prevgroup /* Pre-compile phase */6944)) == 0)6945return 0; /* Error */69466947cb->parens_depth -= 1;69486949/* If that was a non-conditional significant group (not an assertion, not a6950DEFINE) that matches at least one character, then the current item matches6951a character. Conditionals are handled below. */69526953if (note_group_empty && bravalue != OP_COND && group_return > 0)6954matched_char = TRUE;69556956/* If we've just compiled an assertion, pop the assert depth. */69576958if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)6959cb->assert_depth -= 1;69606961/* At the end of compiling, code is still pointing to the start of the6962group, while tempcode has been updated to point past the end of the group.6963The parsed pattern pointer (pptr) is on the closing META_KET.69646965If this is a conditional bracket, check that there are no more than6966two branches in the group, or just one if it's a DEFINE group. We do this6967in the real compile phase, not in the pre-pass, where the whole group may6968not be available. */69696970if (bravalue == OP_COND && lengthptr == NULL)6971{6972PCRE2_UCHAR *tc = code;6973int condcount = 0;69746975do {6976condcount++;6977tc += GET(tc,1);6978}6979while (*tc != OP_KET);69806981/* A DEFINE group is never obeyed inline (the "condition" is always6982false). It must have only one branch. Having checked this, change the6983opcode to OP_FALSE. */69846985if (code[LINK_SIZE+1] == OP_DEFINE)6986{6987if (condcount > 1)6988{6989cb->erroroffset = offset;6990*errorcodeptr = ERR54;6991return 0;6992}6993code[LINK_SIZE+1] = OP_FALSE;6994bravalue = OP_DEFINE; /* A flag to suppress char handling below */6995}69966997/* A "normal" conditional group. If there is just one branch, we must not6998make use of its firstcu or reqcu, because this is equivalent to an6999empty second branch. Also, it may match an empty string. If there are two7000branches, this item must match a character if the group must. */70017002else7003{7004if (condcount > 2)7005{7006cb->erroroffset = offset;7007*errorcodeptr = ERR27;7008return 0;7009}7010if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;7011else if (group_return > 0) matched_char = TRUE;7012}7013}70147015/* In the pre-compile phase, update the length by the length of the group,7016less the brackets at either end. Then reduce the compiled code to just a7017set of non-capturing brackets so that it doesn't use much memory if it is7018duplicated by a quantifier.*/70197020if (lengthptr != NULL)7021{7022if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)7023{7024*errorcodeptr = ERR20;7025return 0;7026}7027*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;7028code++; /* This already contains bravalue */7029PUTINC(code, 0, 1 + LINK_SIZE);7030*code++ = OP_KET;7031PUTINC(code, 0, 1 + LINK_SIZE);7032break; /* No need to waste time with special character handling */7033}70347035/* Otherwise update the main code pointer to the end of the group. */70367037code = tempcode;70387039/* For a DEFINE group, required and first character settings are not7040relevant. */70417042if (bravalue == OP_DEFINE) break;70437044/* Handle updating of the required and first code units for other types of7045group. Update for normal brackets of all kinds, and conditions with two7046branches (see code above). If the bracket is followed by a quantifier with7047zero repeat, we have to back off. Hence the definition of zeroreqcu and7048zerofirstcu outside the main loop so that they can be accessed for the back7049off. */70507051zeroreqcu = reqcu;7052zeroreqcuflags = reqcuflags;7053zerofirstcu = firstcu;7054zerofirstcuflags = firstcuflags;7055groupsetfirstcu = FALSE;70567057if (bravalue >= OP_ONCE) /* Not an assertion */7058{7059/* If we have not yet set a firstcu in this branch, take it from the7060subpattern, remembering that it was set here so that a repeat of more7061than one can replicate it as reqcu if necessary. If the subpattern has7062no firstcu, set "none" for the whole branch. In both cases, a zero7063repeat forces firstcu to "none". */70647065if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)7066{7067if (subfirstcuflags < REQ_NONE)7068{7069firstcu = subfirstcu;7070firstcuflags = subfirstcuflags;7071groupsetfirstcu = TRUE;7072}7073else firstcuflags = REQ_NONE;7074zerofirstcuflags = REQ_NONE;7075}70767077/* If firstcu was previously set, convert the subpattern's firstcu7078into reqcu if there wasn't one, using the vary flag that was in7079existence beforehand. */70807081else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)7082{7083subreqcu = subfirstcu;7084subreqcuflags = subfirstcuflags | tempreqvary;7085}70867087/* If the subpattern set a required code unit (or set a first code unit7088that isn't really the first code unit - see above), set it. */70897090if (subreqcuflags < REQ_NONE)7091{7092reqcu = subreqcu;7093reqcuflags = subreqcuflags;7094}7095}70967097/* For a forward assertion, we take the reqcu, if set, provided that the7098group has also set a firstcu. This can be helpful if the pattern that7099follows the assertion doesn't set a different char. For example, it's7100useful for /(?=abcde).+/. We can't set firstcu for an assertion, however7101because it leads to incorrect effect for patterns such as /(?=a)a.+/ when7102the "real" "a" would then become a reqcu instead of a firstcu. This is7103overcome by a scan at the end if there's no firstcu, looking for an7104asserted first char. A similar effect for patterns like /(?=.*X)X$/ means7105we must only take the reqcu when the group also set a firstcu. Otherwise,7106in that example, 'X' ends up set for both. */71077108else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&7109subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)7110{7111reqcu = subreqcu;7112reqcuflags = subreqcuflags;7113}71147115break; /* End of nested group handling */711671177118/* ===================================================================*/7119/* Handle named backreferences and recursions. */71207121case META_BACKREF_BYNAME:7122case META_RECURSE_BYNAME:7123{7124int count, index;7125PCRE2_SPTR name;7126named_group *ng;7127uint32_t length = *(++pptr);71287129GETPLUSOFFSET(offset, pptr);7130name = cb->start_pattern + offset;71317132/* In the first pass, the names generated in the pre-pass are available,7133but the main name table has not yet been created. Scan the list of names7134generated in the pre-pass in order to get a number and whether or not7135this name is duplicated. */71367137ng = PRIV(compile_find_named_group)(name, length, cb);71387139if (ng == NULL)7140{7141/* If the name was not found we have a bad reference. */7142*errorcodeptr = ERR15;7143cb->erroroffset = offset;7144return 0;7145}71467147groupnumber = ng->number;71487149/* For a recursion, that's all that is needed. We can now go to7150the code that handles numerical recursion, applying it to the first7151group with the given name. */71527153if (meta == META_RECURSE_BYNAME)7154{7155meta_arg = groupnumber;7156goto HANDLE_NUMERICAL_RECURSION;7157}71587159/* For a back reference, update the back reference map and the7160maximum back reference. */71617162cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;7163if (groupnumber > cb->top_backref)7164cb->top_backref = groupnumber;71657166/* If a back reference name is not duplicated, we can handle it as7167a numerical reference. */71687169if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)7170{7171meta_arg = groupnumber;7172goto HANDLE_SINGLE_REFERENCE;7173}71747175/* If a back reference name is duplicated, we generate a different7176opcode to a numerical back reference. In the second pass we must7177search for the index and count in the final name table. */71787179count = 0; /* Values for first pass (avoids compiler warning) */7180index = 0;7181if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,7182&index, &count, errorcodeptr, cb)) return 0;71837184if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;7185*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;7186PUT2INC(code, 0, index);7187PUT2INC(code, 0, count);7188if ((options & PCRE2_CASELESS) != 0)7189*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?7190REFI_FLAG_CASELESS_RESTRICT : 0) |7191(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?7192REFI_FLAG_TURKISH_CASING : 0);7193}7194break;719571967197/* ===================================================================*/7198/* Handle a numerical callout. */71997200case META_CALLOUT_NUMBER:7201code[0] = OP_CALLOUT;7202PUT(code, 1, pptr[1]); /* Offset to next pattern item */7203PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */7204code[1 + 2*LINK_SIZE] = pptr[3];7205pptr += 3;7206code += PRIV(OP_lengths)[OP_CALLOUT];7207break;720872097210/* ===================================================================*/7211/* Handle a callout with a string argument. In the pre-pass we just compute7212the length without generating anything. The length in pptr[3] includes both7213delimiters; in the actual compile only the first one is copied, but a7214terminating zero is added. Any doubled delimiters within the string make7215this an overestimate, but it is not worth bothering about. */72167217case META_CALLOUT_STRING:7218if (lengthptr != NULL)7219{7220*lengthptr += pptr[3] + (1 + 4*LINK_SIZE);7221pptr += 3;7222SKIPOFFSET(pptr);7223}72247225/* In the real compile we can copy the string. The starting delimiter is7226included so that the client can discover it if they want. We also pass the7227start offset to help a script language give better error messages. */72287229else7230{7231PCRE2_SPTR pp;7232uint32_t delimiter;7233uint32_t length = pptr[3];7234PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);72357236code[0] = OP_CALLOUT_STR;7237PUT(code, 1, pptr[1]); /* Offset to next pattern item */7238PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */72397240pptr += 3;7241GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */7242pp = cb->start_pattern + offset;7243delimiter = *callout_string++ = *pp++;7244if (delimiter == CHAR_LEFT_CURLY_BRACKET)7245delimiter = CHAR_RIGHT_CURLY_BRACKET;7246PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */72477248/* The syntax of the pattern was checked in the parsing scan. The length7249includes both delimiters, but we have passed the opening one just above,7250so we reduce length before testing it. The test is for > 1 because we do7251not want to copy the final delimiter. This also ensures that pp[1] is7252accessible. */72537254while (--length > 1)7255{7256if (*pp == delimiter && pp[1] == delimiter)7257{7258*callout_string++ = delimiter;7259pp += 2;7260length--;7261}7262else *callout_string++ = *pp++;7263}7264*callout_string++ = CHAR_NUL;72657266/* Set the length of the entire item, the advance to its end. */72677268PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));7269code = callout_string;7270}7271break;727272737274/* ===================================================================*/7275/* Handle repetition. The different types are all sorted out in the parsing7276pass. */72777278case META_MINMAX_PLUS:7279case META_MINMAX_QUERY:7280case META_MINMAX:7281repeat_min = *(++pptr);7282repeat_max = *(++pptr);7283goto REPEAT;72847285case META_ASTERISK:7286case META_ASTERISK_PLUS:7287case META_ASTERISK_QUERY:7288repeat_min = 0;7289repeat_max = REPEAT_UNLIMITED;7290goto REPEAT;72917292case META_PLUS:7293case META_PLUS_PLUS:7294case META_PLUS_QUERY:7295repeat_min = 1;7296repeat_max = REPEAT_UNLIMITED;7297goto REPEAT;72987299case META_QUERY:7300case META_QUERY_PLUS:7301case META_QUERY_QUERY:7302repeat_min = 0;7303repeat_max = 1;73047305REPEAT:7306if (previous_matched_char && repeat_min > 0) matched_char = TRUE;73077308/* Remember whether this is a variable length repeat, and default to7309single-char opcodes. */73107311reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;73127313/* Adjust first and required code units for a zero repeat. */73147315if (repeat_min == 0)7316{7317firstcu = zerofirstcu;7318firstcuflags = zerofirstcuflags;7319reqcu = zeroreqcu;7320reqcuflags = zeroreqcuflags;7321}73227323/* Note the greediness and possessiveness. */73247325switch (meta)7326{7327case META_MINMAX_PLUS:7328case META_ASTERISK_PLUS:7329case META_PLUS_PLUS:7330case META_QUERY_PLUS:7331repeat_type = 0; /* Force greedy */7332possessive_quantifier = TRUE;7333break;73347335case META_MINMAX_QUERY:7336case META_ASTERISK_QUERY:7337case META_PLUS_QUERY:7338case META_QUERY_QUERY:7339repeat_type = greedy_non_default;7340possessive_quantifier = FALSE;7341break;73427343default:7344repeat_type = greedy_default;7345possessive_quantifier = FALSE;7346break;7347}73487349/* Save start of previous item, in case we have to move it up in order to7350insert something before it, and remember what it was. */73517352PCRE2_ASSERT(previous != NULL);7353tempcode = previous;7354op_previous = *previous;73557356/* Now handle repetition for the different types of item. If the repeat7357minimum and the repeat maximum are both 1, we can ignore the quantifier for7358non-parenthesized items, as they have only one alternative. For anything in7359parentheses, we must not ignore if {1} is possessive. */73607361switch (op_previous)7362{7363/* If previous was a character or negated character match, abolish the7364item and generate a repeat item instead. If a char item has a minimum of7365more than one, ensure that it is set in reqcu - it might not be if a7366sequence such as x{3} is the first thing in a branch because the x will7367have gone into firstcu instead. */73687369case OP_CHAR:7370case OP_CHARI:7371case OP_NOT:7372case OP_NOTI:7373if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;7374op_type = chartypeoffset[op_previous - OP_CHAR];73757376/* Deal with UTF characters that take up more than one code unit. */73777378#ifdef MAYBE_UTF_MULTI7379if (utf && NOT_FIRSTCU(code[-1]))7380{7381PCRE2_UCHAR *lastchar = code - 1;7382BACKCHAR(lastchar);7383mclength = (uint32_t)(code - lastchar); /* Length of UTF character */7384memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */7385}7386else7387#endif /* MAYBE_UTF_MULTI */73887389/* Handle the case of a single code unit - either with no UTF support, or7390with UTF disabled, or for a single-code-unit UTF character. In the latter7391case, for a repeated positive match, get the caseless flag for the7392required code unit from the previous character, because a class like [Aa]7393sets a caseless A but by now the req_caseopt flag has been reset. */73947395{7396mcbuffer[0] = code[-1];7397mclength = 1;7398if (op_previous <= OP_CHARI && repeat_min > 1)7399{7400reqcu = mcbuffer[0];7401reqcuflags = cb->req_varyopt;7402if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;7403}7404}7405goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */74067407/* If previous was a character class or a back reference, we put the7408repeat stuff after it, but just skip the item if the repeat was {0,0}. */74097410#ifdef SUPPORT_WIDE_CHARS7411case OP_XCLASS:7412case OP_ECLASS:7413#endif7414case OP_CLASS:7415case OP_NCLASS:7416case OP_REF:7417case OP_REFI:7418case OP_DNREF:7419case OP_DNREFI:74207421if (repeat_max == 0)7422{7423code = previous;7424goto END_REPEAT;7425}7426if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;74277428if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)7429*code++ = OP_CRSTAR + repeat_type;7430else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)7431*code++ = OP_CRPLUS + repeat_type;7432else if (repeat_min == 0 && repeat_max == 1)7433*code++ = OP_CRQUERY + repeat_type;7434else7435{7436*code++ = OP_CRRANGE + repeat_type;7437PUT2INC(code, 0, repeat_min);7438if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */7439PUT2INC(code, 0, repeat_max);7440}7441break;74427443/* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets7444because pcre2_match() could not handle backtracking into recursively7445called groups. Now that this backtracking is available, we no longer need7446to do this. However, we still need to replicate recursions as we do for7447groups so as to have independent backtracking points. We can replicate7448for the minimum number of repeats directly. For optional repeats we now7449wrap the recursion in OP_BRA brackets and make use of the bracket7450repetition. */74517452case OP_RECURSE:7453if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)7454goto END_REPEAT;74557456/* Generate unwrapped repeats for a non-zero minimum, except when the7457minimum is 1 and the maximum unlimited, because that can be handled with7458OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the7459minimum, we just need to generate the appropriate additional copies.7460Otherwise we need to generate one more, to simulate the situation when7461the minimum is zero. */74627463if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))7464{7465int replicate = repeat_min;74667467if (repeat_min == repeat_max) replicate--;74687469/* In the pre-compile phase, we don't actually do the replication. We7470just adjust the length as if we had. Do some paranoid checks for7471potential integer overflow. */74727473if (lengthptr != NULL)7474{7475PCRE2_SIZE delta;7476if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||7477OFLOW_MAX - *lengthptr < delta)7478{7479*errorcodeptr = ERR20;7480return 0;7481}7482*lengthptr += delta;7483}7484else for (int i = 0; i < replicate; i++)7485{7486memcpy(code, previous, CU2BYTES(length_prevgroup));7487previous = code;7488code += length_prevgroup;7489}74907491/* If the number of repeats is fixed, we are done. Otherwise, adjust7492the counts and fall through. */74937494if (repeat_min == repeat_max) break;7495if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;7496repeat_min = 0;7497}74987499/* Wrap the recursion call in OP_BRA brackets. */7500{7501PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;75027503(void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));7504op_previous = *previous = OP_BRA;7505PUT(previous, 1, 1 + LINK_SIZE + length);7506previous[1 + LINK_SIZE + length] = OP_KET;7507PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);7508}7509code += 2 + 2 * LINK_SIZE;7510length_prevgroup += 2 + 2 * LINK_SIZE;7511group_return = -1; /* Set "may match empty string" */75127513/* Now treat as a repeated OP_BRA. */7514PCRE2_FALLTHROUGH /* Fall through */75157516/* If previous was a bracket group, we may have to replicate it in7517certain cases. Note that at this point we can encounter only the "basic"7518bracket opcodes such as BRA and CBRA, as this is the place where they get7519converted into the more special varieties such as BRAPOS and SBRA.7520Originally, PCRE did not allow repetition of assertions, but now it does,7521for Perl compatibility. */75227523case OP_ASSERT:7524case OP_ASSERT_NOT:7525case OP_ASSERT_NA:7526case OP_ASSERTBACK:7527case OP_ASSERTBACK_NOT:7528case OP_ASSERTBACK_NA:7529case OP_ASSERT_SCS:7530case OP_ONCE:7531case OP_SCRIPT_RUN:7532case OP_BRA:7533case OP_CBRA:7534case OP_COND:7535{7536int len = (int)(code - previous);7537PCRE2_UCHAR *bralink = NULL;7538PCRE2_UCHAR *brazeroptr = NULL;75397540if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)7541goto END_REPEAT;75427543/* Repeating a DEFINE group (or any group where the condition is always7544FALSE and there is only one branch) is pointless, but Perl allows the7545syntax, so we just ignore the repeat. */75467547if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&7548previous[GET(previous, 1)] != OP_ALT)7549goto END_REPEAT;75507551/* Perl allows all assertions to be quantified, and when they contain7552capturing parentheses and/or are optional there are potential uses for7553this feature. PCRE2 used to force the maximum quantifier to 1 on the7554invalid grounds that further repetition was never useful. This was7555always a bit pointless, since an assertion could be wrapped with a7556repeated group to achieve the effect. General repetition is now7557permitted, but if the maximum is unlimited it is set to one more than7558the minimum. */75597560if (op_previous < OP_ONCE) /* Assertion */7561{7562if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;7563}75647565/* The case of a zero minimum is special because of the need to stick7566OP_BRAZERO in front of it, and because the group appears once in the7567data, whereas in other cases it appears the minimum number of times. For7568this reason, it is simplest to treat this case separately, as otherwise7569the code gets far too messy. There are several special subcases when the7570minimum is zero. */75717572if (repeat_min == 0)7573{7574/* If the maximum is also zero, we used to just omit the group from7575the output altogether, like this:75767577** if (repeat_max == 0)7578** {7579** code = previous;7580** goto END_REPEAT;7581** }75827583However, that fails when a group or a subgroup within it is7584referenced as a subroutine from elsewhere in the pattern, so now we7585stick in OP_SKIPZERO in front of it so that it is skipped on7586execution. As we don't have a list of which groups are referenced, we7587cannot do this selectively.75887589If the maximum is 1 or unlimited, we just have to stick in the7590BRAZERO and do no more at this point. */75917592if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)7593{7594(void)memmove(previous + 1, previous, CU2BYTES(len));7595code++;7596if (repeat_max == 0)7597{7598*previous++ = OP_SKIPZERO;7599goto END_REPEAT;7600}7601brazeroptr = previous; /* Save for possessive optimizing */7602*previous++ = OP_BRAZERO + repeat_type;7603}76047605/* If the maximum is greater than 1 and limited, we have to replicate7606in a nested fashion, sticking OP_BRAZERO before each set of brackets.7607The first one has to be handled carefully because it's the original7608copy, which has to be moved up. The remainder can be handled by code7609that is common with the non-zero minimum case below. We have to7610adjust the value or repeat_max, since one less copy is required. */76117612else7613{7614int linkoffset;7615(void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));7616code += 2 + LINK_SIZE;7617*previous++ = OP_BRAZERO + repeat_type;7618*previous++ = OP_BRA;76197620/* We chain together the bracket link offset fields that have to be7621filled in later when the ends of the brackets are reached. */76227623linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);7624bralink = previous;7625PUTINC(previous, 0, linkoffset);7626}76277628if (repeat_max != REPEAT_UNLIMITED) repeat_max--;7629}76307631/* If the minimum is greater than zero, replicate the group as many7632times as necessary, and adjust the maximum to the number of subsequent7633copies that we need. */76347635else7636{7637if (repeat_min > 1)7638{7639/* In the pre-compile phase, we don't actually do the replication.7640We just adjust the length as if we had. Do some paranoid checks for7641potential integer overflow. */76427643if (lengthptr != NULL)7644{7645PCRE2_SIZE delta;7646if (PRIV(ckd_smul)(&delta, repeat_min - 1,7647(int)length_prevgroup) ||7648OFLOW_MAX - *lengthptr < delta)7649{7650*errorcodeptr = ERR20;7651return 0;7652}7653*lengthptr += delta;7654}76557656/* This is compiling for real. If there is a set first code unit7657for the group, and we have not yet set a "required code unit", set7658it. */76597660else7661{7662if (groupsetfirstcu && reqcuflags >= REQ_NONE)7663{7664reqcu = firstcu;7665reqcuflags = firstcuflags;7666}7667for (uint32_t i = 1; i < repeat_min; i++)7668{7669memcpy(code, previous, CU2BYTES(len));7670code += len;7671}7672}7673}76747675if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;7676}76777678/* This code is common to both the zero and non-zero minimum cases. If7679the maximum is limited, it replicates the group in a nested fashion,7680remembering the bracket starts on a stack. In the case of a zero7681minimum, the first one was set up above. In all cases the repeat_max7682now specifies the number of additional copies needed. Again, we must7683remember to replicate entries on the forward reference list. */76847685if (repeat_max != REPEAT_UNLIMITED)7686{7687/* In the pre-compile phase, we don't actually do the replication. We7688just adjust the length as if we had. For each repetition we must add76891 to the length for BRAZERO and for all but the last repetition we7690must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some7691paranoid checks to avoid integer overflow. */76927693if (lengthptr != NULL && repeat_max > 0)7694{7695PCRE2_SIZE delta;7696if (PRIV(ckd_smul)(&delta, repeat_max,7697(int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||7698OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)7699{7700*errorcodeptr = ERR20;7701return 0;7702}7703delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */7704*lengthptr += delta;7705}77067707/* This is compiling for real */77087709else for (uint32_t i = repeat_max; i >= 1; i--)7710{7711*code++ = OP_BRAZERO + repeat_type;77127713/* All but the final copy start a new nesting, maintaining the7714chain of brackets outstanding. */77157716if (i != 1)7717{7718int linkoffset;7719*code++ = OP_BRA;7720linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);7721bralink = code;7722PUTINC(code, 0, linkoffset);7723}77247725memcpy(code, previous, CU2BYTES(len));7726code += len;7727}77287729/* Now chain through the pending brackets, and fill in their length7730fields (which are holding the chain links pro tem). */77317732while (bralink != NULL)7733{7734int oldlinkoffset;7735int linkoffset = (int)(code - bralink + 1);7736PCRE2_UCHAR *bra = code - linkoffset;7737oldlinkoffset = GET(bra, 1);7738bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;7739*code++ = OP_KET;7740PUTINC(code, 0, linkoffset);7741PUT(bra, 1, linkoffset);7742}7743}77447745/* If the maximum is unlimited, set a repeater in the final copy. For7746SCRIPT_RUN and ONCE brackets, that's all we need to do. However,7747possessively repeated ONCE brackets can be converted into non-capturing7748brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this7749saves having to deal with possessive ONCEs specially.77507751Otherwise, when we are doing the actual compile phase, check to see7752whether this group is one that could match an empty string. If so,7753convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so7754that runtime checking can be done. [This check is also applied to ONCE7755and SCRIPT_RUN groups at runtime, but in a different way.]77567757Then, if the quantifier was possessive and the bracket is not a7758conditional, we convert the BRA code to the POS form, and the KET code7759to KETRPOS. (It turns out to be convenient at runtime to detect this7760kind of subpattern at both the start and at the end.) The use of7761special opcodes makes it possible to reduce greatly the stack usage in7762pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to7763OP_BRAPOSZERO.77647765Then, if the minimum number of matches is 1 or 0, cancel the possessive7766flag so that the default action below, of wrapping everything inside7767atomic brackets, does not happen. When the minimum is greater than 1,7768there will be earlier copies of the group, and so we still have to wrap7769the whole thing. */77707771else7772{7773PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;7774PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);77757776/* Convert possessive ONCE brackets to non-capturing */77777778if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;77797780/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need7781to do is to set the KET. */77827783if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)7784*ketcode = OP_KETRMAX + repeat_type;77857786/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs7787(which have been converted to non-capturing above). */77887789else7790{7791/* In the compile phase, adjust the opcode if the group can match7792an empty string. For a conditional group with only one branch, the7793value of group_return will not show "could be empty", so we must7794check that separately. */77957796if (lengthptr == NULL)7797{7798if (group_return < 0) *bracode += OP_SBRA - OP_BRA;7799if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)7800*bracode = OP_SCOND;7801}78027803/* Handle possessive quantifiers. */78047805if (possessive_quantifier)7806{7807/* For COND brackets, we wrap the whole thing in a possessively7808repeated non-capturing bracket, because we have not invented POS7809versions of the COND opcodes. */78107811if (*bracode == OP_COND || *bracode == OP_SCOND)7812{7813int nlen = (int)(code - bracode);7814(void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));7815code += 1 + LINK_SIZE;7816nlen += 1 + LINK_SIZE;7817*bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;7818*code++ = OP_KETRPOS;7819PUTINC(code, 0, nlen);7820PUT(bracode, 1, nlen);7821}78227823/* For non-COND brackets, we modify the BRA code and use KETRPOS. */78247825else7826{7827*bracode += 1; /* Switch to xxxPOS opcodes */7828*ketcode = OP_KETRPOS;7829}78307831/* If the minimum is zero, mark it as possessive, then unset the7832possessive flag when the minimum is 0 or 1. */78337834if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;7835if (repeat_min < 2) possessive_quantifier = FALSE;7836}78377838/* Non-possessive quantifier */78397840else *ketcode = OP_KETRMAX + repeat_type;7841}7842}7843}7844break;78457846/* If previous was a character type match (\d or similar), abolish it and7847create a suitable repeat item. The code is shared with single-character7848repeats by setting op_type to add a suitable offset into repeat_type.7849Note the the Unicode property types will be present only when7850SUPPORT_UNICODE is defined, but we don't wrap the little bits of code7851here because it just makes it horribly messy. */78527853default:78547855/* LCOV_EXCL_START */7856if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)7857{7858PCRE2_DEBUG_UNREACHABLE();7859*errorcodeptr = ERR10; /* Not a character type - internal error */7860return 0;7861}7862/* LCOV_EXCL_STOP */78637864{7865int prop_type, prop_value;7866PCRE2_UCHAR *oldcode;78677868if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;78697870op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */7871mclength = 0; /* Not a character */78727873if (op_previous == OP_PROP || op_previous == OP_NOTPROP)7874{7875prop_type = previous[1];7876prop_value = previous[2];7877}7878else7879{7880/* Come here from just above with a character in mcbuffer/mclength.7881You must also set op_type before the jump. */7882OUTPUT_SINGLE_REPEAT:7883prop_type = prop_value = -1;7884}78857886/* At this point, if prop_type == prop_value == -1 we either have a7887character in mcbuffer when mclength is greater than zero, or we have7888mclength zero, in which case there is a non-property character type in7889op_previous. If prop_type/value are not negative, we have a property7890character type in op_previous. */78917892oldcode = code; /* Save where we were */7893code = previous; /* Usually overwrite previous item */78947895/* If the maximum is zero then the minimum must also be zero; Perl allows7896this case, so we do too - by simply omitting the item altogether. */78977898if (repeat_max == 0) goto END_REPEAT;78997900/* Combine the op_type with the repeat_type */79017902repeat_type += op_type;79037904/* A minimum of zero is handled either as the special case * or ?, or as7905an UPTO, with the maximum given. */79067907if (repeat_min == 0)7908{7909if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;7910else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;7911else7912{7913*code++ = OP_UPTO + repeat_type;7914PUT2INC(code, 0, repeat_max);7915}7916}79177918/* A repeat minimum of 1 is optimized into some special cases. If the7919maximum is unlimited, we use OP_PLUS. Otherwise, the original item is7920left in place and, if the maximum is greater than 1, we use OP_UPTO with7921one less than the maximum. */79227923else if (repeat_min == 1)7924{7925if (repeat_max == REPEAT_UNLIMITED)7926*code++ = OP_PLUS + repeat_type;7927else7928{7929code = oldcode; /* Leave previous item in place */7930if (repeat_max == 1) goto END_REPEAT;7931*code++ = OP_UPTO + repeat_type;7932PUT2INC(code, 0, repeat_max - 1);7933}7934}79357936/* The case {n,n} is just an EXACT, while the general case {n,m} is7937handled as an EXACT followed by an UPTO or STAR or QUERY. */79387939else7940{7941*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */7942PUT2INC(code, 0, repeat_min);79437944/* Unless repeat_max equals repeat_min, fill in the data for EXACT,7945and then generate the second opcode. For a repeated Unicode property7946match, there are two extra values that define the required property,7947and mclength is set zero to indicate this. */79487949if (repeat_max != repeat_min)7950{7951if (mclength > 0)7952{7953memcpy(code, mcbuffer, CU2BYTES(mclength));7954code += mclength;7955}7956else7957{7958*code++ = op_previous;7959if (prop_type >= 0)7960{7961*code++ = prop_type;7962*code++ = prop_value;7963}7964}79657966/* Now set up the following opcode */79677968if (repeat_max == REPEAT_UNLIMITED)7969*code++ = OP_STAR + repeat_type;7970else7971{7972repeat_max -= repeat_min;7973if (repeat_max == 1)7974{7975*code++ = OP_QUERY + repeat_type;7976}7977else7978{7979*code++ = OP_UPTO + repeat_type;7980PUT2INC(code, 0, repeat_max);7981}7982}7983}7984}79857986/* Fill in the character or character type for the final opcode. */79877988if (mclength > 0)7989{7990memcpy(code, mcbuffer, CU2BYTES(mclength));7991code += mclength;7992}7993else7994{7995*code++ = op_previous;7996if (prop_type >= 0)7997{7998*code++ = prop_type;7999*code++ = prop_value;8000}8001}8002}8003break;8004} /* End of switch on different op_previous values */800580068007/* If the character following a repeat is '+', possessive_quantifier is8008TRUE. For some opcodes, there are special alternative opcodes for this8009case. For anything else, we wrap the entire repeated item inside OP_ONCE8010brackets. Logically, the '+' notation is just syntactic sugar, taken from8011Sun's Java package, but the special opcodes can optimize it.80128013Some (but not all) possessively repeated subpatterns have already been8014completely handled in the code just above. For them, possessive_quantifier8015is always FALSE at this stage. Note that the repeated item starts at8016tempcode, not at previous, which might be the first part of a string whose8017(former) last char we repeated. */80188019if (possessive_quantifier)8020{8021int len;80228023/* Possessifying an EXACT quantifier has no effect, so we can ignore it.8024However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},8025{5,}, or {5,10}). We skip over an EXACT item; if the length of what8026remains is greater than zero, there's a further opcode that can be8027handled. If not, do nothing, leaving the EXACT alone. */80288029switch(*tempcode)8030{8031case OP_TYPEEXACT:8032tempcode += PRIV(OP_lengths)[*tempcode] +8033((tempcode[1 + IMM2_SIZE] == OP_PROP8034|| tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);8035break;80368037/* CHAR opcodes are used for exacts whose count is 1. */80388039case OP_CHAR:8040case OP_CHARI:8041case OP_NOT:8042case OP_NOTI:8043case OP_EXACT:8044case OP_EXACTI:8045case OP_NOTEXACT:8046case OP_NOTEXACTI:8047tempcode += PRIV(OP_lengths)[*tempcode];8048#ifdef SUPPORT_UNICODE8049if (utf && HAS_EXTRALEN(tempcode[-1]))8050tempcode += GET_EXTRALEN(tempcode[-1]);8051#endif8052break;80538054/* For the class opcodes, the repeat operator appears at the end;8055adjust tempcode to point to it. */80568057case OP_CLASS:8058case OP_NCLASS:8059tempcode += 1 + 32/sizeof(PCRE2_UCHAR);8060break;80618062#ifdef SUPPORT_WIDE_CHARS8063case OP_XCLASS:8064case OP_ECLASS:8065tempcode += GET(tempcode, 1);8066break;8067#endif8068}80698070/* If tempcode is equal to code (which points to the end of the repeated8071item), it means we have skipped an EXACT item but there is no following8072QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In8073all other cases, tempcode will be pointing to the repeat opcode, and will8074be less than code, so the value of len will be greater than 0. */80758076len = (int)(code - tempcode);8077if (len > 0)8078{8079unsigned int repcode = *tempcode;80808081/* There is a table for possessifying opcodes, all of which are less8082than OP_CALLOUT. A zero entry means there is no possessified version.8083*/80848085if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)8086*tempcode = opcode_possessify[repcode];80878088/* For opcode without a special possessified version, wrap the item in8089ONCE brackets. */80908091else8092{8093(void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));8094code += 1 + LINK_SIZE;8095len += 1 + LINK_SIZE;8096tempcode[0] = OP_ONCE;8097*code++ = OP_KET;8098PUTINC(code, 0, len);8099PUT(tempcode, 1, len);8100}8101}8102}81038104/* We set the "follows varying string" flag for subsequently encountered8105reqcus if it isn't already set and we have just passed a varying length8106item. */81078108END_REPEAT:8109cb->req_varyopt |= reqvary;8110break;811181128113/* ===================================================================*/8114/* Handle a 32-bit data character with a value greater than META_END. */81158116case META_BIGVALUE:8117pptr++;8118goto NORMAL_CHAR;811981208121/* ===============================================================*/8122/* Handle a back reference by number, which is the meta argument. The8123pattern offsets for back references to group numbers less than 10 are held8124in a special vector, to avoid using more than two parsed pattern elements8125in 64-bit environments. We only need the offset to the first occurrence,8126because if that doesn't fail, subsequent ones will also be OK. */81278128case META_BACKREF:8129if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];8130else GETPLUSOFFSET(offset, pptr);81318132if (meta_arg > cb->bracount)8133{8134cb->erroroffset = offset;8135*errorcodeptr = ERR15; /* Non-existent subpattern */8136return 0;8137}81388139/* Come here from named backref handling when the reference is to a8140single group (that is, not to a duplicated name). The back reference8141data will have already been updated. We must disable firstcu if not8142set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'8143later. */81448145HANDLE_SINGLE_REFERENCE:8146if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;8147*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;8148PUT2INC(code, 0, meta_arg);8149if ((options & PCRE2_CASELESS) != 0)8150*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?8151REFI_FLAG_CASELESS_RESTRICT : 0) |8152(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?8153REFI_FLAG_TURKISH_CASING : 0);81548155/* Update the map of back references, and keep the highest one. We8156could do this in parse_regex() for numerical back references, but not8157for named back references, because we don't know the numbers to which8158named back references refer. So we do it all in this function. */81598160cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;8161if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;8162break;816381648165/* ===============================================================*/8166/* Handle recursion by inserting the number of the called group (which is8167the meta argument) after OP_RECURSE. At the end of compiling the pattern is8168scanned and these numbers are replaced by offsets within the pattern. It is8169done like this to avoid problems with forward references and adjusting8170offsets when groups are duplicated and moved (as discovered in previous8171implementations). Note that a recursion does not have a set first8172character. */81738174case META_RECURSE:8175GETPLUSOFFSET(offset, pptr);8176if (meta_arg > cb->bracount)8177{8178cb->erroroffset = offset;8179*errorcodeptr = ERR15; /* Non-existent subpattern */8180return 0;8181}8182HANDLE_NUMERICAL_RECURSION:8183*code = OP_RECURSE;8184PUT(code, 1, meta_arg);8185code += 1 + LINK_SIZE;8186/* Repeat processing requires this information to8187determine the real length in pre-compile phase. */8188length_prevgroup = 1 + LINK_SIZE;81898190if (META_CODE(pptr[1]) == META_OFFSET ||8191META_CODE(pptr[1]) == META_CAPTURE_NAME ||8192META_CODE(pptr[1]) == META_CAPTURE_NUMBER)8193{8194recurse_arguments *args;81958196if (lengthptr != NULL)8197{8198if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))8199return 0;82008201args = (recurse_arguments*)cb->last_data;8202length_prevgroup += (args->size * (1 + IMM2_SIZE));8203*lengthptr += (args->size * (1 + IMM2_SIZE));8204pptr += args->skip_size;8205}8206else8207{8208uint16_t *current, *end;82098210args = (recurse_arguments*)cb->first_data;8211PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);82128213current = (uint16_t*)(args + 1);8214end = current + args->size;8215PCRE2_ASSERT(end > current);82168217do8218{8219code[0] = OP_CREF;8220PUT2(code, 1, *current);8221code += 1 + IMM2_SIZE;8222}8223while (++current < end);82248225length_prevgroup += (args->size * (1 + IMM2_SIZE));8226pptr += args->skip_size;8227cb->first_data = args->header.next;8228cb->cx->memctl.free(args, cb->cx->memctl.memory_data);8229}8230}82318232groupsetfirstcu = FALSE;8233cb->had_recurse = TRUE;8234if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;8235zerofirstcu = firstcu;8236zerofirstcuflags = firstcuflags;8237break;823882398240/* ===============================================================*/8241/* Handle capturing parentheses; the number is the meta argument. */82428243case META_CAPTURE:8244bravalue = OP_CBRA;8245skipunits = IMM2_SIZE;8246PUT2(code, 1+LINK_SIZE, meta_arg);8247cb->lastcapture = meta_arg;8248goto GROUP_PROCESS_NOTE_EMPTY;824982508251/* ===============================================================*/8252/* Handle escape sequence items. For ones like \d, the ESC_values are8253arranged to be the same as the corresponding OP_values in the default case8254when PCRE2_UCP is not set (which is the only case in which they will appear8255here).82568257Note: \Q and \E are never seen here, as they were dealt with in8258parse_pattern(). Neither are numerical back references or recursions, which8259were turned into META_BACKREF or META_RECURSE items, respectively. \k and8260\g, when followed by names, are turned into META_BACKREF_BYNAME or8261META_RECURSE_BYNAME. */82628263case META_ESCAPE:82648265/* We can test for escape sequences that consume a character because their8266values lie between ESC_b and ESC_Z; this may have to change if any new ones8267are ever created. For these sequences, we disable the setting of a first8268character if it hasn't already been set. */82698270if (meta_arg > ESC_b && meta_arg < ESC_Z)8271{8272matched_char = TRUE;8273if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;8274}82758276/* Set values to reset to if this is followed by a zero repeat. */82778278zerofirstcu = firstcu;8279zerofirstcuflags = firstcuflags;8280zeroreqcu = reqcu;8281zeroreqcuflags = reqcuflags;82828283/* If Unicode is not supported, \P and \p are not allowed and are8284faulted at parse time, so will never appear here. */82858286#ifdef SUPPORT_UNICODE8287if (meta_arg == ESC_P || meta_arg == ESC_p)8288{8289uint32_t ptype = *(++pptr) >> 16;8290uint32_t pdata = *pptr & 0xffff;82918292/* In caseless matching, particular characteristics Lu, Ll, and Lt get8293converted to the general characteristic L&. That is, upper, lower, and8294title case letters are all conflated. */82958296if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&8297(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))8298{8299ptype = PT_LAMP;8300pdata = 0;8301}83028303/* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}8304is compiled to [] so as to benefit from the auto-anchoring code. */83058306if (ptype == PT_ANY)8307{8308if (meta_arg == ESC_P)8309{8310*code++ = OP_CLASS;8311memset(code, 0, 32);8312code += 32 / sizeof(PCRE2_UCHAR);8313}8314else8315*code++ = OP_ALLANY;8316}8317else8318{8319*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;8320*code++ = ptype;8321*code++ = pdata;8322}8323break; /* End META_ESCAPE */8324}8325#endif83268327/* \K is forbidden in lookarounds since 10.38 because that's what Perl has8328done. However, there's an option, in case anyone was relying on it. */83298330if (cb->assert_depth > 0 && meta_arg == ESC_K &&8331(xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)8332{8333*errorcodeptr = ERR99;8334return 0;8335}83368337/* For the rest (including \X when Unicode is supported - if not it's8338faulted at parse time), the OP value is the escape value when PCRE2_UCP is8339not set; if it is set, most of them do not show up here because they are8340converted into Unicode property tests in parse_regex().83418342In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY8343instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.8344There are special UCP codes for \B and \b which are used in UCP mode unless8345"word" matching is being forced to ASCII.83468347Note that \b and \B do a one-character lookbehind, and \A also behaves as8348if it does. */83498350switch(meta_arg)8351{8352case ESC_C:8353cb->external_flags |= PCRE2_HASBKC; /* Record */8354#if PCRE2_CODE_UNIT_WIDTH == 328355meta_arg = OP_ALLANY;8356(void)utf; /* Avoid compiler warning. */8357#else8358if (!utf) meta_arg = OP_ALLANY;8359#endif8360break;83618362case ESC_B:8363case ESC_b:8364if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)8365meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :8366OP_UCP_WORD_BOUNDARY;8367PCRE2_FALLTHROUGH /* Fall through */83688369case ESC_A:8370if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;8371break;83728373case ESC_K:8374cb->external_flags |= PCRE2_HASBSK; /* Record */8375break;8376}83778378*code++ = meta_arg;8379break; /* End META_ESCAPE */838083818382/* ===================================================================*/8383/* Handle an unrecognized meta value. A parsed pattern value less than8384META_END is a literal. Otherwise we have a problem. */83858386default:8387/* LCOV_EXCL_START */8388if (meta >= META_END)8389{8390PCRE2_DEBUG_UNREACHABLE();8391*errorcodeptr = ERR89; /* Internal error - unrecognized. */8392return 0;8393}8394/* LCOV_EXCL_STOP */83958396/* Handle a literal character. We come here by goto in the case of a839732-bit, non-UTF character whose value is greater than META_END. */83988399NORMAL_CHAR:8400meta = *pptr; /* Get the full 32 bits */8401NORMAL_CHAR_SET: /* Character is already in meta */8402matched_char = TRUE;84038404/* For caseless UTF or UCP mode, check whether this character has more than8405one other case. If so, generate a special OP_PROP item instead of OP_CHARI.8406When casing restrictions apply, ignore caseless sets that start with an8407ASCII character. If the character is affected by the special Turkish rules,8408hardcode the matching characters using a caseset. */84098410#ifdef SUPPORT_UNICODE8411if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)8412{8413uint32_t caseset;84148415if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==8416PCRE2_EXTRA_TURKISH_CASING &&8417UCD_ANY_I(meta))8418{8419caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);8420}8421else if ((caseset = UCD_CASESET(meta)) != 0 &&8422(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&8423PRIV(ucd_caseless_sets)[caseset] < 128)8424{8425caseset = 0; /* Ignore the caseless set if it's restricted. */8426}84278428if (caseset != 0)8429{8430*code++ = OP_PROP;8431*code++ = PT_CLIST;8432*code++ = caseset;8433if (firstcuflags == REQ_UNSET)8434firstcuflags = zerofirstcuflags = REQ_NONE;8435break; /* End handling this meta item */8436}8437}8438#endif84398440/* Caseful matches, or caseless and not one of the multicase characters. We8441come here by goto in the case of a positive class that contains only8442case-partners of a character with just two cases; matched_char has already8443been set TRUE and options fudged if necessary. */84448445CLASS_CASELESS_CHAR:84468447/* Get the character's code units into mcbuffer, with the length in8448mclength. When not in UTF mode, the length is always 1. */84498450#ifdef SUPPORT_UNICODE8451if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else8452#endif8453{8454mclength = 1;8455mcbuffer[0] = meta;8456}84578458/* Generate the appropriate code */84598460*code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;8461memcpy(code, mcbuffer, CU2BYTES(mclength));8462code += mclength;84638464/* Remember if \r or \n were seen */84658466if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)8467cb->external_flags |= PCRE2_HASCRORLF;84688469/* Set the first and required code units appropriately. If no previous8470first code unit, set it from this character, but revert to none on a zero8471repeat. Otherwise, leave the firstcu value alone, and don't change it on8472a zero repeat. */84738474if (firstcuflags == REQ_UNSET)8475{8476zerofirstcuflags = REQ_NONE;8477zeroreqcu = reqcu;8478zeroreqcuflags = reqcuflags;84798480/* If the character is more than one code unit long, we can set a single8481firstcu only if it is not to be matched caselessly. Multiple possible8482starting code units may be picked up later in the studying code. */84838484if (mclength == 1 || req_caseopt == 0)8485{8486firstcu = mcbuffer[0];8487firstcuflags = req_caseopt;8488if (mclength != 1)8489{8490reqcu = code[-1];8491reqcuflags = cb->req_varyopt;8492}8493}8494else firstcuflags = reqcuflags = REQ_NONE;8495}84968497/* firstcu was previously set; we can set reqcu only if the length is84981 or the matching is caseful. */84998500else8501{8502zerofirstcu = firstcu;8503zerofirstcuflags = firstcuflags;8504zeroreqcu = reqcu;8505zeroreqcuflags = reqcuflags;8506if (mclength == 1 || req_caseopt == 0)8507{8508reqcu = code[-1];8509reqcuflags = req_caseopt | cb->req_varyopt;8510}8511}85128513/* If caselessness was temporarily instated, reset it. */85148515if (reset_caseful)8516{8517options &= ~PCRE2_CASELESS;8518req_caseopt = 0;8519reset_caseful = FALSE;8520}85218522break; /* End literal character handling */8523} /* End of big switch */8524} /* End of big loop */85258526/* LCOV_EXCL_START */8527PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */8528return 0; /* Avoid compiler warnings */8529/* LCOV_EXCL_STOP */8530}8531853285338534/*************************************************8535* Compile regex: a sequence of alternatives *8536*************************************************/85378538/* On entry, pptr is pointing past the bracket meta, but on return it points to8539the closing bracket or META_END. The code variable is pointing at the code unit8540into which the BRA operator has been stored. This function is used during the8541pre-compile phase when we are trying to find out the amount of memory needed,8542as well as during the real compile phase. The value of lengthptr distinguishes8543the two phases.85448545Arguments:8546options option bits, including any changes for this subpattern8547xoptions extra option bits, ditto8548codeptr -> the address of the current code pointer8549pptrptr -> the address of the current parsed pattern pointer8550errorcodeptr -> pointer to error code variable8551skipunits skip this many code units at start (for brackets and OP_COND)8552firstcuptr place to put the first required code unit8553firstcuflagsptr place to put the first code unit flags8554reqcuptr place to put the last required code unit8555reqcuflagsptr place to put the last required code unit flags8556bcptr pointer to the chain of currently open branches8557cb points to the data block with tables pointers etc.8558lengthptr NULL during the real compile phase8559points to length accumulator during pre-compile phase85608561Returns: 0 There has been an error8562+1 Success, this group must match at least one character8563-1 Success, this group may match an empty string8564*/85658566static int8567compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,8568uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,8569uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,8570uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,8571compile_block *cb, PCRE2_SIZE *lengthptr)8572{8573PCRE2_UCHAR *code = *codeptr;8574PCRE2_UCHAR *last_branch = code;8575PCRE2_UCHAR *start_bracket = code;8576BOOL lookbehind;8577open_capitem capitem;8578int capnumber = 0;8579int okreturn = 1;8580uint32_t *pptr = *pptrptr;8581uint32_t firstcu, reqcu;8582uint32_t lookbehindlength;8583uint32_t lookbehindminlength;8584uint32_t firstcuflags, reqcuflags;8585PCRE2_SIZE length;8586branch_chain bc;85878588/* If set, call the external function that checks for stack availability. */85898590if (cb->cx->stack_guard != NULL &&8591cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))8592{8593*errorcodeptr= ERR33;8594cb->erroroffset = 0;8595return 0;8596}85978598/* Miscellaneous initialization */85998600bc.outer = bcptr;8601bc.current_branch = code;86028603firstcu = reqcu = 0;8604firstcuflags = reqcuflags = REQ_UNSET;86058606/* Accumulate the length for use in the pre-compile phase. Start with the8607length of the BRA and KET and any extra code units that are required at the8608beginning. We accumulate in a local variable to save frequent testing of8609lengthptr for NULL. We cannot do this by looking at the value of 'code' at the8610start and end of each alternative, because compiled items are discarded during8611the pre-compile phase so that the workspace is not exceeded. */86128613length = 2 + 2*LINK_SIZE + skipunits;86148615/* Remember if this is a lookbehind assertion, and if it is, save its length8616and skip over the pattern offset. */86178618lookbehind = *code == OP_ASSERTBACK ||8619*code == OP_ASSERTBACK_NOT ||8620*code == OP_ASSERTBACK_NA;86218622if (lookbehind)8623{8624lookbehindlength = META_DATA(pptr[-1]);8625lookbehindminlength = *pptr;8626pptr += SIZEOFFSET;8627}8628else lookbehindlength = lookbehindminlength = 0;86298630/* If this is a capturing subpattern, add to the chain of open capturing items8631so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA8632need be tested here; changing this opcode to one of its variants, e.g.8633OP_SCBRAPOS, happens later, after the group has been compiled. */86348635if (*code == OP_CBRA)8636{8637capnumber = GET2(code, 1 + LINK_SIZE);8638capitem.number = capnumber;8639capitem.next = open_caps;8640capitem.assert_depth = cb->assert_depth;8641open_caps = &capitem;8642}86438644/* Offset is set zero to mark that this bracket is still open */86458646PUT(code, 1, 0);8647code += 1 + LINK_SIZE + skipunits;86488649/* Loop for each alternative branch */86508651for (;;)8652{8653int branch_return;8654uint32_t branchfirstcu = 0, branchreqcu = 0;8655uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;86568657/* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There8658is only a single minimum length for the whole assertion. When the minimum8659length is LOOKBEHIND_MAX it means that all branches are of fixed length,8660though not necessarily the same length. In this case, the original OP_REVERSE8661can be used. It can also be used if a branch in a variable length lookbehind8662has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both8663maximum and minimum values. */86648665if (lookbehind && lookbehindlength > 0)8666{8667if (lookbehindminlength == LOOKBEHIND_MAX ||8668lookbehindminlength == lookbehindlength)8669{8670*code++ = OP_REVERSE;8671PUT2INC(code, 0, lookbehindlength);8672length += 1 + IMM2_SIZE;8673}8674else8675{8676*code++ = OP_VREVERSE;8677PUT2INC(code, 0, lookbehindminlength);8678PUT2INC(code, 0, lookbehindlength);8679length += 1 + 2*IMM2_SIZE;8680}8681}86828683/* Now compile the branch; in the pre-compile phase its length gets added8684into the length. */86858686if ((branch_return =8687compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,8688&branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,8689&bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)8690return 0;86918692/* If a branch can match an empty string, so can the whole group. */86938694if (branch_return < 0) okreturn = -1;86958696/* In the real compile phase, there is some post-processing to be done. */86978698if (lengthptr == NULL)8699{8700/* If this is the first branch, the firstcu and reqcu values for the8701branch become the values for the regex. */87028703if (*last_branch != OP_ALT)8704{8705firstcu = branchfirstcu;8706firstcuflags = branchfirstcuflags;8707reqcu = branchreqcu;8708reqcuflags = branchreqcuflags;8709}87108711/* If this is not the first branch, the first char and reqcu have to8712match the values from all the previous branches, except that if the8713previous value for reqcu didn't have REQ_VARY set, it can still match,8714and we set REQ_VARY for the group from this branch's value. */87158716else8717{8718/* If we previously had a firstcu, but it doesn't match the new branch,8719we have to abandon the firstcu for the regex, but if there was8720previously no reqcu, it takes on the value of the old firstcu. */87218722if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)8723{8724if (firstcuflags < REQ_NONE)8725{8726if (reqcuflags >= REQ_NONE)8727{8728reqcu = firstcu;8729reqcuflags = firstcuflags;8730}8731}8732firstcuflags = REQ_NONE;8733}87348735/* If we (now or from before) have no firstcu, a firstcu from the8736branch becomes a reqcu if there isn't a branch reqcu. */87378738if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&8739branchreqcuflags >= REQ_NONE)8740{8741branchreqcu = branchfirstcu;8742branchreqcuflags = branchfirstcuflags;8743}87448745/* Now ensure that the reqcus match */87468747if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||8748reqcu != branchreqcu)8749reqcuflags = REQ_NONE;8750else8751{8752reqcu = branchreqcu;8753reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */8754}8755}8756}87578758/* Handle reaching the end of the expression, either ')' or end of pattern.8759In the real compile phase, go back through the alternative branches and8760reverse the chain of offsets, with the field in the BRA item now becoming an8761offset to the first alternative. If there are no alternatives, it points to8762the end of the group. The length in the terminating ket is always the length8763of the whole bracketed item. Return leaving the pointer at the terminating8764char. */87658766if (META_CODE(*pptr) != META_ALT)8767{8768if (lengthptr == NULL)8769{8770uint32_t branch_length = (uint32_t)(code - last_branch);8771do8772{8773uint32_t prev_length = GET(last_branch, 1);8774PUT(last_branch, 1, branch_length);8775branch_length = prev_length;8776last_branch -= branch_length;8777}8778while (branch_length > 0);8779}87808781/* Fill in the ket */87828783*code = OP_KET;8784PUT(code, 1, (uint32_t)(code - start_bracket));8785code += 1 + LINK_SIZE;87868787/* Set values to pass back */87888789*codeptr = code;8790*pptrptr = pptr;8791*firstcuptr = firstcu;8792*firstcuflagsptr = firstcuflags;8793*reqcuptr = reqcu;8794*reqcuflagsptr = reqcuflags;8795if (lengthptr != NULL)8796{8797if (OFLOW_MAX - *lengthptr < length)8798{8799*errorcodeptr = ERR20;8800return 0;8801}8802*lengthptr += length;8803}8804return okreturn;8805}88068807/* Another branch follows. In the pre-compile phase, we can move the code8808pointer back to where it was for the start of the first branch. (That is,8809pretend that each branch is the only one.)88108811In the real compile phase, insert an ALT node. Its length field points back8812to the previous branch while the bracket remains open. At the end the chain8813is reversed. It's done like this so that the start of the bracket has a8814zero offset until it is closed, making it possible to detect recursion. */88158816if (lengthptr != NULL)8817{8818code = *codeptr + 1 + LINK_SIZE + skipunits;8819length += 1 + LINK_SIZE;8820}8821else8822{8823*code = OP_ALT;8824PUT(code, 1, (int)(code - last_branch));8825bc.current_branch = last_branch = code;8826code += 1 + LINK_SIZE;8827}88288829/* Set the maximum lookbehind length for the next branch (if not in a8830lookbehind the value will be zero) and then advance past the vertical bar. */88318832lookbehindlength = META_DATA(*pptr);8833pptr++;8834}88358836/* LCOV_EXCL_START */8837PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */8838return 0; /* Avoid compiler warnings */8839/* LCOV_EXCL_STOP */8840}8841884288438844/*************************************************8845* Check for anchored pattern *8846*************************************************/88478848/* Try to find out if this is an anchored regular expression. Consider each8849alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket8850all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then8851it's anchored. However, if this is a multiline pattern, then only OP_SOD will8852be found, because ^ generates OP_CIRCM in that mode.88538854We can also consider a regex to be anchored if OP_SOM starts all its branches.8855This is the code for \G, which means "match at start of match position, taking8856into account the match offset".88578858A branch is also implicitly anchored if it starts with .* and DOTALL is set,8859because that will try the rest of the pattern at all possible matching points,8860so there is no point trying again.... er ....88618862.... except when the .* appears inside capturing parentheses, and there is a8863subsequent back reference to those parentheses. We haven't enough information8864to catch that case precisely.88658866At first, the best we could do was to detect when .* was in capturing brackets8867and the highest back reference was greater than or equal to that level.8868However, by keeping a bitmap of the first 31 back references, we can catch some8869of the more common cases more precisely.88708871... A second exception is when the .* appears inside an atomic group, because8872this prevents the number of characters it matches from being adjusted.88738874Arguments:8875code points to start of the compiled pattern8876bracket_map a bitmap of which brackets we are inside while testing; this8877handles up to substring 31; after that we just have to take8878the less precise approach8879cb points to the compile data block8880atomcount atomic group level8881inassert TRUE if in an assertion8882dotstar_anchor TRUE if automatic anchoring optimization is enabled88838884Returns: TRUE or FALSE8885*/88868887static BOOL8888is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,8889int atomcount, BOOL inassert, BOOL dotstar_anchor)8890{8891do {8892PCRE2_SPTR scode = first_significant_code(8893code + PRIV(OP_lengths)[*code], FALSE);8894int op = *scode;88958896/* Non-capturing brackets */88978898if (op == OP_BRA || op == OP_BRAPOS ||8899op == OP_SBRA || op == OP_SBRAPOS)8900{8901if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))8902return FALSE;8903}89048905/* Capturing brackets */89068907else if (op == OP_CBRA || op == OP_CBRAPOS ||8908op == OP_SCBRA || op == OP_SCBRAPOS)8909{8910int n = GET2(scode, 1+LINK_SIZE);8911uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);8912if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;8913}89148915/* Positive forward assertion */89168917else if (op == OP_ASSERT || op == OP_ASSERT_NA)8918{8919if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;8920}89218922/* Condition. If there is no second branch, it can't be anchored. */89238924else if (op == OP_COND || op == OP_SCOND)8925{8926if (scode[GET(scode,1)] != OP_ALT) return FALSE;8927if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))8928return FALSE;8929}89308931/* Atomic groups */89328933else if (op == OP_ONCE)8934{8935if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))8936return FALSE;8937}89388939/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and8940it isn't in brackets that are or may be referenced or inside an atomic8941group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,8942because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/8943with the subject "aab", which matches "b", i.e. not at the start of a line.8944There is also an option that disables auto-anchoring. */89458946else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||8947op == OP_TYPEPOSSTAR))8948{8949if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||8950atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)8951return FALSE;8952}89538954/* Check for explicit anchoring */89558956else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;89578958code += GET(code, 1);8959}8960while (*code == OP_ALT); /* Loop for each alternative */8961return TRUE;8962}8963896489658966/*************************************************8967* Check for starting with ^ or .* *8968*************************************************/89698970/* This is called to find out if every branch starts with ^ or .* so that8971"first char" processing can be done to speed things up in multiline8972matching and for non-DOTALL patterns that start with .* (which must start at8973the beginning or after \n). As in the case of is_anchored() (see above), we8974have to take account of back references to capturing brackets that contain .*8975because in that case we can't make the assumption. Also, the appearance of .*8976inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE8977or *SKIP does not count, because once again the assumption no longer holds.89788979Arguments:8980code points to start of the compiled pattern or a group8981bracket_map a bitmap of which brackets we are inside while testing; this8982handles up to substring 31; after that we just have to take8983the less precise approach8984cb points to the compile data8985atomcount atomic group level8986inassert TRUE if in an assertion8987dotstar_anchor TRUE if automatic anchoring optimization is enabled89888989Returns: TRUE or FALSE8990*/89918992static BOOL8993is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,8994int atomcount, BOOL inassert, BOOL dotstar_anchor)8995{8996do {8997PCRE2_SPTR scode = first_significant_code(8998code + PRIV(OP_lengths)[*code], FALSE);8999int op = *scode;90009001/* If we are at the start of a conditional assertion group, *both* the9002conditional assertion *and* what follows the condition must satisfy the test9003for start of line. Other kinds of condition fail. Note that there may be an9004auto-callout at the start of a condition. */90059006if (op == OP_COND)9007{9008scode += 1 + LINK_SIZE;90099010if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];9011else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);90129013switch (*scode)9014{9015case OP_CREF:9016case OP_DNCREF:9017case OP_RREF:9018case OP_DNRREF:9019case OP_FAIL:9020case OP_FALSE:9021case OP_TRUE:9022return FALSE;90239024default: /* Assertion */9025if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))9026return FALSE;9027do scode += GET(scode, 1); while (*scode == OP_ALT);9028scode += 1 + LINK_SIZE;9029break;9030}9031scode = first_significant_code(scode, FALSE);9032op = *scode;9033}90349035/* Non-capturing brackets */90369037if (op == OP_BRA || op == OP_BRAPOS ||9038op == OP_SBRA || op == OP_SBRAPOS)9039{9040if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))9041return FALSE;9042}90439044/* Capturing brackets */90459046else if (op == OP_CBRA || op == OP_CBRAPOS ||9047op == OP_SCBRA || op == OP_SCBRAPOS)9048{9049int n = GET2(scode, 1+LINK_SIZE);9050unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);9051if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))9052return FALSE;9053}90549055/* Positive forward assertions */90569057else if (op == OP_ASSERT || op == OP_ASSERT_NA)9058{9059if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))9060return FALSE;9061}90629063/* Atomic brackets */90649065else if (op == OP_ONCE)9066{9067if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))9068return FALSE;9069}90709071/* .* means "start at start or after \n" if it isn't in atomic brackets or9072brackets that may be referenced or an assertion, and as long as the pattern9073does not contain *PRUNE or *SKIP, because these break the feature. Consider,9074for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",9075i.e. not at the start of a line. There is also an option that disables this9076optimization. */90779078else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)9079{9080if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||9081atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)9082return FALSE;9083}90849085/* Check for explicit circumflex; anything else gives a FALSE result. Note9086in particular that this includes atomic brackets OP_ONCE because the number9087of characters matched by .* cannot be adjusted inside them. */90889089else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;90909091/* Move on to the next alternative */90929093code += GET(code, 1);9094}9095while (*code == OP_ALT); /* Loop for each alternative */9096return TRUE;9097}9098909991009101/*************************************************9102* Scan compiled regex for recursion reference *9103*************************************************/91049105/* This function scans through a compiled pattern until it finds an instance of9106OP_RECURSE.91079108Arguments:9109code points to start of expression9110utf TRUE in UTF mode91119112Returns: pointer to the opcode for OP_RECURSE, or NULL if not found9113*/91149115static PCRE2_UCHAR *9116find_recurse(PCRE2_UCHAR *code, BOOL utf)9117{9118for (;;)9119{9120PCRE2_UCHAR c = *code;9121if (c == OP_END) return NULL;9122if (c == OP_RECURSE) return code;91239124/* XCLASS is used for classes that cannot be represented just by a bit map.9125This includes negated single high-valued characters. ECLASS is used for9126classes that use set operations internally. CALLOUT_STR is used for9127callouts with string arguments. In each case the length in the table is9128zero; the actual length is stored in the compiled code. */91299130if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);9131else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);91329133/* Otherwise, we can get the item's length from the table, except that for9134repeated character types, we have to test for \p and \P, which have an extra9135two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,9136we must add in its length. */91379138else9139{9140switch(c)9141{9142case OP_TYPESTAR:9143case OP_TYPEMINSTAR:9144case OP_TYPEPLUS:9145case OP_TYPEMINPLUS:9146case OP_TYPEQUERY:9147case OP_TYPEMINQUERY:9148case OP_TYPEPOSSTAR:9149case OP_TYPEPOSPLUS:9150case OP_TYPEPOSQUERY:9151if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;9152break;91539154case OP_TYPEPOSUPTO:9155case OP_TYPEUPTO:9156case OP_TYPEMINUPTO:9157case OP_TYPEEXACT:9158if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)9159code += 2;9160break;91619162case OP_MARK:9163case OP_COMMIT_ARG:9164case OP_PRUNE_ARG:9165case OP_SKIP_ARG:9166case OP_THEN_ARG:9167code += code[1];9168break;9169}91709171/* Add in the fixed length from the table */91729173code += PRIV(OP_lengths)[c];91749175/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may9176be followed by a multi-unit character. The length in the table is a9177minimum, so we have to arrange to skip the extra units. */91789179#ifdef MAYBE_UTF_MULTI9180if (utf) switch(c)9181{9182case OP_CHAR:9183case OP_CHARI:9184case OP_NOT:9185case OP_NOTI:9186case OP_EXACT:9187case OP_EXACTI:9188case OP_NOTEXACT:9189case OP_NOTEXACTI:9190case OP_UPTO:9191case OP_UPTOI:9192case OP_NOTUPTO:9193case OP_NOTUPTOI:9194case OP_MINUPTO:9195case OP_MINUPTOI:9196case OP_NOTMINUPTO:9197case OP_NOTMINUPTOI:9198case OP_POSUPTO:9199case OP_POSUPTOI:9200case OP_NOTPOSUPTO:9201case OP_NOTPOSUPTOI:9202case OP_STAR:9203case OP_STARI:9204case OP_NOTSTAR:9205case OP_NOTSTARI:9206case OP_MINSTAR:9207case OP_MINSTARI:9208case OP_NOTMINSTAR:9209case OP_NOTMINSTARI:9210case OP_POSSTAR:9211case OP_POSSTARI:9212case OP_NOTPOSSTAR:9213case OP_NOTPOSSTARI:9214case OP_PLUS:9215case OP_PLUSI:9216case OP_NOTPLUS:9217case OP_NOTPLUSI:9218case OP_MINPLUS:9219case OP_MINPLUSI:9220case OP_NOTMINPLUS:9221case OP_NOTMINPLUSI:9222case OP_POSPLUS:9223case OP_POSPLUSI:9224case OP_NOTPOSPLUS:9225case OP_NOTPOSPLUSI:9226case OP_QUERY:9227case OP_QUERYI:9228case OP_NOTQUERY:9229case OP_NOTQUERYI:9230case OP_MINQUERY:9231case OP_MINQUERYI:9232case OP_NOTMINQUERY:9233case OP_NOTMINQUERYI:9234case OP_POSQUERY:9235case OP_POSQUERYI:9236case OP_NOTPOSQUERY:9237case OP_NOTPOSQUERYI:9238if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);9239break;9240}9241#else9242(void)(utf); /* Keep compiler happy by referencing function argument */9243#endif /* MAYBE_UTF_MULTI */9244}9245}9246}9247924892499250/*************************************************9251* Check for asserted fixed first code unit *9252*************************************************/92539254/* During compilation, the "first code unit" settings from forward assertions9255are discarded, because they can cause conflicts with actual literals that9256follow. However, if we end up without a first code unit setting for an9257unanchored pattern, it is worth scanning the regex to see if there is an9258initial asserted first code unit. If all branches start with the same asserted9259code unit, or with a non-conditional bracket all of whose alternatives start9260with the same asserted code unit (recurse ad lib), then we return that code9261unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with9262REQ_NONE in the flags.92639264Arguments:9265code points to start of compiled pattern9266flags points to the first code unit flags9267inassert non-zero if in an assertion92689269Returns: the fixed first code unit, or 0 with REQ_NONE in flags9270*/92719272static uint32_t9273find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)9274{9275uint32_t c = 0;9276uint32_t cflags = REQ_NONE;92779278*flags = REQ_NONE;9279do {9280uint32_t d;9281uint32_t dflags;9282int xl = (*code == OP_CBRA || *code == OP_SCBRA ||9283*code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;9284PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);9285PCRE2_UCHAR op = *scode;92869287switch(op)9288{9289default:9290return 0;92919292case OP_BRA:9293case OP_BRAPOS:9294case OP_CBRA:9295case OP_SCBRA:9296case OP_CBRAPOS:9297case OP_SCBRAPOS:9298case OP_ASSERT:9299case OP_ASSERT_NA:9300case OP_ONCE:9301case OP_SCRIPT_RUN:9302d = find_firstassertedcu(scode, &dflags, inassert +9303((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));9304if (dflags >= REQ_NONE) return 0;9305if (cflags >= REQ_NONE) { c = d; cflags = dflags; }9306else if (c != d || cflags != dflags) return 0;9307break;93089309case OP_EXACT:9310scode += IMM2_SIZE;9311PCRE2_FALLTHROUGH /* Fall through */93129313case OP_CHAR:9314case OP_PLUS:9315case OP_MINPLUS:9316case OP_POSPLUS:9317if (inassert == 0) return 0;9318if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }9319else if (c != scode[1]) return 0;9320break;93219322case OP_EXACTI:9323scode += IMM2_SIZE;9324PCRE2_FALLTHROUGH /* Fall through */93259326case OP_CHARI:9327case OP_PLUSI:9328case OP_MINPLUSI:9329case OP_POSPLUSI:9330if (inassert == 0) return 0;93319332/* If the character is more than one code unit long, we cannot set its9333first code unit when matching caselessly. Later scanning may pick up9334multiple code units. */93359336#ifdef SUPPORT_UNICODE9337#if PCRE2_CODE_UNIT_WIDTH == 89338if (scode[1] >= 0x80) return 0;9339#elif PCRE2_CODE_UNIT_WIDTH == 169340if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;9341#endif9342#endif93439344if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }9345else if (c != scode[1]) return 0;9346break;9347}93489349code += GET(code, 1);9350}9351while (*code == OP_ALT);93529353*flags = cflags;9354return c;9355}9356935793589359/*************************************************9360* Skip in parsed pattern *9361*************************************************/93629363/* This function is called to skip parts of the parsed pattern when finding the9364length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find9365the end of the branch, it is called to skip over an internal lookaround or9366(DEFINE) group, and it is also called to skip to the end of a class, during9367which it will never encounter nested groups (but there's no need to have9368special code for that).93699370When called to find the end of a branch or group, pptr must point to the first9371meta code inside the branch, not the branch-starting code. In other cases it9372can point to the item that causes the function to be called.93739374Arguments:9375pptr current pointer to skip from9376skiptype PSKIP_CLASS when skipping to end of class9377PSKIP_ALT when META_ALT ends the skip9378PSKIP_KET when only META_KET ends the skip93799380Returns: new value of pptr9381NULL if META_END is reached - should never occur9382or for an unknown meta value - likewise9383*/93849385static uint32_t *9386parsed_skip(uint32_t *pptr, uint32_t skiptype)9387{9388uint32_t nestlevel = 0;93899390for (;; pptr++)9391{9392uint32_t meta = META_CODE(*pptr);93939394switch(meta)9395{9396default: /* Just skip over most items */9397if (meta < META_END) continue; /* Literal */9398break;93999400/* The parsed regex is malformed; we have reached the end and did9401not find the end of the construct which we are skipping over. */94029403/* LCOV_EXCL_START */9404case META_END:9405PCRE2_DEBUG_UNREACHABLE();9406return NULL;9407/* LCOV_EXCL_STOP */94089409/* The data for these items is variable in length. */94109411case META_BACKREF: /* Offset is present only if group >= 10 */9412if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;9413break;94149415case META_ESCAPE:9416if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)9417pptr += 1; /* Skip prop data */9418break;94199420case META_MARK: /* Add the length of the name. */9421case META_COMMIT_ARG:9422case META_PRUNE_ARG:9423case META_SKIP_ARG:9424case META_THEN_ARG:9425pptr += pptr[1];9426break;94279428/* These are the "active" items in this loop. */94299430case META_CLASS_END:9431if (skiptype == PSKIP_CLASS) return pptr;9432break;94339434case META_ATOMIC:9435case META_CAPTURE:9436case META_COND_ASSERT:9437case META_COND_DEFINE:9438case META_COND_NAME:9439case META_COND_NUMBER:9440case META_COND_RNAME:9441case META_COND_RNUMBER:9442case META_COND_VERSION:9443case META_SCS:9444case META_LOOKAHEAD:9445case META_LOOKAHEADNOT:9446case META_LOOKAHEAD_NA:9447case META_LOOKBEHIND:9448case META_LOOKBEHINDNOT:9449case META_LOOKBEHIND_NA:9450case META_NOCAPTURE:9451case META_SCRIPT_RUN:9452nestlevel++;9453break;94549455case META_ALT:9456if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;9457break;94589459case META_KET:9460if (nestlevel == 0) return pptr;9461nestlevel--;9462break;9463}94649465/* The extra data item length for each meta is in a table. */94669467meta = (meta >> 16) & 0x7fff;9468if (meta >= sizeof(meta_extra_lengths)) return NULL;9469pptr += meta_extra_lengths[meta];9470}94719472/* LCOV_EXCL_START */9473PCRE2_UNREACHABLE(); /* Control never reaches here */9474/* LCOV_EXCL_STOP */9475}9476947794789479/*************************************************9480* Find length of a parsed group *9481*************************************************/94829483/* This is called for nested groups within a branch of a lookbehind whose9484length is being computed. On entry, the pointer must be at the first element9485after the group initializing code. On exit it points to OP_KET. Caching is used9486to improve processing speed when the same capturing group occurs many times.94879488Arguments:9489pptrptr pointer to pointer in the parsed pattern9490minptr where to return the minimum length9491isinline FALSE if a reference or recursion; TRUE for inline group9492errcodeptr pointer to the errorcode9493lcptr pointer to the loop counter9494group number of captured group or -1 for a non-capturing group9495recurses chain of recurse_check to catch mutual recursion9496cb pointer to the compile data94979498Returns: the maximum group length or a negative number9499*/95009501static int9502get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,9503int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)9504{9505uint32_t *gi = cb->groupinfo + 2 * group;9506int branchlength, branchminlength;9507int grouplength = -1;9508int groupminlength = INT_MAX;95099510/* The cache can be used only if there is no possibility of there being two9511groups with the same number. We do not need to set the end pointer for a group9512that is being processed as a back reference or recursion, but we must do so for9513an inline group. */95149515if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)9516{9517uint32_t groupinfo = gi[0];9518if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;9519if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)9520{9521if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);9522*minptr = gi[1];9523return groupinfo & GI_FIXED_LENGTH_MASK;9524}9525}95269527/* Scan the group. In this case we find the end pointer of necessity. */95289529for(;;)9530{9531branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,9532recurses, cb);9533if (branchlength < 0) goto ISNOTFIXED;9534if (branchlength > grouplength) grouplength = branchlength;9535if (branchminlength < groupminlength) groupminlength = branchminlength;9536if (**pptrptr == META_KET) break;9537*pptrptr += 1; /* Skip META_ALT */9538}95399540if (group > 0)9541{9542gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);9543gi[1] = groupminlength;9544}95459546*minptr = groupminlength;9547return grouplength;95489549ISNOTFIXED:9550if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;9551return -1;9552}9553955495559556/*************************************************9557* Find length of a parsed branch *9558*************************************************/95599560/* Return fixed maximum and minimum lengths for a branch in a lookbehind,9561giving an error if the length is not limited. On entry, *pptrptr points to the9562first element inside the branch. On exit it is set to point to the ALT or KET.95639564Arguments:9565pptrptr pointer to pointer in the parsed pattern9566minptr where to return the minimum length9567errcodeptr pointer to error code9568lcptr pointer to loop counter9569recurses chain of recurse_check to catch mutual recursion9570cb pointer to compile block95719572Returns: the maximum length, or a negative value on error9573*/95749575static int9576get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,9577parsed_recurse_check *recurses, compile_block *cb)9578{9579int branchlength = 0;9580int branchminlength = 0;9581int grouplength, groupminlength;9582uint32_t lastitemlength = 0;9583uint32_t lastitemminlength = 0;9584uint32_t *pptr = *pptrptr;9585PCRE2_SIZE offset;9586parsed_recurse_check this_recurse;95879588/* A large and/or complex regex can take too long to process. This can happen9589more often when (?| groups are present in the pattern because their length9590cannot be cached. */95919592if ((*lcptr)++ > 2000)9593{9594*errcodeptr = ERR35; /* Lookbehind is too complicated */9595return -1;9596}95979598/* Scan the branch, accumulating the length. */95999600for (;; pptr++)9601{9602parsed_recurse_check *r;9603uint32_t *gptr, *gptrend;9604uint32_t escape;9605uint32_t min, max;9606uint32_t group = 0;9607uint32_t itemlength = 0;9608uint32_t itemminlength = 0;96099610if (*pptr < META_END)9611{9612itemlength = itemminlength = 1;9613}96149615else switch (META_CODE(*pptr))9616{9617case META_KET:9618case META_ALT:9619goto EXIT;96209621/* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the9622actual termination. */96239624case META_ACCEPT:9625case META_FAIL:9626pptr = parsed_skip(pptr, PSKIP_ALT);9627if (pptr == NULL) goto PARSED_SKIP_FAILED;9628goto EXIT;96299630case META_MARK:9631case META_COMMIT_ARG:9632case META_PRUNE_ARG:9633case META_SKIP_ARG:9634case META_THEN_ARG:9635pptr += pptr[1] + 1;9636break;96379638case META_CIRCUMFLEX:9639case META_COMMIT:9640case META_DOLLAR:9641case META_PRUNE:9642case META_SKIP:9643case META_THEN:9644break;96459646case META_OPTIONS:9647pptr += 2;9648break;96499650case META_BIGVALUE:9651itemlength = itemminlength = 1;9652pptr += 1;9653break;96549655case META_CLASS:9656case META_CLASS_NOT:9657itemlength = itemminlength = 1;9658pptr = parsed_skip(pptr, PSKIP_CLASS);9659if (pptr == NULL) goto PARSED_SKIP_FAILED;9660break;96619662case META_CLASS_EMPTY_NOT:9663case META_DOT:9664itemlength = itemminlength = 1;9665break;96669667case META_CALLOUT_NUMBER:9668pptr += 3;9669break;96709671case META_CALLOUT_STRING:9672pptr += 3 + SIZEOFFSET;9673break;96749675/* Only some escapes consume a character. Of those, \R can match one or two9676characters, but \X is never allowed because it matches an unknown number of9677characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */96789679case META_ESCAPE:9680escape = META_DATA(*pptr);9681if (escape == ESC_X) return -1;9682if (escape == ESC_R)9683{9684itemminlength = 1;9685itemlength = 2;9686}9687else if (escape > ESC_b && escape < ESC_Z)9688{9689#if PCRE2_CODE_UNIT_WIDTH != 329690if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)9691{9692*errcodeptr = ERR36;9693return -1;9694}9695#endif9696itemlength = itemminlength = 1;9697if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */9698}9699break;97009701/* Lookaheads do not contribute to the length of this branch, but they may9702contain lookbehinds within them whose lengths need to be set. */97039704case META_LOOKAHEAD:9705case META_LOOKAHEADNOT:9706case META_LOOKAHEAD_NA:9707case META_SCS:9708*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);9709if (*errcodeptr != 0) return -1;97109711/* Ignore any qualifiers that follow a lookahead assertion. */97129713switch (pptr[1])9714{9715case META_ASTERISK:9716case META_ASTERISK_PLUS:9717case META_ASTERISK_QUERY:9718case META_PLUS:9719case META_PLUS_PLUS:9720case META_PLUS_QUERY:9721case META_QUERY:9722case META_QUERY_PLUS:9723case META_QUERY_QUERY:9724pptr++;9725break;97269727case META_MINMAX:9728case META_MINMAX_PLUS:9729case META_MINMAX_QUERY:9730pptr += 3;9731break;97329733default:9734break;9735}9736break;97379738/* A nested lookbehind does not contribute any length to this lookbehind,9739but must itself be checked and have its lengths set. Note that9740set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket9741of the group, so no need to update it here. */97429743case META_LOOKBEHIND:9744case META_LOOKBEHINDNOT:9745case META_LOOKBEHIND_NA:9746if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))9747return -1;9748break;97499750/* Back references and recursions are handled by very similar code. At this9751stage, the names generated in the parsing pass are available, but the main9752name table has not yet been created. So for the named varieties, scan the9753list of names in order to get the number of the first one in the pattern,9754and whether or not this name is duplicated. */97559756case META_BACKREF_BYNAME:9757if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)9758goto ISNOTFIXED;9759PCRE2_FALLTHROUGH /* Fall through */97609761case META_RECURSE_BYNAME:9762{9763PCRE2_SPTR name;9764BOOL is_dupname = FALSE;9765named_group *ng;9766uint32_t meta_code = META_CODE(*pptr);9767uint32_t length = *(++pptr);97689769GETPLUSOFFSET(offset, pptr);9770name = cb->start_pattern + offset;9771ng = PRIV(compile_find_named_group)(name, length, cb);97729773if (ng == NULL)9774{9775*errcodeptr = ERR15; /* Non-existent subpattern */9776cb->erroroffset = offset;9777return -1;9778}97799780group = ng->number;9781is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;97829783/* A numerical back reference can be fixed length if duplicate capturing9784groups are not being used. A non-duplicate named back reference can also9785be handled. */97869787if (meta_code == META_RECURSE_BYNAME ||9788(!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))9789goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */9790}9791goto ISNOTFIXED; /* Duplicate name or number */97929793/* The offset values for back references < 10 are in a separate vector9794because otherwise they would use more than two parsed pattern elements on979564-bit systems. */97969797case META_BACKREF:9798if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||9799(cb->external_flags & PCRE2_DUPCAPUSED) != 0)9800goto ISNOTFIXED;9801group = META_DATA(*pptr);9802if (group < 10)9803{9804offset = cb->small_ref_offset[group];9805goto RECURSE_OR_BACKREF_LENGTH;9806}98079808PCRE2_FALLTHROUGH /* Fall through */9809/* For groups >= 10 - picking up group twice does no harm. */98109811/* A true recursion implies not fixed length, but a subroutine call may9812be OK. Back reference "recursions" are also failed. */98139814case META_RECURSE:9815group = META_DATA(*pptr);9816GETPLUSOFFSET(offset, pptr);98179818RECURSE_OR_BACKREF_LENGTH:9819if (group > cb->bracount)9820{9821cb->erroroffset = offset;9822*errcodeptr = ERR15; /* Non-existent subpattern */9823return -1;9824}9825if (group == 0) goto ISNOTFIXED; /* Local recursion */9826for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)9827{9828if (META_CODE(*gptr) == META_BIGVALUE) gptr++;9829else if (*gptr == (META_CAPTURE | group)) break;9830}98319832/* We must start the search for the end of the group at the first meta code9833inside the group. Otherwise it will be treated as an enclosed group. */98349835gptrend = parsed_skip(gptr + 1, PSKIP_KET);9836if (gptrend == NULL) goto PARSED_SKIP_FAILED;9837if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */9838for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;9839if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */9840this_recurse.prev = recurses;9841this_recurse.groupptr = gptr;98429843/* We do not need to know the position of the end of the group, that is,9844gptr is not used after the call to get_grouplength(). Setting the second9845argument FALSE stops it scanning for the end when the length can be found9846in the cache. */98479848gptr++;9849grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,9850lcptr, group, &this_recurse, cb);9851if (grouplength < 0)9852{9853if (*errcodeptr == 0) goto ISNOTFIXED;9854return -1; /* Error already set */9855}9856itemlength = grouplength;9857itemminlength = groupminlength;9858break;98599860/* A (DEFINE) group is never obeyed inline and so it does not contribute to9861the length of this branch. Skip from the following item to the next9862unpaired ket. */98639864case META_COND_DEFINE:9865pptr = parsed_skip(pptr + 1, PSKIP_KET);9866break;98679868/* Check other nested groups - advance past the initial data for each type9869and then seek a fixed length with get_grouplength(). */98709871case META_COND_NAME:9872case META_COND_NUMBER:9873case META_COND_RNAME:9874case META_COND_RNUMBER:9875pptr += 2 + SIZEOFFSET;9876goto CHECK_GROUP;98779878case META_COND_ASSERT:9879pptr += 1;9880goto CHECK_GROUP;98819882case META_COND_VERSION:9883pptr += 4;9884goto CHECK_GROUP;98859886case META_CAPTURE:9887group = META_DATA(*pptr);9888PCRE2_FALLTHROUGH /* Fall through */98899890case META_ATOMIC:9891case META_NOCAPTURE:9892case META_SCRIPT_RUN:9893pptr++;9894CHECK_GROUP:9895grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,9896lcptr, group, recurses, cb);9897if (grouplength < 0) return -1;9898itemlength = grouplength;9899itemminlength = groupminlength;9900break;99019902case META_QUERY:9903case META_QUERY_PLUS:9904case META_QUERY_QUERY:9905min = 0;9906max = 1;9907goto REPETITION;99089909/* Exact repetition is OK; variable repetition is not. A repetition of zero9910must subtract the length that has already been added. */99119912case META_MINMAX:9913case META_MINMAX_PLUS:9914case META_MINMAX_QUERY:9915min = pptr[1];9916max = pptr[2];9917pptr += 2;99189919REPETITION:9920if (max != REPEAT_UNLIMITED)9921{9922if (lastitemlength != 0 && /* Should not occur, but just in case */9923max != 0 &&9924(INT_MAX - branchlength)/lastitemlength < max - 1)9925{9926*errcodeptr = ERR87; /* Integer overflow; lookbehind too big */9927return -1;9928}9929if (min == 0) branchminlength -= lastitemminlength;9930else itemminlength = (min - 1) * lastitemminlength;9931if (max == 0) branchlength -= lastitemlength;9932else itemlength = (max - 1) * lastitemlength;9933break;9934}9935PCRE2_FALLTHROUGH /* Fall through */99369937/* Any other item means this branch does not have a fixed length. */99389939default:9940ISNOTFIXED:9941*errcodeptr = ERR25; /* Not fixed length */9942return -1;9943}99449945/* Add the item length to the branchlength, checking for integer overflow and9946for the branch length exceeding the overall limit. Later, if there is at9947least one variable-length branch in the group, there is a test for the9948(smaller) variable-length branch length limit. */99499950if (INT_MAX - branchlength < (int)itemlength ||9951(branchlength += itemlength) > LOOKBEHIND_MAX)9952{9953*errcodeptr = ERR87;9954return -1;9955}99569957branchminlength += itemminlength;99589959/* Save this item length for use if the next item is a quantifier. */99609961lastitemlength = itemlength;9962lastitemminlength = itemminlength;9963}99649965EXIT:9966*pptrptr = pptr;9967*minptr = branchminlength;9968return branchlength;99699970/* LCOV_EXCL_START */9971PARSED_SKIP_FAILED:9972PCRE2_DEBUG_UNREACHABLE();9973*errcodeptr = ERR90; /* Unhandled META code - internal error */9974return -1;9975/* LCOV_EXCL_STOP */9976}9977997899799980/*************************************************9981* Set lengths in a lookbehind *9982*************************************************/99839984/* This function is called for each lookbehind, to set the lengths in its9985branches. An error occurs if any branch does not have a limited maximum length9986that is less than the limit (65535). On exit, the pointer must be left on the9987final ket.99889989The function also maintains the max_lookbehind value. Any lookbehind branch9990that contains a nested lookbehind may actually look further back than the9991length of the branch. The additional amount is passed back from9992get_branchlength() as an "extra" value.99939994Arguments:9995pptrptr pointer to pointer in the parsed pattern9996errcodeptr pointer to error code9997lcptr pointer to loop counter9998recurses chain of recurse_check to catch mutual recursion9999cb pointer to compile block1000010001Returns: TRUE if all is well10002FALSE otherwise, with error code and offset set10003*/1000410005static BOOL10006set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,10007parsed_recurse_check *recurses, compile_block *cb)10008{10009PCRE2_SIZE offset;10010uint32_t *bptr = *pptrptr;10011uint32_t *gbptr = bptr;10012int maxlength = 0;10013int minlength = INT_MAX;10014BOOL variable = FALSE;1001510016READPLUSOFFSET(offset, bptr); /* Offset for error messages */10017*pptrptr += SIZEOFFSET;1001810019/* Each branch can have a different maximum length, but we can keep only a10020single minimum for the whole group, because there's nowhere to save individual10021values in the META_ALT item. */1002210023do10024{10025int branchlength, branchminlength;1002610027*pptrptr += 1;10028branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,10029recurses, cb);1003010031if (branchlength < 0)10032{10033/* The errorcode and offset may already be set from a nested lookbehind. */10034if (*errcodeptr == 0) *errcodeptr = ERR25;10035if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;10036return FALSE;10037}1003810039if (branchlength != branchminlength) variable = TRUE;10040if (branchminlength < minlength) minlength = branchminlength;10041if (branchlength > maxlength) maxlength = branchlength;10042if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;10043*bptr |= branchlength; /* branchlength never more than 65535 */10044bptr = *pptrptr;10045}10046while (META_CODE(*bptr) == META_ALT);1004710048/* If any branch is of variable length, the whole lookbehind is of variable10049length. If the maximum length of any branch exceeds the maximum for variable10050lookbehinds, give an error. Otherwise, the minimum length is set in the word10051that follows the original group META value. For a fixed-length lookbehind, this10052is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but10053possibly different) length. */1005410055if (variable)10056{10057gbptr[1] = minlength;10058if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)10059{10060*errcodeptr = ERR100;10061cb->erroroffset = offset;10062return FALSE;10063}10064}10065else gbptr[1] = LOOKBEHIND_MAX;1006610067return TRUE;10068}10069100701007110072/*************************************************10073* Check parsed pattern lookbehinds *10074*************************************************/1007510076/* This function is called at the end of parsing a pattern if any lookbehinds10077were encountered. It scans the parsed pattern for them, calling10078set_lookbehind_lengths() for each one. At the start, the errorcode is zero and10079the error offset is marked unset. The enables the functions above not to10080override settings from deeper nestings.1008110082This function is called recursively from get_branchlength() for lookaheads in10083order to process any lookbehinds that they may contain. It stops when it hits a10084non-nested closing parenthesis in this case, returning a pointer to it.1008510086Arguments10087pptr points to where to start (start of pattern or start of lookahead)10088retptr if not NULL, return the ket pointer here10089recurses chain of recurse_check to catch mutual recursion10090cb points to the compile block10091lcptr points to loop counter1009210093Returns: 0 on success, or an errorcode (cb->erroroffset will be set)10094*/1009510096static int10097check_lookbehinds(uint32_t *pptr, uint32_t **retptr,10098parsed_recurse_check *recurses, compile_block *cb, int *lcptr)10099{10100int errorcode = 0;10101int nestlevel = 0;1010210103cb->erroroffset = PCRE2_UNSET;1010410105for (; *pptr != META_END; pptr++)10106{10107if (*pptr < META_END) continue; /* Literal */1010810109switch (META_CODE(*pptr))10110{10111/* The following erroroffset is a bogus but safe value. This branch should10112be avoided by providing a proper implementation for all supported cases10113below. */1011410115/* LCOV_EXCL_START */10116default:10117PCRE2_DEBUG_UNREACHABLE();10118cb->erroroffset = 0;10119return ERR70; /* Unrecognized meta code */10120/* LCOV_EXCL_STOP */1012110122case META_ESCAPE:10123if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)10124pptr += 1; /* Skip prop data */10125break;1012610127case META_KET:10128if (--nestlevel < 0)10129{10130if (retptr != NULL) *retptr = pptr;10131return 0;10132}10133break;1013410135case META_ATOMIC:10136case META_CAPTURE:10137case META_COND_ASSERT:10138case META_SCS:10139case META_LOOKAHEAD:10140case META_LOOKAHEADNOT:10141case META_LOOKAHEAD_NA:10142case META_NOCAPTURE:10143case META_SCRIPT_RUN:10144nestlevel++;10145break;1014610147case META_ACCEPT:10148case META_ALT:10149case META_ASTERISK:10150case META_ASTERISK_PLUS:10151case META_ASTERISK_QUERY:10152case META_BACKREF:10153case META_CIRCUMFLEX:10154case META_CLASS:10155case META_CLASS_EMPTY:10156case META_CLASS_EMPTY_NOT:10157case META_CLASS_END:10158case META_CLASS_NOT:10159case META_COMMIT:10160case META_DOLLAR:10161case META_DOT:10162case META_FAIL:10163case META_PLUS:10164case META_PLUS_PLUS:10165case META_PLUS_QUERY:10166case META_PRUNE:10167case META_QUERY:10168case META_QUERY_PLUS:10169case META_QUERY_QUERY:10170case META_RANGE_ESCAPED:10171case META_RANGE_LITERAL:10172case META_SKIP:10173case META_THEN:10174break;1017510176case META_OFFSET:10177case META_RECURSE:10178pptr += SIZEOFFSET;10179break;1018010181case META_BACKREF_BYNAME:10182case META_RECURSE_BYNAME:10183pptr += 1 + SIZEOFFSET;10184break;1018510186case META_COND_DEFINE:10187pptr += SIZEOFFSET;10188nestlevel++;10189break;1019010191case META_COND_NAME:10192case META_COND_NUMBER:10193case META_COND_RNAME:10194case META_COND_RNUMBER:10195pptr += 1 + SIZEOFFSET;10196nestlevel++;10197break;1019810199case META_COND_VERSION:10200pptr += 3;10201nestlevel++;10202break;1020310204case META_CALLOUT_STRING:10205pptr += 3 + SIZEOFFSET;10206break;1020710208case META_BIGVALUE:10209case META_POSIX:10210case META_POSIX_NEG:10211case META_CAPTURE_NAME:10212case META_CAPTURE_NUMBER:10213pptr += 1;10214break;1021510216case META_MINMAX:10217case META_MINMAX_QUERY:10218case META_MINMAX_PLUS:10219case META_OPTIONS:10220pptr += 2;10221break;1022210223case META_CALLOUT_NUMBER:10224pptr += 3;10225break;1022610227case META_MARK:10228case META_COMMIT_ARG:10229case META_PRUNE_ARG:10230case META_SKIP_ARG:10231case META_THEN_ARG:10232pptr += 1 + pptr[1];10233break;1023410235/* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to10236the final ket of the group, so no need to update it here. */1023710238case META_LOOKBEHIND:10239case META_LOOKBEHINDNOT:10240case META_LOOKBEHIND_NA:10241if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))10242return errorcode;10243break;10244}10245}1024610247return 0;10248}10249102501025110252/*************************************************10253* External function to compile a pattern *10254*************************************************/1025510256/* This function reads a regular expression in the form of a string and returns10257a pointer to a block of store holding a compiled version of the expression.1025810259Arguments:10260pattern the regular expression10261patlen the length of the pattern, or PCRE2_ZERO_TERMINATED10262options option bits10263errorptr pointer to errorcode10264erroroffset pointer to error offset10265ccontext points to a compile context or is NULL1026610267Returns: pointer to compiled data block, or NULL on error,10268with errorcode and erroroffset set10269*/1027010271PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION10272pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,10273int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)10274{10275BOOL utf; /* Set TRUE for UTF mode */10276BOOL ucp; /* Set TRUE for UCP mode */10277BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */10278BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */10279pcre2_real_code *re = NULL; /* What we will return */10280compile_block cb; /* "Static" compile-time data */10281const uint8_t *tables; /* Char tables base pointer */1028210283PCRE2_UCHAR null_str[1] = { 0xcd }; /* Dummy for handling null inputs */10284PCRE2_UCHAR *code; /* Current pointer in compiled code */10285PCRE2_UCHAR *codestart; /* Start of compiled code */10286PCRE2_SPTR ptr; /* Current pointer in pattern */10287uint32_t *pptr; /* Current pointer in parsed pattern */1028810289PCRE2_SIZE length = 1; /* Allow for final END opcode */10290PCRE2_SIZE usedlength; /* Actual length used */10291PCRE2_SIZE re_blocksize; /* Size of memory block */10292PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */1029310294uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */10295uint32_t firstcu, reqcu; /* Value of first/req code unit */10296uint32_t setflags = 0; /* NL and BSR set flags */10297uint32_t xoptions; /* Flags from context, modified */1029810299uint32_t skipatstart; /* When checking (*UTF) etc */10300uint32_t limit_heap = UINT32_MAX;10301uint32_t limit_match = UINT32_MAX; /* Unset match limits */10302uint32_t limit_depth = UINT32_MAX;1030310304int newline = 0; /* Unset; can be set by the pattern */10305int bsr = 0; /* Unset; can be set by the pattern */10306int errorcode = 0; /* Initialize to avoid compiler warn */10307int regexrc; /* Return from compile */1030810309uint32_t i; /* Local loop counter */1031010311/* Enable all optimizations by default. */10312uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :10313PCRE2_OPTIMIZATION_ALL;1031410315/* Comments at the head of this file explain about these variables. */1031610317uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];10318uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];10319named_group named_groups[NAMED_GROUP_LIST_SIZE];1032010321/* The workspace is used in different ways in the different compiling phases.10322It needs to be 16-bit aligned for the preliminary parsing scan. */1032310324uint32_t c16workspace[C16_WORK_SIZE];10325PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;103261032710328/* -------------- Check arguments and set up the pattern ----------------- */1032910330/* There must be error code and offset pointers. */1033110332if (errorptr == NULL)10333{10334if (erroroffset != NULL) *erroroffset = 0;10335return NULL;10336}10337if (erroroffset == NULL)10338{10339if (errorptr != NULL) *errorptr = ERR120;10340return NULL;10341}10342*errorptr = ERR0;10343*erroroffset = 0;1034410345/* There must be a pattern, but NULL is allowed with zero length. */1034610347if (pattern == NULL)10348{10349if (patlen == 0)10350pattern = null_str;10351else10352{10353*errorptr = ERR16;10354return NULL;10355}10356}1035710358/* A NULL compile context means "use a default context" */1035910360if (ccontext == NULL)10361ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));1036210363/* PCRE2_MATCH_INVALID_UTF implies UTF */1036410365if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;1036610367/* Check that all undefined public option bits are zero. */1036810369if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||10370(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)10371{10372*errorptr = ERR17;10373return NULL;10374}1037510376if ((options & PCRE2_LITERAL) != 0 &&10377((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||10378(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))10379{10380*errorptr = ERR92;10381return NULL;10382}1038310384/* A zero-terminated pattern is indicated by the special length value10385PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */1038610387if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))10388patlen = PRIV(strlen)(pattern);10389(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */1039010391if (patlen > ccontext->max_pattern_length)10392{10393*errorptr = ERR88;10394return NULL;10395}1039610397/* Optimization flags in 'options' can override those in the compile context.10398This is because some options to disable optimizations were added before the10399optimization flags word existed, and we need to continue supporting them10400for backwards compatibility. */1040110402if ((options & PCRE2_NO_AUTO_POSSESS) != 0)10403optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;10404if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)10405optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;10406if ((options & PCRE2_NO_START_OPTIMIZE) != 0)10407optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;1040810409/* From here on, all returns from this function should end up going via the10410EXIT label. */104111041210413/* ------------ Initialize the "static" compile data -------------- */1041410415tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);1041610417cb.lcc = tables + lcc_offset; /* Individual */10418cb.fcc = tables + fcc_offset; /* character */10419cb.cbits = tables + cbits_offset; /* tables */10420cb.ctypes = tables + ctypes_offset;1042110422cb.assert_depth = 0;10423cb.bracount = 0;10424cb.cx = ccontext;10425cb.dupnames = FALSE;10426cb.end_pattern = pattern + patlen;10427cb.erroroffset = 0;10428cb.external_flags = 0;10429cb.external_options = options;10430cb.groupinfo = stack_groupinfo;10431cb.had_recurse = FALSE;10432cb.lastcapture = 0;10433cb.max_lookbehind = 0; /* Max encountered */10434cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */10435cb.name_entry_size = 0;10436cb.name_table = NULL;10437cb.named_groups = named_groups;10438cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;10439cb.names_found = 0;10440cb.parens_depth = 0;10441cb.parsed_pattern = stack_parsed_pattern;10442cb.req_varyopt = 0;10443cb.start_code = cworkspace;10444cb.start_pattern = pattern;10445cb.start_workspace = cworkspace;10446cb.workspace_size = COMPILE_WORK_SIZE;10447cb.first_data = NULL;10448cb.last_data = NULL;10449#ifdef SUPPORT_WIDE_CHARS10450cb.char_lists_size = 0;10451#endif1045210453/* Maximum back reference and backref bitmap. The bitmap records up to 31 back10454references to help in deciding whether (.*) can be treated as anchored or not.10455*/1045610457cb.top_backref = 0;10458cb.backref_map = 0;1045910460/* Escape sequences \1 to \9 are always back references, but as they are only10461two characters long, only two elements can be used in the parsed_pattern10462vector. The first contains the reference, and we'd like to use the second to10463record the offset in the pattern, so that forward references to non-existent10464groups can be diagnosed later with an offset. However, on 64-bit systems,10465PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first10466occurrence of \1 to \9, indexed by the second parsed_pattern value. All other10467references have enough space for the offset to be put into the parsed pattern.10468*/1046910470for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;104711047210473/* --------------- Start looking at the pattern --------------- */1047410475/* Unless PCRE2_LITERAL is set, check for global one-time option settings at10476the start of the pattern, and remember the offset to the actual regex. With10477valgrind support, make the terminator of a zero-terminated pattern10478inaccessible. This catches bugs that would otherwise only show up for10479non-zero-terminated patterns. */1048010481#ifdef SUPPORT_VALGRIND10482if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));10483#endif1048410485xoptions = ccontext->extra_options;10486ptr = pattern;10487skipatstart = 0;1048810489if ((options & PCRE2_LITERAL) == 0)10490{10491while (patlen - skipatstart >= 2 &&10492ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&10493ptr[skipatstart+1] == CHAR_ASTERISK)10494{10495for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)10496{10497const pso *p = pso_list + i;1049810499if (patlen - skipatstart - 2 >= p->length &&10500PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)10501{10502uint32_t c, pp;1050310504skipatstart += p->length + 2;10505switch(p->type)10506{10507case PSO_OPT:10508cb.external_options |= p->value;10509break;1051010511case PSO_XOPT:10512xoptions |= p->value;10513break;1051410515case PSO_FLG:10516setflags |= p->value;10517break;1051810519case PSO_NL:10520newline = p->value;10521setflags |= PCRE2_NL_SET;10522break;1052310524case PSO_BSR:10525bsr = p->value;10526setflags |= PCRE2_BSR_SET;10527break;1052810529case PSO_LIMM:10530case PSO_LIMD:10531case PSO_LIMH:10532c = 0;10533pp = skipatstart;10534while (pp < patlen && IS_DIGIT(ptr[pp]))10535{10536if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */10537c = c*10 + (ptr[pp++] - CHAR_0);10538}10539if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)10540{10541errorcode = ERR60;10542ptr += pp;10543utf = FALSE; /* Used by HAD_EARLY_ERROR */10544goto HAD_EARLY_ERROR;10545}10546if (p->type == PSO_LIMH) limit_heap = c;10547else if (p->type == PSO_LIMM) limit_match = c;10548else limit_depth = c;10549skipatstart = ++pp;10550break;1055110552case PSO_OPTMZ:10553optim_flags &= ~(p->value);1055410555/* For backward compatibility the three original VERBs to disable10556optimizations need to also update the corresponding bit in the10557external options. */1055810559switch(p->value)10560{10561case PCRE2_OPTIM_AUTO_POSSESS:10562cb.external_options |= PCRE2_NO_AUTO_POSSESS;10563break;1056410565case PCRE2_OPTIM_DOTSTAR_ANCHOR:10566cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;10567break;1056810569case PCRE2_OPTIM_START_OPTIMIZE:10570cb.external_options |= PCRE2_NO_START_OPTIMIZE;10571break;10572}1057310574break;1057510576/* LCOV_EXCL_START */10577default:10578/* All values in the enum need an explicit entry for this switch10579but until a better way to prevent coding mistakes is invented keep10580a catch all that triggers a debug build assert as a failsafe */10581PCRE2_DEBUG_UNREACHABLE();10582/* LCOV_EXCL_STOP */10583}10584break; /* Out of the table scan loop */10585}10586}10587if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */10588}10589PCRE2_ASSERT(skipatstart <= patlen);10590}1059110592/* End of pattern-start options; advance to start of real regex. */1059310594ptr += skipatstart;1059510596/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */1059710598#ifndef SUPPORT_UNICODE10599if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)10600{10601errorcode = ERR32;10602goto HAD_EARLY_ERROR;10603}10604#endif1060510606/* Check UTF. We have the original options in 'options', with that value as10607modified by (*UTF) etc in cb->external_options. The extra option10608PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the10609surrogate code points cannot be represented in UTF-16. */1061010611utf = (cb.external_options & PCRE2_UTF) != 0;10612if (utf)10613{10614if ((options & PCRE2_NEVER_UTF) != 0)10615{10616errorcode = ERR74;10617goto HAD_EARLY_ERROR;10618}10619if ((options & PCRE2_NO_UTF_CHECK) == 0 &&10620(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)10621goto HAD_ERROR; /* Offset was set by valid_utf() */1062210623#if PCRE2_CODE_UNIT_WIDTH == 1610624if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)10625{10626errorcode = ERR91;10627goto HAD_EARLY_ERROR;10628}10629#endif10630}1063110632/* Check UCP lockout. */1063310634ucp = (cb.external_options & PCRE2_UCP) != 0;10635if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)10636{10637errorcode = ERR75;10638goto HAD_EARLY_ERROR;10639}1064010641/* PCRE2_EXTRA_TURKISH_CASING checks */1064210643if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)10644{10645if (!utf && !ucp)10646{10647errorcode = ERR104;10648goto HAD_EARLY_ERROR;10649}1065010651#if PCRE2_CODE_UNIT_WIDTH == 810652if (!utf)10653{10654errorcode = ERR105;10655goto HAD_EARLY_ERROR;10656}10657#endif1065810659if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)10660{10661errorcode = ERR106;10662goto HAD_EARLY_ERROR;10663}10664}1066510666/* Process the BSR setting. */1066710668if (bsr == 0) bsr = ccontext->bsr_convention;1066910670/* Process the newline setting. */1067110672if (newline == 0) newline = ccontext->newline_convention;10673cb.nltype = NLTYPE_FIXED;10674switch(newline)10675{10676case PCRE2_NEWLINE_CR:10677cb.nllen = 1;10678cb.nl[0] = CHAR_CR;10679break;1068010681case PCRE2_NEWLINE_LF:10682cb.nllen = 1;10683cb.nl[0] = CHAR_NL;10684break;1068510686case PCRE2_NEWLINE_NUL:10687cb.nllen = 1;10688cb.nl[0] = CHAR_NUL;10689break;1069010691case PCRE2_NEWLINE_CRLF:10692cb.nllen = 2;10693cb.nl[0] = CHAR_CR;10694cb.nl[1] = CHAR_NL;10695break;1069610697case PCRE2_NEWLINE_ANY:10698cb.nltype = NLTYPE_ANY;10699break;1070010701case PCRE2_NEWLINE_ANYCRLF:10702cb.nltype = NLTYPE_ANYCRLF;10703break;1070410705/* LCOV_EXCL_START */10706default:10707PCRE2_DEBUG_UNREACHABLE();10708errorcode = ERR56;10709goto HAD_EARLY_ERROR;10710/* LCOV_EXCL_STOP */10711}1071210713/* Pre-scan the pattern to do two things: (1) Discover the named groups and10714their numerical equivalents, so that this information is always available for10715the remaining processing. (2) At the same time, parse the pattern and put a10716processed version into the parsed_pattern vector. This has escapes interpreted10717and comments removed (amongst other things). */1071810719/* Ensure that the parsed pattern buffer is big enough. For many smaller10720patterns the vector on the stack (which was set up above) can be used. */1072110722parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);1072310724/* Allow for 2x uint32_t at the start and 2 at the end, for10725PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */1072610727if ((ccontext->extra_options &10728(PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)10729parsed_size_needed += 4;1073010731/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */1073210733if ((options & PCRE2_AUTO_CALLOUT) != 0)10734parsed_size_needed += 4;1073510736parsed_size_needed += 1; /* For the final META_END */1073710738if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)10739{10740uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(10741parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);10742if (heap_parsed_pattern == NULL)10743{10744*errorptr = ERR21;10745goto EXIT;10746}10747cb.parsed_pattern = heap_parsed_pattern;10748}10749cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;1075010751/* Do the parsing scan. */1075210753errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);10754if (errorcode != 0) goto HAD_CB_ERROR;1075510756/* If there are any lookbehinds, scan the parsed pattern to figure out their10757lengths. Workspace is needed to remember whether numbered groups are or are not10758of limited length, and if limited, what the minimum and maximum lengths are.10759This caching saves re-computing the length of any group that is referenced more10760than once, which is particularly relevant when recursion is involved.10761Unnumbered groups do not have this exposure because they cannot be referenced.10762If there are sufficiently few groups, the default index vector on the stack, as10763set up above, can be used. Otherwise we have to get/free some heap memory. The10764vector must be initialized to zero. */1076510766if (has_lookbehind)10767{10768int loopcount = 0;10769if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)10770{10771cb.groupinfo = ccontext->memctl.malloc(10772(2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);10773if (cb.groupinfo == NULL)10774{10775errorcode = ERR21;10776cb.erroroffset = 0;10777goto HAD_CB_ERROR;10778}10779}10780memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));10781errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);10782if (errorcode != 0) goto HAD_CB_ERROR;10783}1078410785/* For debugging, there is a function that shows the parsed pattern vector. */1078610787#ifdef DEBUG_SHOW_PARSED10788fprintf(stderr, "+++ Pre-scan complete:\n");10789show_parsed(&cb);10790#endif1079110792/* For debugging capturing information this code can be enabled. */1079310794#ifdef DEBUG_SHOW_CAPTURES10795{10796named_group *ng = cb.named_groups;10797fprintf(stderr, "+++Captures: %d\n", cb.bracount);10798for (i = 0; i < cb.names_found; i++, ng++)10799{10800fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);10801}10802}10803#endif1080410805/* Pretend to compile the pattern while actually just accumulating the amount10806of memory required in the 'length' variable. This behaviour is triggered by10807passing a non-NULL final argument to compile_regex(). We pass a block of10808workspace (cworkspace) for it to compile parts of the pattern into; the10809compiled code is discarded when it is no longer needed, so hopefully this10810workspace will never overflow, though there is a test for its doing so.1081110812On error, errorcode will be set non-zero, so we don't need to look at the10813result of the function. The initial options have been put into the cb block,10814but we still have to pass a separate options variable (the first argument)10815because the options may change as the pattern is processed. */1081610817cb.erroroffset = patlen; /* For any subsequent errors that do not set it */10818pptr = cb.parsed_pattern;10819code = cworkspace;10820*code = OP_BRA;1082110822(void)compile_regex(cb.external_options, xoptions, &code, &pptr,10823&errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,10824&cb, &length);1082510826if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */1082710828/* This should be caught in compile_regex(), but just in case... */1082910830#if defined SUPPORT_WIDE_CHARS10831PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);10832if (length > MAX_PATTERN_SIZE ||10833MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))10834#else10835if (length > MAX_PATTERN_SIZE)10836#endif10837{10838errorcode = ERR20;10839cb.erroroffset = 0;10840goto HAD_CB_ERROR;10841}1084210843/* Compute the size of, then, if not too large, get and initialize the data10844block for storing the compiled pattern and names table. Integer overflow should10845no longer be possible because nowadays we limit the maximum value of10846cb.names_found and cb.name_entry_size. */1084710848re_blocksize =10849CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);1085010851#if defined SUPPORT_WIDE_CHARS10852if (cb.char_lists_size != 0)10853{10854#if PCRE2_CODE_UNIT_WIDTH != 3210855/* Align to 32 bit first. This ensures the10856allocated area will also be 32 bit aligned. */10857re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));10858#endif10859re_blocksize += cb.char_lists_size;10860}10861#endif1086210863re_blocksize += CU2BYTES(length);1086410865if (re_blocksize > ccontext->max_pattern_compiled_length)10866{10867errorcode = ERR101;10868cb.erroroffset = 0;10869goto HAD_CB_ERROR;10870}1087110872re_blocksize += sizeof(pcre2_real_code);10873re = (pcre2_real_code *)10874ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);10875if (re == NULL)10876{10877errorcode = ERR21;10878cb.erroroffset = 0;10879goto HAD_CB_ERROR;10880}1088110882/* The compiler may put padding at the end of the pcre2_real_code structure in10883order to round it up to a multiple of 4 or 8 bytes. This means that when a10884compiled pattern is copied (for example, when serialized) undefined bytes are10885read, and this annoys debuggers such as valgrind. To avoid this, we explicitly10886write to the last 8 bytes of the structure before setting the fields. */1088710888memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);10889re->memctl = ccontext->memctl;10890re->tables = tables;10891re->executable_jit = NULL;10892memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));10893re->blocksize = re_blocksize;10894re->code_start = re_blocksize - CU2BYTES(length);10895re->magic_number = MAGIC_NUMBER;10896re->compile_options = options;10897re->overall_options = cb.external_options;10898re->extra_options = xoptions;10899re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;10900re->limit_heap = limit_heap;10901re->limit_match = limit_match;10902re->limit_depth = limit_depth;10903re->first_codeunit = 0;10904re->last_codeunit = 0;10905re->bsr_convention = bsr;10906re->newline_convention = newline;10907re->max_lookbehind = 0;10908re->minlength = 0;10909re->top_bracket = 0;10910re->top_backref = 0;10911re->name_entry_size = cb.name_entry_size;10912re->name_count = cb.names_found;10913re->optimization_flags = optim_flags;1091410915/* The basic block is immediately followed by the name table, and the compiled10916code follows after that. */1091710918codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);1091910920/* Update the compile data block for the actual compile. The starting points of10921the name/number translation table and of the code are passed around in the10922compile data block. The start/end pattern and initial options are already set10923from the pre-compile phase, as is the name_entry_size field. */1092410925cb.parens_depth = 0;10926cb.assert_depth = 0;10927cb.lastcapture = 0;10928cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));10929cb.start_code = codestart;10930cb.req_varyopt = 0;10931cb.had_accept = FALSE;10932cb.had_pruneorskip = FALSE;10933#ifdef SUPPORT_WIDE_CHARS10934cb.char_lists_size = 0;10935#endif109361093710938/* If any named groups were found, create the name/number table from the list10939created in the pre-pass. */1094010941if (cb.names_found > 0)10942{10943named_group *ng = cb.named_groups;10944uint32_t tablecount = 0;1094510946/* Length 0 represents duplicates, and they have already been handled. */10947for (i = 0; i < cb.names_found; i++, ng++)10948if (ng->length > 0)10949tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);1095010951PCRE2_ASSERT(tablecount == cb.names_found);10952}1095310954/* Set up a starting, non-extracting bracket, then compile the expression. On10955error, errorcode will be set non-zero, so we don't need to look at the result10956of the function here. */1095710958pptr = cb.parsed_pattern;10959code = (PCRE2_UCHAR *)codestart;10960*code = OP_BRA;10961regexrc = compile_regex(re->overall_options, re->extra_options, &code,10962&pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,10963NULL, &cb, NULL);10964if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;10965re->top_bracket = cb.bracount;10966re->top_backref = cb.top_backref;10967re->max_lookbehind = cb.max_lookbehind;1096810969if (cb.had_accept)10970{10971reqcu = 0; /* Must disable after (*ACCEPT) */10972reqcuflags = REQ_NONE;10973re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */10974}1097510976/* Fill in the final opcode and check for disastrous overflow. If no overflow,10977but the estimated length exceeds the really used length, adjust the value of10978re->blocksize, and if valgrind support is configured, mark the extra allocated10979memory as unaddressable, so that any out-of-bound reads can be detected. */1098010981*code++ = OP_END;10982usedlength = code - codestart;10983/* LCOV_EXCL_START */10984if (usedlength > length)10985{10986PCRE2_DEBUG_UNREACHABLE();10987errorcode = ERR23; /* Overflow of code block - internal error */10988cb.erroroffset = 0;10989goto HAD_CB_ERROR;10990}10991/* LCOV_EXCL_STOP */1099210993re->blocksize -= CU2BYTES(length - usedlength);10994#ifdef SUPPORT_VALGRIND10995VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));10996#endif1099710998/* Scan the pattern for recursion/subroutine calls and convert the group10999numbers into offsets. Maintain a small cache so that repeated groups containing11000recursions are efficiently handled. */1100111002#define RSCAN_CACHE_SIZE 81100311004if (errorcode == 0 && cb.had_recurse)11005{11006PCRE2_UCHAR *rcode;11007PCRE2_SPTR rgroup;11008unsigned int ccount = 0;11009int start = RSCAN_CACHE_SIZE;11010recurse_cache rc[RSCAN_CACHE_SIZE];1101111012for (rcode = find_recurse(codestart, utf);11013rcode != NULL;11014rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))11015{11016int p, groupnumber;1101711018groupnumber = (int)GET(rcode, 1);11019if (groupnumber == 0) rgroup = codestart; else11020{11021PCRE2_SPTR search_from = codestart;11022rgroup = NULL;11023for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)11024{11025if (groupnumber == rc[p].groupnumber)11026{11027rgroup = rc[p].group;11028break;11029}1103011031/* Group n+1 must always start to the right of group n, so we can save11032search time below when the new group number is greater than any of the11033previously found groups. */1103411035if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;11036}1103711038if (rgroup == NULL)11039{11040rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);11041/* LCOV_EXCL_START */11042if (rgroup == NULL)11043{11044PCRE2_DEBUG_UNREACHABLE();11045errorcode = ERR53;11046break;11047}11048/* LCOV_EXCL_STOP */1104911050if (--start < 0) start = RSCAN_CACHE_SIZE - 1;11051rc[start].groupnumber = groupnumber;11052rc[start].group = rgroup;11053if (ccount < RSCAN_CACHE_SIZE) ccount++;11054}11055}1105611057PUT(rcode, 1, (uint32_t)(rgroup - codestart));11058}11059}1106011061/* In rare debugging situations we sometimes need to look at the compiled code11062at this stage. */1106311064#ifdef DEBUG_CALL_PRINTINT11065pcre2_printint(re, stderr, TRUE);11066fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);11067#endif1106811069/* Unless disabled, check whether any single character iterators can be11070auto-possessified. The function overwrites the appropriate opcode values, so11071the type of the pointer must be cast. NOTE: the intermediate variable "temp" is11072used in this code because at least one compiler gives a warning about loss of11073"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the11074function call. */1107511076if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)11077{11078PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;11079int possessify_rc = PRIV(auto_possessify)(temp, &cb);11080/* LCOV_EXCL_START */11081if (possessify_rc != 0)11082{11083PCRE2_DEBUG_UNREACHABLE();11084errorcode = ERR80;11085cb.erroroffset = 0;11086}11087/* LCOV_EXCL_STOP */11088}1108911090/* Failed to compile, or error while post-processing. */1109111092if (errorcode != 0) goto HAD_CB_ERROR;1109311094/* Successful compile. If the anchored option was not passed, set it if11095we can determine that the pattern is anchored by virtue of ^ characters or \A11096or anything else, such as starting with non-atomic .* when DOTALL is set and11097there are no occurrences of *PRUNE or *SKIP (though there is an option to11098disable this case). */1109911100if ((re->overall_options & PCRE2_ANCHORED) == 0)11101{11102BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);11103if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))11104re->overall_options |= PCRE2_ANCHORED;11105}1110611107/* Set up the first code unit or startline flag, the required code unit, and11108then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE11109is disabled, as the data it would create will not be used. Note that a first code11110unit (but not the startline flag) is useful for anchored patterns because it11111can still give a quick "no match" and also avoid searching for a last code11112unit. */1111311114if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)11115{11116int minminlength = 0; /* For minimal minlength from first/required CU */11117int study_rc;1111811119/* If we do not have a first code unit, see if there is one that is asserted11120(these are not saved during the compile because they can cause conflicts with11121actual literals that follow). */1112211123if (firstcuflags >= REQ_NONE) {11124uint32_t assertedcuflags = 0;11125uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);11126/* It would be wrong to use the asserted first code unit as `firstcu` for11127* regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)11128* For that example, if we set both firstcu and reqcu to 'a', it would mean11129* the subject string needs to be at least 2 characters long, which is wrong.11130* With more analysis, we would be able to set firstcu in more cases. */11131if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {11132firstcu = assertedcu;11133firstcuflags = assertedcuflags;11134}11135}1113611137/* Save the data for a first code unit. The existence of one means the11138minimum length must be at least 1. */1113911140if (firstcuflags < REQ_NONE)11141{11142re->first_codeunit = firstcu;11143re->flags |= PCRE2_FIRSTSET;11144minminlength++;1114511146/* Handle caseless first code units. */1114711148if ((firstcuflags & REQ_CASELESS) != 0)11149{11150if (firstcu < 128 || (!utf && !ucp && firstcu < 255))11151{11152if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;11153}1115411155/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.11156In 8-bit UTF mode, code units in the range 128-255 are introductory code11157units and cannot have another case, but if UCP is set they may do. */1115811159#ifdef SUPPORT_UNICODE11160#if PCRE2_CODE_UNIT_WIDTH == 811161else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)11162re->flags |= PCRE2_FIRSTCASELESS;11163#else11164else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&11165UCD_OTHERCASE(firstcu) != firstcu)11166re->flags |= PCRE2_FIRSTCASELESS;11167#endif11168#endif /* SUPPORT_UNICODE */11169}11170}1117111172/* When there is no first code unit, for non-anchored patterns, see if we can11173set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all11174branches start with ^ and also when all branches start with non-atomic .* for11175non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option11176that disables this case.) */1117711178else if ((re->overall_options & PCRE2_ANCHORED) == 0)11179{11180BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);11181if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))11182re->flags |= PCRE2_STARTLINE;11183}1118411185/* Handle the "required code unit", if one is set. In the UTF case we can11186increment the minimum minimum length only if we are sure this really is a11187different character and not a non-starting code unit of the first character,11188because the minimum length count is in characters, not code units. */1118911190if (reqcuflags < REQ_NONE)11191{11192#if PCRE2_CODE_UNIT_WIDTH == 1611193if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */11194firstcuflags >= REQ_NONE || /* First not set */11195(firstcu & 0xf800) != 0xd800 || /* First not surrogate */11196(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */11197#elif PCRE2_CODE_UNIT_WIDTH == 811198if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */11199firstcuflags >= REQ_NONE || /* First not set */11200(firstcu & 0x80) == 0 || /* First is ASCII */11201(reqcu & 0x80) == 0) /* Req is ASCII */11202#endif11203{11204minminlength++;11205}1120611207/* In the case of an anchored pattern, set up the value only if it follows11208a variable length item in the pattern. */1120911210if ((re->overall_options & PCRE2_ANCHORED) == 0 ||11211(reqcuflags & REQ_VARY) != 0)11212{11213re->last_codeunit = reqcu;11214re->flags |= PCRE2_LASTSET;1121511216/* Handle caseless required code units as for first code units (above). */1121711218if ((reqcuflags & REQ_CASELESS) != 0)11219{11220if (reqcu < 128 || (!utf && !ucp && reqcu < 255))11221{11222if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;11223}11224#ifdef SUPPORT_UNICODE11225#if PCRE2_CODE_UNIT_WIDTH == 811226else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)11227re->flags |= PCRE2_LASTCASELESS;11228#else11229else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&11230UCD_OTHERCASE(reqcu) != reqcu)11231re->flags |= PCRE2_LASTCASELESS;11232#endif11233#endif /* SUPPORT_UNICODE */11234}11235}11236}1123711238/* Study the compiled pattern to set up information such as a bitmap of11239starting code units and a minimum matching length. */1124011241study_rc = PRIV(study)(re);11242/* LCOV_EXCL_START */11243if (study_rc != 0)11244{11245PCRE2_DEBUG_UNREACHABLE();11246errorcode = ERR31;11247cb.erroroffset = 0;11248goto HAD_CB_ERROR;11249}11250/* LCOV_EXCL_STOP */1125111252/* If study() set a bitmap of starting code units, it implies a minimum11253length of at least one. */1125411255if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)11256minminlength = 1;1125711258/* If the minimum length set (or not set) by study() is less than the minimum11259implied by required code units, override it. */1126011261if (re->minlength < minminlength) re->minlength = minminlength;11262} /* End of start-of-match optimizations. */1126311264/* Control ends up here in all cases. When running under valgrind, make a11265pattern's terminating zero defined again. If memory was obtained for the parsed11266version of the pattern, free it before returning. Also free the list of named11267groups if a larger one had to be obtained, and likewise the group information11268vector. */1126911270#ifdef SUPPORT_UNICODE11271/* All items must be freed. */11272PCRE2_ASSERT(cb.first_data == NULL);11273#endif1127411275EXIT:11276#ifdef SUPPORT_VALGRIND11277if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));11278#endif11279if (cb.parsed_pattern != stack_parsed_pattern)11280ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);11281if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)11282ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);11283if (cb.groupinfo != stack_groupinfo)11284ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);1128511286return re; /* Will be NULL after an error */1128711288/* Errors discovered in parse_regex() set the offset value in the compile11289block. Errors discovered before it is called must compute it from the ptr11290value. After parse_regex() is called, the offset in the compile block is set to11291the end of the pattern, but certain errors in compile_regex() may reset it if11292an offset is available in the parsed pattern. */1129311294HAD_CB_ERROR:11295ptr = pattern + cb.erroroffset;1129611297HAD_EARLY_ERROR:11298/* Ensure we don't return out-of-range erroroffset. */11299PCRE2_ASSERT(ptr >= pattern);11300PCRE2_ASSERT(ptr <= (pattern + patlen));11301/* Ensure that the erroroffset never slices a UTF-encoded character in half.11302If the input is invalid, then we return an offset just before the first invalid11303character, so the text to the left of the offset must always be valid. */11304#if defined PCRE2_DEBUG && defined SUPPORT_UNICODE11305if (ptr > pattern && utf)11306{11307PCRE2_SPTR prev = ptr - 1;11308PCRE2_SIZE dummyoffset;11309BACKCHAR(prev);11310PCRE2_ASSERT(prev >= pattern);11311PCRE2_ASSERT(PRIV(valid_utf)(prev, ptr - prev, &dummyoffset) == 0);11312}11313#endif11314*erroroffset = ptr - pattern;1131511316HAD_ERROR:11317*errorptr = errorcode;11318pcre2_code_free(re);11319re = NULL;1132011321if (cb.first_data != NULL)11322{11323compile_data* current_data = cb.first_data;11324do11325{11326compile_data* next_data = current_data->next;11327cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);11328current_data = next_data;11329}11330while (current_data != NULL);11331}1133211333goto EXIT;11334}1133511336/* These #undefs are here to enable unity builds with CMake. */1133711338#undef NLBLOCK /* Block containing newline information */11339#undef PSSTART /* Field containing processed string start */11340#undef PSEND /* Field containing processed string end */1134111342/* End of pcre2_compile.c */113431134411345