Path: blob/master/thirdparty/pcre2/src/pcre2_compile.c
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#ifdef HAVE_CONFIG_H42#include "config.h"43#endif4445#define NLBLOCK cb /* Block containing newline information */46#define PSSTART start_pattern /* Field containing processed string start */47#define PSEND end_pattern /* Field containing processed string end */4849#include "pcre2_compile.h"5051/* In rare error cases debugging might require calling pcre2_printint(). */5253#if 054#ifdef EBCDIC55#define PRINTABLE(c) ((c) >= 64 && (c) < 255)56#else57#define PRINTABLE(c) ((c) >= 32 && (c) < 127)58#endif59#include "pcre2_printint.c"60#define DEBUG_CALL_PRINTINT61#endif6263/* Other debugging code can be enabled by these defines. */6465/* #define DEBUG_SHOW_CAPTURES */66/* #define DEBUG_SHOW_PARSED */6768/* There are a few things that vary with different code unit sizes. Handle them69by defining macros in order to minimize #if usage. */7071#if PCRE2_CODE_UNIT_WIDTH == 872#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 573#define XDIGIT(c) xdigitab[c]7475#else /* Either 16-bit or 32-bit */76#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)7778#if PCRE2_CODE_UNIT_WIDTH == 1679#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 68081#else /* 32-bit */82#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 683#endif84#endif8586/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which87consists of uint32_t elements. Assume that if uint32_t can't hold it, two of88them will be able to (i.e. assume a 64-bit world). */8990#if PCRE2_SIZE_MAX <= UINT32_MAX91#define PUTOFFSET(s,p) *p++ = s92#define GETOFFSET(s,p) s = *p++93#define GETPLUSOFFSET(s,p) s = *(++p)94#define READPLUSOFFSET(s,p) s = p[1]95#define SKIPOFFSET(p) p++96#define SIZEOFFSET 197#else98#define PUTOFFSET(s,p) \99{ *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }100#define GETOFFSET(s,p) \101{ s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }102#define GETPLUSOFFSET(s,p) \103{ s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }104#define READPLUSOFFSET(s,p) \105{ s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }106#define SKIPOFFSET(p) p += 2107#define SIZEOFFSET 2108#endif109110/* Function definitions to allow mutual recursion */111112static int113compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,114uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,115open_capitem *, compile_block *, PCRE2_SIZE *);116117static int118get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,119compile_block *);120121static BOOL122set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,123compile_block *);124125static int126check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,127compile_block *, int *);128129130/*************************************************131* Code parameters and static tables *132*************************************************/133134#define MAX_GROUP_NUMBER 65535u135#define MAX_REPEAT_COUNT 65535u136#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)137138/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in139different ways in the different pattern scans. The parsing and group-140identifying pre-scan uses it to handle nesting, and needs it to be 16-bit141aligned for this. Having defined the size in code units, we set up142C16_WORK_SIZE as the number of elements in the 16-bit vector.143144During the first compiling phase, when determining how much memory is required,145the regex is partly compiled into this space, but the compiled parts are146discarded as soon as they can be, so that hopefully there will never be an147overrun. The code does, however, check for an overrun, which can occur for148pathological patterns. The size of the workspace depends on LINK_SIZE because149the length of compiled items varies with this.150151In the real compile phase, this workspace is not currently used. */152153#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */154155#define C16_WORK_SIZE \156((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))157158/* A uint32_t vector is used for caching information about the size of159capturing groups, to improve performance. A default is created on the stack of160this size. */161162#define GROUPINFO_DEFAULT_SIZE 256163164/* The overrun tests check for a slightly smaller size so that they detect the165overrun before it actually does run off the end of the data block. */166167#define WORK_SIZE_SAFETY_MARGIN (100)168169/* This value determines the size of the initial vector that is used for170remembering named groups during the pre-compile. It is allocated on the stack,171but if it is too small, it is expanded, in a similar way to the workspace. The172value is the number of slots in the list. */173174#define NAMED_GROUP_LIST_SIZE 20175176/* The pre-compiling pass over the pattern creates a parsed pattern in a vector177of uint32_t. For short patterns this lives on the stack, with this size. Heap178memory is used for longer patterns. */179180#define PARSED_PATTERN_DEFAULT_SIZE 1024181182/* Maximum length value to check against when making sure that the variable183that holds the compiled pattern length does not overflow. We make it a bit less184than INT_MAX to allow for adding in group terminating code units, so that we185don't have to check them every time. */186187#define OFLOW_MAX (INT_MAX - 20)188189/* Table of extra lengths for each of the meta codes. Must be kept in step with190the definitions above. For some items these values are a basic length to which191a variable amount has to be added. */192193static unsigned char meta_extra_lengths[] = {1940, /* META_END */1950, /* META_ALT */1960, /* META_ATOMIC */1970, /* META_BACKREF - more if group is >= 10 */1981+SIZEOFFSET, /* META_BACKREF_BYNAME */1991, /* META_BIGVALUE */2003, /* META_CALLOUT_NUMBER */2013+SIZEOFFSET, /* META_CALLOUT_STRING */2020, /* META_CAPTURE */2030, /* META_CIRCUMFLEX */2040, /* META_CLASS */2050, /* META_CLASS_EMPTY */2060, /* META_CLASS_EMPTY_NOT */2070, /* META_CLASS_END */2080, /* META_CLASS_NOT */2090, /* META_COND_ASSERT */210SIZEOFFSET, /* META_COND_DEFINE */2111+SIZEOFFSET, /* META_COND_NAME */2121+SIZEOFFSET, /* META_COND_NUMBER */2131+SIZEOFFSET, /* META_COND_RNAME */2141+SIZEOFFSET, /* META_COND_RNUMBER */2153, /* META_COND_VERSION */216SIZEOFFSET, /* META_OFFSET */2170, /* META_SCS */2181, /* META_SCS_NAME */2191, /* META_SCS_NUMBER */2200, /* META_DOLLAR */2210, /* META_DOT */2220, /* META_ESCAPE - one more for ESC_P and ESC_p */2230, /* META_KET */2240, /* META_NOCAPTURE */2252, /* META_OPTIONS */2261, /* META_POSIX */2271, /* META_POSIX_NEG */2280, /* META_RANGE_ESCAPED */2290, /* META_RANGE_LITERAL */230SIZEOFFSET, /* META_RECURSE */2311+SIZEOFFSET, /* META_RECURSE_BYNAME */2320, /* META_SCRIPT_RUN */2330, /* META_LOOKAHEAD */2340, /* META_LOOKAHEADNOT */235SIZEOFFSET, /* META_LOOKBEHIND */236SIZEOFFSET, /* META_LOOKBEHINDNOT */2370, /* META_LOOKAHEAD_NA */238SIZEOFFSET, /* META_LOOKBEHIND_NA */2391, /* META_MARK - plus the string length */2400, /* META_ACCEPT */2410, /* META_FAIL */2420, /* META_COMMIT */2431, /* META_COMMIT_ARG - plus the string length */2440, /* META_PRUNE */2451, /* META_PRUNE_ARG - plus the string length */2460, /* META_SKIP */2471, /* META_SKIP_ARG - plus the string length */2480, /* META_THEN */2491, /* META_THEN_ARG - plus the string length */2500, /* META_ASTERISK */2510, /* META_ASTERISK_PLUS */2520, /* META_ASTERISK_QUERY */2530, /* META_PLUS */2540, /* META_PLUS_PLUS */2550, /* META_PLUS_QUERY */2560, /* META_QUERY */2570, /* META_QUERY_PLUS */2580, /* META_QUERY_QUERY */2592, /* META_MINMAX */2602, /* META_MINMAX_PLUS */2612, /* META_MINMAX_QUERY */2620, /* META_ECLASS_AND */2630, /* META_ECLASS_OR */2640, /* META_ECLASS_SUB */2650, /* META_ECLASS_XOR */2660 /* META_ECLASS_NOT */267};268269/* Types for skipping parts of a parsed pattern. */270271enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };272273/* Values and flags for the unsigned xxcuflags variables that accompany xxcu274variables, which are concerned with first and required code units. A value275greater than or equal to REQ_NONE means "no code unit set"; otherwise the276matching xxcu variable is set, and the low valued bits are relevant. */277278#define REQ_UNSET 0xffffffffu /* Not yet found anything */279#define REQ_NONE 0xfffffffeu /* Found not fixed character */280#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */281#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */282283/* These flags are used in the groupinfo vector. */284285#define GI_SET_FIXED_LENGTH 0x80000000u286#define GI_NOT_FIXED_LENGTH 0x40000000u287#define GI_FIXED_LENGTH_MASK 0x0000ffffu288289/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC290and is fast (a good compiler can turn it into a subtraction and unsigned291comparison). */292293#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)294295/* Table to identify hex digits. The tables in chartables are dependent on the296locale, and may mark arbitrary characters as digits. We want to recognize only2970-9, a-z, and A-Z as hex digits, which is why we have a private table here. It298costs 256 bytes, but it is a lot faster than doing character value tests (at299least in some simple cases I timed), and in some applications one wants PCRE2300to compile efficiently as well as match efficiently. The value in the table is301the binary hex digit value, or 0xff for non-hex digits. */302303/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in304UTF-8 mode. */305306#ifndef EBCDIC307static const uint8_t xdigitab[] =308{3090xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */3100xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */3110xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */3120xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */3130xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */3140xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */3150x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */3160x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */3170xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */3180xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */3190xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */3200xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */3210xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */3220xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */3230xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */3240xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */3250xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */3260xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */3270xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */3280xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */3290xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */3300xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */3310xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */3320xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */3330xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */3340xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */3350xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */3360xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */3370xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */3380xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */3390xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */3400xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */341342#else343344/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */345346static const uint8_t xdigitab[] =347{3480xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */3490xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */3500xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */3510xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */3520xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */3530xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */3540xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */3550xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */3560xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */3570xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */3580xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */3590xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */3600xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */3610xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */3620xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */3630xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */3640xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */3650xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */3660xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */3670xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */3680xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */3690xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */3700xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */3710xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */3720xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */3730xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */3740xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */3750xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */3760xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */3770xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */3780x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */3790x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */380#endif /* EBCDIC */381382383/* Table for handling alphanumeric escaped characters. Positive returns are384simple data values; negative values are for special things like \d and so on.385Zero means further processing is needed (for things like \x), or the escape is386invalid. */387388/* This is the "normal" table for ASCII systems or for EBCDIC systems running389in UTF-8 mode. It runs from '0' to 'z'. */390391#ifndef EBCDIC392#define ESCAPES_FIRST CHAR_0393#define ESCAPES_LAST CHAR_z394#define UPPER_CASE(c) (c-32)395396static const short int escapes[] = {397/* 0 */ 0, /* 1 */ 0,398/* 2 */ 0, /* 3 */ 0,399/* 4 */ 0, /* 5 */ 0,400/* 6 */ 0, /* 7 */ 0,401/* 8 */ 0, /* 9 */ 0,402/* : */ CHAR_COLON, /* ; */ CHAR_SEMICOLON,403/* < */ CHAR_LESS_THAN_SIGN, /* = */ CHAR_EQUALS_SIGN,404/* > */ CHAR_GREATER_THAN_SIGN, /* ? */ CHAR_QUESTION_MARK,405/* @ */ CHAR_COMMERCIAL_AT, /* A */ -ESC_A,406/* B */ -ESC_B, /* C */ -ESC_C,407/* D */ -ESC_D, /* E */ -ESC_E,408/* F */ 0, /* G */ -ESC_G,409/* H */ -ESC_H, /* I */ 0,410/* J */ 0, /* K */ -ESC_K,411/* L */ 0, /* M */ 0,412/* N */ -ESC_N, /* O */ 0,413/* P */ -ESC_P, /* Q */ -ESC_Q,414/* R */ -ESC_R, /* S */ -ESC_S,415/* T */ 0, /* U */ 0,416/* V */ -ESC_V, /* W */ -ESC_W,417/* X */ -ESC_X, /* Y */ 0,418/* Z */ -ESC_Z, /* [ */ CHAR_LEFT_SQUARE_BRACKET,419/* \ */ CHAR_BACKSLASH, /* ] */ CHAR_RIGHT_SQUARE_BRACKET,420/* ^ */ CHAR_CIRCUMFLEX_ACCENT, /* _ */ CHAR_UNDERSCORE,421/* ` */ CHAR_GRAVE_ACCENT, /* a */ CHAR_BEL,422/* b */ -ESC_b, /* c */ 0,423/* d */ -ESC_d, /* e */ CHAR_ESC,424/* f */ CHAR_FF, /* g */ 0,425/* h */ -ESC_h, /* i */ 0,426/* j */ 0, /* k */ -ESC_k,427/* l */ 0, /* m */ 0,428/* n */ CHAR_LF, /* o */ 0,429/* p */ -ESC_p, /* q */ 0,430/* r */ CHAR_CR, /* s */ -ESC_s,431/* t */ CHAR_HT, /* u */ 0,432/* v */ -ESC_v, /* w */ -ESC_w,433/* x */ 0, /* y */ 0,434/* z */ -ESC_z435};436437#else438439/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.440It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code441is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a442because it is defined as 'a', which of course picks up the ASCII value. */443444#if 'a' == 0x81 /* Check for a real EBCDIC environment */445#define ESCAPES_FIRST CHAR_a446#define ESCAPES_LAST CHAR_9447#define UPPER_CASE(c) (c+64)448#else /* Testing in an ASCII environment */449#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */450#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */451#define UPPER_CASE(c) (c-32)452#endif453454static const short int escapes[] = {455/* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,456/* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,457/* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,458/* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,459/* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,460/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,461/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,462/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',463/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,464/* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,465/* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,466/* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,467/* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,468/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,469/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,470/* F8 */ 0, 0471};472473/* We also need a table of characters that may follow \c in an EBCDIC474environment for characters 0-31. */475476static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";477478#endif /* EBCDIC */479480481/* Table of special "verbs" like (*PRUNE). This is a short table, so it is482searched linearly. Put all the names into a single string, in order to reduce483the number of relocations when a shared library is dynamically linked. The484string is built from string macros so that it works in UTF-8 mode on EBCDIC485platforms. */486487typedef struct verbitem {488unsigned int len; /* Length of verb name */489uint32_t meta; /* Base META_ code */490int has_arg; /* Argument requirement */491} verbitem;492493static const char verbnames[] =494"\0" /* Empty name is a shorthand for MARK */495STRING_MARK0496STRING_ACCEPT0497STRING_F0498STRING_FAIL0499STRING_COMMIT0500STRING_PRUNE0501STRING_SKIP0502STRING_THEN;503504static const verbitem verbs[] = {505{ 0, META_MARK, +1 }, /* > 0 => must have an argument */506{ 4, META_MARK, +1 },507{ 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */508{ 1, META_FAIL, -1 },509{ 4, META_FAIL, -1 },510{ 6, META_COMMIT, 0 },511{ 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */512{ 4, META_SKIP, 0 },513{ 4, META_THEN, 0 }514};515516static const int verbcount = sizeof(verbs)/sizeof(verbitem);517518/* Verb opcodes, indexed by their META code offset from META_MARK. */519520static const uint32_t verbops[] = {521OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,522OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };523524/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */525526typedef struct alasitem {527unsigned int len; /* Length of name */528uint32_t meta; /* Base META_ code */529} alasitem;530531static const char alasnames[] =532STRING_pla0533STRING_plb0534STRING_napla0535STRING_naplb0536STRING_nla0537STRING_nlb0538STRING_positive_lookahead0539STRING_positive_lookbehind0540STRING_non_atomic_positive_lookahead0541STRING_non_atomic_positive_lookbehind0542STRING_negative_lookahead0543STRING_negative_lookbehind0544STRING_scs0545STRING_scan_substring0546STRING_atomic0547STRING_sr0548STRING_asr0549STRING_script_run0550STRING_atomic_script_run;551552static const alasitem alasmeta[] = {553{ 3, META_LOOKAHEAD },554{ 3, META_LOOKBEHIND },555{ 5, META_LOOKAHEAD_NA },556{ 5, META_LOOKBEHIND_NA },557{ 3, META_LOOKAHEADNOT },558{ 3, META_LOOKBEHINDNOT },559{ 18, META_LOOKAHEAD },560{ 19, META_LOOKBEHIND },561{ 29, META_LOOKAHEAD_NA },562{ 30, META_LOOKBEHIND_NA },563{ 18, META_LOOKAHEADNOT },564{ 19, META_LOOKBEHINDNOT },565{ 3, META_SCS },566{ 14, META_SCS },567{ 6, META_ATOMIC },568{ 2, META_SCRIPT_RUN }, /* sr = script run */569{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */570{ 10, META_SCRIPT_RUN }, /* script run */571{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */572};573574static const int alascount = sizeof(alasmeta)/sizeof(alasitem);575576/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */577578static uint32_t chartypeoffset[] = {579OP_STAR - OP_STAR, OP_STARI - OP_STAR,580OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };581582/* Tables of names of POSIX character classes and their lengths. The names are583now all in a single string, to reduce the number of relocations when a shared584library is dynamically loaded. The list of lengths is terminated by a zero585length entry. The first three must be alpha, lower, upper, as this is assumed586for handling case independence.587588The indices for several classes are stored in pcre2_compile.h - these must589be kept in sync with posix_names, posix_name_lengths, posix_class_maps,590and posix_substitutes. */591592static const char posix_names[] =593STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0594STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0595STRING_graph0 STRING_print0 STRING_punct0 STRING_space0596STRING_word0 STRING_xdigit;597598static const uint8_t posix_name_lengths[] = {5995, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };600601/* Table of class bit maps for each POSIX class. Each class is formed from a602base map, with an optional addition or removal of another map. Then, for some603classes, there is some additional tweaking: for [:blank:] the vertical space604characters are removed, and for [:alpha:] and [:alnum:] the underscore605character is removed. The triples in the table consist of the base map offset,606second map offset or -1 if no second map, and a non-negative value for map607addition or a negative value for map subtraction (if there are two maps). The608absolute value of the third field has these meanings: 0 => no tweaking, 1 =>609remove vertical space characters, 2 => remove underscore. */610611const int PRIV(posix_class_maps)[] = {612cbit_word, cbit_digit, -2, /* alpha */613cbit_lower, -1, 0, /* lower */614cbit_upper, -1, 0, /* upper */615cbit_word, -1, 2, /* alnum - word without underscore */616cbit_print, cbit_cntrl, 0, /* ascii */617cbit_space, -1, 1, /* blank - a GNU extension */618cbit_cntrl, -1, 0, /* cntrl */619cbit_digit, -1, 0, /* digit */620cbit_graph, -1, 0, /* graph */621cbit_print, -1, 0, /* print */622cbit_punct, -1, 0, /* punct */623cbit_space, -1, 0, /* space */624cbit_word, -1, 0, /* word - a Perl extension */625cbit_xdigit, -1, 0 /* xdigit */626};627628#ifdef SUPPORT_UNICODE629630/* The POSIX class Unicode property substitutes that are used in UCP mode must631be in the order of the POSIX class names, defined above. */632633static int posix_substitutes[] = {634PT_GC, ucp_L, /* alpha */635PT_PC, ucp_Ll, /* lower */636PT_PC, ucp_Lu, /* upper */637PT_ALNUM, 0, /* alnum */638-1, 0, /* ascii, treat as non-UCP */639-1, 1, /* blank, treat as \h */640PT_PC, ucp_Cc, /* cntrl */641PT_PC, ucp_Nd, /* digit */642PT_PXGRAPH, 0, /* graph */643PT_PXPRINT, 0, /* print */644PT_PXPUNCT, 0, /* punct */645PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */646PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */647PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */648};649#endif /* SUPPORT_UNICODE */650651/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset652are allowed. */653654#define PUBLIC_LITERAL_COMPILE_OPTIONS \655(PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \656PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \657PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)658659#define PUBLIC_COMPILE_OPTIONS \660(PUBLIC_LITERAL_COMPILE_OPTIONS| \661PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \662PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \663PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \664PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \665PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \666PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)667668#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \669(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \670PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)671672#define PUBLIC_COMPILE_EXTRA_OPTIONS \673(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \674PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \675PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \676PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \677PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \678PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \679PCRE2_EXTRA_NEVER_CALLOUT)680681/* This is a table of start-of-pattern options such as (*UTF) and settings such682as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward683compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is684generic and always supported. */685686enum { PSO_OPT, /* Value is an option bit */687PSO_XOPT, /* Value is an xoption bit */688PSO_FLG, /* Value is a flag bit */689PSO_NL, /* Value is a newline type */690PSO_BSR, /* Value is a \R type */691PSO_LIMH, /* Read integer value for heap limit */692PSO_LIMM, /* Read integer value for match limit */693PSO_LIMD, /* Read integer value for depth limit */694PSO_OPTMZ /* Value is an optimization bit */695};696697typedef struct pso {698const char *name;699uint16_t length;700uint16_t type;701uint32_t value;702} pso;703704/* NB: STRING_UTFn_RIGHTPAR contains the length as well */705706static const pso pso_list[] = {707{ STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },708{ STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },709{ STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },710{ STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },711{ STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },712{ STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },713{ STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },714{ STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },715{ STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },716{ STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },717{ STRING_TURKISH_CASING_RIGHTPAR, 15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },718{ STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },719{ STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },720{ STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },721{ STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },722{ STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },723{ STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },724{ STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },725{ STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },726{ STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },727{ STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },728{ STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },729{ STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }730};731732/* This table is used when converting repeating opcodes into possessified733versions as a result of an explicit possessive quantifier such as ++. A zero734value means there is no possessified version - in those cases the item in735question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT736because all relevant opcodes are less than that. */737738static const uint8_t opcode_possessify[] = {7390, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */7400, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */7417420, /* NOTI */743OP_POSSTAR, 0, /* STAR, MINSTAR */744OP_POSPLUS, 0, /* PLUS, MINPLUS */745OP_POSQUERY, 0, /* QUERY, MINQUERY */746OP_POSUPTO, 0, /* UPTO, MINUPTO */7470, /* EXACT */7480, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */749750OP_POSSTARI, 0, /* STARI, MINSTARI */751OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */752OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */753OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */7540, /* EXACTI */7550, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */756757OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */758OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */759OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */760OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */7610, /* NOTEXACT */7620, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */763764OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */765OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */766OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */767OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */7680, /* NOTEXACTI */7690, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */770771OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */772OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */773OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */774OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */7750, /* TYPEEXACT */7760, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */777778OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */779OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */780OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */781OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */7820, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */7837840, 0, 0, 0, /* CLASS, NCLASS, XCLASS, ECLASS */7850, 0, /* REF, REFI */7860, 0, /* DNREF, DNREFI */7870, 0, /* RECURSE, CALLOUT */788};789790/* Compile-time check that the table has the correct size. */791STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);792793794#ifdef DEBUG_SHOW_PARSED795/*************************************************796* Show the parsed pattern for debugging *797*************************************************/798799/* For debugging the pre-scan, this code, which outputs the parsed data vector,800can be enabled. */801802static void show_parsed(compile_block *cb)803{804uint32_t *pptr = cb->parsed_pattern;805806for (;;)807{808int max, min;809PCRE2_SIZE offset;810uint32_t i;811uint32_t length;812uint32_t meta_arg = META_DATA(*pptr);813814fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);815816if (*pptr < META_END)817{818if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);819pptr++;820}821822else switch (META_CODE(*pptr++))823{824default:825fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");826return;827828case META_END:829fprintf(stderr, "META_END\n");830return;831832case META_CAPTURE:833fprintf(stderr, "META_CAPTURE %d", meta_arg);834break;835836case META_RECURSE:837GETOFFSET(offset, pptr);838fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);839break;840841case META_BACKREF:842if (meta_arg < 10)843offset = cb->small_ref_offset[meta_arg];844else845GETOFFSET(offset, pptr);846fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);847break;848849case META_ESCAPE:850if (meta_arg == ESC_P || meta_arg == ESC_p)851{852uint32_t ptype = *pptr >> 16;853uint32_t pvalue = *pptr++ & 0xffff;854fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,855ptype, pvalue);856}857else858{859uint32_t cc;860/* There's just one escape we might have here that isn't negated in the861escapes table. */862if (meta_arg == ESC_g) cc = CHAR_g;863else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)864{865if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;866}867if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;868fprintf(stderr, "META \\%c", cc);869}870break;871872case META_MINMAX:873min = *pptr++;874max = *pptr++;875if (max != REPEAT_UNLIMITED)876fprintf(stderr, "META {%d,%d}", min, max);877else878fprintf(stderr, "META {%d,}", min);879break;880881case META_MINMAX_QUERY:882min = *pptr++;883max = *pptr++;884if (max != REPEAT_UNLIMITED)885fprintf(stderr, "META {%d,%d}?", min, max);886else887fprintf(stderr, "META {%d,}?", min);888break;889890case META_MINMAX_PLUS:891min = *pptr++;892max = *pptr++;893if (max != REPEAT_UNLIMITED)894fprintf(stderr, "META {%d,%d}+", min, max);895else896fprintf(stderr, "META {%d,}+", min);897break;898899case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;900case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;901case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;902case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;903case META_DOT: fprintf(stderr, "META_DOT"); break;904case META_ASTERISK: fprintf(stderr, "META *"); break;905case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;906case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;907case META_PLUS: fprintf(stderr, "META +"); break;908case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;909case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;910case META_QUERY: fprintf(stderr, "META ?"); break;911case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;912case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;913914case META_ATOMIC: fprintf(stderr, "META (?>"); break;915case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;916case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;917case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;918case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;919case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;920case META_KET: fprintf(stderr, "META )"); break;921case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;922923case META_CLASS: fprintf(stderr, "META ["); break;924case META_CLASS_NOT: fprintf(stderr, "META [^"); break;925case META_CLASS_END: fprintf(stderr, "META ]"); break;926case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;927case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;928929case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;930case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;931932case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;933case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;934935case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;936case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;937case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;938case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;939case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;940case META_THEN: fprintf(stderr, "META (*THEN)"); break;941942case META_OPTIONS:943fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);944pptr += 2;945break;946947case META_LOOKBEHIND:948fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);949pptr += 2;950break;951952case META_LOOKBEHIND_NA:953fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);954pptr += 2;955break;956957case META_LOOKBEHINDNOT:958fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);959pptr += 2;960break;961962case META_CALLOUT_NUMBER:963fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],964pptr[1]);965pptr += 3;966break;967968case META_CALLOUT_STRING:969{970uint32_t patoffset = *pptr++; /* Offset of next pattern item */971uint32_t patlength = *pptr++; /* Length of next pattern item */972fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);973GETOFFSET(offset, pptr);974fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);975}976break;977978case META_RECURSE_BYNAME:979fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);980GETOFFSET(offset, pptr);981fprintf(stderr, "%zd", offset);982break;983984case META_BACKREF_BYNAME:985fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);986GETOFFSET(offset, pptr);987fprintf(stderr, "%zd", offset);988break;989990case META_COND_NUMBER:991fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);992GETOFFSET(offset, pptr);993fprintf(stderr, "%zd", offset);994pptr++;995break;996997case META_COND_DEFINE:998fprintf(stderr, "META (?(DEFINE) offset=");999GETOFFSET(offset, pptr);1000fprintf(stderr, "%zd", offset);1001break;10021003case META_COND_VERSION:1004fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");1005fprintf(stderr, "%d.", *pptr++);1006fprintf(stderr, "%d)", *pptr++);1007break;10081009case META_COND_NAME:1010fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);1011GETOFFSET(offset, pptr);1012fprintf(stderr, "%zd", offset);1013break;10141015case META_COND_RNAME:1016fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);1017GETOFFSET(offset, pptr);1018fprintf(stderr, "%zd", offset);1019break;10201021/* This is kept as a name, because it might be. */10221023case META_COND_RNUMBER:1024fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);1025GETOFFSET(offset, pptr);1026fprintf(stderr, "%zd", offset);1027break;10281029case META_OFFSET:1030fprintf(stderr, "META_OFFSET offset=");1031GETOFFSET(offset, pptr);1032fprintf(stderr, "%zd", offset);1033break;10341035case META_SCS:1036fprintf(stderr, "META (*scan_substring:");1037break;10381039case META_SCS_NAME:1040fprintf(stderr, "META_SCS_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);1041break;10421043case META_SCS_NUMBER:1044fprintf(stderr, "META_SCS_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);1045break;10461047case META_MARK:1048fprintf(stderr, "META (*MARK:");1049goto SHOWARG;10501051case META_COMMIT_ARG:1052fprintf(stderr, "META (*COMMIT:");1053goto SHOWARG;10541055case META_PRUNE_ARG:1056fprintf(stderr, "META (*PRUNE:");1057goto SHOWARG;10581059case META_SKIP_ARG:1060fprintf(stderr, "META (*SKIP:");1061goto SHOWARG;10621063case META_THEN_ARG:1064fprintf(stderr, "META (*THEN:");1065SHOWARG:1066length = *pptr++;1067for (i = 0; i < length; i++)1068{1069uint32_t cc = *pptr++;1070if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);1071else fprintf(stderr, "\\x{%x}", cc);1072}1073fprintf(stderr, ") length=%u", length);1074break;10751076case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;1077case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;1078case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;1079case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;1080case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;1081}1082fprintf(stderr, "\n");1083}1084return;1085}1086#endif /* DEBUG_SHOW_PARSED */1087108810891090/*************************************************1091* Copy compiled code *1092*************************************************/10931094/* Compiled JIT code cannot be copied, so the new compiled block has no1095associated JIT data. */10961097PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION1098pcre2_code_copy(const pcre2_code *code)1099{1100PCRE2_SIZE *ref_count;1101pcre2_code *newcode;11021103if (code == NULL) return NULL;1104newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);1105if (newcode == NULL) return NULL;1106memcpy(newcode, code, code->blocksize);1107newcode->executable_jit = NULL;11081109/* If the code is one that has been deserialized, increment the reference count1110in the decoded tables. */11111112if ((code->flags & PCRE2_DEREF_TABLES) != 0)1113{1114ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);1115(*ref_count)++;1116}11171118return newcode;1119}1120112111221123/*************************************************1124* Copy compiled code and character tables *1125*************************************************/11261127/* Compiled JIT code cannot be copied, so the new compiled block has no1128associated JIT data. This version of code_copy also makes a separate copy of1129the character tables. */11301131PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION1132pcre2_code_copy_with_tables(const pcre2_code *code)1133{1134PCRE2_SIZE* ref_count;1135pcre2_code *newcode;1136uint8_t *newtables;11371138if (code == NULL) return NULL;1139newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);1140if (newcode == NULL) return NULL;1141memcpy(newcode, code, code->blocksize);1142newcode->executable_jit = NULL;11431144newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),1145code->memctl.memory_data);1146if (newtables == NULL)1147{1148code->memctl.free((void *)newcode, code->memctl.memory_data);1149return NULL;1150}1151memcpy(newtables, code->tables, TABLES_LENGTH);1152ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);1153*ref_count = 1;11541155newcode->tables = newtables;1156newcode->flags |= PCRE2_DEREF_TABLES;1157return newcode;1158}1159116011611162/*************************************************1163* Free compiled code *1164*************************************************/11651166PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION1167pcre2_code_free(pcre2_code *code)1168{1169PCRE2_SIZE* ref_count;11701171if (code != NULL)1172{1173#ifdef SUPPORT_JIT1174if (code->executable_jit != NULL)1175PRIV(jit_free)(code->executable_jit, &code->memctl);1176#endif11771178if ((code->flags & PCRE2_DEREF_TABLES) != 0)1179{1180/* Decoded tables belong to the codes after deserialization, and they must1181be freed when there are no more references to them. The *ref_count should1182always be > 0. */11831184ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);1185if (*ref_count > 0)1186{1187(*ref_count)--;1188if (*ref_count == 0)1189code->memctl.free((void *)code->tables, code->memctl.memory_data);1190}1191}11921193code->memctl.free(code, code->memctl.memory_data);1194}1195}1196119711981199/*************************************************1200* Read a number, possibly signed *1201*************************************************/12021203/* This function is used to read numbers in the pattern. The initial pointer1204must be at the sign or first digit of the number. When relative values1205(introduced by + or -) are allowed, they are relative group numbers, and the1206result must be greater than zero.12071208Arguments:1209ptrptr points to the character pointer variable1210ptrend points to the end of the input string1211allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this1212max_value the largest number allowed;1213you must not pass a value for max_value larger than1214INT_MAX/10 - 1 because this function relies on max_value to1215avoid integer overflow1216max_error the error to give for an over-large number1217intptr where to put the result1218errcodeptr where to put an error code12191220Returns: TRUE - a number was read1221FALSE - errorcode == 0 => no number was found1222errorcode != 0 => an error occurred1223*/12241225static BOOL1226read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,1227uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)1228{1229int sign = 0;1230uint32_t n = 0;1231PCRE2_SPTR ptr = *ptrptr;1232BOOL yield = FALSE;12331234PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);12351236*errorcodeptr = 0;12371238if (allow_sign >= 0 && ptr < ptrend)1239{1240if (*ptr == CHAR_PLUS)1241{1242sign = +1;1243max_value -= allow_sign;1244ptr++;1245}1246else if (*ptr == CHAR_MINUS)1247{1248sign = -1;1249ptr++;1250}1251}12521253if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;1254while (ptr < ptrend && IS_DIGIT(*ptr))1255{1256n = n * 10 + (*ptr++ - CHAR_0);1257if (n > max_value)1258{1259*errorcodeptr = max_error;1260while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;1261goto EXIT;1262}1263}12641265if (allow_sign >= 0 && sign != 0)1266{1267if (n == 0)1268{1269*errorcodeptr = ERR26; /* +0 and -0 are not allowed */1270goto EXIT;1271}12721273if (sign > 0) n += allow_sign;1274else if (n > (uint32_t)allow_sign)1275{1276*errorcodeptr = ERR15; /* Non-existent subpattern */1277goto EXIT;1278}1279else n = allow_sign + 1 - n;1280}12811282yield = TRUE;12831284EXIT:1285*intptr = n;1286*ptrptr = ptr;1287return yield;1288}1289129012911292/*************************************************1293* Read repeat counts *1294*************************************************/12951296/* Read an item of the form {n,m} and return the values when non-NULL pointers1297are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a1298larger value is used for "unlimited". We have to use signed arguments for1299read_number() because it is capable of returning a signed value. As of Perl13005.34.0 either n or m may be absent, but not both. Perl also allows spaces and1301tabs after { and before } and between the numbers and the comma, so we do too.13021303Arguments:1304ptrptr points to pointer to character after '{'1305ptrend pointer to end of input1306minp if not NULL, pointer to int for min1307maxp if not NULL, pointer to int for max1308errorcodeptr points to error code variable13091310Returns: FALSE if not a repeat quantifier, errorcode set zero1311FALSE on error, with errorcode set non-zero1312TRUE on success, with pointer updated to point after '}'1313*/13141315static BOOL1316read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,1317uint32_t *maxp, int *errorcodeptr)1318{1319PCRE2_SPTR p = *ptrptr;1320PCRE2_SPTR pp;1321BOOL yield = FALSE;1322BOOL had_minimum = FALSE;1323int32_t min = 0;1324int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */13251326*errorcodeptr = 0;1327while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;13281329/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence1330such as "X{123456ABC" would incorrectly give a "number too big in quantifier"1331error. */13321333pp = p;1334if (pp < ptrend && IS_DIGIT(*pp))1335{1336had_minimum = TRUE;1337while (++pp < ptrend && IS_DIGIT(*pp)) {}1338}13391340while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;1341if (pp >= ptrend) return FALSE;13421343if (*pp == CHAR_RIGHT_CURLY_BRACKET)1344{1345if (!had_minimum) return FALSE;1346}1347else1348{1349if (*pp++ != CHAR_COMMA) return FALSE;1350while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;1351if (pp >= ptrend) return FALSE;1352if (IS_DIGIT(*pp))1353{1354while (++pp < ptrend && IS_DIGIT(*pp)) {}1355}1356else if (!had_minimum) return FALSE;1357while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;1358if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;1359}13601361/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}1362or {n,m}. The only error that read_number() can return is for a number that is1363too big. If *errorcodeptr is returned as zero it means no number was found. */13641365/* Deal with {,m} or n too big. If we successfully read m there is no need to1366check m >= n because n defaults to zero. */13671368if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))1369{1370if (*errorcodeptr != 0) goto EXIT; /* n too big */1371p++; /* Skip comma and subsequent spaces */1372while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1373if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))1374{1375if (*errorcodeptr != 0) goto EXIT; /* m too big */1376}1377}13781379/* Have read one number. Deal with {n} or {n,} or {n,m} */13801381else1382{1383while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1384if (*p == CHAR_RIGHT_CURLY_BRACKET)1385{1386max = min;1387}1388else /* Handle {n,} or {n,m} */1389{1390p++; /* Skip comma and subsequent spaces */1391while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1392if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))1393{1394if (*errorcodeptr != 0) goto EXIT; /* m too big */1395}13961397if (max < min)1398{1399*errorcodeptr = ERR4;1400goto EXIT;1401}1402}1403}14041405/* Valid quantifier exists */14061407while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1408p++;1409yield = TRUE;1410if (minp != NULL) *minp = (uint32_t)min;1411if (maxp != NULL) *maxp = (uint32_t)max;14121413/* Update the pattern pointer */14141415EXIT:1416*ptrptr = p;1417return yield;1418}1419142014211422/*************************************************1423* Handle escapes *1424*************************************************/14251426/* This function is called when a \ has been encountered. It either returns a1427positive value for a simple escape such as \d, or 0 for a data character, which1428is placed in chptr. A backreference to group n is returned as -(n+1). On1429entry, ptr is pointing at the character after \. On exit, it points after the1430final code unit of the escape sequence.14311432This function is also called from pcre2_substitute() to handle escape sequences1433in replacement strings. In this case, the cb argument is NULL, and in the case1434of escapes that have further processing, only sequences that define a data1435character are recognised. The options argument is the final value of the1436compiled pattern's options.14371438Arguments:1439ptrptr points to the input position pointer1440ptrend points to the end of the input1441chptr points to a returned data character1442errorcodeptr points to the errorcode variable (containing zero)1443options the current options bits1444xoptions the current extra options bits1445bracount the number of capturing parentheses encountered so far1446isclass TRUE if in a character class1447cb compile data block or NULL when called from pcre2_substitute()14481449Returns: zero => a data character1450positive => a special escape sequence1451negative => a numerical back reference1452on error, errorcodeptr is set non-zero1453*/14541455int1456PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,1457int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,1458BOOL isclass, compile_block *cb)1459{1460BOOL utf = (options & PCRE2_UTF) != 0;1461BOOL alt_bsux =1462((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;1463PCRE2_SPTR ptr = *ptrptr;1464uint32_t c, cc;1465int escape = 0;1466int i;14671468/* If backslash is at the end of the string, it's an error. */14691470if (ptr >= ptrend)1471{1472*errorcodeptr = ERR1;1473return 0;1474}14751476GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */1477*errorcodeptr = 0; /* Be optimistic */14781479/* Non-alphanumerics are literals, so we just leave the value in c. An initial1480value test saves a memory lookup for code points outside the alphanumeric1481range. */14821483if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */14841485/* Otherwise, do a table lookup. Non-zero values need little processing here. A1486positive value is a literal value for something like \n. A negative value is1487the negation of one of the ESC_ macros that is passed back for handling by the1488calling function. Some extra checking is needed for \N because only \N{U+dddd}1489is supported. If the value is zero, further processing is handled below. */14901491else if ((i = escapes[c - ESCAPES_FIRST]) != 0)1492{1493if (i > 0)1494{1495c = (uint32_t)i;1496if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)1497c = CHAR_LF;1498}1499else /* Negative table entry */1500{1501escape = -i; /* Else return a special escape */1502if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))1503cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */15041505/* Perl supports \N{name} for character names and \N{U+dddd} for numerical1506Unicode code points, as well as plain \N for "not newline". PCRE does not1507support \N{name}. However, it does support quantification such as \N{2,3},1508so if \N{ is not followed by U+dddd we check for a quantifier. */15091510if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)1511{1512PCRE2_SPTR p = ptr + 1;15131514/* Perl ignores spaces and tabs after { */15151516while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;15171518/* \N{U+ can be handled by the \x{ code. However, this construction is1519not valid in EBCDIC environments because it specifies a Unicode1520character, not a codepoint in the local code. For example \N{U+0041}1521must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode1522casing semantics for the entire pattern, so allow it only in UTF (i.e.1523Unicode) mode. */15241525if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)1526{1527#ifndef EBCDIC1528if (utf)1529{1530ptr = p + 2;1531escape = 0; /* Not a fancy escape after all */1532goto COME_FROM_NU;1533}1534#endif1535*errorcodeptr = ERR93;1536}15371538/* Give an error in contexts where quantifiers are not allowed1539(character classes; substitution strings). */15401541else if (isclass || cb == NULL)1542{1543*errorcodeptr = ERR37;1544}15451546/* Give an error if what follows is not a quantifier, but don't override1547an error set by the quantifier reader (e.g. number overflow). */15481549else1550{1551if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&1552*errorcodeptr == 0)1553*errorcodeptr = ERR37;1554}1555}1556}1557}15581559/* Escapes that need further processing, including those that are unknown, have1560a zero entry in the lookup table. When called from pcre2_substitute(), only \c,1561\o, and \x are recognized (\u and \U can never appear as they are used for case1562forcing). */15631564else1565{1566int s;1567PCRE2_SPTR oldptr;1568BOOL overflow;15691570/* Filter calls from pcre2_substitute(). */15711572if (cb == NULL)1573{1574if (c < CHAR_0 ||1575(c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x && c != CHAR_g)))1576{1577*errorcodeptr = ERR3;1578return 0;1579}1580alt_bsux = FALSE; /* Do not modify \x handling */1581}15821583switch (c)1584{1585/* A number of Perl escapes are not handled by PCRE. We give an explicit1586error. */15871588case CHAR_F:1589case CHAR_l:1590case CHAR_L:1591*errorcodeptr = ERR37;1592break;15931594/* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX1595is set. Otherwise, \u must be followed by exactly four hex digits or, if1596PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.1597Otherwise it is a lowercase u letter. This gives some compatibility with1598ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT1599allowed. When \u{ is not followed by hex digits, a special return is given1600because otherwise \u{ 12} (for example) would be treated as u{12}. */16011602case CHAR_u:1603if (!alt_bsux) *errorcodeptr = ERR37; else1604{1605uint32_t xc;16061607if (ptr >= ptrend) break;1608if (*ptr == CHAR_LEFT_CURLY_BRACKET &&1609(xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)1610{1611PCRE2_SPTR hptr = ptr + 1;16121613cc = 0;1614while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)1615{1616if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */1617{1618*errorcodeptr = ERR77;1619ptr = hptr; /* Show where */1620break; /* *hptr != } will cause another break below */1621}1622cc = (cc << 4) | xc;1623hptr++;1624}16251626if (hptr == ptr + 1 || /* No hex digits */1627hptr >= ptrend || /* Hit end of input */1628*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */1629{1630if (isclass) break; /* In a class, just treat as '\u' literal */1631escape = ESC_ub; /* Special return */1632ptr++; /* Skip { */1633break; /* Hex escape not recognized */1634}16351636c = cc; /* Accept the code point */1637ptr = hptr + 1;1638}16391640else /* Must be exactly 4 hex digits */1641{1642if (ptrend - ptr < 4) break; /* Less than 4 chars */1643if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */1644if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */1645cc = (cc << 4) | xc;1646if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */1647cc = (cc << 4) | xc;1648if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */1649c = (cc << 4) | xc;1650ptr += 4;1651}16521653if (utf)1654{1655if (c > 0x10ffffU) *errorcodeptr = ERR77;1656else1657if (c >= 0xd800 && c <= 0xdfff &&1658(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)1659*errorcodeptr = ERR73;1660}1661else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;1662}1663break;16641665/* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,1666in which case it is an upper case letter. */16671668case CHAR_U:1669if (!alt_bsux) *errorcodeptr = ERR37;1670break;16711672/* In a character class, \g is just a literal "g". Outside a character1673class, \g must be followed by one of a number of specific things:16741675(1) A number, either plain or braced. If positive, it is an absolute1676backreference. If negative, it is a relative backreference. This is a Perl16775.10 feature.16781679(2) Perl 5.10 also supports \g{name} as a reference to a named group. This1680is part of Perl's movement towards a unified syntax for back references. As1681this is synonymous with \k{name}, we fudge it up by pretending it really1682was \k{name}.16831684(3) For Oniguruma compatibility we also support \g followed by a name or a1685number either in angle brackets or in single quotes. However, these are1686(possibly recursive) subroutine calls, _not_ backreferences. We return1687the ESC_g code.16881689Summary: Return a negative number for a numerical back reference (offset1690by 1), ESC_k for a named back reference, and ESC_g for a named or1691numbered subroutine call.16921693The above describes the \g behaviour inside patterns. Inside replacement1694strings (pcre2_substitute) we support only \g<nameornum> for Python1695compatibility. Return ESG_g for the named case, and -(num+1) for the1696numbered case.1697*/16981699case CHAR_g:1700if (isclass) break;17011702if (ptr >= ptrend)1703{1704*errorcodeptr = ERR57;1705break;1706}17071708if (cb == NULL)1709{1710PCRE2_SPTR p;1711/* Substitution strings */1712if (*ptr != CHAR_LESS_THAN_SIGN)1713{1714*errorcodeptr = ERR57;1715break;1716}17171718p = ptr + 1;17191720if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,1721errorcodeptr))1722{1723if (*errorcodeptr == 0) escape = ESC_g; /* No number found */1724break;1725}17261727if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)1728{1729/* not advancing ptr; report error at the \g character */1730*errorcodeptr = ERR57;1731break;1732}17331734/* This is the reason that back references are returned as -(s+1) rather1735than just -s. In a pattern, \0 is not a back reference, but \g<0> is1736valid in a substitution string, so this must be representable. */1737ptr = p + 1;1738escape = -(s+1);1739break;1740}17411742if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)1743{1744escape = ESC_g;1745break;1746}17471748/* If there is a brace delimiter, try to read a numerical reference. If1749there isn't one, assume we have a name and treat it as \k. */17501751if (*ptr == CHAR_LEFT_CURLY_BRACKET)1752{1753PCRE2_SPTR p = ptr + 1;17541755while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;1756if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,1757errorcodeptr))1758{1759if (*errorcodeptr == 0) escape = ESC_k; /* No number found */1760break;1761}1762while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;17631764if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)1765{1766/* not advancing ptr; report error at the \g character */1767*errorcodeptr = ERR57;1768break;1769}1770ptr = p + 1;1771}17721773/* Read an undelimited number */17741775else1776{1777if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,1778errorcodeptr))1779{1780if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */1781break;1782}1783}17841785if (s <= 0)1786{1787*errorcodeptr = ERR15;1788break;1789}17901791escape = -(s+1);1792break;17931794/* The handling of escape sequences consisting of a string of digits1795starting with one that is not zero is not straightforward. Perl has changed1796over the years. Nowadays \g{} for backreferences and \o{} for octal are1797recommended to avoid the ambiguities in the old syntax.17981799Outside a character class, the digits are read as a decimal number. If the1800number is less than 10, or if there are that many previous extracting left1801brackets, it is a back reference. Otherwise, up to three octal digits are1802read to form an escaped character code. Thus \123 is likely to be octal 1231803(cf \0123, which is octal 012 followed by the literal 3). This is the "Perl1804style" of handling ambiguous octal/backrefences such as \12.18051806There is an alternative disambiguation strategy, selected by1807PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must1808have either a leading zero, or exactly three octal digits; otherwise it's1809a backreference. The disambiguation is stable, and does not depend on how1810many capture groups are defined (it's simply an invalid backreference if1811there is no corresponding capture group). Additionally, octal values above1812\377 (\xff) are rejected.18131814Inside a character class, \ followed by a digit is always either a literal18158 or 9 or an octal number. */18161817case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:1818case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:18191820if (isclass)1821{1822/* Fall through to octal handling; never a backreference inside a class. */1823}1824else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)1825{1826/* Python-style disambiguation. */1827if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&1828ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)1829{1830/* We peeked a three-digit octal, so fall through */1831}1832else1833{1834/* We are at a digit, so the only possible error from read_number() is1835a number that is too large. */1836ptr--; /* Back to the digit */18371838if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))1839{1840*errorcodeptr = ERR61;1841break;1842}18431844escape = -(s+1);1845break;1846}1847}1848else1849{1850/* Perl-style disambiguation. */1851oldptr = ptr;1852ptr--; /* Back to the digit */18531854/* As we know we are at a digit, the only possible error from1855read_number() is a number that is too large to be a group number. Because1856that number might be still valid if read as an octal, errorcodeptr is not1857set on failure and therefore a sentinel value of INT_MAX is used instead1858of the original value, and will be used later to properly set the error,1859if not falling through. */18601861if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))1862s = INT_MAX;18631864/* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x1865are octal escapes if there are not that many previous captures. */18661867if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)1868{1869/* s > MAX_GROUP_NUMBER should not be possible because of read_number(),1870but we keep it just to be safe and because it will also catch the1871sentinel value that was set on failure by that function. */18721873if ((unsigned)s > MAX_GROUP_NUMBER)1874{1875PCRE2_ASSERT(s == INT_MAX);1876*errorcodeptr = ERR61;1877}1878else escape = -(s+1); /* Indicates a back reference */1879break;1880}18811882ptr = oldptr; /* Put the pointer back and fall through */1883}18841885/* Handle a digit following \ when the number is not a back reference, or1886we are within a character class. If the first digit is 8 or 9, Perl used to1887generate a binary zero and then treat the digit as a following literal. At1888least by Perl 5.18 this changed so as not to insert the binary zero. */18891890if (c >= CHAR_8) break;18911892/* Fall through */18931894/* \0 always starts an octal number, but we may drop through to here with a1895larger first octal digit. The original code used just to take the least1896significant 8 bits of octal numbers (I think this is what early Perls used1897to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,1898but no more than 3 octal digits. */18991900case CHAR_0:1901c -= CHAR_0;1902while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)1903c = c * 8 + *ptr++ - CHAR_0;1904if (c > 0xff)1905{1906if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;1907#if PCRE2_CODE_UNIT_WIDTH == 81908else if (!utf) *errorcodeptr = ERR51;1909#endif1910}19111912/* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect1913two- or three-character octal escapes \00 and \000, nor \x00. */19141915if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)1916*errorcodeptr = ERR98;1917break;19181919/* \o is a relatively new Perl feature, supporting a more general way of1920specifying character codes in octal. The only supported form is \o{ddd},1921with optional spaces or tabs after { and before }. */19221923case CHAR_o:1924if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)1925{1926ptr--;1927*errorcodeptr = ERR55;1928break;1929}19301931while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;1932if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)1933{1934*errorcodeptr = ERR78;1935break;1936}19371938c = 0;1939overflow = FALSE;1940while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)1941{1942cc = *ptr++;1943if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */1944#if PCRE2_CODE_UNIT_WIDTH == 321945if (c >= 0x20000000u) { overflow = TRUE; break; }1946#endif1947c = (c << 3) + (cc - CHAR_0);1948#if PCRE2_CODE_UNIT_WIDTH == 81949if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }1950#elif PCRE2_CODE_UNIT_WIDTH == 161951if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }1952#elif PCRE2_CODE_UNIT_WIDTH == 321953if (utf && c > 0x10ffffU) { overflow = TRUE; break; }1954#endif1955}19561957while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;19581959if (overflow)1960{1961while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;1962*errorcodeptr = ERR34;1963}1964else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)1965{1966if (utf && c >= 0xd800 && c <= 0xdfff &&1967(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)1968{1969ptr--;1970*errorcodeptr = ERR73;1971}1972}1973else1974{1975ptr--;1976*errorcodeptr = ERR64;1977}1978break;19791980/* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed1981by two hexadecimal digits. Otherwise it is a lowercase x letter. */19821983case CHAR_x:1984if (alt_bsux)1985{1986uint32_t xc;1987if (ptrend - ptr < 2) break; /* Less than 2 characters */1988if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */1989if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */1990c = (cc << 4) | xc;1991ptr += 2;1992}19931994/* Handle \x in Perl's style. \x{ddd} is a character code which can be1995greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex1996digits. If not, { used to be treated as a data character. However, Perl1997seems to read hex digits up to the first non-such, and ignore the rest, so1998that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE1999now gives an error. */20002001else2002{2003if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)2004{2005ptr++;2006while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;20072008#ifndef EBCDIC2009COME_FROM_NU:2010#endif2011if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)2012{2013*errorcodeptr = ERR78;2014break;2015}2016c = 0;2017overflow = FALSE;20182019while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)2020{2021ptr++;2022if (c == 0 && cc == 0) continue; /* Leading zeroes */2023#if PCRE2_CODE_UNIT_WIDTH == 322024if (c >= 0x10000000l) { overflow = TRUE; break; }2025#endif2026c = (c << 4) | cc;2027if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))2028{2029overflow = TRUE;2030break;2031}2032}20332034/* Perl ignores spaces and tabs before } */20352036while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;20372038/* On overflow, skip remaining hex digits */20392040if (overflow)2041{2042while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;2043*errorcodeptr = ERR34;2044}2045else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)2046{2047if (utf && c >= 0xd800 && c <= 0xdfff &&2048(xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)2049{2050ptr--;2051*errorcodeptr = ERR73;2052}2053}20542055/* If the sequence of hex digits (followed by optional space) does not2056end with '}', give an error. We used just to recognize this construct2057and fall through to the normal \x handling, but nowadays Perl gives an2058error, which seems much more sensible, so we do too. */20592060else2061{2062ptr--;2063*errorcodeptr = ERR67;2064}2065} /* End of \x{} processing */20662067/* Read a up to two hex digits after \x */20682069else2070{2071/* Perl has the surprising/broken behaviour that \x without following2072hex digits is treated as an escape for NUL. Their source code laments2073this but keeps it for backwards compatibility. A warning is printed2074when "use warnings" is enabled. Because we don't have warnings, we2075simply forbid it. */2076if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)2077{2078/* Not a hex digit */2079*errorcodeptr = ERR78;2080break;2081}2082ptr++;2083c = cc;20842085/* With "use re 'strict'" Perl actually requires exactly two digits (error2086for \x, \xA and \xAAA). While \x was already rejected, this seems overly2087strict, and there seems little incentive to align with that, given the2088backwards-compatibility cost.20892090For comparison, note that other engines disagree. For example:2091- Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits2092- .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.2093*/2094if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */2095ptr++;2096c = (c << 4) | cc;2097} /* End of \xdd handling */2098} /* End of Perl-style \x handling */2099break;21002101/* The handling of \c is different in ASCII and EBCDIC environments. In an2102ASCII (or Unicode) environment, an error is given if the character2103following \c is not a printable ASCII character. Otherwise, the following2104character is upper-cased if it is a letter, and after that the 0x40 bit is2105flipped. The result is the value of the escape.21062107In an EBCDIC environment the handling of \c is compatible with the2108specification in the perlebcdic document. The following character must be2109a letter or one of small number of special characters. These provide a2110means of defining the character values 0-31.21112112For testing the EBCDIC handling of \c in an ASCII environment, recognize2113the EBCDIC value of 'c' explicitly. */21142115#if defined EBCDIC && 'a' != 0x812116case 0x83:2117#else2118case CHAR_c:2119#endif2120if (ptr >= ptrend)2121{2122*errorcodeptr = ERR2;2123break;2124}2125c = *ptr;2126if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);21272128/* Handle \c in an ASCII/Unicode environment. */21292130#ifndef EBCDIC /* ASCII/UTF-8 coding */2131if (c < 32 || c > 126) /* Excludes all non-printable ASCII */2132{2133*errorcodeptr = ERR68;2134break;2135}2136c ^= 0x40;21372138/* Handle \c in an EBCDIC environment. The special case \c? is converted to2139255 (0xff) or 95 (0x5f) if other characters suggest we are using the2140POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)2141The other valid sequences correspond to a list of specific characters. */21422143#else2144if (c == CHAR_QUESTION_MARK)2145c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;2146else2147{2148for (i = 0; i < 32; i++)2149{2150if (c == ebcdic_escape_c[i]) break;2151}2152if (i < 32) c = i; else *errorcodeptr = ERR68;2153}2154#endif /* EBCDIC */21552156ptr++;2157break;21582159/* Any other alphanumeric following \ is an error. Perl gives an error only2160if in warning mode, but PCRE doesn't have a warning mode. */21612162default:2163*errorcodeptr = ERR3;2164*ptrptr = ptr - 1; /* Point to the character at fault */2165return 0;2166}2167}21682169/* Set the pointer to the next character before returning. */21702171*ptrptr = ptr;2172*chptr = c;2173return escape;2174}2175217621772178#ifdef SUPPORT_UNICODE2179/*************************************************2180* Handle \P and \p *2181*************************************************/21822183/* This function is called after \P or \p has been encountered, provided that2184PCRE2 is compiled with support for UTF and Unicode properties. On entry, the2185contents of ptrptr are pointing after the P or p. On exit, it is left pointing2186after the final code unit of the escape sequence.21872188Arguments:2189ptrptr the pattern position pointer2190negptr a boolean that is set TRUE for negation else FALSE2191ptypeptr an unsigned int that is set to the type value2192pdataptr an unsigned int that is set to the detailed property value2193errorcodeptr the error code variable2194cb the compile data21952196Returns: TRUE if the type value was found, or FALSE for an invalid type2197*/21982199static BOOL2200get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,2201uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)2202{2203PCRE2_UCHAR c;2204PCRE2_SIZE i, bot, top;2205PCRE2_SPTR ptr = *ptrptr;2206PCRE2_UCHAR name[50];2207PCRE2_UCHAR *vptr = NULL;2208uint16_t ptscript = PT_NOTSCRIPT;22092210if (ptr >= cb->end_pattern) goto ERROR_RETURN;2211c = *ptr++;2212*negptr = FALSE;22132214/* \P or \p can be followed by a name in {}, optionally preceded by ^ for2215negation. We must be handling Unicode encoding here, though we may be compiling2216for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC2217input and Unicode input in the same build.) In accordance with Unicode's "loose2218matching" rules, ASCII white space, hyphens, and underscores are ignored. We2219don't use isspace() or tolower() because (a) code points may be greater than2220255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC2221environment. */22222223if (c == CHAR_LEFT_CURLY_BRACKET)2224{2225if (ptr >= cb->end_pattern) goto ERROR_RETURN;22262227for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)2228{2229REDO:22302231if (ptr >= cb->end_pattern) goto ERROR_RETURN;2232c = *ptr++;22332234/* Skip ignorable Unicode characters. */22352236while (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||2237(c >= CHAR_HT && c <= CHAR_CR))2238{2239if (ptr >= cb->end_pattern) goto ERROR_RETURN;2240c = *ptr++;2241}22422243/* The first significant character being circumflex negates the meaning of2244the item. */22452246if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)2247{2248*negptr = TRUE;2249goto REDO;2250}22512252if (c == CHAR_RIGHT_CURLY_BRACKET) break;22532254/* Names consist of ASCII letters and digits, but equals and colon may also2255occur as a name/value separator. We must also allow for \p{L&}. A simple2256check for a value between '&' and 'z' suffices because anything else in a2257name or value will cause an "unknown property" error anyway. */22582259if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;22602261/* Lower case a capital letter or remember where the name/value separator2262is. */22632264if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;2265else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)2266vptr = name + i;22672268name[i] = c;2269}22702271/* Error if the loop didn't end with '}' - either we hit the end of the2272pattern or the name was longer than any legal property name. */22732274if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;2275name[i] = 0;2276}22772278/* If { doesn't follow \p or \P there is just one following character, which2279must be an ASCII letter. */22802281else if (c >= CHAR_A && c <= CHAR_Z)2282{2283name[0] = c | 0x20; /* Lower case */2284name[1] = 0;2285}2286else if (c >= CHAR_a && c <= CHAR_z)2287{2288name[0] = c;2289name[1] = 0;2290}2291else goto ERROR_RETURN;22922293*ptrptr = ptr; /* Update pattern pointer */22942295/* If the property contains ':' or '=' we have class name and value separately2296specified. The following are supported:22972298. Bidi_Class (synonym bc), for which the property names are "bidi<name>".2299. Script (synonym sc) for which the property name is the script name2300. Script_Extensions (synonym scx), ditto23012302As this is a small number, we currently just check the names directly. If this2303grows, a sorted table and a switch will be neater.23042305For both the script properties, set a PT_xxx value so that (1) they can be2306distinguished and (2) invalid script names that happen to be the name of2307another property can be diagnosed. */23082309if (vptr != NULL)2310{2311int offset = 0;2312PCRE2_UCHAR sname[8];23132314*vptr = 0; /* Terminate property name */2315if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||2316PRIV(strcmp_c8)(name, STRING_bc) == 0)2317{2318offset = 4;2319sname[0] = CHAR_b;2320sname[1] = CHAR_i; /* There is no strcpy_c8 function */2321sname[2] = CHAR_d;2322sname[3] = CHAR_i;2323}23242325else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||2326PRIV(strcmp_c8)(name, STRING_sc) == 0)2327ptscript = PT_SC;23282329else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||2330PRIV(strcmp_c8)(name, STRING_scx) == 0)2331ptscript = PT_SCX;23322333else2334{2335*errorcodeptr = ERR47;2336return FALSE;2337}23382339/* Adjust the string in name[] as needed */23402341memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));2342if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));2343}23442345/* Search for a recognized property using binary chop. */23462347bot = 0;2348top = PRIV(utt_size);23492350while (bot < top)2351{2352int r;2353i = (bot + top) >> 1;2354r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);23552356/* When a matching property is found, some extra checking is needed when the2357\p{xx:yy} syntax is used and xx is either sc or scx. */23582359if (r == 0)2360{2361*pdataptr = PRIV(utt)[i].value;2362if (vptr == NULL || ptscript == PT_NOTSCRIPT)2363{2364*ptypeptr = PRIV(utt)[i].type;2365return TRUE;2366}23672368switch (PRIV(utt)[i].type)2369{2370case PT_SC:2371*ptypeptr = PT_SC;2372return TRUE;23732374case PT_SCX:2375*ptypeptr = ptscript;2376return TRUE;2377}23782379break; /* Non-script found */2380}23812382if (r > 0) bot = i + 1; else top = i;2383}23842385*errorcodeptr = ERR47; /* Unrecognized property */2386return FALSE;23872388ERROR_RETURN: /* Malformed \P or \p */2389*errorcodeptr = ERR46;2390*ptrptr = ptr;2391return FALSE;2392}2393#endif2394239523962397/*************************************************2398* Check for POSIX class syntax *2399*************************************************/24002401/* This function is called when the sequence "[:" or "[." or "[=" is2402encountered in a character class. It checks whether this is followed by a2403sequence of characters terminated by a matching ":]" or ".]" or "=]". If we2404reach an unescaped ']' without the special preceding character, return FALSE.24052406Originally, this function only recognized a sequence of letters between the2407terminators, but it seems that Perl recognizes any sequence of characters,2408though of course unknown POSIX names are subsequently rejected. Perl gives an2409"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE2410didn't consider this to be a POSIX class. Likewise for [:1234:].24112412The problem in trying to be exactly like Perl is in the handling of escapes. We2413have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX2414class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code2415below handles the special cases \\ and \], but does not try to do any other2416escape processing. This makes it different from Perl for cases such as2417[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does2418not recognize "l\ower". This is a lesser evil than not diagnosing bad classes2419when Perl does, I think.24202421A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.2422It seems that the appearance of a nested POSIX class supersedes an apparent2423external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or2424a digit. This is handled by returning FALSE if the start of a new group with2425the same terminator is encountered, since the next closing sequence must close2426the nested group, not the outer one.24272428In Perl, unescaped square brackets may also appear as part of class names. For2429example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for2430[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not2431seem right at all. PCRE does not allow closing square brackets in POSIX class2432names.24332434Arguments:2435ptr pointer to the character after the initial [ (colon, dot, equals)2436ptrend pointer to the end of the pattern2437endptr where to return a pointer to the terminating ':', '.', or '='24382439Returns: TRUE or FALSE2440*/24412442static BOOL2443check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)2444{2445PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */2446terminator = *ptr++; /* compiler warns about "non-constant" initializer. */24472448for (; ptrend - ptr >= 2; ptr++)2449{2450if (*ptr == CHAR_BACKSLASH &&2451(ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))2452ptr++;24532454else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||2455*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;24562457else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)2458{2459*endptr = ptr;2460return TRUE;2461}2462}24632464return FALSE;2465}2466246724682469/*************************************************2470* Check POSIX class name *2471*************************************************/24722473/* This function is called to check the name given in a POSIX-style class entry2474such as [:alnum:].24752476Arguments:2477ptr points to the first letter2478len the length of the name24792480Returns: a value representing the name, or -1 if unknown2481*/24822483static int2484check_posix_name(PCRE2_SPTR ptr, int len)2485{2486const char *pn = posix_names;2487int yield = 0;2488while (posix_name_lengths[yield] != 0)2489{2490if (len == posix_name_lengths[yield] &&2491PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;2492pn += posix_name_lengths[yield] + 1;2493yield++;2494}2495return -1;2496}2497249824992500/*************************************************2501* Read a subpattern or VERB name *2502*************************************************/25032504/* This function is called from parse_regex() below whenever it needs to read2505the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial2506pointer must be to the preceding character. If that character is '*' we are2507reading a verb or alpha assertion name. The pointer is updated to point after2508the name, for a VERB or alpha assertion name, or after tha name's terminator2509for a subpattern name. Returning both the offset and the name pointer is2510redundant information, but some callers use one and some the other, so it is2511simplest just to return both. When the name is in braces, spaces and tabs are2512allowed (and ignored) at either end.25132514Arguments:2515ptrptr points to the character pointer variable2516ptrend points to the end of the input string2517utf true if the input is UTF-encoded2518terminator the terminator of a subpattern name must be this2519offsetptr where to put the offset from the start of the pattern2520nameptr where to put a pointer to the name in the input2521namelenptr where to put the length of the name2522errcodeptr where to put an error code2523cb pointer to the compile data block25242525Returns: TRUE if a name was read2526FALSE otherwise, with error code set2527*/25282529static BOOL2530read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,2531PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,2532int *errorcodeptr, compile_block *cb)2533{2534PCRE2_SPTR ptr = *ptrptr;2535BOOL is_group = (*ptr++ != CHAR_ASTERISK);2536BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;25372538if (is_braced)2539while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;25402541if (ptr >= ptrend) /* No characters in name */2542{2543*errorcodeptr = is_group? ERR62: /* Subpattern name expected */2544ERR60; /* Verb not recognized or malformed */2545goto FAILED;2546}25472548*nameptr = ptr;2549*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);25502551/* If this logic were ever to change, the matching function in pcre2_substitute.c2552ought to be updated to match. */25532554/* In UTF mode, a group name may contain letters and decimal digits as defined2555by Unicode properties, and underscores, but must not start with a digit. */25562557#ifdef SUPPORT_UNICODE2558if (utf && is_group)2559{2560uint32_t c, type;25612562GETCHAR(c, ptr);2563type = UCD_CHARTYPE(c);25642565if (type == ucp_Nd)2566{2567*errorcodeptr = ERR44;2568goto FAILED;2569}25702571for(;;)2572{2573if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&2574c != CHAR_UNDERSCORE) break;2575ptr++;2576FORWARDCHARTEST(ptr, ptrend);2577if (ptr >= ptrend) break;2578GETCHAR(c, ptr);2579type = UCD_CHARTYPE(c);2580}2581}2582else2583#else2584(void)utf; /* Avoid compiler warning */2585#endif /* SUPPORT_UNICODE */25862587/* Handle non-group names and group names in non-UTF modes. A group name must2588not start with a digit. If either of the others start with a digit it just2589won't be recognized. */25902591{2592if (is_group && IS_DIGIT(*ptr))2593{2594*errorcodeptr = ERR44;2595goto FAILED;2596}25972598while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)2599{2600ptr++;2601}2602}26032604/* Check name length */26052606if (ptr > *nameptr + MAX_NAME_SIZE)2607{2608*errorcodeptr = ERR48;2609goto FAILED;2610}2611*namelenptr = (uint32_t)(ptr - *nameptr);26122613/* Subpattern names must not be empty, and their terminator is checked here.2614(What follows a verb or alpha assertion name is checked separately.) */26152616if (is_group)2617{2618if (ptr == *nameptr)2619{2620*errorcodeptr = ERR62; /* Subpattern name expected */2621goto FAILED;2622}2623if (is_braced)2624while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;2625if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)2626{2627*errorcodeptr = ERR42;2628goto FAILED;2629}2630ptr++;2631}26322633*ptrptr = ptr;2634return TRUE;26352636FAILED:2637*ptrptr = ptr;2638return FALSE;2639}2640264126422643/*************************************************2644* Manage callouts at start of cycle *2645*************************************************/26462647/* At the start of a new item in parse_regex() we are able to record the2648details of the previous item in a prior callout, and also to set up an2649automatic callout if enabled. Avoid having two adjacent automatic callouts,2650which would otherwise happen for items such as \Q that contribute nothing to2651the parsed pattern.26522653Arguments:2654ptr current pattern pointer2655pcalloutptr points to a pointer to previous callout, or NULL2656auto_callout TRUE if auto_callouts are enabled2657parsed_pattern the parsed pattern pointer2658cb compile block26592660Returns: possibly updated parsed_pattern pointer.2661*/26622663static uint32_t *2664manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,2665uint32_t *parsed_pattern, compile_block *cb)2666{2667uint32_t *previous_callout = *pcalloutptr;26682669if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -2670cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);26712672if (!auto_callout) previous_callout = NULL; else2673{2674if (previous_callout == NULL ||2675previous_callout != parsed_pattern - 4 ||2676previous_callout[3] != 255)2677{2678previous_callout = parsed_pattern; /* Set up new automatic callout */2679parsed_pattern += 4;2680previous_callout[0] = META_CALLOUT_NUMBER;2681previous_callout[2] = 0;2682previous_callout[3] = 255;2683}2684previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);2685}26862687*pcalloutptr = previous_callout;2688return parsed_pattern;2689}2690269126922693/*************************************************2694* Handle \d, \D, \s, \S, \w, \W *2695*************************************************/26962697/* This function is called from parse_regex() below, both for freestanding2698escapes, and those within classes, to handle those escapes that may change when2699Unicode property support is requested. Note that PCRE2_UCP will never be set2700without Unicode support because that is checked when pcre2_compile() is called.27012702Arguments:2703escape the ESC_... value2704parsed_pattern where to add the code2705options options bits2706xoptions extra options bits27072708Returns: updated value of parsed_pattern2709*/2710static uint32_t *2711handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,2712uint32_t xoptions)2713{2714uint32_t ascii_option = 0;2715uint32_t prop = ESC_p;27162717switch(escape)2718{2719case ESC_D:2720prop = ESC_P;2721/* Fall through */2722case ESC_d:2723ascii_option = PCRE2_EXTRA_ASCII_BSD;2724break;27252726case ESC_S:2727prop = ESC_P;2728/* Fall through */2729case ESC_s:2730ascii_option = PCRE2_EXTRA_ASCII_BSS;2731break;27322733case ESC_W:2734prop = ESC_P;2735/* Fall through */2736case ESC_w:2737ascii_option = PCRE2_EXTRA_ASCII_BSW;2738break;2739}27402741if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)2742{2743*parsed_pattern++ = META_ESCAPE + escape;2744}2745else2746{2747*parsed_pattern++ = META_ESCAPE + prop;2748switch(escape)2749{2750case ESC_d:2751case ESC_D:2752*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;2753break;27542755case ESC_s:2756case ESC_S:2757*parsed_pattern++ = PT_SPACE << 16;2758break;27592760case ESC_w:2761case ESC_W:2762*parsed_pattern++ = PT_WORD << 16;2763break;2764}2765}27662767return parsed_pattern;2768}2769277027712772/*************************************************2773* Maximum size of parsed_pattern for given input *2774*************************************************/27752776/* This function is called from parse_regex() below, to determine the amount2777of memory to allocate for parsed_pattern. It is also called to check whether2778the amount of data written respects the amount of memory allocated.27792780Arguments:2781ptr points to the start of the pattern2782ptrend points to the end of the pattern2783utf TRUE in UTF mode2784options the options bits27852786Returns: the number of uint32_t units for parsed_pattern2787*/2788static ptrdiff_t2789max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,2790uint32_t options)2791{2792PCRE2_SIZE big32count = 0;2793ptrdiff_t parsed_size_needed;27942795/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of2796unsigned 32-bit ints written out to the parsed pattern is bounded by the length2797of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,2798when literal characters greater than META_END (0x80000000) have to be coded as2799two units. In this case, therefore, we scan the pattern to check for such2800values. */28012802#if PCRE2_CODE_UNIT_WIDTH == 322803if (!utf)2804{2805PCRE2_SPTR p;2806for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;2807}2808#else2809(void)utf; /* Avoid compiler warning */2810#endif28112812parsed_size_needed = (ptrend - ptr) + big32count;28132814/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (42815elements) for each character. This is overkill, but memory is plentiful these2816days. */28172818if ((options & PCRE2_AUTO_CALLOUT) != 0)2819parsed_size_needed += (ptrend - ptr) * 4;28202821return parsed_size_needed;2822}2823282428252826/*************************************************2827* Parse regex and identify named groups *2828*************************************************/28292830/* This function is called first of all. It scans the pattern and does two2831things: (1) It identifies capturing groups and makes a table of named capturing2832groups so that information about them is fully available to both the compiling2833scans. (2) It writes a parsed version of the pattern with comments omitted and2834escapes processed into the parsed_pattern vector.28352836Arguments:2837ptr points to the start of the pattern2838options compiling dynamic options (may change during the scan)2839has_lookbehind points to a boolean, set TRUE if a lookbehind is found2840cb pointer to the compile data block28412842Returns: zero on success or a non-zero error code, with the2843error offset placed in the cb field2844*/28452846/* A structure and some flags for dealing with nested groups. */28472848typedef struct nest_save {2849uint16_t nest_depth;2850uint16_t reset_group;2851uint16_t max_group;2852uint16_t flags;2853uint32_t options;2854uint32_t xoptions;2855} nest_save;28562857#define NSF_RESET 0x0001u2858#define NSF_CONDASSERT 0x0002u2859#define NSF_ATOMICSR 0x0004u28602861/* Options that are changeable within the pattern must be tracked during2862parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,2863but all must be tracked so that META_OPTIONS items set the correct values for2864the main compiling phase. */28652866#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \2867PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \2868PCRE2_UNGREEDY)28692870#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \2871PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \2872PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)28732874/* States used for analyzing ranges in character classes. The two OK values2875must be last. */28762877enum {2878RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */2879RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */2880RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */2881RANGE_FORBID_STARTED, /* State after '[\d-'*/2882RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */2883RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */2884};28852886/* States used for analyzing operators and operands in extended character2887classes. */28882889enum {2890CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */2891CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */2892CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */2893};28942895/* States used for determining the parse mode in character classes. The two2896PERL_EXT values must be last. */28972898enum {2899CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */2900CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */2901CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */2902CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */2903};29042905/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates2906the storing of literal values in the main parsed pattern, where they can always2907be quantified. */29082909#if PCRE2_CODE_UNIT_WIDTH == 322910#define PARSED_LITERAL(c, p) \2911{ \2912if (c >= META_END) *p++ = META_BIGVALUE; \2913*p++ = c; \2914okquantifier = TRUE; \2915}2916#else2917#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;2918#endif29192920/* Here's the actual function. */29212922static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,2923BOOL *has_lookbehind, compile_block *cb)2924{2925uint32_t c;2926uint32_t delimiter;2927uint32_t namelen;2928uint32_t class_range_state;2929uint32_t class_op_state;2930uint32_t class_mode_state;2931uint32_t *class_start;2932uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */2933uint32_t *verbstartptr = NULL;2934uint32_t *previous_callout = NULL;2935uint32_t *parsed_pattern = cb->parsed_pattern;2936uint32_t *parsed_pattern_end = cb->parsed_pattern_end;2937uint32_t *this_parsed_item = NULL;2938uint32_t *prev_parsed_item = NULL;2939uint32_t meta_quantifier = 0;2940uint32_t add_after_mark = 0;2941uint16_t nest_depth = 0;2942int16_t class_depth_m1 = -1; /* The m1 means minus 1. */2943int16_t class_maxdepth_m1 = -1;2944int after_manual_callout = 0;2945int expect_cond_assert = 0;2946int errorcode = 0;2947int escape;2948int i;2949BOOL inescq = FALSE;2950BOOL inverbname = FALSE;2951BOOL utf = (options & PCRE2_UTF) != 0;2952BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;2953BOOL isdupname;2954BOOL negate_class;2955BOOL okquantifier = FALSE;2956PCRE2_SPTR thisptr;2957PCRE2_SPTR name;2958PCRE2_SPTR ptrend = cb->end_pattern;2959PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */2960PCRE2_SPTR class_range_forbid_ptr = NULL;2961named_group *ng;2962nest_save *top_nest, *end_nests;2963#ifdef PCRE2_DEBUG2964uint32_t *parsed_pattern_check;2965ptrdiff_t parsed_pattern_extra = 0;2966ptrdiff_t parsed_pattern_extra_check = 0;2967PCRE2_SPTR ptr_check;2968#endif29692970PCRE2_ASSERT(parsed_pattern != NULL);29712972/* Insert leading items for word and line matching (features provided for the2973benefit of pcre2grep). */29742975if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)2976{2977*parsed_pattern++ = META_CIRCUMFLEX;2978*parsed_pattern++ = META_NOCAPTURE;2979}2980else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)2981{2982*parsed_pattern++ = META_ESCAPE + ESC_b;2983*parsed_pattern++ = META_NOCAPTURE;2984}29852986#ifdef PCRE2_DEBUG2987parsed_pattern_check = parsed_pattern;2988ptr_check = ptr;2989#endif29902991/* If the pattern is actually a literal string, process it separately to avoid2992cluttering up the main loop. */29932994if ((options & PCRE2_LITERAL) != 0)2995{2996while (ptr < ptrend)2997{2998if (parsed_pattern >= parsed_pattern_end)2999{3000PCRE2_DEBUG_UNREACHABLE();3001errorcode = ERR63; /* Internal error (parsed pattern overflow) */3002goto FAILED;3003}3004thisptr = ptr;3005GETCHARINCTEST(c, ptr);3006if (auto_callout)3007parsed_pattern = manage_callouts(thisptr, &previous_callout,3008auto_callout, parsed_pattern, cb);3009PARSED_LITERAL(c, parsed_pattern);3010}3011goto PARSED_END;3012}30133014/* Process a real regex which may contain meta-characters. */30153016top_nest = NULL;3017end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);30183019/* The size of the nest_save structure might not be a factor of the size of the3020workspace. Therefore we must round down end_nests so as to correctly avoid3021creating a nest_save that spans the end of the workspace. */30223023end_nests = (nest_save *)((char *)end_nests -3024((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));30253026/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */30273028if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;30293030/* Now scan the pattern */30313032while (ptr < ptrend)3033{3034int prev_expect_cond_assert;3035uint32_t min_repeat = 0, max_repeat = 0;3036uint32_t set, unset, *optset;3037uint32_t xset, xunset, *xoptset;3038uint32_t terminator;3039uint32_t prev_meta_quantifier;3040BOOL prev_okquantifier;3041PCRE2_SPTR tempptr;3042PCRE2_SIZE offset;30433044if (nest_depth > cb->cx->parens_nest_limit)3045{3046errorcode = ERR19;3047goto FAILED; /* Parentheses too deeply nested */3048}30493050/* Check that we haven't emitted too much into parsed_pattern. We allocate3051a suitably-sized buffer upfront, then do unchecked writes to it. If we only3052write a little bit too much, everything will appear to be OK, because the3053upfront size is an overestimate... but a malicious pattern could end up3054forcing a write past the buffer end. We must catch this during3055development. */30563057#ifdef PCRE2_DEBUG3058/* Strong post-write check. Won't help in release builds - at this point3059the write has already occurred so it's too late. However, should stop us3060committing unsafe code. */3061PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +3062(parsed_pattern_extra - parsed_pattern_extra_check) <=3063max_parsed_pattern(ptr_check, ptr, utf, options));3064parsed_pattern_check = parsed_pattern;3065parsed_pattern_extra_check = parsed_pattern_extra;3066ptr_check = ptr;3067#endif30683069if (parsed_pattern >= parsed_pattern_end)3070{3071/* Weak pre-write check; only ensures parsed_pattern[0] is writeable3072(but the code below can write many chars). Better than nothing. */3073PCRE2_DEBUG_UNREACHABLE();3074errorcode = ERR63; /* Internal error (parsed pattern overflow) */3075goto FAILED;3076}30773078/* If the last time round this loop something was added, parsed_pattern will3079no longer be equal to this_parsed_item. Remember where the previous item3080started and reset for the next item. Note that sometimes round the loop,3081nothing gets added (e.g. for ignored white space). */30823083if (this_parsed_item != parsed_pattern)3084{3085prev_parsed_item = this_parsed_item;3086this_parsed_item = parsed_pattern;3087}30883089/* Get next input character, save its position for callout handling. */30903091thisptr = ptr;3092GETCHARINCTEST(c, ptr);30933094/* Copy quoted literals until \E, allowing for the possibility of automatic3095callouts, except when processing a (*VERB) "name". */30963097if (inescq)3098{3099if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)3100{3101inescq = FALSE;3102ptr++; /* Skip E */3103}3104else3105{3106if (expect_cond_assert > 0) /* A literal is not allowed if we are */3107{ /* expecting a conditional assertion, */3108ptr--; /* but an empty \Q\E sequence is OK. */3109errorcode = ERR28;3110goto FAILED;3111}3112if (inverbname)3113{ /* Don't use PARSED_LITERAL() because it */3114#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */3115if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;3116#endif3117*parsed_pattern++ = c;3118}3119else3120{3121if (after_manual_callout-- <= 0)3122parsed_pattern = manage_callouts(thisptr, &previous_callout,3123auto_callout, parsed_pattern, cb);3124PARSED_LITERAL(c, parsed_pattern);3125}3126meta_quantifier = 0;3127}3128continue; /* Next character */3129}31303131/* If we are processing the "name" part of a (*VERB:NAME) item, all3132characters up to the closing parenthesis are literals except when3133PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q3134and \E and escaped characters are allowed (no character types such as \d). If3135PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do3136this by not entering the special (*VERB:NAME) processing - they are then3137picked up below. Note that c is a character, not a code unit, so we must not3138use MAX_255 to test its size because MAX_255 tests code units and is assumed3139TRUE in 8-bit mode. */31403141if (inverbname &&3142(3143/* EITHER: not both options set */3144((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=3145(PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||3146#ifdef SUPPORT_UNICODE3147/* OR: character > 255 AND not Unicode Pattern White Space */3148(c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||3149#endif3150/* OR: not a # comment or isspace() white space */3151(c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 03152#ifdef SUPPORT_UNICODE3153/* and not CHAR_NEL when Unicode is supported */3154&& c != CHAR_NEL3155#endif3156)))3157{3158PCRE2_SIZE verbnamelength;31593160switch(c)3161{3162default: /* Don't use PARSED_LITERAL() because it */3163#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */3164if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;3165#endif3166*parsed_pattern++ = c;3167break;31683169case CHAR_RIGHT_PARENTHESIS:3170inverbname = FALSE;3171/* This is the length in characters */3172verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);3173/* But the limit on the length is in code units */3174if (ptr - verbnamestart - 1 > (int)MAX_MARK)3175{3176ptr--;3177errorcode = ERR76;3178goto FAILED;3179}3180*verblengthptr = (uint32_t)verbnamelength;31813182/* If this name was on a verb such as (*ACCEPT) which does not continue,3183a (*MARK) was generated for the name. We now add the original verb as the3184next item. */31853186if (add_after_mark != 0)3187{3188*parsed_pattern++ = add_after_mark;3189add_after_mark = 0;3190}3191break;31923193case CHAR_BACKSLASH:3194if ((options & PCRE2_ALT_VERBNAMES) != 0)3195{3196escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,3197xoptions, cb->bracount, FALSE, cb);3198if (errorcode != 0) goto FAILED;3199}3200else escape = 0; /* Treat all as literal */32013202switch(escape)3203{3204case 0: /* Don't use PARSED_LITERAL() because it */3205#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */3206if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;3207#endif3208*parsed_pattern++ = c;3209break;32103211case ESC_ub:3212*parsed_pattern++ = CHAR_u;3213PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);3214break;32153216case ESC_Q:3217inescq = TRUE;3218break;32193220case ESC_E: /* Ignore */3221break;32223223default:3224errorcode = ERR40; /* Invalid in verb name */3225goto FAILED;3226}3227}3228continue; /* Next character in pattern */3229}32303231/* Not a verb name character. At this point we must process everything that3232must not change the quantification state. This is mainly comments, but we3233handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as3234A+, as in Perl. An isolated \E is ignored. */32353236if (c == CHAR_BACKSLASH && ptr < ptrend)3237{3238if (*ptr == CHAR_Q || *ptr == CHAR_E)3239{3240inescq = *ptr == CHAR_Q;3241ptr++;3242continue;3243}3244}32453246/* Skip over whitespace and # comments in extended mode. Note that c is a3247character, not a code unit, so we must not use MAX_255 to test its size3248because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The3249whitespace characters are those designated as "Pattern White Space" by3250Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is3251U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a3252subset of space characters that match \h and \v. */32533254if ((options & PCRE2_EXTENDED) != 0)3255{3256if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;3257#ifdef SUPPORT_UNICODE3258if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;3259#endif3260if (c == CHAR_NUMBER_SIGN)3261{3262while (ptr < ptrend)3263{3264if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */3265{ /* IS_NEWLINE sets cb->nllen. */3266ptr += cb->nllen;3267break;3268}3269ptr++;3270#ifdef SUPPORT_UNICODE3271if (utf) FORWARDCHARTEST(ptr, ptrend);3272#endif3273}3274continue; /* Next character in pattern */3275}3276}32773278/* Skip over bracketed comments */32793280if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&3281ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)3282{3283while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);3284if (ptr >= ptrend)3285{3286errorcode = ERR18; /* A special error for missing ) in a comment */3287goto FAILED; /* to make it easier to debug. */3288}3289ptr++;3290continue; /* Next character in pattern */3291}32923293/* If the next item is not a quantifier, fill in length of any previous3294callout and create an auto callout if required. */32953296if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&3297(c != CHAR_LEFT_CURLY_BRACKET ||3298(tempptr = ptr,3299!read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))3300{3301if (after_manual_callout-- <= 0)3302{3303parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,3304parsed_pattern, cb);3305this_parsed_item = parsed_pattern; /* New start for current item */3306}3307}33083309/* If expect_cond_assert is 2, we have just passed (?( and are expecting an3310assertion, possibly preceded by a callout. If the value is 1, we have just3311had the callout and expect an assertion. There must be at least 3 more3312characters in all cases. When expect_cond_assert is 2, we know that the3313current character is an opening parenthesis, as otherwise we wouldn't be3314here. However, when it is 1, we need to check, and it's easiest just to check3315always. Note that expect_cond_assert may be negative, since all callouts just3316decrement it. */33173318if (expect_cond_assert > 0)3319{3320BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&3321(ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);3322if (ok)3323{3324if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */3325{3326ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;3327}3328else switch(ptr[1]) /* Traditional symbolic format */3329{3330case CHAR_C:3331ok = expect_cond_assert == 2;3332break;33333334case CHAR_EQUALS_SIGN:3335case CHAR_EXCLAMATION_MARK:3336break;33373338case CHAR_LESS_THAN_SIGN:3339ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;3340break;33413342default:3343ok = FALSE;3344}3345}33463347if (!ok)3348{3349ptr--; /* Adjust error offset */3350errorcode = ERR28;3351goto FAILED;3352}3353}33543355/* Remember whether we are expecting a conditional assertion, and set the3356default for this item. */33573358prev_expect_cond_assert = expect_cond_assert;3359expect_cond_assert = 0;33603361/* Remember quantification status for the previous significant item, then set3362default for this item. */33633364prev_okquantifier = okquantifier;3365prev_meta_quantifier = meta_quantifier;3366okquantifier = FALSE;3367meta_quantifier = 0;33683369/* If the previous significant item was a quantifier, adjust the parsed code3370if there is a following modifier. The base meta value is always followed by3371the PLUS and QUERY values, in that order. We do this here rather than after3372reading a quantifier so that intervening comments and /x whitespace can be3373ignored without having to replicate code. */33743375if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))3376{3377parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =3378prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?33790x00020000u : 0x00010000u);3380continue; /* Next character in pattern */3381}33823383/* Process the next item in the main part of a pattern. */33843385switch(c)3386{3387default: /* Non-special character */3388PARSED_LITERAL(c, parsed_pattern);3389break;339033913392/* ---- Escape sequence ---- */33933394case CHAR_BACKSLASH:3395tempptr = ptr;3396escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,3397xoptions, cb->bracount, FALSE, cb);3398if (errorcode != 0)3399{3400ESCAPE_FAILED:3401if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)3402goto FAILED;3403ptr = tempptr;3404if (ptr >= ptrend) c = CHAR_BACKSLASH; else3405{3406GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */3407}3408escape = 0; /* Treat as literal character */3409}34103411/* The escape was a data escape or literal character. */34123413if (escape == 0)3414{3415PARSED_LITERAL(c, parsed_pattern);3416}34173418/* The escape was a back (or forward) reference. We keep the offset in3419order to give a more useful diagnostic for a bad forward reference. For3420references to groups numbered less than 10 we can't use more than two items3421in parsed_pattern because they may be just two characters in the input (and3422in a 64-bit world an offset may need two elements). So for them, the offset3423of the first occurrent is held in a special vector. */34243425else if (escape < 0)3426{3427offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);3428escape = -escape - 1;3429*parsed_pattern++ = META_BACKREF | (uint32_t)escape;3430if (escape < 10)3431{3432if (cb->small_ref_offset[escape] == PCRE2_UNSET)3433cb->small_ref_offset[escape] = offset;3434}3435else3436{3437PUTOFFSET(offset, parsed_pattern);3438}3439okquantifier = TRUE;3440}34413442/* The escape was a character class such as \d etc. or other special3443escape indicator such as \A or \X. Most of them generate just a single3444parsed item, but \P and \p are followed by a 16-bit type and a 16-bit3445value. They are supported only when Unicode is available. The type and3446value are packed into a single 32-bit value so that the whole sequences3447uses only two elements in the parsed_vector. This is because the same3448coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is3449set.34503451There are also some cases where the escape sequence is followed by a name:3452\k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>3453and \g'name' are subroutine calls by name; \g{name} is a synonym for3454\k{name}. Note that \g<number> and \g'number' are handled by check_escape()3455and returned as a negative value (handled above). A name is coded as an3456offset into the pattern and a length. */34573458else switch (escape)3459{3460case ESC_C:3461#ifdef NEVER_BACKSLASH_C3462errorcode = ERR85;3463goto ESCAPE_FAILED;3464#else3465if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)3466{3467errorcode = ERR83;3468goto ESCAPE_FAILED;3469}3470#endif3471okquantifier = TRUE;3472*parsed_pattern++ = META_ESCAPE + escape;3473break;34743475/* This is a special return that happens only in EXTRA_ALT_BSUX mode,3476when \u{ is not followed by hex digits and }. It requests two literal3477characters, u and { and we need this, as otherwise \u{ 12} (for example)3478would be treated as u{12} now that spaces are allowed in quantifiers. */34793480case ESC_ub:3481*parsed_pattern++ = CHAR_u;3482PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);3483break;34843485case ESC_X:3486#ifndef SUPPORT_UNICODE3487errorcode = ERR45; /* Supported only with Unicode support */3488goto ESCAPE_FAILED;3489#endif3490case ESC_H:3491case ESC_h:3492case ESC_N:3493case ESC_R:3494case ESC_V:3495case ESC_v:3496okquantifier = TRUE;3497*parsed_pattern++ = META_ESCAPE + escape;3498break;34993500default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */3501*parsed_pattern++ = META_ESCAPE + escape;3502break;35033504/* Escapes that may change in UCP mode. */35053506case ESC_d:3507case ESC_D:3508case ESC_s:3509case ESC_S:3510case ESC_w:3511case ESC_W:3512okquantifier = TRUE;3513parsed_pattern = handle_escdsw(escape, parsed_pattern, options,3514xoptions);3515break;35163517/* Unicode property matching */35183519case ESC_P:3520case ESC_p:3521#ifdef SUPPORT_UNICODE3522{3523BOOL negated;3524uint16_t ptype = 0, pdata = 0;3525if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))3526goto ESCAPE_FAILED;3527if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;3528*parsed_pattern++ = META_ESCAPE + escape;3529*parsed_pattern++ = (ptype << 16) | pdata;3530okquantifier = TRUE;3531}3532#else3533errorcode = ERR45;3534goto ESCAPE_FAILED;3535#endif3536break; /* End \P and \p */35373538/* When \g is used with quotes or angle brackets as delimiters, it is a3539numerical or named subroutine call, and control comes here. When used3540with brace delimiters it is a numerical back reference and does not come3541here because check_escape() returns it directly as a reference. \k is3542always a named back reference. */35433544case ESC_g:3545case ESC_k:3546if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&3547*ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))3548{3549errorcode = (escape == ESC_g)? ERR57 : ERR69;3550goto ESCAPE_FAILED;3551}3552terminator = (*ptr == CHAR_LESS_THAN_SIGN)?3553CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?3554CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;35553556/* For a non-braced \g, check for a numerical recursion. */35573558if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)3559{3560PCRE2_SPTR p = ptr + 1;35613562if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,3563&errorcode))3564{3565if (p >= ptrend || *p != terminator)3566{3567errorcode = ERR57;3568goto ESCAPE_FAILED;3569}3570ptr = p;3571goto SET_RECURSION;3572}3573if (errorcode != 0) goto ESCAPE_FAILED;3574}35753576/* Not a numerical recursion. Perl allows spaces and tabs after { and3577before } but not for other delimiters. */35783579if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,3580&errorcode, cb)) goto ESCAPE_FAILED;35813582/* \k and \g when used with braces are back references, whereas \g used3583with quotes or angle brackets is a recursion */35843585*parsed_pattern++ =3586(escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?3587META_BACKREF_BYNAME : META_RECURSE_BYNAME;3588*parsed_pattern++ = namelen;35893590PUTOFFSET(offset, parsed_pattern);3591okquantifier = TRUE;3592break; /* End special escape processing */3593}3594break; /* End escape sequence processing */359535963597/* ---- Single-character special items ---- */35983599case CHAR_CIRCUMFLEX_ACCENT:3600*parsed_pattern++ = META_CIRCUMFLEX;3601break;36023603case CHAR_DOLLAR_SIGN:3604*parsed_pattern++ = META_DOLLAR;3605break;36063607case CHAR_DOT:3608*parsed_pattern++ = META_DOT;3609okquantifier = TRUE;3610break;361136123613/* ---- Single-character quantifiers ---- */36143615case CHAR_ASTERISK:3616meta_quantifier = META_ASTERISK;3617goto CHECK_QUANTIFIER;36183619case CHAR_PLUS:3620meta_quantifier = META_PLUS;3621goto CHECK_QUANTIFIER;36223623case CHAR_QUESTION_MARK:3624meta_quantifier = META_QUERY;3625goto CHECK_QUANTIFIER;362636273628/* ---- Potential {n,m} quantifier ---- */36293630case CHAR_LEFT_CURLY_BRACKET:3631if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,3632&errorcode))3633{3634if (errorcode != 0) goto FAILED; /* Error in quantifier. */3635PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */3636break; /* No more quantifier processing */3637}3638meta_quantifier = META_MINMAX;3639/* Fall through */364036413642/* ---- Quantifier post-processing ---- */36433644/* Check that a quantifier is allowed after the previous item. This3645guarantees that there is a previous item. */36463647CHECK_QUANTIFIER:3648if (!prev_okquantifier)3649{3650errorcode = ERR9;3651goto FAILED_BACK; // TODO https://github.com/PCRE2Project/pcre2/issues/5493652}36533654/* Most (*VERB)s are not allowed to be quantified, but an ungreedy3655quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a3656sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by3657wrapping it in non-capturing brackets, but we have to allow for a preceding3658(*MARK) for when (*ACCEPT) has an argument. */36593660if (*prev_parsed_item == META_ACCEPT)3661{3662uint32_t *p;3663for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];3664*verbstartptr = META_NOCAPTURE;3665parsed_pattern[1] = META_KET;3666parsed_pattern += 2;36673668#ifdef PCRE2_DEBUG3669PCRE2_ASSERT(parsed_pattern_extra >= 2);3670parsed_pattern_extra -= 2;3671#endif3672}36733674/* Now we can put the quantifier into the parsed pattern vector. At this3675stage, we have only the basic quantifier. The check for a following + or ?3676modifier happens at the top of the loop, after any intervening comments3677have been removed. */36783679*parsed_pattern++ = meta_quantifier;3680if (c == CHAR_LEFT_CURLY_BRACKET)3681{3682*parsed_pattern++ = min_repeat;3683*parsed_pattern++ = max_repeat;3684}3685break;368636873688/* ---- Character class ---- */36893690case CHAR_LEFT_SQUARE_BRACKET:36913692/* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is3693used for "start of word" and "end of word". As these are otherwise illegal3694sequences, we don't break anything by recognizing them. They are replaced3695by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are3696erroneous and are handled by the normal code below. */36973698if (ptrend - ptr >= 6 &&3699(PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||3700PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))3701{3702*parsed_pattern++ = META_ESCAPE + ESC_b;37033704if (ptr[2] == CHAR_LESS_THAN_SIGN)3705{3706*parsed_pattern++ = META_LOOKAHEAD;3707}3708else3709{3710*parsed_pattern++ = META_LOOKBEHIND;3711*has_lookbehind = TRUE;37123713/* The offset is used only for the "non-fixed length" error; this won't3714occur here, so just store zero. */37153716PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);3717}37183719if ((options & PCRE2_UCP) == 0)3720*parsed_pattern++ = META_ESCAPE + ESC_w;3721else3722{3723*parsed_pattern++ = META_ESCAPE + ESC_p;3724*parsed_pattern++ = PT_WORD << 16;3725}3726*parsed_pattern++ = META_KET;3727ptr += 6;3728okquantifier = TRUE;3729break;3730}37313732/* PCRE supports POSIX class stuff inside a class. Perl gives an error if3733they are encountered at the top level, so we'll do that too. */37343735if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||3736*ptr == CHAR_EQUALS_SIGN) &&3737check_posix_syntax(ptr, ptrend, &tempptr))3738{3739errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;3740goto FAILED;3741}37423743class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?3744CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;37453746/* Jump here from '(?[...])'. That jump must initialize class_mode_state,3747set c to the '[' character, and ptr to just after the '['. */37483749FROM_PERL_EXTENDED_CLASS:3750okquantifier = TRUE;37513752/* In an EBCDIC environment, Perl treats alphabetic ranges specially3753because there are holes in the encoding, and simply using the range A-Z3754(for example) would include the characters in the holes. This applies only3755to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]3756in this respect. In order to accommodate this, we keep track of whether3757character values are literal or not, and a state variable for handling3758ranges. */37593760/* Loop for the contents of the class. Classes may be nested, if3761PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */37623763/* c is still set to '[' so the loop will handle the start of the class. */37643765class_depth_m1 = -1;3766class_maxdepth_m1 = -1;3767class_range_state = RANGE_NO;3768class_op_state = CLASS_OP_EMPTY;3769class_start = NULL;37703771for (;;)3772{3773BOOL char_is_literal = TRUE;37743775/* Inside \Q...\E everything is literal except \E */37763777if (inescq)3778{3779if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)3780{3781inescq = FALSE; /* Reset literal state */3782ptr++; /* Skip the 'E' */3783goto CLASS_CONTINUE;3784}37853786/* Surprisingly, you cannot use \Q..\E to escape a character inside a3787Perl extended class. However, empty \Q\E sequences are allowed, so here3788were're only giving an error if the \Q..\E is non-empty. */37893790if (class_mode_state == CLASS_MODE_PERL_EXT)3791{3792errorcode = ERR116;3793goto FAILED;3794}37953796goto CLASS_LITERAL;3797}37983799/* Skip over space and tab (only) in extended-more mode, or anywhere3800inside a Perl extended class (which implies /xx). */38013802if ((c == CHAR_SPACE || c == CHAR_HT) &&3803((options & PCRE2_EXTENDED_MORE) != 0 ||3804class_mode_state >= CLASS_MODE_PERL_EXT))3805goto CLASS_CONTINUE;38063807/* Handle POSIX class names. Perl allows a negation extension of the3808form [:^name:]. A square bracket that doesn't match the syntax is3809treated as a literal. We also recognize the POSIX constructions3810[.ch.] and [=ch=] ("collating elements") and fault them, as Perl38115.6 and 5.8 do. */38123813if (class_depth_m1 >= 0 &&3814c == CHAR_LEFT_SQUARE_BRACKET &&3815ptrend - ptr >= 3 &&3816(*ptr == CHAR_COLON || *ptr == CHAR_DOT ||3817*ptr == CHAR_EQUALS_SIGN) &&3818check_posix_syntax(ptr, ptrend, &tempptr))3819{3820BOOL posix_negate = FALSE;3821int posix_class;38223823/* Perl treats a hyphen before a POSIX class as a literal, not the3824start of a range. However, it gives a warning in its warning mode. PCRE3825does not have a warning mode, so we give an error, because this is3826likely an error on the user's part. */38273828if (class_range_state == RANGE_STARTED)3829{3830ptr = tempptr + 2;3831errorcode = ERR50;3832goto FAILED;3833}38343835/* Perl treats a hyphen after a POSIX class as a literal, not the3836start of a range. However, it gives a warning in its warning mode3837unless the hyphen is the last character in the class. PCRE does not3838have a warning mode, so we give an error, because this is likely an3839error on the user's part.38403841Roll back to the hyphen for the error position. */38423843if (class_range_state == RANGE_FORBID_STARTED)3844{3845ptr = class_range_forbid_ptr;3846errorcode = ERR50;3847goto FAILED;3848}38493850/* Disallow implicit union in Perl extended classes. */38513852if (class_op_state == CLASS_OP_OPERAND &&3853class_mode_state == CLASS_MODE_PERL_EXT)3854{3855ptr = tempptr + 2;3856errorcode = ERR113;3857goto FAILED;3858}38593860if (*ptr != CHAR_COLON)3861{3862ptr = tempptr + 2;3863errorcode = ERR13;3864goto FAILED;3865}38663867if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)3868{3869posix_negate = TRUE;3870ptr++;3871}38723873posix_class = check_posix_name(ptr, (int)(tempptr - ptr));3874ptr = tempptr + 2;3875if (posix_class < 0)3876{3877errorcode = ERR30;3878goto FAILED;3879}38803881/* Set "a hyphen is forbidden to be the start of a range". For the '-]'3882case, the hyphen is treated as a literal, but for '-1' it is disallowed3883(because it would be interpreted as range). */38843885class_range_state = RANGE_FORBID_NO;3886class_op_state = CLASS_OP_OPERAND;38873888/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some3889of the POSIX classes are converted to use Unicode properties \p or \P3890or, in one case, \h or \H. The substitutes table has two values per3891class, containing the type and value of a \p or \P item. The special3892cases are specified with a negative type: a non-zero value causes \h or3893\H to be used, and a zero value falls through to behave like a non-UCP3894POSIX class. There are now also some extra options that force ASCII for3895some classes. */38963897#ifdef SUPPORT_UNICODE3898if ((options & PCRE2_UCP) != 0 &&3899(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&3900!((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&3901(posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))3902{3903int ptype = posix_substitutes[2*posix_class];3904int pvalue = posix_substitutes[2*posix_class + 1];39053906if (ptype >= 0)3907{3908*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);3909*parsed_pattern++ = (ptype << 16) | pvalue;3910goto CLASS_CONTINUE;3911}39123913if (pvalue != 0)3914{3915*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);3916goto CLASS_CONTINUE;3917}39183919/* Fall through */3920}3921#endif /* SUPPORT_UNICODE */39223923/* Non-UCP POSIX class */39243925*parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;3926*parsed_pattern++ = posix_class;3927}39283929/* Check for the start of the outermost class, or the start of a nested class. */39303931else if ((c == CHAR_LEFT_SQUARE_BRACKET &&3932(class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||3933class_mode_state == CLASS_MODE_PERL_EXT)) ||3934(c == CHAR_LEFT_PARENTHESIS &&3935class_mode_state == CLASS_MODE_PERL_EXT))3936{3937uint32_t start_c = c;3938uint32_t new_class_mode_state;39393940/* Update the class mode, if moving into a 'leaf' inside a Perl extended3941class. */39423943if (start_c == CHAR_LEFT_SQUARE_BRACKET &&3944class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)3945new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;3946else3947new_class_mode_state = class_mode_state;39483949/* Tidy up the other class before starting the nested class. */3950/* -[ beginning a nested class is a literal '-' */39513952if (class_range_state == RANGE_STARTED)3953parsed_pattern[-1] = CHAR_MINUS;39543955/* Disallow implicit union in Perl extended classes. */39563957if (class_op_state == CLASS_OP_OPERAND &&3958class_mode_state == CLASS_MODE_PERL_EXT)3959{3960errorcode = ERR113;3961goto FAILED;3962}39633964/* Validate nesting depth */3965if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)3966{3967errorcode = ERR107;3968goto FAILED; /* Classes too deeply nested */3969}39703971/* Process the character class start. If the first character is '^', set3972the negation flag. If the first few characters (either before or after ^)3973are \Q\E or \E or space or tab in extended-more mode, we skip them too.3974This makes for compatibility with Perl. */39753976negate_class = FALSE;3977for (;;)3978{3979if (ptr >= ptrend)3980{3981if (start_c == CHAR_LEFT_PARENTHESIS)3982errorcode = ERR14; /* Missing terminating ')' */3983else3984errorcode = ERR6; /* Missing terminating ']' */3985goto FAILED;3986}39873988GETCHARINCTEST(c, ptr);3989if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;3990else if (c == CHAR_BACKSLASH)3991{3992if (ptr < ptrend && *ptr == CHAR_E) ptr++;3993else if (ptrend - ptr >= 3 &&3994PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)3995ptr += 3;3996else3997break;3998}3999else if ((c == CHAR_SPACE || c == CHAR_HT) && /* Note: just these two */4000((options & PCRE2_EXTENDED_MORE) != 0 ||4001new_class_mode_state >= CLASS_MODE_PERL_EXT))4002continue;4003else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)4004negate_class = TRUE;4005else break;4006}40074008/* Now the real contents of the class; c has the first "real" character.4009Empty classes are permitted only if the option is set, and if it's not4010a Perl-extended class. */40114012if (c == CHAR_RIGHT_SQUARE_BRACKET &&4013(cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&4014new_class_mode_state < CLASS_MODE_PERL_EXT)4015{4016PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);40174018if (class_start != NULL)4019{4020PCRE2_ASSERT(class_depth_m1 >= 0);4021/* Represents that the class is an extended class. */4022*class_start |= CLASS_IS_ECLASS;4023class_start = NULL;4024}40254026*parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;40274028/* Leave nesting depth unchanged; but check for zero depth to handle the4029very first (top-level) class being empty. */4030if (class_depth_m1 < 0) break;40314032class_range_state = RANGE_NO; /* for processing the containing class */4033class_op_state = CLASS_OP_OPERAND;4034goto CLASS_CONTINUE;4035}40364037/* Enter a non-empty class. */40384039if (class_start != NULL)4040{4041PCRE2_ASSERT(class_depth_m1 >= 0);4042/* Represents that the class is an extended class. */4043*class_start |= CLASS_IS_ECLASS;4044class_start = NULL;4045}40464047class_start = parsed_pattern;4048*parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;4049class_range_state = RANGE_NO;4050class_op_state = CLASS_OP_EMPTY;4051class_mode_state = new_class_mode_state;4052++class_depth_m1;4053if (class_maxdepth_m1 < class_depth_m1)4054class_maxdepth_m1 = class_depth_m1;4055/* Reset; no op seen yet at new depth. */4056cb->class_op_used[class_depth_m1] = 0;40574058/* Implement the special start-of-class literal meaning of ']'. */4059if (c == CHAR_RIGHT_SQUARE_BRACKET &&4060new_class_mode_state != CLASS_MODE_PERL_EXT)4061{4062class_range_state = RANGE_OK_LITERAL;4063class_op_state = CLASS_OP_OPERAND;4064PARSED_LITERAL(c, parsed_pattern);4065goto CLASS_CONTINUE;4066}40674068continue; /* We have already loaded c with the next character */4069}40704071/* Check for the end of the class. */40724073else if (c == CHAR_RIGHT_SQUARE_BRACKET ||4074(c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))4075{4076/* In Perl extended mode, the ']' can only be used to match the4077opening '[', and ')' must match an opening parenthesis. */4078if (class_mode_state == CLASS_MODE_PERL_EXT)4079{4080if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)4081{4082errorcode = ERR14;4083goto FAILED_BACK;4084}4085if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)4086{4087errorcode = ERR22;4088goto FAILED;4089}4090}40914092/* Check no trailing operator. */4093if (class_op_state == CLASS_OP_OPERATOR)4094{4095errorcode = ERR110;4096goto FAILED;4097}40984099/* Check no empty expression for Perl extended expressions. */4100if (class_mode_state == CLASS_MODE_PERL_EXT &&4101class_op_state == CLASS_OP_EMPTY)4102{4103errorcode = ERR114;4104goto FAILED;4105}41064107/* -] at the end of a class is a literal '-' */4108if (class_range_state == RANGE_STARTED)4109parsed_pattern[-1] = CHAR_MINUS;41104111*parsed_pattern++ = META_CLASS_END;41124113if (--class_depth_m1 < 0)4114{4115/* Check for and consume ')' after '(?[...]'. */4116PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);4117if (class_mode_state == CLASS_MODE_PERL_EXT)4118{4119if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)4120{4121errorcode = ERR115;4122goto FAILED;4123}41244125ptr++;4126}41274128break;4129}41304131class_range_state = RANGE_NO; /* for processing the containing class */4132class_op_state = CLASS_OP_OPERAND;4133if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)4134class_mode_state = CLASS_MODE_PERL_EXT;4135/* The extended class flag has already4136been set for the parent class. */4137class_start = NULL;4138}41394140/* Handle a Perl set binary operator */41414142else if (class_mode_state == CLASS_MODE_PERL_EXT &&4143(c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||4144c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))4145{4146/* Check that there was a preceding operand. */4147if (class_op_state != CLASS_OP_OPERAND)4148{4149errorcode = ERR109;4150goto FAILED;4151}41524153if (class_start != NULL)4154{4155PCRE2_ASSERT(class_depth_m1 >= 0);4156/* Represents that the class is an extended class. */4157*class_start |= CLASS_IS_ECLASS;4158class_start = NULL;4159}41604161PCRE2_ASSERT(class_range_state != RANGE_STARTED &&4162class_range_state != RANGE_FORBID_STARTED);41634164*parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :4165c == CHAR_VERTICAL_LINE? META_ECLASS_OR :4166c == CHAR_MINUS? META_ECLASS_SUB :4167c == CHAR_AMPERSAND? META_ECLASS_AND :4168META_ECLASS_XOR;4169class_range_state = RANGE_NO;4170class_op_state = CLASS_OP_OPERATOR;4171}41724173/* Handle a Perl set unary operator */41744175else if (class_mode_state == CLASS_MODE_PERL_EXT &&4176c == CHAR_EXCLAMATION_MARK)4177{4178/* Check that the "!" has not got a preceding operand (i.e. it's the4179start of the class, or follows an operator). */4180if (class_op_state == CLASS_OP_OPERAND)4181{4182errorcode = ERR113;4183goto FAILED;4184}41854186if (class_start != NULL)4187{4188PCRE2_ASSERT(class_depth_m1 >= 0);4189/* Represents that the class is an extended class. */4190*class_start |= CLASS_IS_ECLASS;4191class_start = NULL;4192}41934194PCRE2_ASSERT(class_range_state != RANGE_STARTED &&4195class_range_state != RANGE_FORBID_STARTED);41964197*parsed_pattern++ = META_ECLASS_NOT;4198class_range_state = RANGE_NO;4199class_op_state = CLASS_OP_OPERATOR;4200}42014202/* Handle a UTS#18 set operator */42034204else if (class_mode_state == CLASS_MODE_ALT_EXT &&4205(c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||4206c == CHAR_AMPERSAND || c == CHAR_TILDE) &&4207ptr < ptrend && *ptr == c)4208{4209++ptr;42104211/* Check there isn't a triple-repetition. */4212if (ptr < ptrend && *ptr == c)4213{4214while (ptr < ptrend && *ptr == c) ++ptr; /* Improve error offset. */4215errorcode = ERR108;4216goto FAILED;4217}42184219/* Check for a preceding operand. */4220if (class_op_state != CLASS_OP_OPERAND)4221{4222errorcode = ERR109;4223goto FAILED;4224}42254226/* Check for mixed precedence. Forbid [A--B&&C]. */4227if (cb->class_op_used[class_depth_m1] != 0 &&4228cb->class_op_used[class_depth_m1] != (uint8_t)c)4229{4230errorcode = ERR111;4231goto FAILED;4232}42334234if (class_start != NULL)4235{4236PCRE2_ASSERT(class_depth_m1 >= 0);4237/* Represents that the class is an extended class. */4238*class_start |= CLASS_IS_ECLASS;4239class_start = NULL;4240}42414242/* Dangling '-' before an operator is a literal */4243if (class_range_state == RANGE_STARTED)4244parsed_pattern[-1] = CHAR_MINUS;42454246*parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :4247c == CHAR_MINUS? META_ECLASS_SUB :4248c == CHAR_AMPERSAND? META_ECLASS_AND :4249META_ECLASS_XOR;4250class_range_state = RANGE_NO;4251class_op_state = CLASS_OP_OPERATOR;4252cb->class_op_used[class_depth_m1] = (uint8_t)c;4253}42544255/* Handle escapes in a class */42564257else if (c == CHAR_BACKSLASH)4258{4259tempptr = ptr;4260escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,4261xoptions, cb->bracount, TRUE, cb);42624263if (errorcode != 0)4264{4265if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||4266class_mode_state >= CLASS_MODE_PERL_EXT)4267goto FAILED;4268ptr = tempptr;4269if (ptr >= ptrend) c = CHAR_BACKSLASH; else4270{4271GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */4272}4273escape = 0; /* Treat as literal character */4274}42754276switch(escape)4277{4278case 0: /* Escaped character code point is in c */4279char_is_literal = FALSE;4280goto CLASS_LITERAL; /* (a few lines above) */42814282case ESC_b:4283c = CHAR_BS; /* \b is backspace in a class */4284char_is_literal = FALSE;4285goto CLASS_LITERAL;42864287case ESC_k:4288c = CHAR_k; /* \k is not special in a class, just like \g */4289char_is_literal = FALSE;4290goto CLASS_LITERAL;42914292case ESC_Q:4293inescq = TRUE; /* Enter literal mode */4294goto CLASS_CONTINUE;42954296case ESC_E: /* Ignore orphan \E */4297goto CLASS_CONTINUE;42984299case ESC_B: /* Always an error in a class */4300case ESC_R:4301case ESC_X:4302errorcode = ERR7;4303ptr--; // TODO https://github.com/PCRE2Project/pcre2/issues/5494304goto FAILED;43054306case ESC_N: /* Not permitted by Perl either */4307errorcode = ERR71;4308goto FAILED;43094310case ESC_H:4311case ESC_h:4312case ESC_V:4313case ESC_v:4314*parsed_pattern++ = META_ESCAPE + escape;4315break;43164317/* These escapes may be converted to Unicode property tests when4318PCRE2_UCP is set. */43194320case ESC_d:4321case ESC_D:4322case ESC_s:4323case ESC_S:4324case ESC_w:4325case ESC_W:4326parsed_pattern = handle_escdsw(escape, parsed_pattern, options,4327xoptions);4328break;43294330/* Explicit Unicode property matching */43314332case ESC_P:4333case ESC_p:4334#ifdef SUPPORT_UNICODE4335{4336BOOL negated;4337uint16_t ptype = 0, pdata = 0;4338if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))4339goto FAILED;43404341/* In caseless matching, particular characteristics Lu, Ll, and Lt4342get converted to the general characteristic L&. That is, upper,4343lower, and title case letters are all conflated. */43444345if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&4346(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))4347{4348ptype = PT_LAMP;4349pdata = 0;4350}43514352if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;4353*parsed_pattern++ = META_ESCAPE + escape;4354*parsed_pattern++ = (ptype << 16) | pdata;4355}4356#else4357errorcode = ERR45;4358goto FAILED;4359#endif4360break; /* End \P and \p */43614362/* All others are not allowed in a class */43634364default:4365PCRE2_DEBUG_UNREACHABLE();4366/* Fall through */43674368case ESC_A:4369case ESC_Z:4370case ESC_z:4371case ESC_G:4372case ESC_K:4373case ESC_C:4374errorcode = ERR7;4375ptr--; // TODO https://github.com/PCRE2Project/pcre2/issues/5494376goto FAILED;4377}43784379/* All the switch-cases above which end in "break" describe a set4380of characters. None may start a range. */43814382/* The second part of a range can be a single-character escape4383sequence (detected above), but not any of the other escapes. Perl4384treats a hyphen as a literal in such circumstances. However, in Perl's4385warning mode, a warning is given, so PCRE now faults it, as it is4386almost certainly a mistake on the user's part. */43874388if (class_range_state == RANGE_STARTED)4389{4390errorcode = ERR50;4391goto FAILED;4392}43934394/* Perl gives a warning unless the hyphen following a multi-character4395escape is the last character in the class. PCRE throws an error. */43964397if (class_range_state == RANGE_FORBID_STARTED)4398{4399ptr = class_range_forbid_ptr;4400errorcode = ERR50;4401goto FAILED;4402}44034404/* Disallow implicit union in Perl extended classes. */44054406if (class_op_state == CLASS_OP_OPERAND &&4407class_mode_state == CLASS_MODE_PERL_EXT)4408{4409errorcode = ERR113;4410goto FAILED;4411}44124413class_range_state = RANGE_FORBID_NO;4414class_op_state = CLASS_OP_OPERAND;4415}44164417/* Forbid unescaped literals, and the special meaning of '-', inside a4418Perl extended class. */44194420else if (class_mode_state == CLASS_MODE_PERL_EXT)4421{4422errorcode = ERR116;4423goto FAILED;4424}44254426/* Handle potential start of range */44274428else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)4429{4430*parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?4431META_RANGE_LITERAL : META_RANGE_ESCAPED;4432class_range_state = RANGE_STARTED;4433}44344435/* Handle forbidden start of range */44364437else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)4438{4439*parsed_pattern++ = CHAR_MINUS;4440class_range_state = RANGE_FORBID_STARTED;4441class_range_forbid_ptr = ptr;4442}44434444/* Handle a literal character */44454446else4447{4448CLASS_LITERAL:44494450/* Disallow implicit union in Perl extended classes. */44514452if (class_op_state == CLASS_OP_OPERAND &&4453class_mode_state == CLASS_MODE_PERL_EXT)4454{4455errorcode = ERR113;4456goto FAILED;4457}44584459if (class_range_state == RANGE_STARTED)4460{4461if (c == parsed_pattern[-2]) /* Optimize one-char range */4462parsed_pattern--;4463else if (parsed_pattern[-2] > c) /* Check range is in order */4464{4465errorcode = ERR8;4466goto FAILED_BACK; // TODO https://github.com/PCRE2Project/pcre2/issues/5494467}4468else4469{4470if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)4471parsed_pattern[-1] = META_RANGE_ESCAPED;4472PARSED_LITERAL(c, parsed_pattern);4473}4474class_range_state = RANGE_NO;4475class_op_state = CLASS_OP_OPERAND;4476}4477else if (class_range_state == RANGE_FORBID_STARTED)4478{4479ptr = class_range_forbid_ptr;4480errorcode = ERR50;4481goto FAILED;4482}4483else /* Potential start of range */4484{4485class_range_state = char_is_literal?4486RANGE_OK_LITERAL : RANGE_OK_ESCAPED;4487class_op_state = CLASS_OP_OPERAND;4488PARSED_LITERAL(c, parsed_pattern);4489}4490}44914492/* Proceed to next thing in the class. */44934494CLASS_CONTINUE:4495if (ptr >= ptrend)4496{4497if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)4498errorcode = ERR14; /* Missing terminating ')' */4499if (class_mode_state == CLASS_MODE_ALT_EXT &&4500class_depth_m1 == 0 && class_maxdepth_m1 == 1)4501errorcode = ERR112; /* Missing terminating ']', but we saw '[ [ ]...' */4502else4503errorcode = ERR6; /* Missing terminating ']' */4504goto FAILED;4505}4506GETCHARINCTEST(c, ptr);4507} /* End of class-processing loop */45084509break; /* End of character class */451045114512/* ---- Opening parenthesis ---- */45134514case CHAR_LEFT_PARENTHESIS:4515if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;45164517/* If ( is not followed by ? it is either a capture or a special verb or an4518alpha assertion or a positive non-atomic lookahead. */45194520if (*ptr != CHAR_QUESTION_MARK)4521{4522const char *vn;45234524/* Handle capturing brackets (or non-capturing if auto-capture is turned4525off). */45264527if (*ptr != CHAR_ASTERISK)4528{4529nest_depth++;4530if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)4531{4532if (cb->bracount >= MAX_GROUP_NUMBER)4533{4534errorcode = ERR97;4535goto FAILED;4536}4537cb->bracount++;4538*parsed_pattern++ = META_CAPTURE | cb->bracount;4539}4540else *parsed_pattern++ = META_NOCAPTURE;4541}45424543/* Do nothing for (* followed by end of pattern or ) so it gives a "bad4544quantifier" error rather than "(*MARK) must have an argument". */45454546else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)4547break;45484549/* Handle "alpha assertions" such as (*pla:...). Most of these are4550synonyms for the historical symbolic assertions, but the script run and4551non-atomic lookaround ones are new. They are distinguished by starting4552with a lower case letter. Checking both ends of the alphabet makes this4553work in all character codes. */45544555else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)4556{4557uint32_t meta;45584559vn = alasnames;4560if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,4561&errorcode, cb)) goto FAILED;4562if (ptr >= ptrend || *ptr != CHAR_COLON)4563{4564errorcode = ERR95; /* Malformed */4565goto FAILED;4566}45674568/* Scan the table of alpha assertion names */45694570for (i = 0; i < alascount; i++)4571{4572if (namelen == alasmeta[i].len &&4573PRIV(strncmp_c8)(name, vn, namelen) == 0)4574break;4575vn += alasmeta[i].len + 1;4576}45774578if (i >= alascount)4579{4580errorcode = ERR95; /* Alpha assertion not recognized */4581goto FAILED;4582}45834584/* Check for expecting an assertion condition. If so, only atomic4585lookaround assertions are valid. */45864587meta = alasmeta[i].meta;4588if (prev_expect_cond_assert > 0 &&4589(meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))4590{4591errorcode = ERR28; /* Atomic assertion expected */4592goto FAILED;4593}45944595/* The lookaround alphabetic synonyms can mostly be handled by jumping4596to the code that handles the traditional symbolic forms. */45974598switch(meta)4599{4600default:4601PCRE2_DEBUG_UNREACHABLE();4602errorcode = ERR89; /* Unknown code; should never occur because */4603goto FAILED; /* the meta values come from a table above. */46044605case META_ATOMIC:4606goto ATOMIC_GROUP;46074608case META_LOOKAHEAD:4609goto POSITIVE_LOOK_AHEAD;46104611case META_LOOKAHEAD_NA:4612goto POSITIVE_NONATOMIC_LOOK_AHEAD;46134614case META_LOOKAHEADNOT:4615goto NEGATIVE_LOOK_AHEAD;46164617case META_SCS:4618if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;46194620if (*ptr != CHAR_LEFT_PARENTHESIS)4621{4622errorcode = ERR15;4623goto FAILED;4624}46254626ptr++;4627*parsed_pattern++ = META_SCS;4628/* Temporary variable, zero in the first iteration. */4629offset = 0;46304631for (;;)4632{4633PCRE2_SIZE next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);46344635/* Handle (scan_substring:([+-]number)... */4636if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,4637&i, &errorcode))4638{4639PCRE2_ASSERT(i >= 0);4640if (i <= 0)4641{4642errorcode = ERR15;4643goto FAILED;4644}4645meta = META_SCS_NUMBER;4646namelen = (uint32_t)i;4647}4648else if (errorcode != 0) goto FAILED; /* Number too big */4649else4650{4651if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;46524653/* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */4654if (*ptr == CHAR_LESS_THAN_SIGN)4655terminator = CHAR_GREATER_THAN_SIGN;4656else if (*ptr == CHAR_APOSTROPHE)4657terminator = CHAR_APOSTROPHE;4658else4659{4660errorcode = ERR15;4661goto FAILED;4662}46634664if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,4665&name, &namelen, &errorcode, cb)) goto FAILED;46664667meta = META_SCS_NAME;4668}46694670PCRE2_ASSERT(next_offset > 0);4671if (offset == 0 || (next_offset - offset) >= 0x10000)4672{4673*parsed_pattern++ = META_OFFSET;4674PUTOFFSET(next_offset, parsed_pattern);4675offset = next_offset;4676}46774678/* The offset is encoded as a relative offset, because for some4679inputs such as ",2" in (*scs:(1,2,3)...), we only have space for4680two uint32_t values, and an opcode and absolute offset may require4681three uint32_t values. */4682*parsed_pattern++ = meta | (uint32_t)(next_offset - offset);4683*parsed_pattern++ = namelen;4684offset = next_offset;46854686if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;46874688if (*ptr == CHAR_RIGHT_PARENTHESIS) break;46894690if (*ptr != CHAR_COMMA)4691{4692errorcode = ERR24;4693goto FAILED;4694}46954696ptr++;4697}4698ptr++;4699goto POST_ASSERTION;47004701case META_LOOKBEHIND:4702case META_LOOKBEHINDNOT:4703case META_LOOKBEHIND_NA:4704*parsed_pattern++ = meta;4705ptr--;4706goto POST_LOOKBEHIND;47074708/* The script run facilities are handled here. Unicode support is4709required (give an error if not, as this is a security issue). Always4710record a META_SCRIPT_RUN item. Then, for the atomic version, insert4711META_ATOMIC and remember that we need two META_KETs at the end. */47124713case META_SCRIPT_RUN:4714case META_ATOMIC_SCRIPT_RUN:4715#ifdef SUPPORT_UNICODE4716*parsed_pattern++ = META_SCRIPT_RUN;4717nest_depth++;4718ptr++;4719if (meta == META_ATOMIC_SCRIPT_RUN)4720{4721*parsed_pattern++ = META_ATOMIC;4722if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);4723else if (++top_nest >= end_nests)4724{4725errorcode = ERR84;4726goto FAILED;4727}4728top_nest->nest_depth = nest_depth;4729top_nest->flags = NSF_ATOMICSR;4730top_nest->options = options & PARSE_TRACKED_OPTIONS;4731top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;47324733#ifdef PCRE2_DEBUG4734/* We'll write out two META_KETs for a single ")" in the input4735pattern, so we reserve space for that in our bounds check. */4736parsed_pattern_extra++;4737#endif4738}4739break;4740#else /* SUPPORT_UNICODE */4741errorcode = ERR96;4742goto FAILED;4743#endif4744}4745}474647474748/* ---- Handle (*VERB) and (*VERB:NAME) ---- */47494750else4751{4752vn = verbnames;4753if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,4754&errorcode, cb)) goto FAILED;4755if (ptr >= ptrend || (*ptr != CHAR_COLON &&4756*ptr != CHAR_RIGHT_PARENTHESIS))4757{4758errorcode = ERR60; /* Malformed */4759goto FAILED;4760}47614762/* Scan the table of verb names */47634764for (i = 0; i < verbcount; i++)4765{4766if (namelen == verbs[i].len &&4767PRIV(strncmp_c8)(name, vn, namelen) == 0)4768break;4769vn += verbs[i].len + 1;4770}47714772if (i >= verbcount)4773{4774errorcode = ERR60; /* Verb not recognized */4775goto FAILED;4776}47774778/* An empty argument is treated as no argument. */47794780if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&4781ptr[1] == CHAR_RIGHT_PARENTHESIS)4782ptr++; /* Advance to the closing parens */47834784/* Check for mandatory non-empty argument; this is (*MARK) */47854786if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)4787{4788errorcode = ERR66;4789goto FAILED;4790}47914792/* Remember where this verb, possibly with a preceding (*MARK), starts,4793for handling quantified (*ACCEPT). */47944795verbstartptr = parsed_pattern;4796okquantifier = (verbs[i].meta == META_ACCEPT);4797#ifdef PCRE2_DEBUG4798/* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)4799with a non-capturing bracket, if there is a following quantifier. */4800if (okquantifier) parsed_pattern_extra += 2;4801#endif48024803/* It appears that Perl allows any characters whatsoever, other than a4804closing parenthesis, to appear in arguments ("names"), so we no longer4805insist on letters, digits, and underscores. Perl does not, however, do4806any interpretation within arguments, and has no means of including a4807closing parenthesis. PCRE supports escape processing but only when it4808is requested by an option. We set inverbname TRUE here, and let the4809main loop take care of this so that escape and \x processing is done by4810the main code above. */48114812if (*ptr++ == CHAR_COLON) /* Skip past : or ) */4813{4814/* Some optional arguments can be treated as a preceding (*MARK) */48154816if (verbs[i].has_arg < 0)4817{4818add_after_mark = verbs[i].meta;4819*parsed_pattern++ = META_MARK;4820}48214822/* The remaining verbs with arguments (except *MARK) need a different4823opcode. */48244825else4826{4827*parsed_pattern++ = verbs[i].meta +4828((verbs[i].meta != META_MARK)? 0x00010000u:0);4829}48304831/* Set up for reading the name in the main loop. */48324833verblengthptr = parsed_pattern++;4834verbnamestart = ptr;4835inverbname = TRUE;4836}4837else /* No verb "name" argument */4838{4839*parsed_pattern++ = verbs[i].meta;4840}4841} /* End of (*VERB) handling */4842break; /* Done with this parenthesis */4843} /* End of groups that don't start with (? */484448454846/* ---- Items starting (? ---- */48474848/* The type of item is determined by what follows (?. Handle (?| and option4849changes under "default" because both need a new block on the nest stack.4850Comments starting with (?# are handled above. Note that there is some4851ambiguity about the sequence (?- because if a digit follows it's a relative4852recursion or subroutine call whereas otherwise it's an option unsetting. */48534854if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;48554856switch(*ptr)4857{4858default:4859if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))4860goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */48614862/* We now have either (?| or a (possibly empty) option setting,4863optionally followed by a non-capturing group. */48644865nest_depth++;4866if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);4867else if (++top_nest >= end_nests)4868{4869errorcode = ERR84;4870goto FAILED;4871}4872top_nest->nest_depth = nest_depth;4873top_nest->flags = 0;4874top_nest->options = options & PARSE_TRACKED_OPTIONS;4875top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;48764877/* Start of non-capturing group that resets the capture count for each4878branch. */48794880if (*ptr == CHAR_VERTICAL_LINE)4881{4882top_nest->reset_group = (uint16_t)cb->bracount;4883top_nest->max_group = (uint16_t)cb->bracount;4884top_nest->flags |= NSF_RESET;4885cb->external_flags |= PCRE2_DUPCAPUSED;4886*parsed_pattern++ = META_NOCAPTURE;4887ptr++;4888}48894890/* Scan for options imnrsxJU to be set or unset. */48914892else4893{4894BOOL hyphenok = TRUE;4895uint32_t oldoptions = options;4896uint32_t oldxoptions = xoptions;48974898top_nest->reset_group = 0;4899top_nest->max_group = 0;4900set = unset = 0;4901optset = &set;4902xset = xunset = 0;4903xoptset = &xset;49044905/* ^ at the start unsets irmnsx and disables the subsequent use of - */49064907if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)4908{4909options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|4910PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);4911xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);4912hyphenok = FALSE;4913ptr++;4914}49154916while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&4917*ptr != CHAR_COLON)4918{4919switch (*ptr++)4920{4921case CHAR_MINUS:4922if (!hyphenok)4923{4924errorcode = ERR94;4925ptr--; /* Correct the offset */4926goto FAILED;4927}4928optset = &unset;4929xoptset = &xunset;4930hyphenok = FALSE;4931break;49324933/* There are some two-character sequences that start with 'a'. */49344935case CHAR_a:4936if (ptr < ptrend)4937{4938if (*ptr == CHAR_D)4939{4940*xoptset |= PCRE2_EXTRA_ASCII_BSD;4941ptr++;4942break;4943}4944if (*ptr == CHAR_P)4945{4946*xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);4947ptr++;4948break;4949}4950if (*ptr == CHAR_S)4951{4952*xoptset |= PCRE2_EXTRA_ASCII_BSS;4953ptr++;4954break;4955}4956if (*ptr == CHAR_T)4957{4958*xoptset |= PCRE2_EXTRA_ASCII_DIGIT;4959ptr++;4960break;4961}4962if (*ptr == CHAR_W)4963{4964*xoptset |= PCRE2_EXTRA_ASCII_BSW;4965ptr++;4966break;4967}4968}4969*xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|4970PCRE2_EXTRA_ASCII_BSW|4971PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;4972break;49734974case CHAR_J: /* Record that it changed in the external options */4975*optset |= PCRE2_DUPNAMES;4976cb->external_flags |= PCRE2_JCHANGED;4977break;49784979case CHAR_i: *optset |= PCRE2_CASELESS; break;4980case CHAR_m: *optset |= PCRE2_MULTILINE; break;4981case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;4982case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;4983case CHAR_s: *optset |= PCRE2_DOTALL; break;4984case CHAR_U: *optset |= PCRE2_UNGREEDY; break;49854986/* If x appears twice it sets the extended extended option. */49874988case CHAR_x:4989*optset |= PCRE2_EXTENDED;4990if (ptr < ptrend && *ptr == CHAR_x)4991{4992*optset |= PCRE2_EXTENDED_MORE;4993ptr++;4994}4995break;49964997default:4998errorcode = ERR11;4999ptr--; /* Correct the offset */5000goto FAILED;5001}5002}50035004/* If we are setting extended without extended-more, ensure that any5005existing extended-more gets unset. Also, unsetting extended must also5006unset extended-more. */50075008if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||5009(unset & PCRE2_EXTENDED) != 0)5010unset |= PCRE2_EXTENDED_MORE;50115012options = (options | set) & (~unset);5013xoptions = (xoptions | xset) & (~xunset);50145015/* If the options ended with ')' this is not the start of a nested5016group with option changes, so the options change at this level.5017In this case, if the previous level set up a nest block, discard the5018one we have just created. Otherwise adjust it for the previous level.5019If the options ended with ':' we are starting a non-capturing group,5020possibly with an options setting. */50215022if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;5023if (*ptr++ == CHAR_RIGHT_PARENTHESIS)5024{5025nest_depth--; /* This is not a nested group after all. */5026if (top_nest > (nest_save *)(cb->start_workspace) &&5027(top_nest-1)->nest_depth == nest_depth) top_nest--;5028else top_nest->nest_depth = nest_depth;5029}5030else *parsed_pattern++ = META_NOCAPTURE;50315032/* If nothing changed, no need to record. */50335034if (options != oldoptions || xoptions != oldxoptions)5035{5036*parsed_pattern++ = META_OPTIONS;5037*parsed_pattern++ = options;5038*parsed_pattern++ = xoptions;5039}5040} /* End options processing */5041break; /* End default case after (? */504250435044/* ---- Python syntax support ---- */50455046case CHAR_P:5047if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;50485049/* (?P<name> is the same as (?<name>, which defines a named group. */50505051if (*ptr == CHAR_LESS_THAN_SIGN)5052{5053terminator = CHAR_GREATER_THAN_SIGN;5054goto DEFINE_NAME;5055}50565057/* (?P>name) is the same as (?&name), which is a recursion or subroutine5058call. */50595060if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;50615062/* (?P=name) is the same as \k<name>, a back reference by name. Anything5063else after (?P is an error. */50645065if (*ptr != CHAR_EQUALS_SIGN)5066{5067errorcode = ERR41;5068goto FAILED;5069}5070if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,5071&namelen, &errorcode, cb)) goto FAILED;5072*parsed_pattern++ = META_BACKREF_BYNAME;5073*parsed_pattern++ = namelen;5074PUTOFFSET(offset, parsed_pattern);5075okquantifier = TRUE;5076break; /* End of (?P processing */507750785079/* ---- Recursion/subroutine calls by number ---- */50805081case CHAR_R:5082i = 0; /* (?R) == (?R0) */5083ptr++;5084if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5085{5086errorcode = ERR58;5087goto FAILED;5088}5089goto SET_RECURSION;50905091/* An item starting (?- followed by a digit comes here via the "default"5092case because (?- followed by a non-digit is an options setting. */50935094case CHAR_PLUS:5095if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))5096{5097errorcode = ERR29; /* Missing number */5098goto FAILED;5099}5100/* Fall through */51015102case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:5103case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:5104RECURSION_BYNUMBER:5105if (!read_number(&ptr, ptrend,5106(IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */5107MAX_GROUP_NUMBER, ERR61,5108&i, &errorcode)) goto FAILED;5109PCRE2_ASSERT(i >= 0); /* NB (?0) is permitted, represented by i=0 */5110if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5111goto UNCLOSED_PARENTHESIS;51125113SET_RECURSION:5114*parsed_pattern++ = META_RECURSE | (uint32_t)i;5115offset = (PCRE2_SIZE)(ptr - cb->start_pattern);5116ptr++;5117PUTOFFSET(offset, parsed_pattern);5118okquantifier = TRUE;5119break; /* End of recursive call by number handling */512051215122/* ---- Recursion/subroutine calls by name ---- */51235124case CHAR_AMPERSAND:5125RECURSE_BY_NAME:5126if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,5127&namelen, &errorcode, cb)) goto FAILED;5128*parsed_pattern++ = META_RECURSE_BYNAME;5129*parsed_pattern++ = namelen;5130PUTOFFSET(offset, parsed_pattern);5131okquantifier = TRUE;5132break;51335134/* ---- Callout with numerical or string argument ---- */51355136case CHAR_C:5137if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)5138{5139errorcode = ERR103;5140goto FAILED;5141}51425143if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;51445145/* If the previous item was a condition starting (?(? an assertion,5146optionally preceded by a callout, is expected. This is checked later on,5147during actual compilation. However we need to identify this kind of5148assertion in this pass because it must not be qualified. The value of5149expect_cond_assert is set to 2 after (?(? is processed. We decrement it5150for a callout - still leaving a positive value that identifies the5151assertion. Multiple callouts or any other items will make it zero or5152less, which doesn't matter because they will cause an error later. */51535154expect_cond_assert = prev_expect_cond_assert - 1;51555156/* If previous_callout is not NULL, it means this follows a previous5157callout. If it was a manual callout, do nothing; this means its "length5158of next pattern item" field will remain zero. If it was an automatic5159callout, abolish it. */51605161if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&5162previous_callout == parsed_pattern - 4 &&5163parsed_pattern[-1] == 255)5164parsed_pattern = previous_callout;51655166/* Save for updating next pattern item length, and skip one item before5167completing. */51685169previous_callout = parsed_pattern;5170after_manual_callout = 1;51715172/* Handle a string argument; specific delimiter is required. */51735174if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))5175{5176PCRE2_SIZE calloutlength;5177PCRE2_SPTR startptr = ptr;51785179delimiter = 0;5180for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)5181{5182if (*ptr == PRIV(callout_start_delims)[i])5183{5184delimiter = PRIV(callout_end_delims)[i];5185break;5186}5187}5188if (delimiter == 0)5189{5190errorcode = ERR82;5191goto FAILED;5192}51935194*parsed_pattern = META_CALLOUT_STRING;5195parsed_pattern += 3; /* Skip pattern info */51965197for (;;)5198{5199if (++ptr >= ptrend)5200{5201errorcode = ERR81;5202ptr = startptr; /* To give a more useful message */5203goto FAILED;5204}5205if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))5206break;5207}52085209calloutlength = (PCRE2_SIZE)(ptr - startptr);5210if (calloutlength > UINT32_MAX)5211{5212errorcode = ERR72;5213goto FAILED;5214}5215*parsed_pattern++ = (uint32_t)calloutlength;5216offset = (PCRE2_SIZE)(startptr - cb->start_pattern);5217PUTOFFSET(offset, parsed_pattern);5218}52195220/* Handle a callout with an optional numerical argument, which must be5221less than or equal to 255. A missing argument gives 0. */52225223else5224{5225int n = 0;5226*parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */5227parsed_pattern += 3; /* Skip pattern info */5228while (ptr < ptrend && IS_DIGIT(*ptr))5229{5230n = n * 10 + (*ptr++ - CHAR_0);5231if (n > 255)5232{5233errorcode = ERR38;5234goto FAILED;5235}5236}5237*parsed_pattern++ = n;5238}52395240/* Both formats must have a closing parenthesis */52415242if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5243{5244errorcode = ERR39;5245goto FAILED;5246}5247ptr++;52485249/* Remember the offset to the next item in the pattern, and set a default5250length. This should get updated after the next item is read. */52515252previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);5253previous_callout[2] = 0;5254break; /* End callout */525552565257/* ---- Conditional group ---- */52585259/* A condition can be an assertion, a number (referring to a numbered5260group's having been set), a name (referring to a named group), or 'R',5261referring to overall recursion. R<digits> and R&name are also permitted5262for recursion state tests. Numbers may be preceded by + or - to specify a5263relative group number.52645265There are several syntaxes for testing a named group: (?(name)) is used5266by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).52675268There are two unfortunate ambiguities. 'R' can be the recursive thing or5269the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be5270the Perl DEFINE feature or the Python named test. We look for a name5271first; if not found, we try the other case.52725273For compatibility with auto-callouts, we allow a callout to be specified5274before a condition that is an assertion. */52755276case CHAR_LEFT_PARENTHESIS:5277if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;5278nest_depth++;52795280/* If the next character is ? or * there must be an assertion next5281(optionally preceded by a callout). We do not check this here, but5282instead we set expect_cond_assert to 2. If this is still greater than5283zero (callouts decrement it) when the next assertion is read, it will be5284marked as a condition that must not be repeated. A value greater than5285zero also causes checking that an assertion (possibly with callout)5286follows. */52875288if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)5289{5290*parsed_pattern++ = META_COND_ASSERT;5291ptr--; /* Pull pointer back to the opening parenthesis. */5292expect_cond_assert = 2;5293break; /* End of conditional */5294}52955296/* Handle (?([+-]number)... */52975298if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,5299&errorcode))5300{5301PCRE2_ASSERT(i >= 0);5302if (i <= 0)5303{5304errorcode = ERR15;5305goto FAILED;5306}5307*parsed_pattern++ = META_COND_NUMBER;5308offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);5309PUTOFFSET(offset, parsed_pattern);5310*parsed_pattern++ = i;5311}5312else if (errorcode != 0) goto FAILED; /* Number too big */53135314/* No number found. Handle the special case (?(VERSION[>]=n.m)... */53155316else if (ptrend - ptr >= 10 &&5317PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&5318ptr[7] != CHAR_RIGHT_PARENTHESIS)5319{5320uint32_t ge = 0;5321int major = 0;5322int minor = 0;53235324ptr += 7;5325if (*ptr == CHAR_GREATER_THAN_SIGN)5326{5327ge = 1;5328ptr++;5329}53305331/* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT5332references its argument twice. */53335334if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))5335goto BAD_VERSION_CONDITION;53365337if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))5338goto FAILED;53395340if (ptr >= ptrend) goto BAD_VERSION_CONDITION;5341if (*ptr == CHAR_DOT)5342{5343if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;5344minor = (*ptr++ - CHAR_0) * 10;5345if (ptr >= ptrend) goto BAD_VERSION_CONDITION;5346if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;5347if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5348goto BAD_VERSION_CONDITION;5349}53505351*parsed_pattern++ = META_COND_VERSION;5352*parsed_pattern++ = ge;5353*parsed_pattern++ = major;5354*parsed_pattern++ = minor;5355}53565357/* All the remaining cases now require us to read a name. We cannot at5358this stage distinguish ambiguous cases such as (?(R12) which might be a5359recursion test by number or a name, because the named groups have not yet5360all been identified. Those cases are treated as names, but given a5361different META code. */53625363else5364{5365BOOL was_r_ampersand = FALSE;53665367if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)5368{5369terminator = CHAR_RIGHT_PARENTHESIS;5370was_r_ampersand = TRUE;5371ptr++;5372}5373else if (*ptr == CHAR_LESS_THAN_SIGN)5374terminator = CHAR_GREATER_THAN_SIGN;5375else if (*ptr == CHAR_APOSTROPHE)5376terminator = CHAR_APOSTROPHE;5377else5378{5379terminator = CHAR_RIGHT_PARENTHESIS;5380ptr--; /* Point to char before name */5381}5382if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,5383&errorcode, cb)) goto FAILED;53845385/* Handle (?(R&name) */53865387if (was_r_ampersand)5388{5389*parsed_pattern = META_COND_RNAME;5390ptr--; /* Back to closing parens */5391}53925393/* Handle (?(name). If the name is "DEFINE" we identify it with a5394special code. Likewise if the name consists of R followed only by5395digits. Otherwise, handle it like a quoted name. */53965397else if (terminator == CHAR_RIGHT_PARENTHESIS)5398{5399if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)5400*parsed_pattern = META_COND_DEFINE;5401else5402{5403for (i = 1; i < (int)namelen; i++)5404if (!IS_DIGIT(name[i])) break;5405*parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?5406META_COND_RNUMBER : META_COND_NAME;5407}5408ptr--; /* Back to closing parens */5409}54105411/* Handle (?('name') or (?(<name>) */54125413else *parsed_pattern = META_COND_NAME;54145415/* All these cases except DEFINE end with the name length and offset;5416DEFINE just has an offset (for the "too many branches" error). */54175418if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;5419PUTOFFSET(offset, parsed_pattern);5420} /* End cases that read a name */54215422/* Check the closing parenthesis of the condition */54235424if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)5425{5426errorcode = ERR24;5427goto FAILED;5428}5429ptr++;5430break; /* End of condition processing */543154325433/* ---- Atomic group ---- */54345435case CHAR_GREATER_THAN_SIGN:5436ATOMIC_GROUP: /* Come from (*atomic: */5437*parsed_pattern++ = META_ATOMIC;5438nest_depth++;5439ptr++;5440break;544154425443/* ---- Lookahead assertions ---- */54445445case CHAR_EQUALS_SIGN:5446POSITIVE_LOOK_AHEAD: /* Come from (*pla: */5447*parsed_pattern++ = META_LOOKAHEAD;5448ptr++;5449goto POST_ASSERTION;54505451case CHAR_ASTERISK:5452POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (*napla: */5453*parsed_pattern++ = META_LOOKAHEAD_NA;5454ptr++;5455goto POST_ASSERTION;54565457case CHAR_EXCLAMATION_MARK:5458NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */5459*parsed_pattern++ = META_LOOKAHEADNOT;5460ptr++;5461goto POST_ASSERTION;546254635464/* ---- Lookbehind assertions ---- */54655466/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<5467is the start of the name of a capturing group. */54685469case CHAR_LESS_THAN_SIGN:5470if (ptrend - ptr <= 1 ||5471(ptr[1] != CHAR_EQUALS_SIGN &&5472ptr[1] != CHAR_EXCLAMATION_MARK &&5473ptr[1] != CHAR_ASTERISK))5474{5475terminator = CHAR_GREATER_THAN_SIGN;5476goto DEFINE_NAME;5477}5478*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?5479META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?5480META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;54815482POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */5483*has_lookbehind = TRUE;5484offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);5485PUTOFFSET(offset, parsed_pattern);5486ptr += 2;5487/* Fall through */54885489/* If the previous item was a condition starting (?(? an assertion,5490optionally preceded by a callout, is expected. This is checked later on,5491during actual compilation. However we need to identify this kind of5492assertion in this pass because it must not be qualified. The value of5493expect_cond_assert is set to 2 after (?(? is processed. We decrement it5494for a callout - still leaving a positive value that identifies the5495assertion. Multiple callouts or any other items will make it zero or5496less, which doesn't matter because they will cause an error later. */54975498POST_ASSERTION:5499nest_depth++;5500if (prev_expect_cond_assert > 0)5501{5502if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);5503else if (++top_nest >= end_nests)5504{5505errorcode = ERR84;5506goto FAILED;5507}5508top_nest->nest_depth = nest_depth;5509top_nest->flags = NSF_CONDASSERT;5510top_nest->options = options & PARSE_TRACKED_OPTIONS;5511top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;5512}5513break;551455155516/* ---- Define a named group ---- */55175518/* A named group may be defined as (?'name') or (?<name>). In the latter5519case we jump to DEFINE_NAME from the disambiguation of (?< above with the5520terminator set to '>'. */55215522case CHAR_APOSTROPHE:5523terminator = CHAR_APOSTROPHE; /* Terminator */55245525DEFINE_NAME:5526if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,5527&errorcode, cb)) goto FAILED;55285529/* We have a name for this capturing group. It is also assigned a number,5530which is its primary means of identification. */55315532if (cb->bracount >= MAX_GROUP_NUMBER)5533{5534errorcode = ERR97;5535goto FAILED;5536}5537cb->bracount++;5538*parsed_pattern++ = META_CAPTURE | cb->bracount;5539nest_depth++;55405541/* Check not too many names */55425543if (cb->names_found >= MAX_NAME_COUNT)5544{5545errorcode = ERR49;5546goto FAILED;5547}55485549/* Adjust the entry size to accommodate the longest name found. */55505551if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)5552cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);55535554/* Scan the list to check for duplicates. For duplicate names, if the5555number is the same, break the loop, which causes the name to be5556discarded; otherwise, if DUPNAMES is not set, give an error.5557If it is set, allow the name with a different number, but continue5558scanning in case this is a duplicate with the same number. For5559non-duplicate names, give an error if the number is duplicated. */55605561isdupname = FALSE;5562ng = cb->named_groups;5563for (i = 0; i < cb->names_found; i++, ng++)5564{5565if (namelen == ng->length &&5566PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)5567{5568if (ng->number == cb->bracount) break;5569if ((options & PCRE2_DUPNAMES) == 0)5570{5571errorcode = ERR43;5572goto FAILED;5573}5574isdupname = ng->isdup = TRUE; /* Mark as a duplicate */5575cb->dupnames = TRUE; /* Duplicate names exist */5576}5577else if (ng->number == cb->bracount)5578{5579errorcode = ERR65;5580goto FAILED;5581}5582}55835584if (i < cb->names_found) break; /* Ignore duplicate with same number */55855586/* Increase the list size if necessary */55875588if (cb->names_found >= cb->named_group_list_size)5589{5590uint32_t newsize = cb->named_group_list_size * 2;5591named_group *newspace =5592cb->cx->memctl.malloc(newsize * sizeof(named_group),5593cb->cx->memctl.memory_data);5594if (newspace == NULL)5595{5596errorcode = ERR21;5597goto FAILED;5598}55995600memcpy(newspace, cb->named_groups,5601cb->named_group_list_size * sizeof(named_group));5602if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)5603cb->cx->memctl.free((void *)cb->named_groups,5604cb->cx->memctl.memory_data);5605cb->named_groups = newspace;5606cb->named_group_list_size = newsize;5607}56085609/* Add this name to the list */56105611cb->named_groups[cb->names_found].name = name;5612cb->named_groups[cb->names_found].length = (uint16_t)namelen;5613cb->named_groups[cb->names_found].number = cb->bracount;5614cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;5615cb->names_found++;5616break;561756185619/* ---- Perl extended character class ---- */56205621/* These are of the form '(?[...])'. We handle these via the same parser5622that consumes ordinary '[...]' classes, but with a flag set to activate5623the extended behaviour. */56245625case CHAR_LEFT_SQUARE_BRACKET:5626class_mode_state = CLASS_MODE_PERL_EXT;5627c = *ptr++;5628goto FROM_PERL_EXTENDED_CLASS;5629} /* End of (? switch */5630break; /* End of ( handling */563156325633/* ---- Branch terminators ---- */56345635/* Alternation: reset the capture count if we are in a (?| group. */56365637case CHAR_VERTICAL_LINE:5638if (top_nest != NULL && top_nest->nest_depth == nest_depth &&5639(top_nest->flags & NSF_RESET) != 0)5640{5641if (cb->bracount > top_nest->max_group)5642top_nest->max_group = (uint16_t)cb->bracount;5643cb->bracount = top_nest->reset_group;5644}5645*parsed_pattern++ = META_ALT;5646break;56475648/* End of group; reset the capture count to the maximum if we are in a (?|5649group and/or reset the options that are tracked during parsing. Disallow5650quantifier for a condition that is an assertion. */56515652case CHAR_RIGHT_PARENTHESIS:5653okquantifier = TRUE;5654if (top_nest != NULL && top_nest->nest_depth == nest_depth)5655{5656options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;5657xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;5658if ((top_nest->flags & NSF_RESET) != 0 &&5659top_nest->max_group > cb->bracount)5660cb->bracount = top_nest->max_group;5661if ((top_nest->flags & NSF_CONDASSERT) != 0)5662okquantifier = FALSE;56635664if ((top_nest->flags & NSF_ATOMICSR) != 0)5665{5666*parsed_pattern++ = META_KET;56675668#ifdef PCRE2_DEBUG5669PCRE2_ASSERT(parsed_pattern_extra > 0);5670parsed_pattern_extra--;5671#endif5672}56735674if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;5675else top_nest--;5676}5677if (nest_depth == 0) /* Unmatched closing parenthesis */5678{5679errorcode = ERR22;5680goto FAILED_BACK; // TODO https://github.com/PCRE2Project/pcre2/issues/5495681}5682nest_depth--;5683*parsed_pattern++ = META_KET;5684break;5685} /* End of switch on pattern character */5686} /* End of main character scan loop */56875688/* End of pattern reached. Check for missing ) at the end of a verb name. */56895690if (inverbname && ptr >= ptrend)5691{5692errorcode = ERR60;5693goto FAILED;5694}569556965697PARSED_END:56985699PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +5700(parsed_pattern_extra - parsed_pattern_extra_check) <=5701max_parsed_pattern(ptr_check, ptr, utf, options));57025703/* Manage callout for the final item */57045705parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,5706parsed_pattern, cb);57075708/* Insert trailing items for word and line matching (features provided for the5709benefit of pcre2grep). */57105711if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)5712{5713*parsed_pattern++ = META_KET;5714*parsed_pattern++ = META_DOLLAR;5715}5716else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)5717{5718*parsed_pattern++ = META_KET;5719*parsed_pattern++ = META_ESCAPE + ESC_b;5720}57215722/* Terminate the parsed pattern, then return success if all groups are closed.5723Otherwise we have unclosed parentheses. */57245725if (parsed_pattern >= parsed_pattern_end)5726{5727PCRE2_DEBUG_UNREACHABLE();5728errorcode = ERR63; /* Internal error (parsed pattern overflow) */5729goto FAILED;5730}57315732*parsed_pattern = META_END;5733if (nest_depth == 0) return 0;57345735UNCLOSED_PARENTHESIS:5736errorcode = ERR14;57375738/* Come here for all failures. */57395740FAILED:5741cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);5742return errorcode;57435744/* Some errors need to indicate the previous character. */57455746FAILED_BACK:5747ptr--;5748goto FAILED;57495750/* This failure happens several times. */57515752BAD_VERSION_CONDITION:5753errorcode = ERR79;5754goto FAILED;5755}5756575757585759/*************************************************5760* Find first significant opcode *5761*************************************************/57625763/* This is called by several functions that scan a compiled expression looking5764for a fixed first character, or an anchoring opcode etc. It skips over things5765that do not influence this. For some calls, it makes sense to skip negative5766forward and all backward assertions, and also the \b assertion; for others it5767does not.57685769Arguments:5770code pointer to the start of the group5771skipassert TRUE if certain assertions are to be skipped57725773Returns: pointer to the first significant opcode5774*/57755776static const PCRE2_UCHAR*5777first_significant_code(PCRE2_SPTR code, BOOL skipassert)5778{5779for (;;)5780{5781switch ((int)*code)5782{5783case OP_ASSERT_NOT:5784case OP_ASSERTBACK:5785case OP_ASSERTBACK_NOT:5786case OP_ASSERTBACK_NA:5787if (!skipassert) return code;5788do code += GET(code, 1); while (*code == OP_ALT);5789code += PRIV(OP_lengths)[*code];5790break;57915792case OP_WORD_BOUNDARY:5793case OP_NOT_WORD_BOUNDARY:5794case OP_UCP_WORD_BOUNDARY:5795case OP_NOT_UCP_WORD_BOUNDARY:5796if (!skipassert) return code;5797/* Fall through */57985799case OP_CALLOUT:5800case OP_CREF:5801case OP_DNCREF:5802case OP_RREF:5803case OP_DNRREF:5804case OP_FALSE:5805case OP_TRUE:5806code += PRIV(OP_lengths)[*code];5807break;58085809case OP_CALLOUT_STR:5810code += GET(code, 1 + 2*LINK_SIZE);5811break;58125813case OP_SKIPZERO:5814code += 2 + GET(code, 2) + LINK_SIZE;5815break;58165817case OP_COND:5818case OP_SCOND:5819if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */5820code[GET(code, 1)] != OP_KET) /* More than one branch */5821return code;5822code += GET(code, 1) + 1 + LINK_SIZE;5823break;58245825case OP_MARK:5826case OP_COMMIT_ARG:5827case OP_PRUNE_ARG:5828case OP_SKIP_ARG:5829case OP_THEN_ARG:5830code += code[1] + PRIV(OP_lengths)[*code];5831break;58325833default:5834return code;5835}5836}58375838PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */5839}5840584158425843/*************************************************5844* Find details of duplicate group names *5845*************************************************/58465847/* This is called from compile_branch() when it needs to know the index and5848count of duplicates in the names table when processing named backreferences,5849either directly, or as conditions.58505851Arguments:5852name points to the name5853length the length of the name5854indexptr where to put the index5855countptr where to put the count of duplicates5856errorcodeptr where to put an error code5857cb the compile block58585859Returns: TRUE if OK, FALSE if not, error code set5860*/58615862static BOOL5863find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,5864int *countptr, int *errorcodeptr, compile_block *cb)5865{5866uint32_t i, groupnumber;5867int count;5868PCRE2_UCHAR *slot = cb->name_table;58695870/* Find the first entry in the table */58715872for (i = 0; i < cb->names_found; i++)5873{5874if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&5875slot[IMM2_SIZE+length] == 0) break;5876slot += cb->name_entry_size;5877}58785879/* This should not occur, because this function is called only when we know we5880have duplicate names. Give an internal error. */58815882if (i >= cb->names_found)5883{5884PCRE2_DEBUG_UNREACHABLE();5885*errorcodeptr = ERR53;5886cb->erroroffset = name - cb->start_pattern;5887return FALSE;5888}58895890/* Record the index and then see how many duplicates there are, updating the5891backref map and maximum back reference as we do. */58925893*indexptr = i;5894count = 0;58955896for (;;)5897{5898count++;5899groupnumber = GET2(slot,0);5900cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;5901if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;5902if (++i >= cb->names_found) break;5903slot += cb->name_entry_size;5904if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||5905(slot+IMM2_SIZE)[length] != 0) break;5906}59075908*countptr = count;5909return TRUE;5910}5911591259135914/*************************************************5915* Compile one branch *5916*************************************************/59175918/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If5919the options are changed during the branch, the pointer is used to change the5920external options bits. This function is used during the pre-compile phase when5921we are trying to find out the amount of memory needed, as well as during the5922real compile phase. The value of lengthptr distinguishes the two phases.59235924Arguments:5925optionsptr pointer to the option bits5926xoptionsptr pointer to the extra option bits5927codeptr points to the pointer to the current code point5928pptrptr points to the current parsed pattern pointer5929errorcodeptr points to error code variable5930firstcuptr place to put the first required code unit5931firstcuflagsptr place to put the first code unit flags5932reqcuptr place to put the last required code unit5933reqcuflagsptr place to put the last required code unit flags5934bcptr points to current branch chain5935open_caps points to current capitem5936cb contains pointers to tables etc.5937lengthptr NULL during the real compile phase5938points to length accumulator during pre-compile phase59395940Returns: 0 There's been an error, *errorcodeptr is non-zero5941+1 Success, this branch must match at least one character5942-1 Success, this branch may match an empty string5943*/59445945static int5946compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,5947PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,5948uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,5949uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,5950compile_block *cb, PCRE2_SIZE *lengthptr)5951{5952int bravalue = 0;5953int okreturn = -1;5954int group_return = 0;5955uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */5956uint32_t greedy_default, greedy_non_default;5957uint32_t repeat_type, op_type;5958uint32_t options = *optionsptr; /* May change dynamically */5959uint32_t xoptions = *xoptionsptr; /* May change dynamically */5960uint32_t firstcu, reqcu;5961uint32_t zeroreqcu, zerofirstcu;5962uint32_t *pptr = *pptrptr;5963uint32_t meta, meta_arg;5964uint32_t firstcuflags, reqcuflags;5965uint32_t zeroreqcuflags, zerofirstcuflags;5966uint32_t req_caseopt, reqvary, tempreqvary;5967/* Some opcodes, such as META_SCS_NUMBER or META_SCS_NAME,5968depends on the previous value of offset. */5969PCRE2_SIZE offset = 0;5970PCRE2_SIZE length_prevgroup = 0;5971PCRE2_UCHAR *code = *codeptr;5972PCRE2_UCHAR *last_code = code;5973PCRE2_UCHAR *orig_code = code;5974PCRE2_UCHAR *tempcode;5975PCRE2_UCHAR *previous = NULL;5976PCRE2_UCHAR op_previous;5977BOOL groupsetfirstcu = FALSE;5978BOOL had_accept = FALSE;5979BOOL matched_char = FALSE;5980BOOL previous_matched_char = FALSE;5981BOOL reset_caseful = FALSE;59825983/* We can fish out the UTF setting once and for all into a BOOL, but we must5984not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically5985as we process the pattern. */59865987#ifdef SUPPORT_UNICODE5988BOOL utf = (options & PCRE2_UTF) != 0;5989BOOL ucp = (options & PCRE2_UCP) != 0;5990#else /* No Unicode support */5991BOOL utf = FALSE;5992#endif59935994/* Set up the default and non-default settings for greediness */59955996greedy_default = ((options & PCRE2_UNGREEDY) != 0);5997greedy_non_default = greedy_default ^ 1;59985999/* Initialize no first unit, no required unit. REQ_UNSET means "no char6000matching encountered yet". It gets changed to REQ_NONE if we hit something that6001matches a non-fixed first unit; reqcu just remains unset if we never find one.60026003When we hit a repeat whose minimum is zero, we may have to adjust these values6004to take the zero repeat into account. This is implemented by setting them to6005zerofirstcu and zeroreqcu when such a repeat is encountered. The individual6006item types that can be repeated set these backoff variables appropriately. */60076008firstcu = reqcu = zerofirstcu = zeroreqcu = 0;6009firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;60106011/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,6012according to the current setting of the caseless flag. The REQ_CASELESS value6013leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables6014to record the case status of the value. This is used only for ASCII characters.6015*/60166017req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;60186019/* Switch on next META item until the end of the branch */60206021for (;; pptr++)6022{6023BOOL possessive_quantifier;6024BOOL note_group_empty;6025uint32_t mclength;6026uint32_t skipunits;6027uint32_t subreqcu, subfirstcu;6028uint32_t groupnumber;6029uint32_t verbarglen, verbculen;6030uint32_t subreqcuflags, subfirstcuflags;6031open_capitem *oc;6032PCRE2_UCHAR mcbuffer[8];60336034/* Get next META item in the pattern and its potential argument. */60356036meta = META_CODE(*pptr);6037meta_arg = META_DATA(*pptr);60386039/* If we are in the pre-compile phase, accumulate the length used for the6040previous cycle of this loop, unless the next item is a quantifier. */60416042if (lengthptr != NULL)6043{6044if (code > cb->start_workspace + cb->workspace_size -6045WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */6046{6047if (code >= cb->start_workspace + cb->workspace_size)6048{6049PCRE2_DEBUG_UNREACHABLE();6050*errorcodeptr = ERR52; /* Over-ran workspace - internal error */6051}6052else6053*errorcodeptr = ERR86;6054return 0;6055}60566057/* There is at least one situation where code goes backwards: this is the6058case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier6059is processed, the whole class is eliminated. However, it is created first,6060so we have to allow memory for it. Therefore, don't ever reduce the length6061at this point. */60626063if (code < last_code) code = last_code;60646065/* If the next thing is not a quantifier, we add the length of the previous6066item into the total, and reset the code pointer to the start of the6067workspace. Otherwise leave the previous item available to be quantified. */60686069if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)6070{6071if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))6072{6073*errorcodeptr = ERR20; /* Integer overflow */6074return 0;6075}6076*lengthptr += (PCRE2_SIZE)(code - orig_code);6077if (*lengthptr > MAX_PATTERN_SIZE)6078{6079*errorcodeptr = ERR20; /* Pattern is too large */6080return 0;6081}6082code = orig_code;6083}60846085/* Remember where this code item starts so we can catch the "backwards"6086case above next time round. */60876088last_code = code;6089}60906091/* Process the next parsed pattern item. If it is not a quantifier, remember6092where it starts so that it can be quantified when a quantifier follows.6093Checking for the legality of quantifiers happens in parse_regex(), except for6094a quantifier after an assertion that is a condition. */60956096if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)6097{6098previous = code;6099if (matched_char && !had_accept) okreturn = 1;6100}61016102previous_matched_char = matched_char;6103matched_char = FALSE;6104note_group_empty = FALSE;6105skipunits = 0; /* Default value for most subgroups */61066107switch(meta)6108{6109/* ===================================================================*/6110/* The branch terminates at pattern end or | or ) */61116112case META_END:6113case META_ALT:6114case META_KET:6115*firstcuptr = firstcu;6116*firstcuflagsptr = firstcuflags;6117*reqcuptr = reqcu;6118*reqcuflagsptr = reqcuflags;6119*codeptr = code;6120*pptrptr = pptr;6121return okreturn;612261236124/* ===================================================================*/6125/* Handle single-character metacharacters. In multiline mode, ^ disables6126the setting of any following char as a first character. */61276128case META_CIRCUMFLEX:6129if ((options & PCRE2_MULTILINE) != 0)6130{6131if (firstcuflags == REQ_UNSET)6132zerofirstcuflags = firstcuflags = REQ_NONE;6133*code++ = OP_CIRCM;6134}6135else *code++ = OP_CIRC;6136break;61376138case META_DOLLAR:6139*code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;6140break;61416142/* There can never be a first char if '.' is first, whatever happens about6143repeats. The value of reqcu doesn't change either. */61446145case META_DOT:6146matched_char = TRUE;6147if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6148zerofirstcu = firstcu;6149zerofirstcuflags = firstcuflags;6150zeroreqcu = reqcu;6151zeroreqcuflags = reqcuflags;6152*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;6153break;615461556156/* ===================================================================*/6157/* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.6158Otherwise, an initial ']' is taken as a data character. When empty classes6159are allowed, [] must generate an empty class - we have no dedicated opcode6160to optimise the representation, but it's a rare case (the '(*FAIL)'6161construct would be a clearer way for a pattern author to represent a6162non-matching branch, but it does have different semantics to '[]' if both6163are followed by a quantifier). The empty-negated [^] matches any character,6164so is useful: generate OP_ALLANY for this. */61656166case META_CLASS_EMPTY:6167case META_CLASS_EMPTY_NOT:6168matched_char = TRUE;6169if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;6170else6171{6172*code++ = OP_CLASS;6173memset(code, 0, 32);6174code += 32 / sizeof(PCRE2_UCHAR);6175}61766177if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6178zerofirstcu = firstcu;6179zerofirstcuflags = firstcuflags;6180break;618161826183/* ===================================================================*/6184/* Non-empty character class. If the included characters are all < 256, we6185build a 32-byte bitmap of the permitted characters, except in the special6186case where there is only one such character. For negated classes, we build6187the map as usual, then invert it at the end. However, we use a different6188opcode so that data characters > 255 can be handled correctly.61896190If the class contains characters outside the 0-255 range, a different6191opcode is compiled. It may optionally have a bit map for characters < 256,6192but those above are explicitly listed afterwards. A flag code unit tells6193whether the bitmap is present, and whether this is a negated class or6194not. */61956196case META_CLASS_NOT:6197case META_CLASS:6198matched_char = TRUE;61996200/* Check for complex extended classes and handle them separately. */62016202if ((*pptr & CLASS_IS_ECLASS) != 0)6203{6204if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,6205errorcodeptr, cb, lengthptr))6206return 0;6207goto CLASS_END_PROCESSING;6208}62096210/* We can optimize the case of a single character in a class by generating6211OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's6212negative. In the negative case there can be no first char if this item is6213first, whatever repeat count may follow. In the case of reqcu, save the6214previous value for reinstating. */62156216/* NOTE: at present this optimization is not effective if the only6217character in a class in 32-bit, non-UCP mode has its top bit set. */62186219if (pptr[1] < META_END && pptr[2] == META_CLASS_END)6220{6221uint32_t c = pptr[1];62226223pptr += 2; /* Move on to class end */6224if (meta == META_CLASS) /* A positive one-char class can be */6225{ /* handled as a normal literal character. */6226meta = c; /* Set up the character */6227goto NORMAL_CHAR_SET;6228}62296230/* Handle a negative one-character class */62316232zeroreqcu = reqcu;6233zeroreqcuflags = reqcuflags;6234if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6235zerofirstcu = firstcu;6236zerofirstcuflags = firstcuflags;62376238/* For caseless UTF or UCP mode, check whether this character has more6239than one other case. If so, generate a special OP_NOTPROP item instead of6240OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any6241caseless set that starts with an ASCII character. If the character is6242affected by the special Turkish rules, hardcode the not-matching6243characters using a caseset. */62446245#ifdef SUPPORT_UNICODE6246if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)6247{6248uint32_t caseset;62496250if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==6251PCRE2_EXTRA_TURKISH_CASING &&6252UCD_ANY_I(c))6253{6254caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);6255}6256else if ((caseset = UCD_CASESET(c)) != 0 &&6257(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&6258PRIV(ucd_caseless_sets)[caseset] < 128)6259{6260caseset = 0; /* Ignore the caseless set if it's restricted. */6261}62626263if (caseset != 0)6264{6265*code++ = OP_NOTPROP;6266*code++ = PT_CLIST;6267*code++ = caseset;6268break; /* We are finished with this class */6269}6270}6271#endif6272/* Char has only one other (usable) case, or UCP not available */62736274*code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;6275code += PUTCHAR(c, code);6276break; /* We are finished with this class */6277} /* End of 1-char optimization */62786279/* Handle character classes that contain more than just one literal6280character. If there are exactly two characters in a positive class, see if6281they are case partners. This can be optimized to generate a caseless single6282character match (which also sets first/required code units if relevant).6283When casing restrictions apply, ignore a caseless set if both characters6284are ASCII. When Turkish casing applies, an 'i' does not match its normal6285Unicode "othercase". */62866287if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&6288pptr[3] == META_CLASS_END)6289{6290uint32_t c = pptr[1];62916292#ifdef SUPPORT_UNICODE6293if ((UCD_CASESET(c) == 0 ||6294((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&6295c < 128 && pptr[2] < 128)) &&6296!((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==6297PCRE2_EXTRA_TURKISH_CASING &&6298UCD_ANY_I(c)))6299#endif6300{6301uint32_t d;63026303#ifdef SUPPORT_UNICODE6304if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else6305#endif6306{6307#if PCRE2_CODE_UNIT_WIDTH != 86308if (c > 255) d = c; else6309#endif6310d = TABLE_GET(c, cb->fcc, c);6311}63126313if (c != d && pptr[2] == d)6314{6315pptr += 3; /* Move on to class end */6316meta = c;6317if ((options & PCRE2_CASELESS) == 0)6318{6319reset_caseful = TRUE;6320options |= PCRE2_CASELESS;6321req_caseopt = REQ_CASELESS;6322}6323goto CLASS_CASELESS_CHAR;6324}6325}6326}63276328/* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */63296330pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,6331&code, meta == META_CLASS_NOT, NULL,6332errorcodeptr, cb, lengthptr);6333if (pptr == NULL) return 0;6334PCRE2_ASSERT(*pptr == META_CLASS_END);63356336CLASS_END_PROCESSING:63376338/* If this class is the first thing in the branch, there can be no first6339char setting, whatever the repeat count. Any reqcu setting must remain6340unchanged after any kind of repeat. */63416342if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6343zerofirstcu = firstcu;6344zerofirstcuflags = firstcuflags;6345zeroreqcu = reqcu;6346zeroreqcuflags = reqcuflags;6347break; /* End of class processing */634863496350/* ===================================================================*/6351/* Deal with (*VERB)s. */63526353/* Check for open captures before ACCEPT and close those that are within6354the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an6355assertion. In the first pass, just accumulate the length required;6356otherwise hitting (*ACCEPT) inside many nested parentheses can cause6357workspace overflow. Do not set firstcu after *ACCEPT. */63586359case META_ACCEPT:6360cb->had_accept = had_accept = TRUE;6361for (oc = open_caps;6362oc != NULL && oc->assert_depth >= cb->assert_depth;6363oc = oc->next)6364{6365if (lengthptr != NULL)6366{6367*lengthptr += CU2BYTES(1) + IMM2_SIZE;6368}6369else6370{6371*code++ = OP_CLOSE;6372PUT2INC(code, 0, oc->number);6373}6374}6375*code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;6376if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;6377break;63786379case META_PRUNE:6380case META_SKIP:6381cb->had_pruneorskip = TRUE;6382/* Fall through */6383case META_COMMIT:6384case META_FAIL:6385*code++ = verbops[(meta - META_MARK) >> 16];6386break;63876388case META_THEN:6389cb->external_flags |= PCRE2_HASTHEN;6390*code++ = OP_THEN;6391break;63926393/* Handle verbs with arguments. Arguments can be very long, especially in639416- and 32-bit modes, and can overflow the workspace in the first pass.6395However, the argument length is constrained to be small enough to fit in6396one code unit. This check happens in parse_regex(). In the first pass,6397instead of putting the argument into memory, we just update the length6398counter and set up an empty argument. */63996400case META_THEN_ARG:6401cb->external_flags |= PCRE2_HASTHEN;6402goto VERB_ARG;64036404case META_PRUNE_ARG:6405case META_SKIP_ARG:6406cb->had_pruneorskip = TRUE;6407/* Fall through */6408case META_MARK:6409case META_COMMIT_ARG:6410VERB_ARG:6411*code++ = verbops[(meta - META_MARK) >> 16];6412/* The length is in characters. */6413verbarglen = *(++pptr);6414verbculen = 0;6415tempcode = code++;6416for (int i = 0; i < (int)verbarglen; i++)6417{6418meta = *(++pptr);6419#ifdef SUPPORT_UNICODE6420if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else6421#endif6422{6423mclength = 1;6424mcbuffer[0] = meta;6425}6426if (lengthptr != NULL) *lengthptr += mclength; else6427{6428memcpy(code, mcbuffer, CU2BYTES(mclength));6429code += mclength;6430verbculen += mclength;6431}6432}64336434*tempcode = verbculen; /* Fill in the code unit length */6435*code++ = 0; /* Terminating zero */6436break;643764386439/* ===================================================================*/6440/* Handle options change. The new setting must be passed back for use in6441subsequent branches. Reset the greedy defaults and the case value for6442firstcu and reqcu. */64436444case META_OPTIONS:6445*optionsptr = options = *(++pptr);6446*xoptionsptr = xoptions = *(++pptr);6447greedy_default = ((options & PCRE2_UNGREEDY) != 0);6448greedy_non_default = greedy_default ^ 1;6449req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;6450break;64516452case META_OFFSET:6453GETPLUSOFFSET(offset, pptr);6454break;64556456case META_SCS:6457bravalue = OP_ASSERT_SCS;6458cb->assert_depth += 1;6459goto GROUP_PROCESS;646064616462/* ===================================================================*/6463/* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous6464because it could be a numerical check on recursion, or a name check on a6465group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that6466we can handle it either way. We first try for a name; if not found, process6467the number. */64686469case META_COND_RNUMBER: /* (?(Rdigits) */6470case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */6471case META_COND_RNAME: /* (?(R&name) - test for recursion */6472case META_SCS_NAME: /* Name of scan substring */6473bravalue = OP_COND;6474{6475int count, index;6476unsigned int i;6477PCRE2_SPTR name;6478named_group *ng = cb->named_groups;6479uint32_t length = *(++pptr);64806481if (meta == META_SCS_NAME)6482offset += meta_arg;6483else6484GETPLUSOFFSET(offset, pptr);6485name = cb->start_pattern + offset;64866487/* In the first pass, the names generated in the pre-pass are available,6488but the main name table has not yet been created. Scan the list of names6489generated in the pre-pass in order to get a number and whether or not6490this name is duplicated. If it is not duplicated, we can handle it as a6491numerical group. */64926493for (i = 0; i < cb->names_found; i++, ng++)6494if (length == ng->length &&6495PRIV(strncmp)(name, ng->name, length) == 0) break;64966497if (i >= cb->names_found)6498{6499/* If the name was not found we have a bad reference, unless we are6500dealing with R<digits>, which is treated as a recursion test by6501number. */65026503groupnumber = 0;6504if (meta == META_COND_RNUMBER)6505{6506for (i = 1; i < length; i++)6507{6508groupnumber = groupnumber * 10 + (name[i] - CHAR_0);6509if (groupnumber > MAX_GROUP_NUMBER)6510{6511*errorcodeptr = ERR61;6512cb->erroroffset = offset + i;6513return 0;6514}6515}6516}65176518if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)6519{6520*errorcodeptr = ERR15;6521cb->erroroffset = offset;6522return 0;6523}65246525/* (?Rdigits) treated as a recursion reference by number. A value of6526zero (which is the result of both (?R) and (?R0)) means "any", and is6527translated into RREF_ANY (which is 0xffff). */65286529if (groupnumber == 0) groupnumber = RREF_ANY;6530code[1+LINK_SIZE] = OP_RREF;6531PUT2(code, 2+LINK_SIZE, groupnumber);6532skipunits = 1+IMM2_SIZE;6533goto GROUP_PROCESS_NOTE_EMPTY;6534}6535else if (!ng->isdup)6536{6537/* Otherwise found a duplicated name */6538if (ng->number > cb->top_backref) cb->top_backref = ng->number;65396540if (meta == META_SCS_NAME)6541{6542code[0] = OP_CREF;6543PUT2(code, 1, ng->number);6544code += 1+IMM2_SIZE;6545break;6546}65476548code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;6549PUT2(code, 2+LINK_SIZE, ng->number);6550skipunits = 1+IMM2_SIZE;6551if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;6552cb->assert_depth += 1;6553goto GROUP_PROCESS;6554}65556556/* We have a duplicated name. In the compile pass we have to search the6557main table in order to get the index and count values. */65586559count = 0; /* Values for first pass (avoids compiler warning) */6560index = 0;6561if (lengthptr == NULL && !find_dupname_details(name, length, &index,6562&count, errorcodeptr, cb)) return 0;65636564if (meta == META_SCS_NAME)6565{6566code[0] = OP_DNCREF;6567PUT2(code, 1, index);6568PUT2(code, 1+IMM2_SIZE, count);6569code += 1+2*IMM2_SIZE;6570break;6571}65726573/* A duplicated name was found. Note that if an R<digits> name is found6574(META_COND_RNUMBER), it is a reference test, not a recursion test. */65756576code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;65776578/* Insert appropriate data values. */6579skipunits = 1+2*IMM2_SIZE;6580PUT2(code, 2+LINK_SIZE, index);6581PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);6582}65836584PCRE2_ASSERT(meta != META_SCS_NAME);6585goto GROUP_PROCESS_NOTE_EMPTY;65866587/* The DEFINE condition is always false. Its internal groups may never6588be called, so matched_char must remain false, hence the jump to6589GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */65906591case META_COND_DEFINE:6592bravalue = OP_COND;6593GETPLUSOFFSET(offset, pptr);6594code[1+LINK_SIZE] = OP_DEFINE;6595skipunits = 1;6596goto GROUP_PROCESS;65976598/* Conditional test of a group's being set. */65996600case META_COND_NUMBER:6601case META_SCS_NUMBER:6602bravalue = OP_COND;6603if (meta == META_SCS_NUMBER)6604offset += meta_arg;6605else6606GETPLUSOFFSET(offset, pptr);66076608groupnumber = *(++pptr);6609if (groupnumber > cb->bracount)6610{6611*errorcodeptr = ERR15;6612cb->erroroffset = offset;6613return 0;6614}6615if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;66166617if (meta == META_SCS_NUMBER)6618{6619code[0] = OP_CREF;6620PUT2(code, 1, groupnumber);6621code += 1+IMM2_SIZE;6622break;6623}66246625/* Point at initial ( for too many branches error */6626offset -= 2;6627code[1+LINK_SIZE] = OP_CREF;6628skipunits = 1+IMM2_SIZE;6629PUT2(code, 2+LINK_SIZE, groupnumber);6630goto GROUP_PROCESS_NOTE_EMPTY;66316632/* Test for the PCRE2 version. */66336634case META_COND_VERSION:6635bravalue = OP_COND;6636if (pptr[1] > 0)6637code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||6638(PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?6639OP_TRUE : OP_FALSE;6640else6641code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?6642OP_TRUE : OP_FALSE;6643skipunits = 1;6644pptr += 3;6645goto GROUP_PROCESS_NOTE_EMPTY;66466647/* The condition is an assertion, possibly preceded by a callout. */66486649case META_COND_ASSERT:6650bravalue = OP_COND;6651goto GROUP_PROCESS_NOTE_EMPTY;665266536654/* ===================================================================*/6655/* Handle all kinds of nested bracketed groups. The non-capturing,6656non-conditional cases are here; others come to GROUP_PROCESS via goto. */66576658case META_LOOKAHEAD:6659bravalue = OP_ASSERT;6660cb->assert_depth += 1;6661goto GROUP_PROCESS;66626663case META_LOOKAHEAD_NA:6664bravalue = OP_ASSERT_NA;6665cb->assert_depth += 1;6666goto GROUP_PROCESS;66676668/* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird6669thing to do, but Perl allows all assertions to be quantified, and when6670they contain capturing parentheses there may be a potential use for6671this feature. Not that that applies to a quantified (?!) but we allow6672it for uniformity. */66736674case META_LOOKAHEADNOT:6675if (pptr[1] == META_KET &&6676(pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))6677{6678*code++ = OP_FAIL;6679pptr++;6680}6681else6682{6683bravalue = OP_ASSERT_NOT;6684cb->assert_depth += 1;6685goto GROUP_PROCESS;6686}6687break;66886689case META_LOOKBEHIND:6690bravalue = OP_ASSERTBACK;6691cb->assert_depth += 1;6692goto GROUP_PROCESS;66936694case META_LOOKBEHINDNOT:6695bravalue = OP_ASSERTBACK_NOT;6696cb->assert_depth += 1;6697goto GROUP_PROCESS;66986699case META_LOOKBEHIND_NA:6700bravalue = OP_ASSERTBACK_NA;6701cb->assert_depth += 1;6702goto GROUP_PROCESS;67036704case META_ATOMIC:6705bravalue = OP_ONCE;6706goto GROUP_PROCESS_NOTE_EMPTY;67076708case META_SCRIPT_RUN:6709bravalue = OP_SCRIPT_RUN;6710goto GROUP_PROCESS_NOTE_EMPTY;67116712case META_NOCAPTURE:6713bravalue = OP_BRA;6714/* Fall through */67156716/* Process nested bracketed regex. The nesting depth is maintained for the6717benefit of the stackguard function. The test for too deep nesting is now6718done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;6719others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take6720note of whether or not they may match an empty string. */67216722GROUP_PROCESS_NOTE_EMPTY:6723note_group_empty = TRUE;67246725GROUP_PROCESS:6726cb->parens_depth += 1;6727*code = bravalue;6728pptr++;6729tempcode = code;6730tempreqvary = cb->req_varyopt; /* Save value before group */6731length_prevgroup = 0; /* Initialize for pre-compile phase */67326733if ((group_return =6734compile_regex(6735options, /* The options state */6736xoptions, /* The extra options state */6737&tempcode, /* Where to put code (updated) */6738&pptr, /* Input pointer (updated) */6739errorcodeptr, /* Where to put an error message */6740skipunits, /* Skip over bracket number */6741&subfirstcu, /* For possible first char */6742&subfirstcuflags,6743&subreqcu, /* For possible last char */6744&subreqcuflags,6745bcptr, /* Current branch chain */6746open_caps, /* Pointer to capture stack */6747cb, /* Compile data block */6748(lengthptr == NULL)? NULL : /* Actual compile phase */6749&length_prevgroup /* Pre-compile phase */6750)) == 0)6751return 0; /* Error */67526753cb->parens_depth -= 1;67546755/* If that was a non-conditional significant group (not an assertion, not a6756DEFINE) that matches at least one character, then the current item matches6757a character. Conditionals are handled below. */67586759if (note_group_empty && bravalue != OP_COND && group_return > 0)6760matched_char = TRUE;67616762/* If we've just compiled an assertion, pop the assert depth. */67636764if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)6765cb->assert_depth -= 1;67666767/* At the end of compiling, code is still pointing to the start of the6768group, while tempcode has been updated to point past the end of the group.6769The parsed pattern pointer (pptr) is on the closing META_KET.67706771If this is a conditional bracket, check that there are no more than6772two branches in the group, or just one if it's a DEFINE group. We do this6773in the real compile phase, not in the pre-pass, where the whole group may6774not be available. */67756776if (bravalue == OP_COND && lengthptr == NULL)6777{6778PCRE2_UCHAR *tc = code;6779int condcount = 0;67806781do {6782condcount++;6783tc += GET(tc,1);6784}6785while (*tc != OP_KET);67866787/* A DEFINE group is never obeyed inline (the "condition" is always6788false). It must have only one branch. Having checked this, change the6789opcode to OP_FALSE. */67906791if (code[LINK_SIZE+1] == OP_DEFINE)6792{6793if (condcount > 1)6794{6795cb->erroroffset = offset;6796*errorcodeptr = ERR54;6797return 0;6798}6799code[LINK_SIZE+1] = OP_FALSE;6800bravalue = OP_DEFINE; /* A flag to suppress char handling below */6801}68026803/* A "normal" conditional group. If there is just one branch, we must not6804make use of its firstcu or reqcu, because this is equivalent to an6805empty second branch. Also, it may match an empty string. If there are two6806branches, this item must match a character if the group must. */68076808else6809{6810if (condcount > 2)6811{6812cb->erroroffset = offset;6813*errorcodeptr = ERR27;6814return 0;6815}6816if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;6817else if (group_return > 0) matched_char = TRUE;6818}6819}68206821/* In the pre-compile phase, update the length by the length of the group,6822less the brackets at either end. Then reduce the compiled code to just a6823set of non-capturing brackets so that it doesn't use much memory if it is6824duplicated by a quantifier.*/68256826if (lengthptr != NULL)6827{6828if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)6829{6830*errorcodeptr = ERR20;6831return 0;6832}6833*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;6834code++; /* This already contains bravalue */6835PUTINC(code, 0, 1 + LINK_SIZE);6836*code++ = OP_KET;6837PUTINC(code, 0, 1 + LINK_SIZE);6838break; /* No need to waste time with special character handling */6839}68406841/* Otherwise update the main code pointer to the end of the group. */68426843code = tempcode;68446845/* For a DEFINE group, required and first character settings are not6846relevant. */68476848if (bravalue == OP_DEFINE) break;68496850/* Handle updating of the required and first code units for other types of6851group. Update for normal brackets of all kinds, and conditions with two6852branches (see code above). If the bracket is followed by a quantifier with6853zero repeat, we have to back off. Hence the definition of zeroreqcu and6854zerofirstcu outside the main loop so that they can be accessed for the back6855off. */68566857zeroreqcu = reqcu;6858zeroreqcuflags = reqcuflags;6859zerofirstcu = firstcu;6860zerofirstcuflags = firstcuflags;6861groupsetfirstcu = FALSE;68626863if (bravalue >= OP_ONCE) /* Not an assertion */6864{6865/* If we have not yet set a firstcu in this branch, take it from the6866subpattern, remembering that it was set here so that a repeat of more6867than one can replicate it as reqcu if necessary. If the subpattern has6868no firstcu, set "none" for the whole branch. In both cases, a zero6869repeat forces firstcu to "none". */68706871if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)6872{6873if (subfirstcuflags < REQ_NONE)6874{6875firstcu = subfirstcu;6876firstcuflags = subfirstcuflags;6877groupsetfirstcu = TRUE;6878}6879else firstcuflags = REQ_NONE;6880zerofirstcuflags = REQ_NONE;6881}68826883/* If firstcu was previously set, convert the subpattern's firstcu6884into reqcu if there wasn't one, using the vary flag that was in6885existence beforehand. */68866887else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)6888{6889subreqcu = subfirstcu;6890subreqcuflags = subfirstcuflags | tempreqvary;6891}68926893/* If the subpattern set a required code unit (or set a first code unit6894that isn't really the first code unit - see above), set it. */68956896if (subreqcuflags < REQ_NONE)6897{6898reqcu = subreqcu;6899reqcuflags = subreqcuflags;6900}6901}69026903/* For a forward assertion, we take the reqcu, if set, provided that the6904group has also set a firstcu. This can be helpful if the pattern that6905follows the assertion doesn't set a different char. For example, it's6906useful for /(?=abcde).+/. We can't set firstcu for an assertion, however6907because it leads to incorrect effect for patterns such as /(?=a)a.+/ when6908the "real" "a" would then become a reqcu instead of a firstcu. This is6909overcome by a scan at the end if there's no firstcu, looking for an6910asserted first char. A similar effect for patterns like /(?=.*X)X$/ means6911we must only take the reqcu when the group also set a firstcu. Otherwise,6912in that example, 'X' ends up set for both. */69136914else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&6915subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)6916{6917reqcu = subreqcu;6918reqcuflags = subreqcuflags;6919}69206921break; /* End of nested group handling */692269236924/* ===================================================================*/6925/* Handle named backreferences and recursions. */69266927case META_BACKREF_BYNAME:6928case META_RECURSE_BYNAME:6929{6930int count, index;6931PCRE2_SPTR name;6932BOOL is_dupname = FALSE;6933named_group *ng = cb->named_groups;6934uint32_t length = *(++pptr);69356936GETPLUSOFFSET(offset, pptr);6937name = cb->start_pattern + offset;69386939/* In the first pass, the names generated in the pre-pass are available,6940but the main name table has not yet been created. Scan the list of names6941generated in the pre-pass in order to get a number and whether or not6942this name is duplicated. */69436944groupnumber = 0;6945for (unsigned int i = 0; i < cb->names_found; i++, ng++)6946{6947if (length == ng->length &&6948PRIV(strncmp)(name, ng->name, length) == 0)6949{6950is_dupname = ng->isdup;6951groupnumber = ng->number;69526953/* For a recursion, that's all that is needed. We can now go to6954the code that handles numerical recursion, applying it to the first6955group with the given name. */69566957if (meta == META_RECURSE_BYNAME)6958{6959meta_arg = groupnumber;6960goto HANDLE_NUMERICAL_RECURSION;6961}69626963/* For a back reference, update the back reference map and the6964maximum back reference. */69656966cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;6967if (groupnumber > cb->top_backref)6968cb->top_backref = groupnumber;6969}6970}69716972/* If the name was not found we have a bad reference. */69736974if (groupnumber == 0)6975{6976*errorcodeptr = ERR15;6977cb->erroroffset = offset;6978return 0;6979}69806981/* If a back reference name is not duplicated, we can handle it as6982a numerical reference. */69836984if (!is_dupname)6985{6986meta_arg = groupnumber;6987goto HANDLE_SINGLE_REFERENCE;6988}69896990/* If a back reference name is duplicated, we generate a different6991opcode to a numerical back reference. In the second pass we must6992search for the index and count in the final name table. */69936994count = 0; /* Values for first pass (avoids compiler warning) */6995index = 0;6996if (lengthptr == NULL && !find_dupname_details(name, length, &index,6997&count, errorcodeptr, cb)) return 0;69986999if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;7000*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;7001PUT2INC(code, 0, index);7002PUT2INC(code, 0, count);7003if ((options & PCRE2_CASELESS) != 0)7004*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?7005REFI_FLAG_CASELESS_RESTRICT : 0) |7006(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?7007REFI_FLAG_TURKISH_CASING : 0);7008}7009break;701070117012/* ===================================================================*/7013/* Handle a numerical callout. */70147015case META_CALLOUT_NUMBER:7016code[0] = OP_CALLOUT;7017PUT(code, 1, pptr[1]); /* Offset to next pattern item */7018PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */7019code[1 + 2*LINK_SIZE] = pptr[3];7020pptr += 3;7021code += PRIV(OP_lengths)[OP_CALLOUT];7022break;702370247025/* ===================================================================*/7026/* Handle a callout with a string argument. In the pre-pass we just compute7027the length without generating anything. The length in pptr[3] includes both7028delimiters; in the actual compile only the first one is copied, but a7029terminating zero is added. Any doubled delimiters within the string make7030this an overestimate, but it is not worth bothering about. */70317032case META_CALLOUT_STRING:7033if (lengthptr != NULL)7034{7035*lengthptr += pptr[3] + (1 + 4*LINK_SIZE);7036pptr += 3;7037SKIPOFFSET(pptr);7038}70397040/* In the real compile we can copy the string. The starting delimiter is7041included so that the client can discover it if they want. We also pass the7042start offset to help a script language give better error messages. */70437044else7045{7046PCRE2_SPTR pp;7047uint32_t delimiter;7048uint32_t length = pptr[3];7049PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);70507051code[0] = OP_CALLOUT_STR;7052PUT(code, 1, pptr[1]); /* Offset to next pattern item */7053PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */70547055pptr += 3;7056GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */7057pp = cb->start_pattern + offset;7058delimiter = *callout_string++ = *pp++;7059if (delimiter == CHAR_LEFT_CURLY_BRACKET)7060delimiter = CHAR_RIGHT_CURLY_BRACKET;7061PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */70627063/* The syntax of the pattern was checked in the parsing scan. The length7064includes both delimiters, but we have passed the opening one just above,7065so we reduce length before testing it. The test is for > 1 because we do7066not want to copy the final delimiter. This also ensures that pp[1] is7067accessible. */70687069while (--length > 1)7070{7071if (*pp == delimiter && pp[1] == delimiter)7072{7073*callout_string++ = delimiter;7074pp += 2;7075length--;7076}7077else *callout_string++ = *pp++;7078}7079*callout_string++ = CHAR_NUL;70807081/* Set the length of the entire item, the advance to its end. */70827083PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));7084code = callout_string;7085}7086break;708770887089/* ===================================================================*/7090/* Handle repetition. The different types are all sorted out in the parsing7091pass. */70927093case META_MINMAX_PLUS:7094case META_MINMAX_QUERY:7095case META_MINMAX:7096repeat_min = *(++pptr);7097repeat_max = *(++pptr);7098goto REPEAT;70997100case META_ASTERISK:7101case META_ASTERISK_PLUS:7102case META_ASTERISK_QUERY:7103repeat_min = 0;7104repeat_max = REPEAT_UNLIMITED;7105goto REPEAT;71067107case META_PLUS:7108case META_PLUS_PLUS:7109case META_PLUS_QUERY:7110repeat_min = 1;7111repeat_max = REPEAT_UNLIMITED;7112goto REPEAT;71137114case META_QUERY:7115case META_QUERY_PLUS:7116case META_QUERY_QUERY:7117repeat_min = 0;7118repeat_max = 1;71197120REPEAT:7121if (previous_matched_char && repeat_min > 0) matched_char = TRUE;71227123/* Remember whether this is a variable length repeat, and default to7124single-char opcodes. */71257126reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;71277128/* Adjust first and required code units for a zero repeat. */71297130if (repeat_min == 0)7131{7132firstcu = zerofirstcu;7133firstcuflags = zerofirstcuflags;7134reqcu = zeroreqcu;7135reqcuflags = zeroreqcuflags;7136}71377138/* Note the greediness and possessiveness. */71397140switch (meta)7141{7142case META_MINMAX_PLUS:7143case META_ASTERISK_PLUS:7144case META_PLUS_PLUS:7145case META_QUERY_PLUS:7146repeat_type = 0; /* Force greedy */7147possessive_quantifier = TRUE;7148break;71497150case META_MINMAX_QUERY:7151case META_ASTERISK_QUERY:7152case META_PLUS_QUERY:7153case META_QUERY_QUERY:7154repeat_type = greedy_non_default;7155possessive_quantifier = FALSE;7156break;71577158default:7159repeat_type = greedy_default;7160possessive_quantifier = FALSE;7161break;7162}71637164/* Save start of previous item, in case we have to move it up in order to7165insert something before it, and remember what it was. */71667167PCRE2_ASSERT(previous != NULL);7168tempcode = previous;7169op_previous = *previous;71707171/* Now handle repetition for the different types of item. If the repeat7172minimum and the repeat maximum are both 1, we can ignore the quantifier for7173non-parenthesized items, as they have only one alternative. For anything in7174parentheses, we must not ignore if {1} is possessive. */71757176switch (op_previous)7177{7178/* If previous was a character or negated character match, abolish the7179item and generate a repeat item instead. If a char item has a minimum of7180more than one, ensure that it is set in reqcu - it might not be if a7181sequence such as x{3} is the first thing in a branch because the x will7182have gone into firstcu instead. */71837184case OP_CHAR:7185case OP_CHARI:7186case OP_NOT:7187case OP_NOTI:7188if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;7189op_type = chartypeoffset[op_previous - OP_CHAR];71907191/* Deal with UTF characters that take up more than one code unit. */71927193#ifdef MAYBE_UTF_MULTI7194if (utf && NOT_FIRSTCU(code[-1]))7195{7196PCRE2_UCHAR *lastchar = code - 1;7197BACKCHAR(lastchar);7198mclength = (uint32_t)(code - lastchar); /* Length of UTF character */7199memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */7200}7201else7202#endif /* MAYBE_UTF_MULTI */72037204/* Handle the case of a single code unit - either with no UTF support, or7205with UTF disabled, or for a single-code-unit UTF character. In the latter7206case, for a repeated positive match, get the caseless flag for the7207required code unit from the previous character, because a class like [Aa]7208sets a caseless A but by now the req_caseopt flag has been reset. */72097210{7211mcbuffer[0] = code[-1];7212mclength = 1;7213if (op_previous <= OP_CHARI && repeat_min > 1)7214{7215reqcu = mcbuffer[0];7216reqcuflags = cb->req_varyopt;7217if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;7218}7219}7220goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */72217222/* If previous was a character class or a back reference, we put the7223repeat stuff after it, but just skip the item if the repeat was {0,0}. */72247225#ifdef SUPPORT_WIDE_CHARS7226case OP_XCLASS:7227case OP_ECLASS:7228#endif7229case OP_CLASS:7230case OP_NCLASS:7231case OP_REF:7232case OP_REFI:7233case OP_DNREF:7234case OP_DNREFI:72357236if (repeat_max == 0)7237{7238code = previous;7239goto END_REPEAT;7240}7241if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;72427243if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)7244*code++ = OP_CRSTAR + repeat_type;7245else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)7246*code++ = OP_CRPLUS + repeat_type;7247else if (repeat_min == 0 && repeat_max == 1)7248*code++ = OP_CRQUERY + repeat_type;7249else7250{7251*code++ = OP_CRRANGE + repeat_type;7252PUT2INC(code, 0, repeat_min);7253if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */7254PUT2INC(code, 0, repeat_max);7255}7256break;72577258/* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets7259because pcre2_match() could not handle backtracking into recursively7260called groups. Now that this backtracking is available, we no longer need7261to do this. However, we still need to replicate recursions as we do for7262groups so as to have independent backtracking points. We can replicate7263for the minimum number of repeats directly. For optional repeats we now7264wrap the recursion in OP_BRA brackets and make use of the bracket7265repetition. */72667267case OP_RECURSE:7268if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)7269goto END_REPEAT;72707271/* Generate unwrapped repeats for a non-zero minimum, except when the7272minimum is 1 and the maximum unlimited, because that can be handled with7273OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the7274minimum, we just need to generate the appropriate additional copies.7275Otherwise we need to generate one more, to simulate the situation when7276the minimum is zero. */72777278if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))7279{7280int replicate = repeat_min;7281if (repeat_min == repeat_max) replicate--;72827283/* In the pre-compile phase, we don't actually do the replication. We7284just adjust the length as if we had. Do some paranoid checks for7285potential integer overflow. */72867287if (lengthptr != NULL)7288{7289PCRE2_SIZE delta;7290if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||7291OFLOW_MAX - *lengthptr < delta)7292{7293*errorcodeptr = ERR20;7294return 0;7295}7296*lengthptr += delta;7297}72987299else for (int i = 0; i < replicate; i++)7300{7301memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));7302previous = code;7303code += 1 + LINK_SIZE;7304}73057306/* If the number of repeats is fixed, we are done. Otherwise, adjust7307the counts and fall through. */73087309if (repeat_min == repeat_max) break;7310if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;7311repeat_min = 0;7312}73137314/* Wrap the recursion call in OP_BRA brackets. */73157316(void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));7317op_previous = *previous = OP_BRA;7318PUT(previous, 1, 2 + 2*LINK_SIZE);7319previous[2 + 2*LINK_SIZE] = OP_KET;7320PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);7321code += 2 + 2 * LINK_SIZE;7322length_prevgroup = 3 + 3*LINK_SIZE;7323group_return = -1; /* Set "may match empty string" */73247325/* Now treat as a repeated OP_BRA. */7326/* Fall through */73277328/* If previous was a bracket group, we may have to replicate it in7329certain cases. Note that at this point we can encounter only the "basic"7330bracket opcodes such as BRA and CBRA, as this is the place where they get7331converted into the more special varieties such as BRAPOS and SBRA.7332Originally, PCRE did not allow repetition of assertions, but now it does,7333for Perl compatibility. */73347335case OP_ASSERT:7336case OP_ASSERT_NOT:7337case OP_ASSERT_NA:7338case OP_ASSERTBACK:7339case OP_ASSERTBACK_NOT:7340case OP_ASSERTBACK_NA:7341case OP_ASSERT_SCS:7342case OP_ONCE:7343case OP_SCRIPT_RUN:7344case OP_BRA:7345case OP_CBRA:7346case OP_COND:7347{7348int len = (int)(code - previous);7349PCRE2_UCHAR *bralink = NULL;7350PCRE2_UCHAR *brazeroptr = NULL;73517352if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)7353goto END_REPEAT;73547355/* Repeating a DEFINE group (or any group where the condition is always7356FALSE and there is only one branch) is pointless, but Perl allows the7357syntax, so we just ignore the repeat. */73587359if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&7360previous[GET(previous, 1)] != OP_ALT)7361goto END_REPEAT;73627363/* Perl allows all assertions to be quantified, and when they contain7364capturing parentheses and/or are optional there are potential uses for7365this feature. PCRE2 used to force the maximum quantifier to 1 on the7366invalid grounds that further repetition was never useful. This was7367always a bit pointless, since an assertion could be wrapped with a7368repeated group to achieve the effect. General repetition is now7369permitted, but if the maximum is unlimited it is set to one more than7370the minimum. */73717372if (op_previous < OP_ONCE) /* Assertion */7373{7374if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;7375}73767377/* The case of a zero minimum is special because of the need to stick7378OP_BRAZERO in front of it, and because the group appears once in the7379data, whereas in other cases it appears the minimum number of times. For7380this reason, it is simplest to treat this case separately, as otherwise7381the code gets far too messy. There are several special subcases when the7382minimum is zero. */73837384if (repeat_min == 0)7385{7386/* If the maximum is also zero, we used to just omit the group from7387the output altogether, like this:73887389** if (repeat_max == 0)7390** {7391** code = previous;7392** goto END_REPEAT;7393** }73947395However, that fails when a group or a subgroup within it is7396referenced as a subroutine from elsewhere in the pattern, so now we7397stick in OP_SKIPZERO in front of it so that it is skipped on7398execution. As we don't have a list of which groups are referenced, we7399cannot do this selectively.74007401If the maximum is 1 or unlimited, we just have to stick in the7402BRAZERO and do no more at this point. */74037404if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)7405{7406(void)memmove(previous + 1, previous, CU2BYTES(len));7407code++;7408if (repeat_max == 0)7409{7410*previous++ = OP_SKIPZERO;7411goto END_REPEAT;7412}7413brazeroptr = previous; /* Save for possessive optimizing */7414*previous++ = OP_BRAZERO + repeat_type;7415}74167417/* If the maximum is greater than 1 and limited, we have to replicate7418in a nested fashion, sticking OP_BRAZERO before each set of brackets.7419The first one has to be handled carefully because it's the original7420copy, which has to be moved up. The remainder can be handled by code7421that is common with the non-zero minimum case below. We have to7422adjust the value or repeat_max, since one less copy is required. */74237424else7425{7426int linkoffset;7427(void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));7428code += 2 + LINK_SIZE;7429*previous++ = OP_BRAZERO + repeat_type;7430*previous++ = OP_BRA;74317432/* We chain together the bracket link offset fields that have to be7433filled in later when the ends of the brackets are reached. */74347435linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);7436bralink = previous;7437PUTINC(previous, 0, linkoffset);7438}74397440if (repeat_max != REPEAT_UNLIMITED) repeat_max--;7441}74427443/* If the minimum is greater than zero, replicate the group as many7444times as necessary, and adjust the maximum to the number of subsequent7445copies that we need. */74467447else7448{7449if (repeat_min > 1)7450{7451/* In the pre-compile phase, we don't actually do the replication.7452We just adjust the length as if we had. Do some paranoid checks for7453potential integer overflow. */74547455if (lengthptr != NULL)7456{7457PCRE2_SIZE delta;7458if (PRIV(ckd_smul)(&delta, repeat_min - 1,7459(int)length_prevgroup) ||7460OFLOW_MAX - *lengthptr < delta)7461{7462*errorcodeptr = ERR20;7463return 0;7464}7465*lengthptr += delta;7466}74677468/* This is compiling for real. If there is a set first code unit7469for the group, and we have not yet set a "required code unit", set7470it. */74717472else7473{7474if (groupsetfirstcu && reqcuflags >= REQ_NONE)7475{7476reqcu = firstcu;7477reqcuflags = firstcuflags;7478}7479for (uint32_t i = 1; i < repeat_min; i++)7480{7481memcpy(code, previous, CU2BYTES(len));7482code += len;7483}7484}7485}74867487if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;7488}74897490/* This code is common to both the zero and non-zero minimum cases. If7491the maximum is limited, it replicates the group in a nested fashion,7492remembering the bracket starts on a stack. In the case of a zero7493minimum, the first one was set up above. In all cases the repeat_max7494now specifies the number of additional copies needed. Again, we must7495remember to replicate entries on the forward reference list. */74967497if (repeat_max != REPEAT_UNLIMITED)7498{7499/* In the pre-compile phase, we don't actually do the replication. We7500just adjust the length as if we had. For each repetition we must add75011 to the length for BRAZERO and for all but the last repetition we7502must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some7503paranoid checks to avoid integer overflow. */75047505if (lengthptr != NULL && repeat_max > 0)7506{7507PCRE2_SIZE delta;7508if (PRIV(ckd_smul)(&delta, repeat_max,7509(int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||7510OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)7511{7512*errorcodeptr = ERR20;7513return 0;7514}7515delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */7516*lengthptr += delta;7517}75187519/* This is compiling for real */75207521else for (uint32_t i = repeat_max; i >= 1; i--)7522{7523*code++ = OP_BRAZERO + repeat_type;75247525/* All but the final copy start a new nesting, maintaining the7526chain of brackets outstanding. */75277528if (i != 1)7529{7530int linkoffset;7531*code++ = OP_BRA;7532linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);7533bralink = code;7534PUTINC(code, 0, linkoffset);7535}75367537memcpy(code, previous, CU2BYTES(len));7538code += len;7539}75407541/* Now chain through the pending brackets, and fill in their length7542fields (which are holding the chain links pro tem). */75437544while (bralink != NULL)7545{7546int oldlinkoffset;7547int linkoffset = (int)(code - bralink + 1);7548PCRE2_UCHAR *bra = code - linkoffset;7549oldlinkoffset = GET(bra, 1);7550bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;7551*code++ = OP_KET;7552PUTINC(code, 0, linkoffset);7553PUT(bra, 1, linkoffset);7554}7555}75567557/* If the maximum is unlimited, set a repeater in the final copy. For7558SCRIPT_RUN and ONCE brackets, that's all we need to do. However,7559possessively repeated ONCE brackets can be converted into non-capturing7560brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this7561saves having to deal with possessive ONCEs specially.75627563Otherwise, when we are doing the actual compile phase, check to see7564whether this group is one that could match an empty string. If so,7565convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so7566that runtime checking can be done. [This check is also applied to ONCE7567and SCRIPT_RUN groups at runtime, but in a different way.]75687569Then, if the quantifier was possessive and the bracket is not a7570conditional, we convert the BRA code to the POS form, and the KET code7571to KETRPOS. (It turns out to be convenient at runtime to detect this7572kind of subpattern at both the start and at the end.) The use of7573special opcodes makes it possible to reduce greatly the stack usage in7574pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to7575OP_BRAPOSZERO.75767577Then, if the minimum number of matches is 1 or 0, cancel the possessive7578flag so that the default action below, of wrapping everything inside7579atomic brackets, does not happen. When the minimum is greater than 1,7580there will be earlier copies of the group, and so we still have to wrap7581the whole thing. */75827583else7584{7585PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;7586PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);75877588/* Convert possessive ONCE brackets to non-capturing */75897590if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;75917592/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need7593to do is to set the KET. */75947595if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)7596*ketcode = OP_KETRMAX + repeat_type;75977598/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs7599(which have been converted to non-capturing above). */76007601else7602{7603/* In the compile phase, adjust the opcode if the group can match7604an empty string. For a conditional group with only one branch, the7605value of group_return will not show "could be empty", so we must7606check that separately. */76077608if (lengthptr == NULL)7609{7610if (group_return < 0) *bracode += OP_SBRA - OP_BRA;7611if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)7612*bracode = OP_SCOND;7613}76147615/* Handle possessive quantifiers. */76167617if (possessive_quantifier)7618{7619/* For COND brackets, we wrap the whole thing in a possessively7620repeated non-capturing bracket, because we have not invented POS7621versions of the COND opcodes. */76227623if (*bracode == OP_COND || *bracode == OP_SCOND)7624{7625int nlen = (int)(code - bracode);7626(void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));7627code += 1 + LINK_SIZE;7628nlen += 1 + LINK_SIZE;7629*bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;7630*code++ = OP_KETRPOS;7631PUTINC(code, 0, nlen);7632PUT(bracode, 1, nlen);7633}76347635/* For non-COND brackets, we modify the BRA code and use KETRPOS. */76367637else7638{7639*bracode += 1; /* Switch to xxxPOS opcodes */7640*ketcode = OP_KETRPOS;7641}76427643/* If the minimum is zero, mark it as possessive, then unset the7644possessive flag when the minimum is 0 or 1. */76457646if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;7647if (repeat_min < 2) possessive_quantifier = FALSE;7648}76497650/* Non-possessive quantifier */76517652else *ketcode = OP_KETRMAX + repeat_type;7653}7654}7655}7656break;76577658/* If previous was a character type match (\d or similar), abolish it and7659create a suitable repeat item. The code is shared with single-character7660repeats by setting op_type to add a suitable offset into repeat_type.7661Note the the Unicode property types will be present only when7662SUPPORT_UNICODE is defined, but we don't wrap the little bits of code7663here because it just makes it horribly messy. */76647665default:7666if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)7667{7668PCRE2_DEBUG_UNREACHABLE();7669*errorcodeptr = ERR10; /* Not a character type - internal error */7670return 0;7671}7672else7673{7674int prop_type, prop_value;7675PCRE2_UCHAR *oldcode;76767677if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;76787679op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */7680mclength = 0; /* Not a character */76817682if (op_previous == OP_PROP || op_previous == OP_NOTPROP)7683{7684prop_type = previous[1];7685prop_value = previous[2];7686}7687else7688{7689/* Come here from just above with a character in mcbuffer/mclength.7690You must also set op_type before the jump. */7691OUTPUT_SINGLE_REPEAT:7692prop_type = prop_value = -1;7693}76947695/* At this point, if prop_type == prop_value == -1 we either have a7696character in mcbuffer when mclength is greater than zero, or we have7697mclength zero, in which case there is a non-property character type in7698op_previous. If prop_type/value are not negative, we have a property7699character type in op_previous. */77007701oldcode = code; /* Save where we were */7702code = previous; /* Usually overwrite previous item */77037704/* If the maximum is zero then the minimum must also be zero; Perl allows7705this case, so we do too - by simply omitting the item altogether. */77067707if (repeat_max == 0) goto END_REPEAT;77087709/* Combine the op_type with the repeat_type */77107711repeat_type += op_type;77127713/* A minimum of zero is handled either as the special case * or ?, or as7714an UPTO, with the maximum given. */77157716if (repeat_min == 0)7717{7718if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;7719else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;7720else7721{7722*code++ = OP_UPTO + repeat_type;7723PUT2INC(code, 0, repeat_max);7724}7725}77267727/* A repeat minimum of 1 is optimized into some special cases. If the7728maximum is unlimited, we use OP_PLUS. Otherwise, the original item is7729left in place and, if the maximum is greater than 1, we use OP_UPTO with7730one less than the maximum. */77317732else if (repeat_min == 1)7733{7734if (repeat_max == REPEAT_UNLIMITED)7735*code++ = OP_PLUS + repeat_type;7736else7737{7738code = oldcode; /* Leave previous item in place */7739if (repeat_max == 1) goto END_REPEAT;7740*code++ = OP_UPTO + repeat_type;7741PUT2INC(code, 0, repeat_max - 1);7742}7743}77447745/* The case {n,n} is just an EXACT, while the general case {n,m} is7746handled as an EXACT followed by an UPTO or STAR or QUERY. */77477748else7749{7750*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */7751PUT2INC(code, 0, repeat_min);77527753/* Unless repeat_max equals repeat_min, fill in the data for EXACT,7754and then generate the second opcode. For a repeated Unicode property7755match, there are two extra values that define the required property,7756and mclength is set zero to indicate this. */77577758if (repeat_max != repeat_min)7759{7760if (mclength > 0)7761{7762memcpy(code, mcbuffer, CU2BYTES(mclength));7763code += mclength;7764}7765else7766{7767*code++ = op_previous;7768if (prop_type >= 0)7769{7770*code++ = prop_type;7771*code++ = prop_value;7772}7773}77747775/* Now set up the following opcode */77767777if (repeat_max == REPEAT_UNLIMITED)7778*code++ = OP_STAR + repeat_type;7779else7780{7781repeat_max -= repeat_min;7782if (repeat_max == 1)7783{7784*code++ = OP_QUERY + repeat_type;7785}7786else7787{7788*code++ = OP_UPTO + repeat_type;7789PUT2INC(code, 0, repeat_max);7790}7791}7792}7793}77947795/* Fill in the character or character type for the final opcode. */77967797if (mclength > 0)7798{7799memcpy(code, mcbuffer, CU2BYTES(mclength));7800code += mclength;7801}7802else7803{7804*code++ = op_previous;7805if (prop_type >= 0)7806{7807*code++ = prop_type;7808*code++ = prop_value;7809}7810}7811}7812break;7813} /* End of switch on different op_previous values */781478157816/* If the character following a repeat is '+', possessive_quantifier is7817TRUE. For some opcodes, there are special alternative opcodes for this7818case. For anything else, we wrap the entire repeated item inside OP_ONCE7819brackets. Logically, the '+' notation is just syntactic sugar, taken from7820Sun's Java package, but the special opcodes can optimize it.78217822Some (but not all) possessively repeated subpatterns have already been7823completely handled in the code just above. For them, possessive_quantifier7824is always FALSE at this stage. Note that the repeated item starts at7825tempcode, not at previous, which might be the first part of a string whose7826(former) last char we repeated. */78277828if (possessive_quantifier)7829{7830int len;78317832/* Possessifying an EXACT quantifier has no effect, so we can ignore it.7833However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},7834{5,}, or {5,10}). We skip over an EXACT item; if the length of what7835remains is greater than zero, there's a further opcode that can be7836handled. If not, do nothing, leaving the EXACT alone. */78377838switch(*tempcode)7839{7840case OP_TYPEEXACT:7841tempcode += PRIV(OP_lengths)[*tempcode] +7842((tempcode[1 + IMM2_SIZE] == OP_PROP7843|| tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);7844break;78457846/* CHAR opcodes are used for exacts whose count is 1. */78477848case OP_CHAR:7849case OP_CHARI:7850case OP_NOT:7851case OP_NOTI:7852case OP_EXACT:7853case OP_EXACTI:7854case OP_NOTEXACT:7855case OP_NOTEXACTI:7856tempcode += PRIV(OP_lengths)[*tempcode];7857#ifdef SUPPORT_UNICODE7858if (utf && HAS_EXTRALEN(tempcode[-1]))7859tempcode += GET_EXTRALEN(tempcode[-1]);7860#endif7861break;78627863/* For the class opcodes, the repeat operator appears at the end;7864adjust tempcode to point to it. */78657866case OP_CLASS:7867case OP_NCLASS:7868tempcode += 1 + 32/sizeof(PCRE2_UCHAR);7869break;78707871#ifdef SUPPORT_WIDE_CHARS7872case OP_XCLASS:7873case OP_ECLASS:7874tempcode += GET(tempcode, 1);7875break;7876#endif7877}78787879/* If tempcode is equal to code (which points to the end of the repeated7880item), it means we have skipped an EXACT item but there is no following7881QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In7882all other cases, tempcode will be pointing to the repeat opcode, and will7883be less than code, so the value of len will be greater than 0. */78847885len = (int)(code - tempcode);7886if (len > 0)7887{7888unsigned int repcode = *tempcode;78897890/* There is a table for possessifying opcodes, all of which are less7891than OP_CALLOUT. A zero entry means there is no possessified version.7892*/78937894if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)7895*tempcode = opcode_possessify[repcode];78967897/* For opcode without a special possessified version, wrap the item in7898ONCE brackets. */78997900else7901{7902(void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));7903code += 1 + LINK_SIZE;7904len += 1 + LINK_SIZE;7905tempcode[0] = OP_ONCE;7906*code++ = OP_KET;7907PUTINC(code, 0, len);7908PUT(tempcode, 1, len);7909}7910}7911}79127913/* We set the "follows varying string" flag for subsequently encountered7914reqcus if it isn't already set and we have just passed a varying length7915item. */79167917END_REPEAT:7918cb->req_varyopt |= reqvary;7919break;792079217922/* ===================================================================*/7923/* Handle a 32-bit data character with a value greater than META_END. */79247925case META_BIGVALUE:7926pptr++;7927goto NORMAL_CHAR;792879297930/* ===============================================================*/7931/* Handle a back reference by number, which is the meta argument. The7932pattern offsets for back references to group numbers less than 10 are held7933in a special vector, to avoid using more than two parsed pattern elements7934in 64-bit environments. We only need the offset to the first occurrence,7935because if that doesn't fail, subsequent ones will also be OK. */79367937case META_BACKREF:7938if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];7939else GETPLUSOFFSET(offset, pptr);79407941if (meta_arg > cb->bracount)7942{7943cb->erroroffset = offset;7944*errorcodeptr = ERR15; /* Non-existent subpattern */7945return 0;7946}79477948/* Come here from named backref handling when the reference is to a7949single group (that is, not to a duplicated name). The back reference7950data will have already been updated. We must disable firstcu if not7951set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'7952later. */79537954HANDLE_SINGLE_REFERENCE:7955if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;7956*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;7957PUT2INC(code, 0, meta_arg);7958if ((options & PCRE2_CASELESS) != 0)7959*code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?7960REFI_FLAG_CASELESS_RESTRICT : 0) |7961(((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?7962REFI_FLAG_TURKISH_CASING : 0);79637964/* Update the map of back references, and keep the highest one. We7965could do this in parse_regex() for numerical back references, but not7966for named back references, because we don't know the numbers to which7967named back references refer. So we do it all in this function. */79687969cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;7970if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;7971break;797279737974/* ===============================================================*/7975/* Handle recursion by inserting the number of the called group (which is7976the meta argument) after OP_RECURSE. At the end of compiling the pattern is7977scanned and these numbers are replaced by offsets within the pattern. It is7978done like this to avoid problems with forward references and adjusting7979offsets when groups are duplicated and moved (as discovered in previous7980implementations). Note that a recursion does not have a set first7981character. */79827983case META_RECURSE:7984GETPLUSOFFSET(offset, pptr);7985if (meta_arg > cb->bracount)7986{7987cb->erroroffset = offset;7988*errorcodeptr = ERR15; /* Non-existent subpattern */7989return 0;7990}7991HANDLE_NUMERICAL_RECURSION:7992*code = OP_RECURSE;7993PUT(code, 1, meta_arg);7994code += 1 + LINK_SIZE;7995groupsetfirstcu = FALSE;7996cb->had_recurse = TRUE;7997if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;7998zerofirstcu = firstcu;7999zerofirstcuflags = firstcuflags;8000break;800180028003/* ===============================================================*/8004/* Handle capturing parentheses; the number is the meta argument. */80058006case META_CAPTURE:8007bravalue = OP_CBRA;8008skipunits = IMM2_SIZE;8009PUT2(code, 1+LINK_SIZE, meta_arg);8010cb->lastcapture = meta_arg;8011goto GROUP_PROCESS_NOTE_EMPTY;801280138014/* ===============================================================*/8015/* Handle escape sequence items. For ones like \d, the ESC_values are8016arranged to be the same as the corresponding OP_values in the default case8017when PCRE2_UCP is not set (which is the only case in which they will appear8018here).80198020Note: \Q and \E are never seen here, as they were dealt with in8021parse_pattern(). Neither are numerical back references or recursions, which8022were turned into META_BACKREF or META_RECURSE items, respectively. \k and8023\g, when followed by names, are turned into META_BACKREF_BYNAME or8024META_RECURSE_BYNAME. */80258026case META_ESCAPE:80278028/* We can test for escape sequences that consume a character because their8029values lie between ESC_b and ESC_Z; this may have to change if any new ones8030are ever created. For these sequences, we disable the setting of a first8031character if it hasn't already been set. */80328033if (meta_arg > ESC_b && meta_arg < ESC_Z)8034{8035matched_char = TRUE;8036if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;8037}80388039/* Set values to reset to if this is followed by a zero repeat. */80408041zerofirstcu = firstcu;8042zerofirstcuflags = firstcuflags;8043zeroreqcu = reqcu;8044zeroreqcuflags = reqcuflags;80458046/* If Unicode is not supported, \P and \p are not allowed and are8047faulted at parse time, so will never appear here. */80488049#ifdef SUPPORT_UNICODE8050if (meta_arg == ESC_P || meta_arg == ESC_p)8051{8052uint32_t ptype = *(++pptr) >> 16;8053uint32_t pdata = *pptr & 0xffff;80548055/* In caseless matching, particular characteristics Lu, Ll, and Lt get8056converted to the general characteristic L&. That is, upper, lower, and8057title case letters are all conflated. */80588059if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&8060(pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))8061{8062ptype = PT_LAMP;8063pdata = 0;8064}80658066/* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}8067is compiled to [] so as to benefit from the auto-anchoring code. */80688069if (ptype == PT_ANY)8070{8071if (meta_arg == ESC_P)8072{8073*code++ = OP_CLASS;8074memset(code, 0, 32);8075code += 32 / sizeof(PCRE2_UCHAR);8076}8077else8078*code++ = OP_ALLANY;8079}8080else8081{8082*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;8083*code++ = ptype;8084*code++ = pdata;8085}8086break; /* End META_ESCAPE */8087}8088#endif80898090/* \K is forbidden in lookarounds since 10.38 because that's what Perl has8091done. However, there's an option, in case anyone was relying on it. */80928093if (cb->assert_depth > 0 && meta_arg == ESC_K &&8094(xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)8095{8096*errorcodeptr = ERR99;8097return 0;8098}80998100/* For the rest (including \X when Unicode is supported - if not it's8101faulted at parse time), the OP value is the escape value when PCRE2_UCP is8102not set; if it is set, most of them do not show up here because they are8103converted into Unicode property tests in parse_regex().81048105In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY8106instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.8107There are special UCP codes for \B and \b which are used in UCP mode unless8108"word" matching is being forced to ASCII.81098110Note that \b and \B do a one-character lookbehind, and \A also behaves as8111if it does. */81128113switch(meta_arg)8114{8115case ESC_C:8116cb->external_flags |= PCRE2_HASBKC; /* Record */8117#if PCRE2_CODE_UNIT_WIDTH == 328118meta_arg = OP_ALLANY;8119#else8120if (!utf) meta_arg = OP_ALLANY;8121#endif8122break;81238124case ESC_B:8125case ESC_b:8126if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)8127meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :8128OP_UCP_WORD_BOUNDARY;8129/* Fall through */81308131case ESC_A:8132if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;8133break;8134}81358136*code++ = meta_arg;8137break; /* End META_ESCAPE */813881398140/* ===================================================================*/8141/* Handle an unrecognized meta value. A parsed pattern value less than8142META_END is a literal. Otherwise we have a problem. */81438144default:8145if (meta >= META_END)8146{8147PCRE2_DEBUG_UNREACHABLE();8148*errorcodeptr = ERR89; /* Internal error - unrecognized. */8149return 0;8150}81518152/* Handle a literal character. We come here by goto in the case of a815332-bit, non-UTF character whose value is greater than META_END. */81548155NORMAL_CHAR:8156meta = *pptr; /* Get the full 32 bits */8157NORMAL_CHAR_SET: /* Character is already in meta */8158matched_char = TRUE;81598160/* For caseless UTF or UCP mode, check whether this character has more than8161one other case. If so, generate a special OP_PROP item instead of OP_CHARI.8162When casing restrictions apply, ignore caseless sets that start with an8163ASCII character. If the character is affected by the special Turkish rules,8164hardcode the matching characters using a caseset. */81658166#ifdef SUPPORT_UNICODE8167if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)8168{8169uint32_t caseset;81708171if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==8172PCRE2_EXTRA_TURKISH_CASING &&8173UCD_ANY_I(meta))8174{8175caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);8176}8177else if ((caseset = UCD_CASESET(meta)) != 0 &&8178(xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&8179PRIV(ucd_caseless_sets)[caseset] < 128)8180{8181caseset = 0; /* Ignore the caseless set if it's restricted. */8182}81838184if (caseset != 0)8185{8186*code++ = OP_PROP;8187*code++ = PT_CLIST;8188*code++ = caseset;8189if (firstcuflags == REQ_UNSET)8190firstcuflags = zerofirstcuflags = REQ_NONE;8191break; /* End handling this meta item */8192}8193}8194#endif81958196/* Caseful matches, or caseless and not one of the multicase characters. We8197come here by goto in the case of a positive class that contains only8198case-partners of a character with just two cases; matched_char has already8199been set TRUE and options fudged if necessary. */82008201CLASS_CASELESS_CHAR:82028203/* Get the character's code units into mcbuffer, with the length in8204mclength. When not in UTF mode, the length is always 1. */82058206#ifdef SUPPORT_UNICODE8207if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else8208#endif8209{8210mclength = 1;8211mcbuffer[0] = meta;8212}82138214/* Generate the appropriate code */82158216*code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;8217memcpy(code, mcbuffer, CU2BYTES(mclength));8218code += mclength;82198220/* Remember if \r or \n were seen */82218222if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)8223cb->external_flags |= PCRE2_HASCRORLF;82248225/* Set the first and required code units appropriately. If no previous8226first code unit, set it from this character, but revert to none on a zero8227repeat. Otherwise, leave the firstcu value alone, and don't change it on8228a zero repeat. */82298230if (firstcuflags == REQ_UNSET)8231{8232zerofirstcuflags = REQ_NONE;8233zeroreqcu = reqcu;8234zeroreqcuflags = reqcuflags;82358236/* If the character is more than one code unit long, we can set a single8237firstcu only if it is not to be matched caselessly. Multiple possible8238starting code units may be picked up later in the studying code. */82398240if (mclength == 1 || req_caseopt == 0)8241{8242firstcu = mcbuffer[0];8243firstcuflags = req_caseopt;8244if (mclength != 1)8245{8246reqcu = code[-1];8247reqcuflags = cb->req_varyopt;8248}8249}8250else firstcuflags = reqcuflags = REQ_NONE;8251}82528253/* firstcu was previously set; we can set reqcu only if the length is82541 or the matching is caseful. */82558256else8257{8258zerofirstcu = firstcu;8259zerofirstcuflags = firstcuflags;8260zeroreqcu = reqcu;8261zeroreqcuflags = reqcuflags;8262if (mclength == 1 || req_caseopt == 0)8263{8264reqcu = code[-1];8265reqcuflags = req_caseopt | cb->req_varyopt;8266}8267}82688269/* If caselessness was temporarily instated, reset it. */82708271if (reset_caseful)8272{8273options &= ~PCRE2_CASELESS;8274req_caseopt = 0;8275reset_caseful = FALSE;8276}82778278break; /* End literal character handling */8279} /* End of big switch */8280} /* End of big loop */82818282PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */8283return 0; /* Avoid compiler warnings */8284}8285828682878288/*************************************************8289* Compile regex: a sequence of alternatives *8290*************************************************/82918292/* On entry, pptr is pointing past the bracket meta, but on return it points to8293the closing bracket or META_END. The code variable is pointing at the code unit8294into which the BRA operator has been stored. This function is used during the8295pre-compile phase when we are trying to find out the amount of memory needed,8296as well as during the real compile phase. The value of lengthptr distinguishes8297the two phases.82988299Arguments:8300options option bits, including any changes for this subpattern8301xoptions extra option bits, ditto8302codeptr -> the address of the current code pointer8303pptrptr -> the address of the current parsed pattern pointer8304errorcodeptr -> pointer to error code variable8305skipunits skip this many code units at start (for brackets and OP_COND)8306firstcuptr place to put the first required code unit8307firstcuflagsptr place to put the first code unit flags8308reqcuptr place to put the last required code unit8309reqcuflagsptr place to put the last required code unit flags8310bcptr pointer to the chain of currently open branches8311cb points to the data block with tables pointers etc.8312lengthptr NULL during the real compile phase8313points to length accumulator during pre-compile phase83148315Returns: 0 There has been an error8316+1 Success, this group must match at least one character8317-1 Success, this group may match an empty string8318*/83198320static int8321compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,8322uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,8323uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,8324uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,8325compile_block *cb, PCRE2_SIZE *lengthptr)8326{8327PCRE2_UCHAR *code = *codeptr;8328PCRE2_UCHAR *last_branch = code;8329PCRE2_UCHAR *start_bracket = code;8330BOOL lookbehind;8331open_capitem capitem;8332int capnumber = 0;8333int okreturn = 1;8334uint32_t *pptr = *pptrptr;8335uint32_t firstcu, reqcu;8336uint32_t lookbehindlength;8337uint32_t lookbehindminlength;8338uint32_t firstcuflags, reqcuflags;8339PCRE2_SIZE length;8340branch_chain bc;83418342/* If set, call the external function that checks for stack availability. */83438344if (cb->cx->stack_guard != NULL &&8345cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))8346{8347*errorcodeptr= ERR33;8348return 0;8349}83508351/* Miscellaneous initialization */83528353bc.outer = bcptr;8354bc.current_branch = code;83558356firstcu = reqcu = 0;8357firstcuflags = reqcuflags = REQ_UNSET;83588359/* Accumulate the length for use in the pre-compile phase. Start with the8360length of the BRA and KET and any extra code units that are required at the8361beginning. We accumulate in a local variable to save frequent testing of8362lengthptr for NULL. We cannot do this by looking at the value of 'code' at the8363start and end of each alternative, because compiled items are discarded during8364the pre-compile phase so that the workspace is not exceeded. */83658366length = 2 + 2*LINK_SIZE + skipunits;83678368/* Remember if this is a lookbehind assertion, and if it is, save its length8369and skip over the pattern offset. */83708371lookbehind = *code == OP_ASSERTBACK ||8372*code == OP_ASSERTBACK_NOT ||8373*code == OP_ASSERTBACK_NA;83748375if (lookbehind)8376{8377lookbehindlength = META_DATA(pptr[-1]);8378lookbehindminlength = *pptr;8379pptr += SIZEOFFSET;8380}8381else lookbehindlength = lookbehindminlength = 0;83828383/* If this is a capturing subpattern, add to the chain of open capturing items8384so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA8385need be tested here; changing this opcode to one of its variants, e.g.8386OP_SCBRAPOS, happens later, after the group has been compiled. */83878388if (*code == OP_CBRA)8389{8390capnumber = GET2(code, 1 + LINK_SIZE);8391capitem.number = capnumber;8392capitem.next = open_caps;8393capitem.assert_depth = cb->assert_depth;8394open_caps = &capitem;8395}83968397/* Offset is set zero to mark that this bracket is still open */83988399PUT(code, 1, 0);8400code += 1 + LINK_SIZE + skipunits;84018402/* Loop for each alternative branch */84038404for (;;)8405{8406int branch_return;8407uint32_t branchfirstcu = 0, branchreqcu = 0;8408uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;84098410/* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There8411is only a single minimum length for the whole assertion. When the minimum8412length is LOOKBEHIND_MAX it means that all branches are of fixed length,8413though not necessarily the same length. In this case, the original OP_REVERSE8414can be used. It can also be used if a branch in a variable length lookbehind8415has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both8416maximum and minimum values. */84178418if (lookbehind && lookbehindlength > 0)8419{8420if (lookbehindminlength == LOOKBEHIND_MAX ||8421lookbehindminlength == lookbehindlength)8422{8423*code++ = OP_REVERSE;8424PUT2INC(code, 0, lookbehindlength);8425length += 1 + IMM2_SIZE;8426}8427else8428{8429*code++ = OP_VREVERSE;8430PUT2INC(code, 0, lookbehindminlength);8431PUT2INC(code, 0, lookbehindlength);8432length += 1 + 2*IMM2_SIZE;8433}8434}84358436/* Now compile the branch; in the pre-compile phase its length gets added8437into the length. */84388439if ((branch_return =8440compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,8441&branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,8442&bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)8443return 0;84448445/* If a branch can match an empty string, so can the whole group. */84468447if (branch_return < 0) okreturn = -1;84488449/* In the real compile phase, there is some post-processing to be done. */84508451if (lengthptr == NULL)8452{8453/* If this is the first branch, the firstcu and reqcu values for the8454branch become the values for the regex. */84558456if (*last_branch != OP_ALT)8457{8458firstcu = branchfirstcu;8459firstcuflags = branchfirstcuflags;8460reqcu = branchreqcu;8461reqcuflags = branchreqcuflags;8462}84638464/* If this is not the first branch, the first char and reqcu have to8465match the values from all the previous branches, except that if the8466previous value for reqcu didn't have REQ_VARY set, it can still match,8467and we set REQ_VARY for the group from this branch's value. */84688469else8470{8471/* If we previously had a firstcu, but it doesn't match the new branch,8472we have to abandon the firstcu for the regex, but if there was8473previously no reqcu, it takes on the value of the old firstcu. */84748475if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)8476{8477if (firstcuflags < REQ_NONE)8478{8479if (reqcuflags >= REQ_NONE)8480{8481reqcu = firstcu;8482reqcuflags = firstcuflags;8483}8484}8485firstcuflags = REQ_NONE;8486}84878488/* If we (now or from before) have no firstcu, a firstcu from the8489branch becomes a reqcu if there isn't a branch reqcu. */84908491if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&8492branchreqcuflags >= REQ_NONE)8493{8494branchreqcu = branchfirstcu;8495branchreqcuflags = branchfirstcuflags;8496}84978498/* Now ensure that the reqcus match */84998500if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||8501reqcu != branchreqcu)8502reqcuflags = REQ_NONE;8503else8504{8505reqcu = branchreqcu;8506reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */8507}8508}8509}85108511/* Handle reaching the end of the expression, either ')' or end of pattern.8512In the real compile phase, go back through the alternative branches and8513reverse the chain of offsets, with the field in the BRA item now becoming an8514offset to the first alternative. If there are no alternatives, it points to8515the end of the group. The length in the terminating ket is always the length8516of the whole bracketed item. Return leaving the pointer at the terminating8517char. */85188519if (META_CODE(*pptr) != META_ALT)8520{8521if (lengthptr == NULL)8522{8523uint32_t branch_length = (uint32_t)(code - last_branch);8524do8525{8526uint32_t prev_length = GET(last_branch, 1);8527PUT(last_branch, 1, branch_length);8528branch_length = prev_length;8529last_branch -= branch_length;8530}8531while (branch_length > 0);8532}85338534/* Fill in the ket */85358536*code = OP_KET;8537PUT(code, 1, (uint32_t)(code - start_bracket));8538code += 1 + LINK_SIZE;85398540/* Set values to pass back */85418542*codeptr = code;8543*pptrptr = pptr;8544*firstcuptr = firstcu;8545*firstcuflagsptr = firstcuflags;8546*reqcuptr = reqcu;8547*reqcuflagsptr = reqcuflags;8548if (lengthptr != NULL)8549{8550if (OFLOW_MAX - *lengthptr < length)8551{8552*errorcodeptr = ERR20;8553return 0;8554}8555*lengthptr += length;8556}8557return okreturn;8558}85598560/* Another branch follows. In the pre-compile phase, we can move the code8561pointer back to where it was for the start of the first branch. (That is,8562pretend that each branch is the only one.)85638564In the real compile phase, insert an ALT node. Its length field points back8565to the previous branch while the bracket remains open. At the end the chain8566is reversed. It's done like this so that the start of the bracket has a8567zero offset until it is closed, making it possible to detect recursion. */85688569if (lengthptr != NULL)8570{8571code = *codeptr + 1 + LINK_SIZE + skipunits;8572length += 1 + LINK_SIZE;8573}8574else8575{8576*code = OP_ALT;8577PUT(code, 1, (int)(code - last_branch));8578bc.current_branch = last_branch = code;8579code += 1 + LINK_SIZE;8580}85818582/* Set the maximum lookbehind length for the next branch (if not in a8583lookbehind the value will be zero) and then advance past the vertical bar. */85848585lookbehindlength = META_DATA(*pptr);8586pptr++;8587}85888589PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */8590return 0; /* Avoid compiler warnings */8591}8592859385948595/*************************************************8596* Check for anchored pattern *8597*************************************************/85988599/* Try to find out if this is an anchored regular expression. Consider each8600alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket8601all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then8602it's anchored. However, if this is a multiline pattern, then only OP_SOD will8603be found, because ^ generates OP_CIRCM in that mode.86048605We can also consider a regex to be anchored if OP_SOM starts all its branches.8606This is the code for \G, which means "match at start of match position, taking8607into account the match offset".86088609A branch is also implicitly anchored if it starts with .* and DOTALL is set,8610because that will try the rest of the pattern at all possible matching points,8611so there is no point trying again.... er ....86128613.... except when the .* appears inside capturing parentheses, and there is a8614subsequent back reference to those parentheses. We haven't enough information8615to catch that case precisely.86168617At first, the best we could do was to detect when .* was in capturing brackets8618and the highest back reference was greater than or equal to that level.8619However, by keeping a bitmap of the first 31 back references, we can catch some8620of the more common cases more precisely.86218622... A second exception is when the .* appears inside an atomic group, because8623this prevents the number of characters it matches from being adjusted.86248625Arguments:8626code points to start of the compiled pattern8627bracket_map a bitmap of which brackets we are inside while testing; this8628handles up to substring 31; after that we just have to take8629the less precise approach8630cb points to the compile data block8631atomcount atomic group level8632inassert TRUE if in an assertion8633dotstar_anchor TRUE if automatic anchoring optimization is enabled86348635Returns: TRUE or FALSE8636*/86378638static BOOL8639is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,8640int atomcount, BOOL inassert, BOOL dotstar_anchor)8641{8642do {8643PCRE2_SPTR scode = first_significant_code(8644code + PRIV(OP_lengths)[*code], FALSE);8645int op = *scode;86468647/* Non-capturing brackets */86488649if (op == OP_BRA || op == OP_BRAPOS ||8650op == OP_SBRA || op == OP_SBRAPOS)8651{8652if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))8653return FALSE;8654}86558656/* Capturing brackets */86578658else if (op == OP_CBRA || op == OP_CBRAPOS ||8659op == OP_SCBRA || op == OP_SCBRAPOS)8660{8661int n = GET2(scode, 1+LINK_SIZE);8662uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);8663if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;8664}86658666/* Positive forward assertion */86678668else if (op == OP_ASSERT || op == OP_ASSERT_NA)8669{8670if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;8671}86728673/* Condition. If there is no second branch, it can't be anchored. */86748675else if (op == OP_COND || op == OP_SCOND)8676{8677if (scode[GET(scode,1)] != OP_ALT) return FALSE;8678if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))8679return FALSE;8680}86818682/* Atomic groups */86838684else if (op == OP_ONCE)8685{8686if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))8687return FALSE;8688}86898690/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and8691it isn't in brackets that are or may be referenced or inside an atomic8692group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,8693because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/8694with the subject "aab", which matches "b", i.e. not at the start of a line.8695There is also an option that disables auto-anchoring. */86968697else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||8698op == OP_TYPEPOSSTAR))8699{8700if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||8701atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)8702return FALSE;8703}87048705/* Check for explicit anchoring */87068707else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;87088709code += GET(code, 1);8710}8711while (*code == OP_ALT); /* Loop for each alternative */8712return TRUE;8713}8714871587168717/*************************************************8718* Check for starting with ^ or .* *8719*************************************************/87208721/* This is called to find out if every branch starts with ^ or .* so that8722"first char" processing can be done to speed things up in multiline8723matching and for non-DOTALL patterns that start with .* (which must start at8724the beginning or after \n). As in the case of is_anchored() (see above), we8725have to take account of back references to capturing brackets that contain .*8726because in that case we can't make the assumption. Also, the appearance of .*8727inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE8728or *SKIP does not count, because once again the assumption no longer holds.87298730Arguments:8731code points to start of the compiled pattern or a group8732bracket_map a bitmap of which brackets we are inside while testing; this8733handles up to substring 31; after that we just have to take8734the less precise approach8735cb points to the compile data8736atomcount atomic group level8737inassert TRUE if in an assertion8738dotstar_anchor TRUE if automatic anchoring optimization is enabled87398740Returns: TRUE or FALSE8741*/87428743static BOOL8744is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,8745int atomcount, BOOL inassert, BOOL dotstar_anchor)8746{8747do {8748PCRE2_SPTR scode = first_significant_code(8749code + PRIV(OP_lengths)[*code], FALSE);8750int op = *scode;87518752/* If we are at the start of a conditional assertion group, *both* the8753conditional assertion *and* what follows the condition must satisfy the test8754for start of line. Other kinds of condition fail. Note that there may be an8755auto-callout at the start of a condition. */87568757if (op == OP_COND)8758{8759scode += 1 + LINK_SIZE;87608761if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];8762else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);87638764switch (*scode)8765{8766case OP_CREF:8767case OP_DNCREF:8768case OP_RREF:8769case OP_DNRREF:8770case OP_FAIL:8771case OP_FALSE:8772case OP_TRUE:8773return FALSE;87748775default: /* Assertion */8776if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))8777return FALSE;8778do scode += GET(scode, 1); while (*scode == OP_ALT);8779scode += 1 + LINK_SIZE;8780break;8781}8782scode = first_significant_code(scode, FALSE);8783op = *scode;8784}87858786/* Non-capturing brackets */87878788if (op == OP_BRA || op == OP_BRAPOS ||8789op == OP_SBRA || op == OP_SBRAPOS)8790{8791if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))8792return FALSE;8793}87948795/* Capturing brackets */87968797else if (op == OP_CBRA || op == OP_CBRAPOS ||8798op == OP_SCBRA || op == OP_SCBRAPOS)8799{8800int n = GET2(scode, 1+LINK_SIZE);8801unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);8802if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))8803return FALSE;8804}88058806/* Positive forward assertions */88078808else if (op == OP_ASSERT || op == OP_ASSERT_NA)8809{8810if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))8811return FALSE;8812}88138814/* Atomic brackets */88158816else if (op == OP_ONCE)8817{8818if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))8819return FALSE;8820}88218822/* .* means "start at start or after \n" if it isn't in atomic brackets or8823brackets that may be referenced or an assertion, and as long as the pattern8824does not contain *PRUNE or *SKIP, because these break the feature. Consider,8825for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",8826i.e. not at the start of a line. There is also an option that disables this8827optimization. */88288829else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)8830{8831if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||8832atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)8833return FALSE;8834}88358836/* Check for explicit circumflex; anything else gives a FALSE result. Note8837in particular that this includes atomic brackets OP_ONCE because the number8838of characters matched by .* cannot be adjusted inside them. */88398840else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;88418842/* Move on to the next alternative */88438844code += GET(code, 1);8845}8846while (*code == OP_ALT); /* Loop for each alternative */8847return TRUE;8848}8849885088518852/*************************************************8853* Scan compiled regex for recursion reference *8854*************************************************/88558856/* This function scans through a compiled pattern until it finds an instance of8857OP_RECURSE.88588859Arguments:8860code points to start of expression8861utf TRUE in UTF mode88628863Returns: pointer to the opcode for OP_RECURSE, or NULL if not found8864*/88658866static PCRE2_UCHAR *8867find_recurse(PCRE2_UCHAR *code, BOOL utf)8868{8869for (;;)8870{8871PCRE2_UCHAR c = *code;8872if (c == OP_END) return NULL;8873if (c == OP_RECURSE) return code;88748875/* XCLASS is used for classes that cannot be represented just by a bit map.8876This includes negated single high-valued characters. ECLASS is used for8877classes that use set operations internally. CALLOUT_STR is used for8878callouts with string arguments. In each case the length in the table is8879zero; the actual length is stored in the compiled code. */88808881if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);8882else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);88838884/* Otherwise, we can get the item's length from the table, except that for8885repeated character types, we have to test for \p and \P, which have an extra8886two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,8887we must add in its length. */88888889else8890{8891switch(c)8892{8893case OP_TYPESTAR:8894case OP_TYPEMINSTAR:8895case OP_TYPEPLUS:8896case OP_TYPEMINPLUS:8897case OP_TYPEQUERY:8898case OP_TYPEMINQUERY:8899case OP_TYPEPOSSTAR:8900case OP_TYPEPOSPLUS:8901case OP_TYPEPOSQUERY:8902if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;8903break;89048905case OP_TYPEPOSUPTO:8906case OP_TYPEUPTO:8907case OP_TYPEMINUPTO:8908case OP_TYPEEXACT:8909if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)8910code += 2;8911break;89128913case OP_MARK:8914case OP_COMMIT_ARG:8915case OP_PRUNE_ARG:8916case OP_SKIP_ARG:8917case OP_THEN_ARG:8918code += code[1];8919break;8920}89218922/* Add in the fixed length from the table */89238924code += PRIV(OP_lengths)[c];89258926/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may8927be followed by a multi-unit character. The length in the table is a8928minimum, so we have to arrange to skip the extra units. */89298930#ifdef MAYBE_UTF_MULTI8931if (utf) switch(c)8932{8933case OP_CHAR:8934case OP_CHARI:8935case OP_NOT:8936case OP_NOTI:8937case OP_EXACT:8938case OP_EXACTI:8939case OP_NOTEXACT:8940case OP_NOTEXACTI:8941case OP_UPTO:8942case OP_UPTOI:8943case OP_NOTUPTO:8944case OP_NOTUPTOI:8945case OP_MINUPTO:8946case OP_MINUPTOI:8947case OP_NOTMINUPTO:8948case OP_NOTMINUPTOI:8949case OP_POSUPTO:8950case OP_POSUPTOI:8951case OP_NOTPOSUPTO:8952case OP_NOTPOSUPTOI:8953case OP_STAR:8954case OP_STARI:8955case OP_NOTSTAR:8956case OP_NOTSTARI:8957case OP_MINSTAR:8958case OP_MINSTARI:8959case OP_NOTMINSTAR:8960case OP_NOTMINSTARI:8961case OP_POSSTAR:8962case OP_POSSTARI:8963case OP_NOTPOSSTAR:8964case OP_NOTPOSSTARI:8965case OP_PLUS:8966case OP_PLUSI:8967case OP_NOTPLUS:8968case OP_NOTPLUSI:8969case OP_MINPLUS:8970case OP_MINPLUSI:8971case OP_NOTMINPLUS:8972case OP_NOTMINPLUSI:8973case OP_POSPLUS:8974case OP_POSPLUSI:8975case OP_NOTPOSPLUS:8976case OP_NOTPOSPLUSI:8977case OP_QUERY:8978case OP_QUERYI:8979case OP_NOTQUERY:8980case OP_NOTQUERYI:8981case OP_MINQUERY:8982case OP_MINQUERYI:8983case OP_NOTMINQUERY:8984case OP_NOTMINQUERYI:8985case OP_POSQUERY:8986case OP_POSQUERYI:8987case OP_NOTPOSQUERY:8988case OP_NOTPOSQUERYI:8989if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);8990break;8991}8992#else8993(void)(utf); /* Keep compiler happy by referencing function argument */8994#endif /* MAYBE_UTF_MULTI */8995}8996}8997}8998899990009001/*************************************************9002* Check for asserted fixed first code unit *9003*************************************************/90049005/* During compilation, the "first code unit" settings from forward assertions9006are discarded, because they can cause conflicts with actual literals that9007follow. However, if we end up without a first code unit setting for an9008unanchored pattern, it is worth scanning the regex to see if there is an9009initial asserted first code unit. If all branches start with the same asserted9010code unit, or with a non-conditional bracket all of whose alternatives start9011with the same asserted code unit (recurse ad lib), then we return that code9012unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with9013REQ_NONE in the flags.90149015Arguments:9016code points to start of compiled pattern9017flags points to the first code unit flags9018inassert non-zero if in an assertion90199020Returns: the fixed first code unit, or 0 with REQ_NONE in flags9021*/90229023static uint32_t9024find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)9025{9026uint32_t c = 0;9027uint32_t cflags = REQ_NONE;90289029*flags = REQ_NONE;9030do {9031uint32_t d;9032uint32_t dflags;9033int xl = (*code == OP_CBRA || *code == OP_SCBRA ||9034*code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;9035PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);9036PCRE2_UCHAR op = *scode;90379038switch(op)9039{9040default:9041return 0;90429043case OP_BRA:9044case OP_BRAPOS:9045case OP_CBRA:9046case OP_SCBRA:9047case OP_CBRAPOS:9048case OP_SCBRAPOS:9049case OP_ASSERT:9050case OP_ASSERT_NA:9051case OP_ONCE:9052case OP_SCRIPT_RUN:9053d = find_firstassertedcu(scode, &dflags, inassert +9054((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));9055if (dflags >= REQ_NONE) return 0;9056if (cflags >= REQ_NONE) { c = d; cflags = dflags; }9057else if (c != d || cflags != dflags) return 0;9058break;90599060case OP_EXACT:9061scode += IMM2_SIZE;9062/* Fall through */90639064case OP_CHAR:9065case OP_PLUS:9066case OP_MINPLUS:9067case OP_POSPLUS:9068if (inassert == 0) return 0;9069if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }9070else if (c != scode[1]) return 0;9071break;90729073case OP_EXACTI:9074scode += IMM2_SIZE;9075/* Fall through */90769077case OP_CHARI:9078case OP_PLUSI:9079case OP_MINPLUSI:9080case OP_POSPLUSI:9081if (inassert == 0) return 0;90829083/* If the character is more than one code unit long, we cannot set its9084first code unit when matching caselessly. Later scanning may pick up9085multiple code units. */90869087#ifdef SUPPORT_UNICODE9088#if PCRE2_CODE_UNIT_WIDTH == 89089if (scode[1] >= 0x80) return 0;9090#elif PCRE2_CODE_UNIT_WIDTH == 169091if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;9092#endif9093#endif90949095if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }9096else if (c != scode[1]) return 0;9097break;9098}90999100code += GET(code, 1);9101}9102while (*code == OP_ALT);91039104*flags = cflags;9105return c;9106}9107910891099110/*************************************************9111* Add an entry to the name/number table *9112*************************************************/91139114/* This function is called between compiling passes to add an entry to the9115name/number table, maintaining alphabetical order. Checking for permitted9116and forbidden duplicates has already been done.91179118Arguments:9119cb the compile data block9120name the name to add9121length the length of the name9122groupno the group number9123tablecount the count of names in the table so far91249125Returns: nothing9126*/91279128static void9129add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,9130unsigned int groupno, uint32_t tablecount)9131{9132uint32_t i;9133PCRE2_UCHAR *slot = cb->name_table;91349135for (i = 0; i < tablecount; i++)9136{9137int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));9138if (crc == 0 && slot[IMM2_SIZE+length] != 0)9139crc = -1; /* Current name is a substring */91409141/* Make space in the table and break the loop for an earlier name. For a9142duplicate or later name, carry on. We do this for duplicates so that in the9143simple case (when ?(| is not used) they are in order of their numbers. In all9144cases they are in the order in which they appear in the pattern. */91459146if (crc < 0)9147{9148(void)memmove(slot + cb->name_entry_size, slot,9149CU2BYTES((tablecount - i) * cb->name_entry_size));9150break;9151}91529153/* Continue the loop for a later or duplicate name */91549155slot += cb->name_entry_size;9156}91579158PUT2(slot, 0, groupno);9159memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));91609161/* Add a terminating zero and fill the rest of the slot with zeroes so that9162the memory is all initialized. Otherwise valgrind moans about uninitialized9163memory when saving serialized compiled patterns. */91649165memset(slot + IMM2_SIZE + length, 0,9166CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));9167}9168916991709171/*************************************************9172* Skip in parsed pattern *9173*************************************************/91749175/* This function is called to skip parts of the parsed pattern when finding the9176length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find9177the end of the branch, it is called to skip over an internal lookaround or9178(DEFINE) group, and it is also called to skip to the end of a class, during9179which it will never encounter nested groups (but there's no need to have9180special code for that).91819182When called to find the end of a branch or group, pptr must point to the first9183meta code inside the branch, not the branch-starting code. In other cases it9184can point to the item that causes the function to be called.91859186Arguments:9187pptr current pointer to skip from9188skiptype PSKIP_CLASS when skipping to end of class9189PSKIP_ALT when META_ALT ends the skip9190PSKIP_KET when only META_KET ends the skip91919192Returns: new value of pptr9193NULL if META_END is reached - should never occur9194or for an unknown meta value - likewise9195*/91969197static uint32_t *9198parsed_skip(uint32_t *pptr, uint32_t skiptype)9199{9200uint32_t nestlevel = 0;92019202for (;; pptr++)9203{9204uint32_t meta = META_CODE(*pptr);92059206switch(meta)9207{9208default: /* Just skip over most items */9209if (meta < META_END) continue; /* Literal */9210break;92119212case META_END:92139214/* The parsed regex is malformed; we have reached the end and did9215not find the end of the construct which we are skipping over. */92169217PCRE2_DEBUG_UNREACHABLE();9218return NULL;92199220/* The data for these items is variable in length. */92219222case META_BACKREF: /* Offset is present only if group >= 10 */9223if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;9224break;92259226case META_ESCAPE:9227if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)9228pptr += 1; /* Skip prop data */9229break;92309231case META_MARK: /* Add the length of the name. */9232case META_COMMIT_ARG:9233case META_PRUNE_ARG:9234case META_SKIP_ARG:9235case META_THEN_ARG:9236pptr += pptr[1];9237break;92389239/* These are the "active" items in this loop. */92409241case META_CLASS_END:9242if (skiptype == PSKIP_CLASS) return pptr;9243break;92449245case META_ATOMIC:9246case META_CAPTURE:9247case META_COND_ASSERT:9248case META_COND_DEFINE:9249case META_COND_NAME:9250case META_COND_NUMBER:9251case META_COND_RNAME:9252case META_COND_RNUMBER:9253case META_COND_VERSION:9254case META_SCS:9255case META_LOOKAHEAD:9256case META_LOOKAHEADNOT:9257case META_LOOKAHEAD_NA:9258case META_LOOKBEHIND:9259case META_LOOKBEHINDNOT:9260case META_LOOKBEHIND_NA:9261case META_NOCAPTURE:9262case META_SCRIPT_RUN:9263nestlevel++;9264break;92659266case META_ALT:9267if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;9268break;92699270case META_KET:9271if (nestlevel == 0) return pptr;9272nestlevel--;9273break;9274}92759276/* The extra data item length for each meta is in a table. */92779278meta = (meta >> 16) & 0x7fff;9279if (meta >= sizeof(meta_extra_lengths)) return NULL;9280pptr += meta_extra_lengths[meta];9281}92829283PCRE2_UNREACHABLE(); /* Control never reaches here */9284}9285928692879288/*************************************************9289* Find length of a parsed group *9290*************************************************/92919292/* This is called for nested groups within a branch of a lookbehind whose9293length is being computed. On entry, the pointer must be at the first element9294after the group initializing code. On exit it points to OP_KET. Caching is used9295to improve processing speed when the same capturing group occurs many times.92969297Arguments:9298pptrptr pointer to pointer in the parsed pattern9299minptr where to return the minimum length9300isinline FALSE if a reference or recursion; TRUE for inline group9301errcodeptr pointer to the errorcode9302lcptr pointer to the loop counter9303group number of captured group or -1 for a non-capturing group9304recurses chain of recurse_check to catch mutual recursion9305cb pointer to the compile data93069307Returns: the maximum group length or a negative number9308*/93099310static int9311get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,9312int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)9313{9314uint32_t *gi = cb->groupinfo + 2 * group;9315int branchlength, branchminlength;9316int grouplength = -1;9317int groupminlength = INT_MAX;93189319/* The cache can be used only if there is no possibility of there being two9320groups with the same number. We do not need to set the end pointer for a group9321that is being processed as a back reference or recursion, but we must do so for9322an inline group. */93239324if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)9325{9326uint32_t groupinfo = gi[0];9327if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;9328if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)9329{9330if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);9331*minptr = gi[1];9332return groupinfo & GI_FIXED_LENGTH_MASK;9333}9334}93359336/* Scan the group. In this case we find the end pointer of necessity. */93379338for(;;)9339{9340branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,9341recurses, cb);9342if (branchlength < 0) goto ISNOTFIXED;9343if (branchlength > grouplength) grouplength = branchlength;9344if (branchminlength < groupminlength) groupminlength = branchminlength;9345if (**pptrptr == META_KET) break;9346*pptrptr += 1; /* Skip META_ALT */9347}93489349if (group > 0)9350{9351gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);9352gi[1] = groupminlength;9353}93549355*minptr = groupminlength;9356return grouplength;93579358ISNOTFIXED:9359if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;9360return -1;9361}9362936393649365/*************************************************9366* Find length of a parsed branch *9367*************************************************/93689369/* Return fixed maximum and minimum lengths for a branch in a lookbehind,9370giving an error if the length is not limited. On entry, *pptrptr points to the9371first element inside the branch. On exit it is set to point to the ALT or KET.93729373Arguments:9374pptrptr pointer to pointer in the parsed pattern9375minptr where to return the minimum length9376errcodeptr pointer to error code9377lcptr pointer to loop counter9378recurses chain of recurse_check to catch mutual recursion9379cb pointer to compile block93809381Returns: the maximum length, or a negative value on error9382*/93839384static int9385get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,9386parsed_recurse_check *recurses, compile_block *cb)9387{9388int branchlength = 0;9389int branchminlength = 0;9390int grouplength, groupminlength;9391uint32_t lastitemlength = 0;9392uint32_t lastitemminlength = 0;9393uint32_t *pptr = *pptrptr;9394PCRE2_SIZE offset;9395parsed_recurse_check this_recurse;93969397/* A large and/or complex regex can take too long to process. This can happen9398more often when (?| groups are present in the pattern because their length9399cannot be cached. */94009401if ((*lcptr)++ > 2000)9402{9403*errcodeptr = ERR35; /* Lookbehind is too complicated */9404return -1;9405}94069407/* Scan the branch, accumulating the length. */94089409for (;; pptr++)9410{9411parsed_recurse_check *r;9412uint32_t *gptr, *gptrend;9413uint32_t escape;9414uint32_t min, max;9415uint32_t group = 0;9416uint32_t itemlength = 0;9417uint32_t itemminlength = 0;94189419if (*pptr < META_END)9420{9421itemlength = itemminlength = 1;9422}94239424else switch (META_CODE(*pptr))9425{9426case META_KET:9427case META_ALT:9428goto EXIT;94299430/* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the9431actual termination. */94329433case META_ACCEPT:9434case META_FAIL:9435pptr = parsed_skip(pptr, PSKIP_ALT);9436if (pptr == NULL) goto PARSED_SKIP_FAILED;9437goto EXIT;94389439case META_MARK:9440case META_COMMIT_ARG:9441case META_PRUNE_ARG:9442case META_SKIP_ARG:9443case META_THEN_ARG:9444pptr += pptr[1] + 1;9445break;94469447case META_CIRCUMFLEX:9448case META_COMMIT:9449case META_DOLLAR:9450case META_PRUNE:9451case META_SKIP:9452case META_THEN:9453break;94549455case META_OPTIONS:9456pptr += 2;9457break;94589459case META_BIGVALUE:9460itemlength = itemminlength = 1;9461pptr += 1;9462break;94639464case META_CLASS:9465case META_CLASS_NOT:9466itemlength = itemminlength = 1;9467pptr = parsed_skip(pptr, PSKIP_CLASS);9468if (pptr == NULL) goto PARSED_SKIP_FAILED;9469break;94709471case META_CLASS_EMPTY_NOT:9472case META_DOT:9473itemlength = itemminlength = 1;9474break;94759476case META_CALLOUT_NUMBER:9477pptr += 3;9478break;94799480case META_CALLOUT_STRING:9481pptr += 3 + SIZEOFFSET;9482break;94839484/* Only some escapes consume a character. Of those, \R can match one or two9485characters, but \X is never allowed because it matches an unknown number of9486characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */94879488case META_ESCAPE:9489escape = META_DATA(*pptr);9490if (escape == ESC_X) return -1;9491if (escape == ESC_R)9492{9493itemminlength = 1;9494itemlength = 2;9495}9496else if (escape > ESC_b && escape < ESC_Z)9497{9498#if PCRE2_CODE_UNIT_WIDTH != 329499if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)9500{9501*errcodeptr = ERR36;9502return -1;9503}9504#endif9505itemlength = itemminlength = 1;9506if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */9507}9508break;95099510/* Lookaheads do not contribute to the length of this branch, but they may9511contain lookbehinds within them whose lengths need to be set. */95129513case META_LOOKAHEAD:9514case META_LOOKAHEADNOT:9515case META_LOOKAHEAD_NA:9516case META_SCS:9517*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);9518if (*errcodeptr != 0) return -1;95199520/* Ignore any qualifiers that follow a lookahead assertion. */95219522switch (pptr[1])9523{9524case META_ASTERISK:9525case META_ASTERISK_PLUS:9526case META_ASTERISK_QUERY:9527case META_PLUS:9528case META_PLUS_PLUS:9529case META_PLUS_QUERY:9530case META_QUERY:9531case META_QUERY_PLUS:9532case META_QUERY_QUERY:9533pptr++;9534break;95359536case META_MINMAX:9537case META_MINMAX_PLUS:9538case META_MINMAX_QUERY:9539pptr += 3;9540break;95419542default:9543break;9544}9545break;95469547/* A nested lookbehind does not contribute any length to this lookbehind,9548but must itself be checked and have its lengths set. Note that9549set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket9550of the group, so no need to update it here. */95519552case META_LOOKBEHIND:9553case META_LOOKBEHINDNOT:9554case META_LOOKBEHIND_NA:9555if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))9556return -1;9557break;95589559/* Back references and recursions are handled by very similar code. At this9560stage, the names generated in the parsing pass are available, but the main9561name table has not yet been created. So for the named varieties, scan the9562list of names in order to get the number of the first one in the pattern,9563and whether or not this name is duplicated. */95649565case META_BACKREF_BYNAME:9566if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)9567goto ISNOTFIXED;9568/* Fall through */95699570case META_RECURSE_BYNAME:9571{9572int i;9573PCRE2_SPTR name;9574BOOL is_dupname = FALSE;9575named_group *ng = cb->named_groups;9576uint32_t meta_code = META_CODE(*pptr);9577uint32_t length = *(++pptr);95789579GETPLUSOFFSET(offset, pptr);9580name = cb->start_pattern + offset;9581for (i = 0; i < cb->names_found; i++, ng++)9582{9583if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)9584{9585group = ng->number;9586is_dupname = ng->isdup;9587break;9588}9589}95909591if (group == 0)9592{9593*errcodeptr = ERR15; /* Non-existent subpattern */9594cb->erroroffset = offset;9595return -1;9596}95979598/* A numerical back reference can be fixed length if duplicate capturing9599groups are not being used. A non-duplicate named back reference can also9600be handled. */96019602if (meta_code == META_RECURSE_BYNAME ||9603(!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))9604goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */9605}9606goto ISNOTFIXED; /* Duplicate name or number */96079608/* The offset values for back references < 10 are in a separate vector9609because otherwise they would use more than two parsed pattern elements on961064-bit systems. */96119612case META_BACKREF:9613if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||9614(cb->external_flags & PCRE2_DUPCAPUSED) != 0)9615goto ISNOTFIXED;9616group = META_DATA(*pptr);9617if (group < 10)9618{9619offset = cb->small_ref_offset[group];9620goto RECURSE_OR_BACKREF_LENGTH;9621}96229623/* Fall through */9624/* For groups >= 10 - picking up group twice does no harm. */96259626/* A true recursion implies not fixed length, but a subroutine call may9627be OK. Back reference "recursions" are also failed. */96289629case META_RECURSE:9630group = META_DATA(*pptr);9631GETPLUSOFFSET(offset, pptr);96329633RECURSE_OR_BACKREF_LENGTH:9634if (group > cb->bracount)9635{9636cb->erroroffset = offset;9637*errcodeptr = ERR15; /* Non-existent subpattern */9638return -1;9639}9640if (group == 0) goto ISNOTFIXED; /* Local recursion */9641for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)9642{9643if (META_CODE(*gptr) == META_BIGVALUE) gptr++;9644else if (*gptr == (META_CAPTURE | group)) break;9645}96469647/* We must start the search for the end of the group at the first meta code9648inside the group. Otherwise it will be treated as an enclosed group. */96499650gptrend = parsed_skip(gptr + 1, PSKIP_KET);9651if (gptrend == NULL) goto PARSED_SKIP_FAILED;9652if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */9653for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;9654if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */9655this_recurse.prev = recurses;9656this_recurse.groupptr = gptr;96579658/* We do not need to know the position of the end of the group, that is,9659gptr is not used after the call to get_grouplength(). Setting the second9660argument FALSE stops it scanning for the end when the length can be found9661in the cache. */96629663gptr++;9664grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,9665lcptr, group, &this_recurse, cb);9666if (grouplength < 0)9667{9668if (*errcodeptr == 0) goto ISNOTFIXED;9669return -1; /* Error already set */9670}9671itemlength = grouplength;9672itemminlength = groupminlength;9673break;96749675/* A (DEFINE) group is never obeyed inline and so it does not contribute to9676the length of this branch. Skip from the following item to the next9677unpaired ket. */96789679case META_COND_DEFINE:9680pptr = parsed_skip(pptr + 1, PSKIP_KET);9681break;96829683/* Check other nested groups - advance past the initial data for each type9684and then seek a fixed length with get_grouplength(). */96859686case META_COND_NAME:9687case META_COND_NUMBER:9688case META_COND_RNAME:9689case META_COND_RNUMBER:9690pptr += 2 + SIZEOFFSET;9691goto CHECK_GROUP;96929693case META_COND_ASSERT:9694pptr += 1;9695goto CHECK_GROUP;96969697case META_COND_VERSION:9698pptr += 4;9699goto CHECK_GROUP;97009701case META_CAPTURE:9702group = META_DATA(*pptr);9703/* Fall through */97049705case META_ATOMIC:9706case META_NOCAPTURE:9707case META_SCRIPT_RUN:9708pptr++;9709CHECK_GROUP:9710grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,9711lcptr, group, recurses, cb);9712if (grouplength < 0) return -1;9713itemlength = grouplength;9714itemminlength = groupminlength;9715break;97169717case META_QUERY:9718case META_QUERY_PLUS:9719case META_QUERY_QUERY:9720min = 0;9721max = 1;9722goto REPETITION;97239724/* Exact repetition is OK; variable repetition is not. A repetition of zero9725must subtract the length that has already been added. */97269727case META_MINMAX:9728case META_MINMAX_PLUS:9729case META_MINMAX_QUERY:9730min = pptr[1];9731max = pptr[2];9732pptr += 2;97339734REPETITION:9735if (max != REPEAT_UNLIMITED)9736{9737if (lastitemlength != 0 && /* Should not occur, but just in case */9738max != 0 &&9739(INT_MAX - branchlength)/lastitemlength < max - 1)9740{9741*errcodeptr = ERR87; /* Integer overflow; lookbehind too big */9742return -1;9743}9744if (min == 0) branchminlength -= lastitemminlength;9745else itemminlength = (min - 1) * lastitemminlength;9746if (max == 0) branchlength -= lastitemlength;9747else itemlength = (max - 1) * lastitemlength;9748break;9749}9750/* Fall through */97519752/* Any other item means this branch does not have a fixed length. */97539754default:9755ISNOTFIXED:9756*errcodeptr = ERR25; /* Not fixed length */9757return -1;9758}97599760/* Add the item length to the branchlength, checking for integer overflow and9761for the branch length exceeding the overall limit. Later, if there is at9762least one variable-length branch in the group, there is a test for the9763(smaller) variable-length branch length limit. */97649765if (INT_MAX - branchlength < (int)itemlength ||9766(branchlength += itemlength) > LOOKBEHIND_MAX)9767{9768*errcodeptr = ERR87;9769return -1;9770}97719772branchminlength += itemminlength;97739774/* Save this item length for use if the next item is a quantifier. */97759776lastitemlength = itemlength;9777lastitemminlength = itemminlength;9778}97799780EXIT:9781*pptrptr = pptr;9782*minptr = branchminlength;9783return branchlength;97849785PARSED_SKIP_FAILED:9786PCRE2_DEBUG_UNREACHABLE();9787*errcodeptr = ERR90; /* Unhandled META code - internal error */9788return -1;9789}9790979197929793/*************************************************9794* Set lengths in a lookbehind *9795*************************************************/97969797/* This function is called for each lookbehind, to set the lengths in its9798branches. An error occurs if any branch does not have a limited maximum length9799that is less than the limit (65535). On exit, the pointer must be left on the9800final ket.98019802The function also maintains the max_lookbehind value. Any lookbehind branch9803that contains a nested lookbehind may actually look further back than the9804length of the branch. The additional amount is passed back from9805get_branchlength() as an "extra" value.98069807Arguments:9808pptrptr pointer to pointer in the parsed pattern9809errcodeptr pointer to error code9810lcptr pointer to loop counter9811recurses chain of recurse_check to catch mutual recursion9812cb pointer to compile block98139814Returns: TRUE if all is well9815FALSE otherwise, with error code and offset set9816*/98179818static BOOL9819set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,9820parsed_recurse_check *recurses, compile_block *cb)9821{9822PCRE2_SIZE offset;9823uint32_t *bptr = *pptrptr;9824uint32_t *gbptr = bptr;9825int maxlength = 0;9826int minlength = INT_MAX;9827BOOL variable = FALSE;98289829READPLUSOFFSET(offset, bptr); /* Offset for error messages */9830*pptrptr += SIZEOFFSET;98319832/* Each branch can have a different maximum length, but we can keep only a9833single minimum for the whole group, because there's nowhere to save individual9834values in the META_ALT item. */98359836do9837{9838int branchlength, branchminlength;98399840*pptrptr += 1;9841branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,9842recurses, cb);98439844if (branchlength < 0)9845{9846/* The errorcode and offset may already be set from a nested lookbehind. */9847if (*errcodeptr == 0) *errcodeptr = ERR25;9848if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;9849return FALSE;9850}98519852if (branchlength != branchminlength) variable = TRUE;9853if (branchminlength < minlength) minlength = branchminlength;9854if (branchlength > maxlength) maxlength = branchlength;9855if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;9856*bptr |= branchlength; /* branchlength never more than 65535 */9857bptr = *pptrptr;9858}9859while (META_CODE(*bptr) == META_ALT);98609861/* If any branch is of variable length, the whole lookbehind is of variable9862length. If the maximum length of any branch exceeds the maximum for variable9863lookbehinds, give an error. Otherwise, the minimum length is set in the word9864that follows the original group META value. For a fixed-length lookbehind, this9865is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but9866possibly different) length. */98679868if (variable)9869{9870gbptr[1] = minlength;9871if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)9872{9873*errcodeptr = ERR100;9874cb->erroroffset = offset;9875return FALSE;9876}9877}9878else gbptr[1] = LOOKBEHIND_MAX;98799880return TRUE;9881}9882988398849885/*************************************************9886* Check parsed pattern lookbehinds *9887*************************************************/98889889/* This function is called at the end of parsing a pattern if any lookbehinds9890were encountered. It scans the parsed pattern for them, calling9891set_lookbehind_lengths() for each one. At the start, the errorcode is zero and9892the error offset is marked unset. The enables the functions above not to9893override settings from deeper nestings.98949895This function is called recursively from get_branchlength() for lookaheads in9896order to process any lookbehinds that they may contain. It stops when it hits a9897non-nested closing parenthesis in this case, returning a pointer to it.98989899Arguments9900pptr points to where to start (start of pattern or start of lookahead)9901retptr if not NULL, return the ket pointer here9902recurses chain of recurse_check to catch mutual recursion9903cb points to the compile block9904lcptr points to loop counter99059906Returns: 0 on success, or an errorcode (cb->erroroffset will be set)9907*/99089909static int9910check_lookbehinds(uint32_t *pptr, uint32_t **retptr,9911parsed_recurse_check *recurses, compile_block *cb, int *lcptr)9912{9913int errorcode = 0;9914int nestlevel = 0;99159916cb->erroroffset = PCRE2_UNSET;99179918for (; *pptr != META_END; pptr++)9919{9920if (*pptr < META_END) continue; /* Literal */99219922switch (META_CODE(*pptr))9923{9924default:99259926/* The following erroroffset is a bogus but safe value. This branch should9927be avoided by providing a proper implementation for all supported cases9928below. */99299930PCRE2_DEBUG_UNREACHABLE();9931cb->erroroffset = 0;9932return ERR70; /* Unrecognized meta code */99339934case META_ESCAPE:9935if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)9936pptr += 1; /* Skip prop data */9937break;99389939case META_KET:9940if (--nestlevel < 0)9941{9942if (retptr != NULL) *retptr = pptr;9943return 0;9944}9945break;99469947case META_ATOMIC:9948case META_CAPTURE:9949case META_COND_ASSERT:9950case META_SCS:9951case META_LOOKAHEAD:9952case META_LOOKAHEADNOT:9953case META_LOOKAHEAD_NA:9954case META_NOCAPTURE:9955case META_SCRIPT_RUN:9956nestlevel++;9957break;99589959case META_ACCEPT:9960case META_ALT:9961case META_ASTERISK:9962case META_ASTERISK_PLUS:9963case META_ASTERISK_QUERY:9964case META_BACKREF:9965case META_CIRCUMFLEX:9966case META_CLASS:9967case META_CLASS_EMPTY:9968case META_CLASS_EMPTY_NOT:9969case META_CLASS_END:9970case META_CLASS_NOT:9971case META_COMMIT:9972case META_DOLLAR:9973case META_DOT:9974case META_FAIL:9975case META_PLUS:9976case META_PLUS_PLUS:9977case META_PLUS_QUERY:9978case META_PRUNE:9979case META_QUERY:9980case META_QUERY_PLUS:9981case META_QUERY_QUERY:9982case META_RANGE_ESCAPED:9983case META_RANGE_LITERAL:9984case META_SKIP:9985case META_THEN:9986break;99879988case META_OFFSET:9989case META_RECURSE:9990pptr += SIZEOFFSET;9991break;99929993case META_BACKREF_BYNAME:9994case META_RECURSE_BYNAME:9995pptr += 1 + SIZEOFFSET;9996break;99979998case META_COND_DEFINE:9999pptr += SIZEOFFSET;10000nestlevel++;10001break;1000210003case META_COND_NAME:10004case META_COND_NUMBER:10005case META_COND_RNAME:10006case META_COND_RNUMBER:10007pptr += 1 + SIZEOFFSET;10008nestlevel++;10009break;1001010011case META_COND_VERSION:10012pptr += 3;10013nestlevel++;10014break;1001510016case META_CALLOUT_STRING:10017pptr += 3 + SIZEOFFSET;10018break;1001910020case META_BIGVALUE:10021case META_POSIX:10022case META_POSIX_NEG:10023case META_SCS_NAME:10024case META_SCS_NUMBER:10025pptr += 1;10026break;1002710028case META_MINMAX:10029case META_MINMAX_QUERY:10030case META_MINMAX_PLUS:10031case META_OPTIONS:10032pptr += 2;10033break;1003410035case META_CALLOUT_NUMBER:10036pptr += 3;10037break;1003810039case META_MARK:10040case META_COMMIT_ARG:10041case META_PRUNE_ARG:10042case META_SKIP_ARG:10043case META_THEN_ARG:10044pptr += 1 + pptr[1];10045break;1004610047/* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to10048the final ket of the group, so no need to update it here. */1004910050case META_LOOKBEHIND:10051case META_LOOKBEHINDNOT:10052case META_LOOKBEHIND_NA:10053if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))10054return errorcode;10055break;10056}10057}1005810059return 0;10060}10061100621006310064/*************************************************10065* External function to compile a pattern *10066*************************************************/1006710068/* This function reads a regular expression in the form of a string and returns10069a pointer to a block of store holding a compiled version of the expression.1007010071Arguments:10072pattern the regular expression10073patlen the length of the pattern, or PCRE2_ZERO_TERMINATED10074options option bits10075errorptr pointer to errorcode10076erroroffset pointer to error offset10077ccontext points to a compile context or is NULL1007810079Returns: pointer to compiled data block, or NULL on error,10080with errorcode and erroroffset set10081*/1008210083PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION10084pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,10085int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)10086{10087BOOL utf; /* Set TRUE for UTF mode */10088BOOL ucp; /* Set TRUE for UCP mode */10089BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */10090BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */10091pcre2_real_code *re = NULL; /* What we will return */10092compile_block cb; /* "Static" compile-time data */10093const uint8_t *tables; /* Char tables base pointer */1009410095PCRE2_UCHAR *code; /* Current pointer in compiled code */10096PCRE2_UCHAR * codestart; /* Start of compiled code */10097PCRE2_SPTR ptr; /* Current pointer in pattern */10098uint32_t *pptr; /* Current pointer in parsed pattern */1009910100PCRE2_SIZE length = 1; /* Allow for final END opcode */10101PCRE2_SIZE usedlength; /* Actual length used */10102PCRE2_SIZE re_blocksize; /* Size of memory block */10103PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */1010410105uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */10106uint32_t firstcu, reqcu; /* Value of first/req code unit */10107uint32_t setflags = 0; /* NL and BSR set flags */10108uint32_t xoptions; /* Flags from context, modified */1010910110uint32_t skipatstart; /* When checking (*UTF) etc */10111uint32_t limit_heap = UINT32_MAX;10112uint32_t limit_match = UINT32_MAX; /* Unset match limits */10113uint32_t limit_depth = UINT32_MAX;1011410115int newline = 0; /* Unset; can be set by the pattern */10116int bsr = 0; /* Unset; can be set by the pattern */10117int errorcode = 0; /* Initialize to avoid compiler warn */10118int regexrc; /* Return from compile */1011910120uint32_t i; /* Local loop counter */1012110122/* Enable all optimizations by default. */10123uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :10124PCRE2_OPTIMIZATION_ALL;1012510126/* Comments at the head of this file explain about these variables. */1012710128uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];10129uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];10130named_group named_groups[NAMED_GROUP_LIST_SIZE];1013110132/* The workspace is used in different ways in the different compiling phases.10133It needs to be 16-bit aligned for the preliminary parsing scan. */1013410135uint32_t c16workspace[C16_WORK_SIZE];10136PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;101371013810139/* -------------- Check arguments and set up the pattern ----------------- */1014010141/* There must be error code and offset pointers. */1014210143if (errorptr == NULL || erroroffset == NULL) return NULL;10144*errorptr = ERR0;10145*erroroffset = 0;1014610147/* There must be a pattern, but NULL is allowed with zero length. */1014810149if (pattern == NULL)10150{10151if (patlen == 0) pattern = (PCRE2_SPTR)""; else10152{10153*errorptr = ERR16;10154return NULL;10155}10156}1015710158/* A NULL compile context means "use a default context" */1015910160if (ccontext == NULL)10161ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));1016210163/* PCRE2_MATCH_INVALID_UTF implies UTF */1016410165if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;1016610167/* Check that all undefined public option bits are zero. */1016810169if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||10170(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)10171{10172*errorptr = ERR17;10173return NULL;10174}1017510176if ((options & PCRE2_LITERAL) != 0 &&10177((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||10178(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))10179{10180*errorptr = ERR92;10181return NULL;10182}1018310184/* A zero-terminated pattern is indicated by the special length value10185PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */1018610187if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))10188patlen = PRIV(strlen)(pattern);10189(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */1019010191if (patlen > ccontext->max_pattern_length)10192{10193*errorptr = ERR88;10194return NULL;10195}1019610197/* Optimization flags in 'options' can override those in the compile context.10198This is because some options to disable optimizations were added before the10199optimization flags word existed, and we need to continue supporting them10200for backwards compatibility. */1020110202if ((options & PCRE2_NO_AUTO_POSSESS) != 0)10203optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;10204if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)10205optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;10206if ((options & PCRE2_NO_START_OPTIMIZE) != 0)10207optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;1020810209/* From here on, all returns from this function should end up going via the10210EXIT label. */102111021210213/* ------------ Initialize the "static" compile data -------------- */1021410215tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);1021610217cb.lcc = tables + lcc_offset; /* Individual */10218cb.fcc = tables + fcc_offset; /* character */10219cb.cbits = tables + cbits_offset; /* tables */10220cb.ctypes = tables + ctypes_offset;1022110222cb.assert_depth = 0;10223cb.bracount = 0;10224cb.cx = ccontext;10225cb.dupnames = FALSE;10226cb.end_pattern = pattern + patlen;10227cb.erroroffset = 0;10228cb.external_flags = 0;10229cb.external_options = options;10230cb.groupinfo = stack_groupinfo;10231cb.had_recurse = FALSE;10232cb.lastcapture = 0;10233cb.max_lookbehind = 0; /* Max encountered */10234cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */10235cb.name_entry_size = 0;10236cb.name_table = NULL;10237cb.named_groups = named_groups;10238cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;10239cb.names_found = 0;10240cb.parens_depth = 0;10241cb.parsed_pattern = stack_parsed_pattern;10242cb.req_varyopt = 0;10243cb.start_code = cworkspace;10244cb.start_pattern = pattern;10245cb.start_workspace = cworkspace;10246cb.workspace_size = COMPILE_WORK_SIZE;10247#ifdef SUPPORT_WIDE_CHARS10248cb.cranges = NULL;10249cb.next_cranges = NULL;10250cb.char_lists_size = 0;10251#endif1025210253/* Maximum back reference and backref bitmap. The bitmap records up to 31 back10254references to help in deciding whether (.*) can be treated as anchored or not.10255*/1025610257cb.top_backref = 0;10258cb.backref_map = 0;1025910260/* Escape sequences \1 to \9 are always back references, but as they are only10261two characters long, only two elements can be used in the parsed_pattern10262vector. The first contains the reference, and we'd like to use the second to10263record the offset in the pattern, so that forward references to non-existent10264groups can be diagnosed later with an offset. However, on 64-bit systems,10265PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first10266occurrence of \1 to \9, indexed by the second parsed_pattern value. All other10267references have enough space for the offset to be put into the parsed pattern.10268*/1026910270for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;102711027210273/* --------------- Start looking at the pattern --------------- */1027410275/* Unless PCRE2_LITERAL is set, check for global one-time option settings at10276the start of the pattern, and remember the offset to the actual regex. With10277valgrind support, make the terminator of a zero-terminated pattern10278inaccessible. This catches bugs that would otherwise only show up for10279non-zero-terminated patterns. */1028010281#ifdef SUPPORT_VALGRIND10282if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));10283#endif1028410285xoptions = ccontext->extra_options;10286ptr = pattern;10287skipatstart = 0;1028810289if ((options & PCRE2_LITERAL) == 0)10290{10291while (patlen - skipatstart >= 2 &&10292ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&10293ptr[skipatstart+1] == CHAR_ASTERISK)10294{10295for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)10296{10297const pso *p = pso_list + i;1029810299if (patlen - skipatstart - 2 >= p->length &&10300PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)10301{10302uint32_t c, pp;1030310304skipatstart += p->length + 2;10305switch(p->type)10306{10307case PSO_OPT:10308cb.external_options |= p->value;10309break;1031010311case PSO_XOPT:10312xoptions |= p->value;10313break;1031410315case PSO_FLG:10316setflags |= p->value;10317break;1031810319case PSO_NL:10320newline = p->value;10321setflags |= PCRE2_NL_SET;10322break;1032310324case PSO_BSR:10325bsr = p->value;10326setflags |= PCRE2_BSR_SET;10327break;1032810329case PSO_LIMM:10330case PSO_LIMD:10331case PSO_LIMH:10332c = 0;10333pp = skipatstart;10334while (pp < patlen && IS_DIGIT(ptr[pp]))10335{10336if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */10337c = c*10 + (ptr[pp++] - CHAR_0);10338}10339if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)10340{10341errorcode = ERR60;10342ptr += pp;10343goto HAD_EARLY_ERROR;10344}10345if (p->type == PSO_LIMH) limit_heap = c;10346else if (p->type == PSO_LIMM) limit_match = c;10347else limit_depth = c;10348skipatstart = ++pp;10349break;1035010351case PSO_OPTMZ:10352optim_flags &= ~(p->value);1035310354/* For backward compatibility the three original VERBs to disable10355optimizations need to also update the corresponding bit in the10356external options. */1035710358switch(p->value)10359{10360case PCRE2_OPTIM_AUTO_POSSESS:10361cb.external_options |= PCRE2_NO_AUTO_POSSESS;10362break;1036310364case PCRE2_OPTIM_DOTSTAR_ANCHOR:10365cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;10366break;1036710368case PCRE2_OPTIM_START_OPTIMIZE:10369cb.external_options |= PCRE2_NO_START_OPTIMIZE;10370break;10371}1037210373break;1037410375default:10376/* All values in the enum need an explicit entry for this switch10377but until a better way to prevent coding mistakes is invented keep10378a catch all that triggers a debug build assert as a failsafe */10379PCRE2_DEBUG_UNREACHABLE();10380}10381break; /* Out of the table scan loop */10382}10383}10384if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */10385}10386PCRE2_ASSERT(skipatstart <= patlen);10387}1038810389/* End of pattern-start options; advance to start of real regex. */1039010391ptr += skipatstart;1039210393/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */1039410395#ifndef SUPPORT_UNICODE10396if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)10397{10398errorcode = ERR32;10399goto HAD_EARLY_ERROR;10400}10401#endif1040210403/* Check UTF. We have the original options in 'options', with that value as10404modified by (*UTF) etc in cb->external_options. The extra option10405PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the10406surrogate code points cannot be represented in UTF-16. */1040710408utf = (cb.external_options & PCRE2_UTF) != 0;10409if (utf)10410{10411if ((options & PCRE2_NEVER_UTF) != 0)10412{10413errorcode = ERR74;10414goto HAD_EARLY_ERROR;10415}10416if ((options & PCRE2_NO_UTF_CHECK) == 0 &&10417(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)10418goto HAD_ERROR; /* Offset was set by valid_utf() */1041910420#if PCRE2_CODE_UNIT_WIDTH == 1610421if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)10422{10423errorcode = ERR91;10424goto HAD_EARLY_ERROR;10425}10426#endif10427}1042810429/* Check UCP lockout. */1043010431ucp = (cb.external_options & PCRE2_UCP) != 0;10432if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)10433{10434errorcode = ERR75;10435goto HAD_EARLY_ERROR;10436}1043710438/* PCRE2_EXTRA_TURKISH_CASING checks */1043910440if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)10441{10442if (!utf && !ucp)10443{10444errorcode = ERR104;10445goto HAD_EARLY_ERROR;10446}1044710448#if PCRE2_CODE_UNIT_WIDTH == 810449if (!utf)10450{10451errorcode = ERR105;10452goto HAD_EARLY_ERROR;10453}10454#endif1045510456if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)10457{10458errorcode = ERR106;10459goto HAD_EARLY_ERROR;10460}10461}1046210463/* Process the BSR setting. */1046410465if (bsr == 0) bsr = ccontext->bsr_convention;1046610467/* Process the newline setting. */1046810469if (newline == 0) newline = ccontext->newline_convention;10470cb.nltype = NLTYPE_FIXED;10471switch(newline)10472{10473case PCRE2_NEWLINE_CR:10474cb.nllen = 1;10475cb.nl[0] = CHAR_CR;10476break;1047710478case PCRE2_NEWLINE_LF:10479cb.nllen = 1;10480cb.nl[0] = CHAR_NL;10481break;1048210483case PCRE2_NEWLINE_NUL:10484cb.nllen = 1;10485cb.nl[0] = CHAR_NUL;10486break;1048710488case PCRE2_NEWLINE_CRLF:10489cb.nllen = 2;10490cb.nl[0] = CHAR_CR;10491cb.nl[1] = CHAR_NL;10492break;1049310494case PCRE2_NEWLINE_ANY:10495cb.nltype = NLTYPE_ANY;10496break;1049710498case PCRE2_NEWLINE_ANYCRLF:10499cb.nltype = NLTYPE_ANYCRLF;10500break;1050110502default:10503PCRE2_DEBUG_UNREACHABLE();10504errorcode = ERR56;10505goto HAD_EARLY_ERROR;10506}1050710508/* Pre-scan the pattern to do two things: (1) Discover the named groups and10509their numerical equivalents, so that this information is always available for10510the remaining processing. (2) At the same time, parse the pattern and put a10511processed version into the parsed_pattern vector. This has escapes interpreted10512and comments removed (amongst other things). */1051310514/* Ensure that the parsed pattern buffer is big enough. For many smaller10515patterns the vector on the stack (which was set up above) can be used. */1051610517parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);1051810519/* Allow for 2x uint32_t at the start and 2 at the end, for10520PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */1052110522if ((ccontext->extra_options &10523(PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)10524parsed_size_needed += 4;1052510526/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */1052710528if ((options & PCRE2_AUTO_CALLOUT) != 0)10529parsed_size_needed += 4;1053010531parsed_size_needed += 1; /* For the final META_END */1053210533if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)10534{10535uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(10536parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);10537if (heap_parsed_pattern == NULL)10538{10539*errorptr = ERR21;10540goto EXIT;10541}10542cb.parsed_pattern = heap_parsed_pattern;10543}10544cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;1054510546/* Do the parsing scan. */1054710548errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);10549if (errorcode != 0) goto HAD_CB_ERROR;1055010551/* If there are any lookbehinds, scan the parsed pattern to figure out their10552lengths. Workspace is needed to remember whether numbered groups are or are not10553of limited length, and if limited, what the minimum and maximum lengths are.10554This caching saves re-computing the length of any group that is referenced more10555than once, which is particularly relevant when recursion is involved.10556Unnumbered groups do not have this exposure because they cannot be referenced.10557If there are sufficiently few groups, the default index vector on the stack, as10558set up above, can be used. Otherwise we have to get/free some heap memory. The10559vector must be initialized to zero. */1056010561if (has_lookbehind)10562{10563int loopcount = 0;10564if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)10565{10566cb.groupinfo = ccontext->memctl.malloc(10567(2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);10568if (cb.groupinfo == NULL)10569{10570errorcode = ERR21;10571cb.erroroffset = 0;10572goto HAD_CB_ERROR;10573}10574}10575memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));10576errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);10577if (errorcode != 0) goto HAD_CB_ERROR;10578}1057910580/* For debugging, there is a function that shows the parsed pattern vector. */1058110582#ifdef DEBUG_SHOW_PARSED10583fprintf(stderr, "+++ Pre-scan complete:\n");10584show_parsed(&cb);10585#endif1058610587/* For debugging capturing information this code can be enabled. */1058810589#ifdef DEBUG_SHOW_CAPTURES10590{10591named_group *ng = cb.named_groups;10592fprintf(stderr, "+++Captures: %d\n", cb.bracount);10593for (i = 0; i < cb.names_found; i++, ng++)10594{10595fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);10596}10597}10598#endif1059910600/* Pretend to compile the pattern while actually just accumulating the amount10601of memory required in the 'length' variable. This behaviour is triggered by10602passing a non-NULL final argument to compile_regex(). We pass a block of10603workspace (cworkspace) for it to compile parts of the pattern into; the10604compiled code is discarded when it is no longer needed, so hopefully this10605workspace will never overflow, though there is a test for its doing so.1060610607On error, errorcode will be set non-zero, so we don't need to look at the10608result of the function. The initial options have been put into the cb block,10609but we still have to pass a separate options variable (the first argument)10610because the options may change as the pattern is processed. */1061110612cb.erroroffset = patlen; /* For any subsequent errors that do not set it */10613pptr = cb.parsed_pattern;10614code = cworkspace;10615*code = OP_BRA;1061610617(void)compile_regex(cb.external_options, xoptions, &code, &pptr,10618&errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,10619&cb, &length);1062010621if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */1062210623/* This should be caught in compile_regex(), but just in case... */1062410625#if defined SUPPORT_WIDE_CHARS10626PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);10627if (length > MAX_PATTERN_SIZE ||10628MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))10629#else10630if (length > MAX_PATTERN_SIZE)10631#endif10632{10633errorcode = ERR20;10634goto HAD_CB_ERROR;10635}1063610637/* Compute the size of, then, if not too large, get and initialize the data10638block for storing the compiled pattern and names table. Integer overflow should10639no longer be possible because nowadays we limit the maximum value of10640cb.names_found and cb.name_entry_size. */1064110642re_blocksize =10643CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);1064410645#if defined SUPPORT_WIDE_CHARS10646if (cb.char_lists_size != 0)10647{10648#if PCRE2_CODE_UNIT_WIDTH != 3210649/* Align to 32 bit first. This ensures the10650allocated area will also be 32 bit aligned. */10651re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));10652#endif10653re_blocksize += cb.char_lists_size;10654}10655#endif1065610657re_blocksize += CU2BYTES(length);1065810659if (re_blocksize > ccontext->max_pattern_compiled_length)10660{10661errorcode = ERR101;10662goto HAD_CB_ERROR;10663}1066410665re_blocksize += sizeof(pcre2_real_code);10666re = (pcre2_real_code *)10667ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);10668if (re == NULL)10669{10670errorcode = ERR21;10671goto HAD_CB_ERROR;10672}1067310674/* The compiler may put padding at the end of the pcre2_real_code structure in10675order to round it up to a multiple of 4 or 8 bytes. This means that when a10676compiled pattern is copied (for example, when serialized) undefined bytes are10677read, and this annoys debuggers such as valgrind. To avoid this, we explicitly10678write to the last 8 bytes of the structure before setting the fields. */1067910680memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);10681re->memctl = ccontext->memctl;10682re->tables = tables;10683re->executable_jit = NULL;10684memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));10685re->blocksize = re_blocksize;10686re->code_start = re_blocksize - CU2BYTES(length);10687re->magic_number = MAGIC_NUMBER;10688re->compile_options = options;10689re->overall_options = cb.external_options;10690re->extra_options = xoptions;10691re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;10692re->limit_heap = limit_heap;10693re->limit_match = limit_match;10694re->limit_depth = limit_depth;10695re->first_codeunit = 0;10696re->last_codeunit = 0;10697re->bsr_convention = bsr;10698re->newline_convention = newline;10699re->max_lookbehind = 0;10700re->minlength = 0;10701re->top_bracket = 0;10702re->top_backref = 0;10703re->name_entry_size = cb.name_entry_size;10704re->name_count = cb.names_found;10705re->optimization_flags = optim_flags;1070610707/* The basic block is immediately followed by the name table, and the compiled10708code follows after that. */1070910710codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);1071110712/* Update the compile data block for the actual compile. The starting points of10713the name/number translation table and of the code are passed around in the10714compile data block. The start/end pattern and initial options are already set10715from the pre-compile phase, as is the name_entry_size field. */1071610717cb.parens_depth = 0;10718cb.assert_depth = 0;10719cb.lastcapture = 0;10720cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));10721cb.start_code = codestart;10722cb.req_varyopt = 0;10723cb.had_accept = FALSE;10724cb.had_pruneorskip = FALSE;10725#ifdef SUPPORT_WIDE_CHARS10726cb.char_lists_size = 0;10727#endif107281072910730/* If any named groups were found, create the name/number table from the list10731created in the pre-pass. */1073210733if (cb.names_found > 0)10734{10735named_group *ng = cb.named_groups;10736for (i = 0; i < cb.names_found; i++, ng++)10737add_name_to_table(&cb, ng->name, ng->length, ng->number, i);10738}1073910740/* Set up a starting, non-extracting bracket, then compile the expression. On10741error, errorcode will be set non-zero, so we don't need to look at the result10742of the function here. */1074310744pptr = cb.parsed_pattern;10745code = (PCRE2_UCHAR *)codestart;10746*code = OP_BRA;10747regexrc = compile_regex(re->overall_options, re->extra_options, &code,10748&pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,10749NULL, &cb, NULL);10750if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;10751re->top_bracket = cb.bracount;10752re->top_backref = cb.top_backref;10753re->max_lookbehind = cb.max_lookbehind;1075410755if (cb.had_accept)10756{10757reqcu = 0; /* Must disable after (*ACCEPT) */10758reqcuflags = REQ_NONE;10759re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */10760}1076110762/* Fill in the final opcode and check for disastrous overflow. If no overflow,10763but the estimated length exceeds the really used length, adjust the value of10764re->blocksize, and if valgrind support is configured, mark the extra allocated10765memory as unaddressable, so that any out-of-bound reads can be detected. */1076610767*code++ = OP_END;10768usedlength = code - codestart;10769if (usedlength > length)10770{10771PCRE2_DEBUG_UNREACHABLE();10772errorcode = ERR23; /* Overflow of code block - internal error */10773}10774else10775{10776re->blocksize -= CU2BYTES(length - usedlength);10777#ifdef SUPPORT_VALGRIND10778VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));10779#endif10780}1078110782/* Scan the pattern for recursion/subroutine calls and convert the group10783numbers into offsets. Maintain a small cache so that repeated groups containing10784recursions are efficiently handled. */1078510786#define RSCAN_CACHE_SIZE 81078710788if (errorcode == 0 && cb.had_recurse)10789{10790PCRE2_UCHAR *rcode;10791PCRE2_SPTR rgroup;10792unsigned int ccount = 0;10793int start = RSCAN_CACHE_SIZE;10794recurse_cache rc[RSCAN_CACHE_SIZE];1079510796for (rcode = find_recurse(codestart, utf);10797rcode != NULL;10798rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))10799{10800int p, groupnumber;1080110802groupnumber = (int)GET(rcode, 1);10803if (groupnumber == 0) rgroup = codestart; else10804{10805PCRE2_SPTR search_from = codestart;10806rgroup = NULL;10807for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)10808{10809if (groupnumber == rc[p].groupnumber)10810{10811rgroup = rc[p].group;10812break;10813}1081410815/* Group n+1 must always start to the right of group n, so we can save10816search time below when the new group number is greater than any of the10817previously found groups. */1081810819if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;10820}1082110822if (rgroup == NULL)10823{10824rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);10825if (rgroup == NULL)10826{10827PCRE2_DEBUG_UNREACHABLE();10828errorcode = ERR53;10829break;10830}10831if (--start < 0) start = RSCAN_CACHE_SIZE - 1;10832rc[start].groupnumber = groupnumber;10833rc[start].group = rgroup;10834if (ccount < RSCAN_CACHE_SIZE) ccount++;10835}10836}1083710838PUT(rcode, 1, (uint32_t)(rgroup - codestart));10839}10840}1084110842/* In rare debugging situations we sometimes need to look at the compiled code10843at this stage. */1084410845#ifdef DEBUG_CALL_PRINTINT10846pcre2_printint(re, stderr, TRUE);10847fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);10848#endif1084910850/* Unless disabled, check whether any single character iterators can be10851auto-possessified. The function overwrites the appropriate opcode values, so10852the type of the pointer must be cast. NOTE: the intermediate variable "temp" is10853used in this code because at least one compiler gives a warning about loss of10854"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the10855function call. */1085610857if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)10858{10859PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;10860if (PRIV(auto_possessify)(temp, &cb) != 0)10861{10862PCRE2_DEBUG_UNREACHABLE();10863errorcode = ERR80;10864}10865}1086610867/* Failed to compile, or error while post-processing. */1086810869if (errorcode != 0) goto HAD_CB_ERROR;1087010871/* Successful compile. If the anchored option was not passed, set it if10872we can determine that the pattern is anchored by virtue of ^ characters or \A10873or anything else, such as starting with non-atomic .* when DOTALL is set and10874there are no occurrences of *PRUNE or *SKIP (though there is an option to10875disable this case). */1087610877if ((re->overall_options & PCRE2_ANCHORED) == 0)10878{10879BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);10880if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))10881re->overall_options |= PCRE2_ANCHORED;10882}1088310884/* Set up the first code unit or startline flag, the required code unit, and10885then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE10886is disabled, as the data it would create will not be used. Note that a first code10887unit (but not the startline flag) is useful for anchored patterns because it10888can still give a quick "no match" and also avoid searching for a last code10889unit. */1089010891if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)10892{10893int minminlength = 0; /* For minimal minlength from first/required CU */1089410895/* If we do not have a first code unit, see if there is one that is asserted10896(these are not saved during the compile because they can cause conflicts with10897actual literals that follow). */1089810899if (firstcuflags >= REQ_NONE) {10900uint32_t assertedcuflags = 0;10901uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);10902/* It would be wrong to use the asserted first code unit as `firstcu` for10903* regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)10904* For that example, if we set both firstcu and reqcu to 'a', it would mean10905* the subject string needs to be at least 2 characters long, which is wrong.10906* With more analysis, we would be able to set firstcu in more cases. */10907if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {10908firstcu = assertedcu;10909firstcuflags = assertedcuflags;10910}10911}1091210913/* Save the data for a first code unit. The existence of one means the10914minimum length must be at least 1. */1091510916if (firstcuflags < REQ_NONE)10917{10918re->first_codeunit = firstcu;10919re->flags |= PCRE2_FIRSTSET;10920minminlength++;1092110922/* Handle caseless first code units. */1092310924if ((firstcuflags & REQ_CASELESS) != 0)10925{10926if (firstcu < 128 || (!utf && !ucp && firstcu < 255))10927{10928if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;10929}1093010931/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.10932In 8-bit UTF mode, code units in the range 128-255 are introductory code10933units and cannot have another case, but if UCP is set they may do. */1093410935#ifdef SUPPORT_UNICODE10936#if PCRE2_CODE_UNIT_WIDTH == 810937else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)10938re->flags |= PCRE2_FIRSTCASELESS;10939#else10940else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&10941UCD_OTHERCASE(firstcu) != firstcu)10942re->flags |= PCRE2_FIRSTCASELESS;10943#endif10944#endif /* SUPPORT_UNICODE */10945}10946}1094710948/* When there is no first code unit, for non-anchored patterns, see if we can10949set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all10950branches start with ^ and also when all branches start with non-atomic .* for10951non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option10952that disables this case.) */1095310954else if ((re->overall_options & PCRE2_ANCHORED) == 0)10955{10956BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);10957if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))10958re->flags |= PCRE2_STARTLINE;10959}1096010961/* Handle the "required code unit", if one is set. In the UTF case we can10962increment the minimum minimum length only if we are sure this really is a10963different character and not a non-starting code unit of the first character,10964because the minimum length count is in characters, not code units. */1096510966if (reqcuflags < REQ_NONE)10967{10968#if PCRE2_CODE_UNIT_WIDTH == 1610969if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */10970firstcuflags >= REQ_NONE || /* First not set */10971(firstcu & 0xf800) != 0xd800 || /* First not surrogate */10972(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */10973#elif PCRE2_CODE_UNIT_WIDTH == 810974if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */10975firstcuflags >= REQ_NONE || /* First not set */10976(firstcu & 0x80) == 0 || /* First is ASCII */10977(reqcu & 0x80) == 0) /* Req is ASCII */10978#endif10979{10980minminlength++;10981}1098210983/* In the case of an anchored pattern, set up the value only if it follows10984a variable length item in the pattern. */1098510986if ((re->overall_options & PCRE2_ANCHORED) == 0 ||10987(reqcuflags & REQ_VARY) != 0)10988{10989re->last_codeunit = reqcu;10990re->flags |= PCRE2_LASTSET;1099110992/* Handle caseless required code units as for first code units (above). */1099310994if ((reqcuflags & REQ_CASELESS) != 0)10995{10996if (reqcu < 128 || (!utf && !ucp && reqcu < 255))10997{10998if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;10999}11000#ifdef SUPPORT_UNICODE11001#if PCRE2_CODE_UNIT_WIDTH == 811002else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)11003re->flags |= PCRE2_LASTCASELESS;11004#else11005else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&11006UCD_OTHERCASE(reqcu) != reqcu)11007re->flags |= PCRE2_LASTCASELESS;11008#endif11009#endif /* SUPPORT_UNICODE */11010}11011}11012}1101311014/* Study the compiled pattern to set up information such as a bitmap of11015starting code units and a minimum matching length. */1101611017if (PRIV(study)(re) != 0)11018{11019PCRE2_DEBUG_UNREACHABLE();11020errorcode = ERR31;11021goto HAD_CB_ERROR;11022}1102311024/* If study() set a bitmap of starting code units, it implies a minimum11025length of at least one. */1102611027if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)11028minminlength = 1;1102911030/* If the minimum length set (or not set) by study() is less than the minimum11031implied by required code units, override it. */1103211033if (re->minlength < minminlength) re->minlength = minminlength;11034} /* End of start-of-match optimizations. */1103511036/* Control ends up here in all cases. When running under valgrind, make a11037pattern's terminating zero defined again. If memory was obtained for the parsed11038version of the pattern, free it before returning. Also free the list of named11039groups if a larger one had to be obtained, and likewise the group information11040vector. */1104111042#ifdef SUPPORT_UNICODE11043PCRE2_ASSERT(cb.cranges == NULL);11044#endif1104511046EXIT:11047#ifdef SUPPORT_VALGRIND11048if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));11049#endif11050if (cb.parsed_pattern != stack_parsed_pattern)11051ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);11052if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)11053ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);11054if (cb.groupinfo != stack_groupinfo)11055ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);1105611057return re; /* Will be NULL after an error */1105811059/* Errors discovered in parse_regex() set the offset value in the compile11060block. Errors discovered before it is called must compute it from the ptr11061value. After parse_regex() is called, the offset in the compile block is set to11062the end of the pattern, but certain errors in compile_regex() may reset it if11063an offset is available in the parsed pattern. */1106411065HAD_CB_ERROR:11066ptr = pattern + cb.erroroffset;1106711068HAD_EARLY_ERROR:11069PCRE2_ASSERT(ptr >= pattern); /* Ensure we don't return invalid erroroffset */11070PCRE2_ASSERT(ptr <= (pattern + patlen));11071*erroroffset = ptr - pattern;1107211073HAD_ERROR:11074*errorptr = errorcode;11075pcre2_code_free(re);11076re = NULL;1107711078#ifdef SUPPORT_WIDE_CHARS11079if (cb.cranges != NULL)11080{11081class_ranges* cranges = cb.cranges;11082do11083{11084class_ranges* next_cranges = cranges->next;11085cb.cx->memctl.free(cranges, cb.cx->memctl.memory_data);11086cranges = next_cranges;11087}11088while (cranges != NULL);11089}11090#endif11091goto EXIT;11092}1109311094/* These #undefs are here to enable unity builds with CMake. */1109511096#undef NLBLOCK /* Block containing newline information */11097#undef PSSTART /* Field containing processed string start */11098#undef PSEND /* Field containing processed string end */1109911100/* End of pcre2_compile.c */111011110211103