Path: blob/master/thirdparty/pcre2/src/pcre2_auto_possess.c
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/3940/* This module contains functions that scan a compiled pattern and change41repeats into possessive repeats where possible. */424344#ifdef HAVE_CONFIG_H45#include "config.h"46#endif474849#include "pcre2_internal.h"5051/* This macro represents the max size of list[] and that is used to keep52track of UCD info in several places, it should be kept on sync with the53value used by GenerateUcd.py */54#define MAX_LIST 85556/*************************************************57* Tables for auto-possessification *58*************************************************/5960/* This table is used to check whether auto-possessification is possible61between adjacent character-type opcodes. The left-hand (repeated) opcode is62used to select the row, and the right-hand opcode is use to select the column.63A value of 1 means that auto-possessification is OK. For example, the second64value in the first row means that \D+\d can be turned into \D++\d.6566The Unicode property types (\P and \p) have to be present to fill out the table67because of what their opcode values are, but the table values should always be68zero because property types are handled separately in the code. The last four69columns apply to items that cannot be repeated, so there is no need to have70rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is71*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */7273#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)74#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)7576static const uint8_t autoposstab[APTROWS][APTCOLS] = {77/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */78{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */79{ 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */80{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */81{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */82{ 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */83{ 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */84{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */85{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */86{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */87{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */88{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */89{ 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */90{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */91{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */92{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */93{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */94{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */95};9697#ifdef SUPPORT_UNICODE98/* This table is used to check whether auto-possessification is possible99between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The100left-hand (repeated) opcode is used to select the row, and the right-hand101opcode is used to select the column. The values are as follows:1021030 Always return FALSE (never auto-possessify)1041 Character groups are distinct (possessify if both are OP_PROP)1052 Check character categories in the same group (general or particular)1063 TRUE if the two opcodes are not the same (PROP vs NOTPROP)1071084 Check left general category vs right particular category1095 Check right general category vs left particular category1101116 Left alphanum vs right general category1127 Left space vs right general category1138 Left word vs right general category1141159 Right alphanum vs left general category11610 Right space vs left general category11711 Right word vs left general category11811912 Left alphanum vs right particular category12013 Left space vs right particular category12114 Left word vs right particular category12212315 Right alphanum vs left particular category12416 Right space vs left particular category12517 Right word vs left particular category126*/127128static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {129/* LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */130{ 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */131{ 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */132{ 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */133{ 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */134{ 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */135{ 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */136{ 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */137{ 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */138{ 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */139{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */140{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */141{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */142{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */143/* PT_ANY does not need a record. */144};145146/* This table is used to check whether auto-possessification is possible147between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one148specifies a general category and the other specifies a particular category. The149row is selected by the general category and the column by the particular150category. The value is 1 if the particular category is not part of the general151category. */152153static const uint8_t catposstab[7][30] = {154/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */155{ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */156{ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */157{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */158{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */159{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */160{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */161{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */162};163164/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against165a general or particular category. The properties in each row are those166that apply to the character set in question. Duplication means that a little167unnecessary work is done when checking, but this keeps things much simpler168because they can all use the same code. For more details see the comment where169this table is used.170171Note: SPACE and PXSPACE used to be different because Perl excluded VT from172"space", but from Perl 5.18 it's included, so both categories are treated the173same here. */174175static const uint8_t posspropstab[3][4] = {176{ ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */177{ ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */178{ ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */179};180#endif /* SUPPORT_UNICODE */181182183184#ifdef SUPPORT_UNICODE185/*************************************************186* Check a character and a property *187*************************************************/188189/* This function is called by compare_opcodes() when a property item is190adjacent to a fixed character.191192Arguments:193c the character194ptype the property type195pdata the data for the type196negated TRUE if it's a negated property (\P or \p{^)197198Returns: TRUE if auto-possessifying is OK199*/200201static BOOL202check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,203BOOL negated)204{205BOOL ok, rc;206const uint32_t *p;207const ucd_record *prop = GET_UCD(c);208209switch(ptype)210{211case PT_LAMP:212return (prop->chartype == ucp_Lu ||213prop->chartype == ucp_Ll ||214prop->chartype == ucp_Lt) == negated;215216case PT_GC:217return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;218219case PT_PC:220return (pdata == prop->chartype) == negated;221222case PT_SC:223return (pdata == prop->script) == negated;224225case PT_SCX:226ok = (pdata == prop->script227|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);228return ok == negated;229230/* These are specials */231232case PT_ALNUM:233return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||234PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;235236/* Perl space used to exclude VT, but from Perl 5.18 it is included, which237means that Perl space and POSIX space are now identical. PCRE was changed238at release 8.34. */239240case PT_SPACE: /* Perl space */241case PT_PXSPACE: /* POSIX space */242switch(c)243{244HSPACE_CASES:245VSPACE_CASES:246rc = negated;247break;248249default:250rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;251}252return rc;253254case PT_WORD:255return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||256PRIV(ucp_gentype)[prop->chartype] == ucp_N ||257c == CHAR_UNDERSCORE) == negated;258259case PT_CLIST:260p = PRIV(ucd_caseless_sets) + prop->caseset;261for (;;)262{263if (c < *p) return !negated;264if (c == *p++) return negated;265}266PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */267break;268269/* Haven't yet thought these through. */270271case PT_BIDICL:272return FALSE;273274case PT_BOOL:275return FALSE;276}277278return FALSE;279}280#endif /* SUPPORT_UNICODE */281282283284/*************************************************285* Base opcode of repeated opcodes *286*************************************************/287288/* Returns the base opcode for repeated single character type opcodes. If the289opcode is not a repeated character type, it returns with the original value.290291Arguments: c opcode292Returns: base opcode for the type293*/294295static PCRE2_UCHAR296get_repeat_base(PCRE2_UCHAR c)297{298return (c > OP_TYPEPOSUPTO)? c :299(c >= OP_TYPESTAR)? OP_TYPESTAR :300(c >= OP_NOTSTARI)? OP_NOTSTARI :301(c >= OP_NOTSTAR)? OP_NOTSTAR :302(c >= OP_STARI)? OP_STARI :303OP_STAR;304}305306307/*************************************************308* Fill the character property list *309*************************************************/310311/* Checks whether the code points to an opcode that can take part in auto-312possessification, and if so, fills a list with its properties.313314Arguments:315code points to start of expression316utf TRUE if in UTF mode317ucp TRUE if in UCP mode318fcc points to the case-flipping table319list points to output list320list[0] will be filled with the opcode321list[1] will be non-zero if this opcode322can match an empty character string323list[2..7] depends on the opcode324325Returns: points to the start of the next opcode if *code is accepted326NULL if *code is not accepted327*/328329static PCRE2_SPTR330get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,331uint32_t *list)332{333PCRE2_UCHAR c = *code;334PCRE2_UCHAR base;335PCRE2_SPTR end;336PCRE2_SPTR class_end;337uint32_t chr;338339#ifdef SUPPORT_UNICODE340uint32_t *clist_dest;341const uint32_t *clist_src;342#else343(void)utf; /* Suppress "unused parameter" compiler warnings */344(void)ucp;345#endif346347list[0] = c;348list[1] = FALSE;349code++;350351if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)352{353base = get_repeat_base(c);354c -= (base - OP_STAR);355356if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)357code += IMM2_SIZE;358359list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&360c != OP_POSPLUS);361362switch(base)363{364case OP_STAR:365list[0] = OP_CHAR;366break;367368case OP_STARI:369list[0] = OP_CHARI;370break;371372case OP_NOTSTAR:373list[0] = OP_NOT;374break;375376case OP_NOTSTARI:377list[0] = OP_NOTI;378break;379380case OP_TYPESTAR:381list[0] = *code;382code++;383break;384}385c = list[0];386}387388switch(c)389{390case OP_NOT_DIGIT:391case OP_DIGIT:392case OP_NOT_WHITESPACE:393case OP_WHITESPACE:394case OP_NOT_WORDCHAR:395case OP_WORDCHAR:396case OP_ANY:397case OP_ALLANY:398case OP_ANYNL:399case OP_NOT_HSPACE:400case OP_HSPACE:401case OP_NOT_VSPACE:402case OP_VSPACE:403case OP_EXTUNI:404case OP_EODN:405case OP_EOD:406case OP_DOLL:407case OP_DOLLM:408return code;409410case OP_CHAR:411case OP_NOT:412GETCHARINCTEST(chr, code);413list[2] = chr;414list[3] = NOTACHAR;415return code;416417case OP_CHARI:418case OP_NOTI:419list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;420GETCHARINCTEST(chr, code);421list[2] = chr;422423#ifdef SUPPORT_UNICODE424if (chr < 128 || (chr < 256 && !utf && !ucp))425list[3] = fcc[chr];426else427list[3] = UCD_OTHERCASE(chr);428#elif defined SUPPORT_WIDE_CHARS429list[3] = (chr < 256) ? fcc[chr] : chr;430#else431list[3] = fcc[chr];432#endif433434/* The othercase might be the same value. */435436if (chr == list[3])437list[3] = NOTACHAR;438else439list[4] = NOTACHAR;440return code;441442#ifdef SUPPORT_UNICODE443case OP_PROP:444case OP_NOTPROP:445if (code[0] != PT_CLIST)446{447list[2] = code[0];448list[3] = code[1];449return code + 2;450}451452/* Convert only if we have enough space. */453454clist_src = PRIV(ucd_caseless_sets) + code[1];455clist_dest = list + 2;456code += 2;457458do {459if (clist_dest >= list + MAX_LIST)460{461/* Early return if there is not enough space. GenerateUcd.py462generated a list with more than 5 characters and something463must be done about that going forward. */464PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */465list[2] = code[0];466list[3] = code[1];467return code;468}469*clist_dest++ = *clist_src;470}471while(*clist_src++ != NOTACHAR);472473/* All characters are stored. The terminating NOTACHAR is copied from the474clist itself. */475476list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;477return code;478#endif479480case OP_NCLASS:481case OP_CLASS:482#ifdef SUPPORT_WIDE_CHARS483case OP_XCLASS:484case OP_ECLASS:485if (c == OP_XCLASS || c == OP_ECLASS)486end = code + GET(code, 0) - 1;487else488#endif489end = code + 32 / sizeof(PCRE2_UCHAR);490class_end = end;491492switch(*end)493{494case OP_CRSTAR:495case OP_CRMINSTAR:496case OP_CRQUERY:497case OP_CRMINQUERY:498case OP_CRPOSSTAR:499case OP_CRPOSQUERY:500list[1] = TRUE;501end++;502break;503504case OP_CRPLUS:505case OP_CRMINPLUS:506case OP_CRPOSPLUS:507end++;508break;509510case OP_CRRANGE:511case OP_CRMINRANGE:512case OP_CRPOSRANGE:513list[1] = (GET2(end, 1) == 0);514end += 1 + 2 * IMM2_SIZE;515break;516}517list[2] = (uint32_t)(end - code);518list[3] = (uint32_t)(end - class_end);519return end;520}521522return NULL; /* Opcode not accepted */523}524525526527/*************************************************528* Scan further character sets for match *529*************************************************/530531/* Checks whether the base and the current opcode have a common character, in532which case the base cannot be possessified.533534Arguments:535code points to the byte code536utf TRUE in UTF mode537ucp TRUE in UCP mode538cb compile data block539base_list the data list of the base opcode540base_end the end of the base opcode541rec_limit points to recursion depth counter542543Returns: TRUE if the auto-possessification is possible544*/545546static BOOL547compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,548const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)549{550PCRE2_UCHAR c;551uint32_t list[MAX_LIST];552const uint32_t *chr_ptr;553const uint32_t *ochr_ptr;554const uint32_t *list_ptr;555PCRE2_SPTR next_code;556#ifdef SUPPORT_WIDE_CHARS557PCRE2_SPTR xclass_flags;558#endif559const uint8_t *class_bitset;560const uint8_t *set1, *set2, *set_end;561uint32_t chr;562BOOL accepted, invert_bits;563BOOL entered_a_group = FALSE;564565if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */566567/* Note: the base_list[1] contains whether the current opcode has a greedy568(represented by a non-zero value) quantifier. This is a different from569other character type lists, which store here that the character iterator570matches to an empty string (also represented by a non-zero value). */571572for(;;)573{574PCRE2_SPTR bracode;575576/* All operations move the code pointer forward.577Therefore infinite recursions are not possible. */578579c = *code;580581/* Skip over callouts */582583if (c == OP_CALLOUT)584{585code += PRIV(OP_lengths)[c];586continue;587}588589if (c == OP_CALLOUT_STR)590{591code += GET(code, 1 + 2*LINK_SIZE);592continue;593}594595/* At the end of a branch, skip to the end of the group and process it. */596597if (c == OP_ALT)598{599do code += GET(code, 1); while (*code == OP_ALT);600c = *code;601}602603/* Inspect the next opcode. */604605switch(c)606{607/* We can always possessify a greedy iterator at the end of the pattern,608which is reached after skipping over the final OP_KET. A non-greedy609iterator must never be possessified. */610611case OP_END:612return base_list[1] != 0;613614/* When an iterator is at the end of certain kinds of group we can inspect615what follows the group by skipping over the closing ket. Note that this616does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given617iteration is variable (could be another iteration or could be the next618item). As these two opcodes are not listed in the next switch, they will619end up as the next code to inspect, and return FALSE by virtue of being620unsupported. */621622case OP_KET:623case OP_KETRPOS:624/* The non-greedy case cannot be converted to a possessive form. */625626if (base_list[1] == 0) return FALSE;627628/* If the bracket is capturing it might be referenced by an OP_RECURSE629so its last iterator can never be possessified if the pattern contains630recursions. (This could be improved by keeping a list of group numbers that631are called by recursion.) */632633bracode = code - GET(code, 1);634switch(*bracode)635{636case OP_CBRA:637case OP_SCBRA:638case OP_CBRAPOS:639case OP_SCBRAPOS:640if (cb->had_recurse) return FALSE;641break;642643/* A script run might have to backtrack if the iterated item can match644characters from more than one script. So give up unless repeating an645explicit character. */646647case OP_SCRIPT_RUN:648if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)649return FALSE;650break;651652/* Atomic sub-patterns and forward assertions can always auto-possessify653their last iterator. However, if the group was entered as a result of654checking a previous iterator, this is not possible. */655656case OP_ASSERT:657case OP_ASSERT_NOT:658case OP_ONCE:659return !entered_a_group;660661/* Fixed-length lookbehinds can be treated the same way, but variable662length lookbehinds must not auto-possessify their last iterator. Note663that in order to identify a variable length lookbehind we must check664through all branches, because some may be of fixed length. */665666case OP_ASSERTBACK:667case OP_ASSERTBACK_NOT:668do669{670if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */671bracode += GET(bracode, 1);672}673while (*bracode == OP_ALT);674return !entered_a_group; /* Not variable length */675676/* Non-atomic assertions - don't possessify last iterator. This needs677more thought. */678679case OP_ASSERT_NA:680case OP_ASSERTBACK_NA:681return FALSE;682}683684/* Skip over the bracket and inspect what comes next. */685686code += PRIV(OP_lengths)[c];687continue;688689/* Handle cases where the next item is a group. */690691case OP_ONCE:692case OP_BRA:693case OP_CBRA:694next_code = code + GET(code, 1);695code += PRIV(OP_lengths)[c];696697/* Check each branch. We have to recurse a level for all but the last698branch. */699700while (*next_code == OP_ALT)701{702if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))703return FALSE;704code = next_code + 1 + LINK_SIZE;705next_code += GET(next_code, 1);706}707708entered_a_group = TRUE;709continue;710711case OP_BRAZERO:712case OP_BRAMINZERO:713714next_code = code + 1;715if (*next_code != OP_BRA && *next_code != OP_CBRA &&716*next_code != OP_ONCE) return FALSE;717718do next_code += GET(next_code, 1); while (*next_code == OP_ALT);719720/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */721722next_code += 1 + LINK_SIZE;723if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,724rec_limit))725return FALSE;726727code += PRIV(OP_lengths)[c];728continue;729730/* The next opcode does not need special handling; fall through and use it731to see if the base can be possessified. */732733default:734break;735}736737/* We now have the next appropriate opcode to compare with the base. Check738for a supported opcode, and load its properties. */739740code = get_chr_property_list(code, utf, ucp, cb->fcc, list);741if (code == NULL) return FALSE; /* Unsupported */742743/* If either opcode is a small character list, set pointers for comparing744characters from that list with another list, or with a property. */745746if (base_list[0] == OP_CHAR)747{748chr_ptr = base_list + 2;749list_ptr = list;750}751else if (list[0] == OP_CHAR)752{753chr_ptr = list + 2;754list_ptr = base_list;755}756757/* Character bitsets can also be compared to certain opcodes. */758759else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS760#if PCRE2_CODE_UNIT_WIDTH == 8761/* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */762|| (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))763#endif764)765{766#if PCRE2_CODE_UNIT_WIDTH == 8767if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))768#else769if (base_list[0] == OP_CLASS)770#endif771{772set1 = (const uint8_t *)(base_end - base_list[2]);773list_ptr = list;774}775else776{777set1 = (const uint8_t *)(code - list[2]);778list_ptr = base_list;779}780781invert_bits = FALSE;782switch(list_ptr[0])783{784case OP_CLASS:785case OP_NCLASS:786set2 = (const uint8_t *)787((list_ptr == list ? code : base_end) - list_ptr[2]);788break;789790#ifdef SUPPORT_WIDE_CHARS791case OP_XCLASS:792xclass_flags = (list_ptr == list ? code : base_end) -793list_ptr[2] + LINK_SIZE;794if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;795if ((*xclass_flags & XCL_MAP) == 0)796{797/* No bits are set for characters < 256. */798if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;799/* Might be an empty repeat. */800continue;801}802set2 = (const uint8_t *)(xclass_flags + 1);803break;804#endif805806case OP_NOT_DIGIT:807invert_bits = TRUE;808/* Fall through */809case OP_DIGIT:810set2 = (const uint8_t *)(cb->cbits + cbit_digit);811break;812813case OP_NOT_WHITESPACE:814invert_bits = TRUE;815/* Fall through */816case OP_WHITESPACE:817set2 = (const uint8_t *)(cb->cbits + cbit_space);818break;819820case OP_NOT_WORDCHAR:821invert_bits = TRUE;822/* Fall through */823case OP_WORDCHAR:824set2 = (const uint8_t *)(cb->cbits + cbit_word);825break;826827default:828return FALSE;829}830831/* Because the bit sets are unaligned bytes, we need to perform byte832comparison here. */833834set_end = set1 + 32;835if (invert_bits)836{837do838{839if ((*set1++ & ~(*set2++)) != 0) return FALSE;840}841while (set1 < set_end);842}843else844{845do846{847if ((*set1++ & *set2++) != 0) return FALSE;848}849while (set1 < set_end);850}851852if (list[1] == 0) return TRUE;853/* Might be an empty repeat. */854continue;855}856857/* Some property combinations also acceptable. Unicode property opcodes are858processed specially; the rest can be handled with a lookup table. */859860else861{862uint32_t leftop, rightop;863864leftop = base_list[0];865rightop = list[0];866867#ifdef SUPPORT_UNICODE868accepted = FALSE; /* Always set in non-unicode case. */869if (leftop == OP_PROP || leftop == OP_NOTPROP)870{871if (rightop == OP_EOD)872accepted = TRUE;873else if (rightop == OP_PROP || rightop == OP_NOTPROP)874{875int n;876const uint8_t *p;877BOOL same = leftop == rightop;878BOOL lisprop = leftop == OP_PROP;879BOOL risprop = rightop == OP_PROP;880BOOL bothprop = lisprop && risprop;881882/* There's a table that specifies how each combination is to be883processed:8840 Always return FALSE (never auto-possessify)8851 Character groups are distinct (possessify if both are OP_PROP)8862 Check character categories in the same group (general or particular)8873 Return TRUE if the two opcodes are not the same888... see comments below889*/890891n = propposstab[base_list[2]][list[2]];892switch(n)893{894case 0: break;895case 1: accepted = bothprop; break;896case 2: accepted = (base_list[3] == list[3]) != same; break;897case 3: accepted = !same; break;898899case 4: /* Left general category, right particular category */900accepted = risprop && catposstab[base_list[3]][list[3]] == same;901break;902903case 5: /* Right general category, left particular category */904accepted = lisprop && catposstab[list[3]][base_list[3]] == same;905break;906907/* This code is logically tricky. Think hard before fiddling with it.908The posspropstab table has four entries per row. Each row relates to909one of PCRE's special properties such as ALNUM or SPACE or WORD.910Only WORD actually needs all four entries, but using repeats for the911others means they can all use the same code below.912913The first two entries in each row are Unicode general categories, and914apply always, because all the characters they include are part of the915PCRE character set. The third and fourth entries are a general and a916particular category, respectively, that include one or more relevant917characters. One or the other is used, depending on whether the check918is for a general or a particular category. However, in both cases the919category contains more characters than the specials that are defined920for the property being tested against. Therefore, it cannot be used921in a NOTPROP case.922923Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.924Underscore is covered by ucp_P or ucp_Po. */925926case 6: /* Left alphanum vs right general category */927case 7: /* Left space vs right general category */928case 8: /* Left word vs right general category */929p = posspropstab[n-6];930accepted = risprop && lisprop ==931(list[3] != p[0] &&932list[3] != p[1] &&933(list[3] != p[2] || !lisprop));934break;935936case 9: /* Right alphanum vs left general category */937case 10: /* Right space vs left general category */938case 11: /* Right word vs left general category */939p = posspropstab[n-9];940accepted = lisprop && risprop ==941(base_list[3] != p[0] &&942base_list[3] != p[1] &&943(base_list[3] != p[2] || !risprop));944break;945946case 12: /* Left alphanum vs right particular category */947case 13: /* Left space vs right particular category */948case 14: /* Left word vs right particular category */949p = posspropstab[n-12];950accepted = risprop && lisprop ==951(catposstab[p[0]][list[3]] &&952catposstab[p[1]][list[3]] &&953(list[3] != p[3] || !lisprop));954break;955956case 15: /* Right alphanum vs left particular category */957case 16: /* Right space vs left particular category */958case 17: /* Right word vs left particular category */959p = posspropstab[n-15];960accepted = lisprop && risprop ==961(catposstab[p[0]][base_list[3]] &&962catposstab[p[1]][base_list[3]] &&963(base_list[3] != p[3] || !risprop));964break;965}966}967}968969else970#endif /* SUPPORT_UNICODE */971972accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&973rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&974autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];975976if (!accepted) return FALSE;977978if (list[1] == 0) return TRUE;979/* Might be an empty repeat. */980continue;981}982983/* Control reaches here only if one of the items is a small character list.984All characters are checked against the other side. */985986do987{988chr = *chr_ptr;989990switch(list_ptr[0])991{992case OP_CHAR:993ochr_ptr = list_ptr + 2;994do995{996if (chr == *ochr_ptr) return FALSE;997ochr_ptr++;998}999while(*ochr_ptr != NOTACHAR);1000break;10011002case OP_NOT:1003ochr_ptr = list_ptr + 2;1004do1005{1006if (chr == *ochr_ptr)1007break;1008ochr_ptr++;1009}1010while(*ochr_ptr != NOTACHAR);1011if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */1012break;10131014/* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*1015set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */10161017case OP_DIGIT:1018if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;1019break;10201021case OP_NOT_DIGIT:1022if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;1023break;10241025case OP_WHITESPACE:1026if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;1027break;10281029case OP_NOT_WHITESPACE:1030if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;1031break;10321033case OP_WORDCHAR:1034if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;1035break;10361037case OP_NOT_WORDCHAR:1038if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;1039break;10401041case OP_HSPACE:1042switch(chr)1043{1044HSPACE_CASES: return FALSE;1045default: break;1046}1047break;10481049case OP_NOT_HSPACE:1050switch(chr)1051{1052HSPACE_CASES: break;1053default: return FALSE;1054}1055break;10561057case OP_ANYNL:1058case OP_VSPACE:1059switch(chr)1060{1061VSPACE_CASES: return FALSE;1062default: break;1063}1064break;10651066case OP_NOT_VSPACE:1067switch(chr)1068{1069VSPACE_CASES: break;1070default: return FALSE;1071}1072break;10731074case OP_DOLL:1075case OP_EODN:1076switch (chr)1077{1078case CHAR_CR:1079case CHAR_LF:1080case CHAR_VT:1081case CHAR_FF:1082case CHAR_NEL:1083#ifndef EBCDIC1084case 0x2028:1085case 0x2029:1086#endif /* Not EBCDIC */1087return FALSE;1088}1089break;10901091case OP_EOD: /* Can always possessify before \z */1092break;10931094#ifdef SUPPORT_UNICODE1095case OP_PROP:1096case OP_NOTPROP:1097if (!check_char_prop(chr, list_ptr[2], list_ptr[3],1098list_ptr[0] == OP_NOTPROP))1099return FALSE;1100break;1101#endif11021103case OP_NCLASS:1104if (chr > 255) return FALSE;1105/* Fall through */11061107case OP_CLASS:1108if (chr > 255) break;1109class_bitset = (const uint8_t *)1110((list_ptr == list ? code : base_end) - list_ptr[2]);1111if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;1112break;11131114#ifdef SUPPORT_WIDE_CHARS1115case OP_XCLASS:1116if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -1117list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))1118return FALSE;1119break;11201121case OP_ECLASS:1122if (PRIV(eclass)(chr,1123(list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,1124(list_ptr == list ? code : base_end) - list_ptr[3],1125(const uint8_t*)cb->start_code, utf))1126return FALSE;1127break;1128#endif /* SUPPORT_WIDE_CHARS */11291130default:1131return FALSE;1132}11331134chr_ptr++;1135}1136while(*chr_ptr != NOTACHAR);11371138/* At least one character must be matched from this opcode. */11391140if (list[1] == 0) return TRUE;1141}11421143PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */1144return FALSE; /* Avoid compiler warnings */1145}1146114711481149/*************************************************1150* Scan compiled regex for auto-possession *1151*************************************************/11521153/* Replaces single character iterations with their possessive alternatives1154if appropriate. This function modifies the compiled opcode! Hitting a1155non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a1156bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches1157overly complicated or large patterns. In these cases, the check just stops,1158leaving the remainder of the pattern unpossessified.11591160Arguments:1161code points to start of the byte code1162cb compile data block11631164Returns: 0 for success1165-1 if a non-existant opcode is encountered1166*/11671168int1169PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)1170{1171PCRE2_UCHAR c;1172PCRE2_SPTR end;1173PCRE2_UCHAR *repeat_opcode;1174uint32_t list[MAX_LIST];1175int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */1176BOOL utf = (cb->external_options & PCRE2_UTF) != 0;1177BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;11781179for (;;)1180{1181c = *code;11821183if (c >= OP_TABLE_LENGTH)1184{1185PCRE2_DEBUG_UNREACHABLE();1186return -1; /* Something gone wrong */1187}11881189if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)1190{1191c -= get_repeat_base(c) - OP_STAR;1192end = (c <= OP_MINUPTO) ?1193get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;1194list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;11951196if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,1197&rec_limit))1198{1199switch(c)1200{1201case OP_STAR:1202*code += OP_POSSTAR - OP_STAR;1203break;12041205case OP_MINSTAR:1206*code += OP_POSSTAR - OP_MINSTAR;1207break;12081209case OP_PLUS:1210*code += OP_POSPLUS - OP_PLUS;1211break;12121213case OP_MINPLUS:1214*code += OP_POSPLUS - OP_MINPLUS;1215break;12161217case OP_QUERY:1218*code += OP_POSQUERY - OP_QUERY;1219break;12201221case OP_MINQUERY:1222*code += OP_POSQUERY - OP_MINQUERY;1223break;12241225case OP_UPTO:1226*code += OP_POSUPTO - OP_UPTO;1227break;12281229case OP_MINUPTO:1230*code += OP_POSUPTO - OP_MINUPTO;1231break;1232}1233}1234c = *code;1235}1236else if (c == OP_CLASS || c == OP_NCLASS1237#ifdef SUPPORT_WIDE_CHARS1238|| c == OP_XCLASS || c == OP_ECLASS1239#endif1240)1241{1242#ifdef SUPPORT_WIDE_CHARS1243if (c == OP_XCLASS || c == OP_ECLASS)1244repeat_opcode = code + GET(code, 1);1245else1246#endif1247repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));12481249c = *repeat_opcode;1250if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)1251{1252/* The return from get_chr_property_list() will never be NULL when1253*code (aka c) is one of the four class opcodes. However, gcc with1254-fanalyzer notes that a NULL return is possible, and grumbles. Hence we1255put in a check. */12561257end = get_chr_property_list(code, utf, ucp, cb->fcc, list);1258list[1] = (c & 1) == 0;12591260if (end != NULL &&1261compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))1262{1263switch (c)1264{1265case OP_CRSTAR:1266case OP_CRMINSTAR:1267*repeat_opcode = OP_CRPOSSTAR;1268break;12691270case OP_CRPLUS:1271case OP_CRMINPLUS:1272*repeat_opcode = OP_CRPOSPLUS;1273break;12741275case OP_CRQUERY:1276case OP_CRMINQUERY:1277*repeat_opcode = OP_CRPOSQUERY;1278break;12791280case OP_CRRANGE:1281case OP_CRMINRANGE:1282*repeat_opcode = OP_CRPOSRANGE;1283break;1284}1285}1286}1287c = *code;1288}12891290switch(c)1291{1292case OP_END:1293return 0;12941295case OP_TYPESTAR:1296case OP_TYPEMINSTAR:1297case OP_TYPEPLUS:1298case OP_TYPEMINPLUS:1299case OP_TYPEQUERY:1300case OP_TYPEMINQUERY:1301case OP_TYPEPOSSTAR:1302case OP_TYPEPOSPLUS:1303case OP_TYPEPOSQUERY:1304if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;1305break;13061307case OP_TYPEUPTO:1308case OP_TYPEMINUPTO:1309case OP_TYPEEXACT:1310case OP_TYPEPOSUPTO:1311if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)1312code += 2;1313break;13141315case OP_CALLOUT_STR:1316code += GET(code, 1 + 2*LINK_SIZE);1317break;13181319#ifdef SUPPORT_WIDE_CHARS1320case OP_XCLASS:1321case OP_ECLASS:1322code += GET(code, 1);1323break;1324#endif13251326case OP_MARK:1327case OP_COMMIT_ARG:1328case OP_PRUNE_ARG:1329case OP_SKIP_ARG:1330case OP_THEN_ARG:1331code += code[1];1332break;1333}13341335/* Add in the fixed length from the table */13361337code += PRIV(OP_lengths)[c];13381339/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be1340followed by a multi-byte character. The length in the table is a minimum, so1341we have to arrange to skip the extra code units. */13421343#ifdef MAYBE_UTF_MULTI1344if (utf) switch(c)1345{1346case OP_CHAR:1347case OP_CHARI:1348case OP_NOT:1349case OP_NOTI:1350case OP_STAR:1351case OP_MINSTAR:1352case OP_PLUS:1353case OP_MINPLUS:1354case OP_QUERY:1355case OP_MINQUERY:1356case OP_UPTO:1357case OP_MINUPTO:1358case OP_EXACT:1359case OP_POSSTAR:1360case OP_POSPLUS:1361case OP_POSQUERY:1362case OP_POSUPTO:1363case OP_STARI:1364case OP_MINSTARI:1365case OP_PLUSI:1366case OP_MINPLUSI:1367case OP_QUERYI:1368case OP_MINQUERYI:1369case OP_UPTOI:1370case OP_MINUPTOI:1371case OP_EXACTI:1372case OP_POSSTARI:1373case OP_POSPLUSI:1374case OP_POSQUERYI:1375case OP_POSUPTOI:1376case OP_NOTSTAR:1377case OP_NOTMINSTAR:1378case OP_NOTPLUS:1379case OP_NOTMINPLUS:1380case OP_NOTQUERY:1381case OP_NOTMINQUERY:1382case OP_NOTUPTO:1383case OP_NOTMINUPTO:1384case OP_NOTEXACT:1385case OP_NOTPOSSTAR:1386case OP_NOTPOSPLUS:1387case OP_NOTPOSQUERY:1388case OP_NOTPOSUPTO:1389case OP_NOTSTARI:1390case OP_NOTMINSTARI:1391case OP_NOTPLUSI:1392case OP_NOTMINPLUSI:1393case OP_NOTQUERYI:1394case OP_NOTMINQUERYI:1395case OP_NOTUPTOI:1396case OP_NOTMINUPTOI:1397case OP_NOTEXACTI:1398case OP_NOTPOSSTARI:1399case OP_NOTPOSPLUSI:1400case OP_NOTPOSQUERYI:1401case OP_NOTPOSUPTOI:1402if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);1403break;1404}1405#else1406(void)(utf); /* Keep compiler happy by referencing function argument */1407#endif /* SUPPORT_WIDE_CHARS */1408}1409}14101411/* End of pcre2_auto_possess.c */141214131414