Path: blob/master/thirdparty/pcre2/src/pcre2_auto_possess.c
21971 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041/* This module contains functions that scan a compiled pattern and change42repeats into possessive repeats where possible. */434445#include "pcre2_internal.h"46474849/* This macro represents the max size of list[] and that is used to keep50track of UCD info in several places, it should be kept on sync with the51value used by GenerateUcd.py */52#define MAX_LIST 85354/*************************************************55* Tables for auto-possessification *56*************************************************/5758/* This table is used to check whether auto-possessification is possible59between adjacent character-type opcodes. The left-hand (repeated) opcode is60used to select the row, and the right-hand opcode is use to select the column.61A value of 1 means that auto-possessification is OK. For example, the second62value in the first row means that \D+\d can be turned into \D++\d.6364The Unicode property types (\P and \p) have to be present to fill out the table65because of what their opcode values are, but the table values should always be66zero because property types are handled separately in the code. The last four67columns apply to items that cannot be repeated, so there is no need to have68rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is69*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */7071#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)72#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)7374static const uint8_t autoposstab[APTROWS][APTCOLS] = {75/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */76{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */77{ 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */78{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */79{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */80{ 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */81{ 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */82{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */83{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */84{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */85{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */86{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */87{ 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */88{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */89{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */90{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */91{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */92{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */93};9495#ifdef SUPPORT_UNICODE96/* This table is used to check whether auto-possessification is possible97between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The98left-hand (repeated) opcode is used to select the row, and the right-hand99opcode is used to select the column. The values are as follows:1001010 Always return FALSE (never auto-possessify)1021 Character groups are distinct (possessify if both are OP_PROP)1032 Check character categories in the same group (general or particular)1043 TRUE if the two opcodes are not the same (PROP vs NOTPROP)1051064 Check left general category vs right particular category1075 Check right general category vs left particular category1081096 Left alphanum vs right general category1107 Left space vs right general category1118 Left word vs right general category1121139 Right alphanum vs left general category11410 Right space vs left general category11511 Right word vs left general category11611712 Left alphanum vs right particular category11813 Left space vs right particular category11914 Left word vs right particular category12012115 Right alphanum vs left particular category12216 Right space vs left particular category12317 Right word vs left particular category124*/125126static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {127/* LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */128{ 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */129{ 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */130{ 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */131{ 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */132{ 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */133{ 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */134{ 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */135{ 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */136{ 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */137{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */138{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */139{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */140{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */141/* PT_ANY does not need a record. */142};143144/* This table is used to check whether auto-possessification is possible145between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one146specifies a general category and the other specifies a particular category. The147row is selected by the general category and the column by the particular148category. The value is 1 if the particular category is not part of the general149category. */150151static const uint8_t catposstab[7][30] = {152/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */153{ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */154{ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */155{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */156{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */157{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */158{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */159{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */160};161162/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against163a general or particular category. The properties in each row are those164that apply to the character set in question. Duplication means that a little165unnecessary work is done when checking, but this keeps things much simpler166because they can all use the same code. For more details see the comment where167this table is used.168169Note: SPACE and PXSPACE used to be different because Perl excluded VT from170"space", but from Perl 5.18 it's included, so both categories are treated the171same here. */172173static const uint8_t posspropstab[3][4] = {174{ ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */175{ ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */176{ ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */177};178#endif /* SUPPORT_UNICODE */179180181182#ifdef SUPPORT_UNICODE183/*************************************************184* Check a character and a property *185*************************************************/186187/* This function is called by compare_opcodes() when a property item is188adjacent to a fixed character.189190Arguments:191c the character192ptype the property type193pdata the data for the type194negated TRUE if it's a negated property (\P or \p{^)195196Returns: TRUE if auto-possessifying is OK197*/198199static BOOL200check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,201BOOL negated)202{203BOOL ok, rc;204const uint32_t *p;205const ucd_record *prop = GET_UCD(c);206207switch(ptype)208{209case PT_LAMP:210return (prop->chartype == ucp_Lu ||211prop->chartype == ucp_Ll ||212prop->chartype == ucp_Lt) == negated;213214case PT_GC:215return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;216217case PT_PC:218return (pdata == prop->chartype) == negated;219220case PT_SC:221return (pdata == prop->script) == negated;222223case PT_SCX:224ok = (pdata == prop->script225|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);226return ok == negated;227228/* These are specials */229230case PT_ALNUM:231return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||232PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;233234/* Perl space used to exclude VT, but from Perl 5.18 it is included, which235means that Perl space and POSIX space are now identical. PCRE was changed236at release 8.34. */237238case PT_SPACE: /* Perl space */239case PT_PXSPACE: /* POSIX space */240switch(c)241{242HSPACE_CASES:243VSPACE_CASES:244rc = negated;245break;246247default:248rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;249}250return rc;251252case PT_WORD:253return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||254PRIV(ucp_gentype)[prop->chartype] == ucp_N ||255c == CHAR_UNDERSCORE) == negated;256257case PT_CLIST:258p = PRIV(ucd_caseless_sets) + prop->caseset;259for (;;)260{261if (c < *p) return !negated;262if (c == *p++) return negated;263}264/* LCOV_EXCL_START */265PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */266break;267/* LCOV_EXCL_STOP */268269/* Haven't yet thought these through. */270271case PT_BIDICL:272return FALSE;273274case PT_BOOL:275return FALSE;276}277278return FALSE;279}280#endif /* SUPPORT_UNICODE */281282283284/*************************************************285* Base opcode of repeated opcodes *286*************************************************/287288/* Returns the base opcode for repeated single character type opcodes. If the289opcode is not a repeated character type, it returns with the original value.290291Arguments: c opcode292Returns: base opcode for the type293*/294295static PCRE2_UCHAR296get_repeat_base(PCRE2_UCHAR c)297{298return (c > OP_TYPEPOSUPTO)? c :299(c >= OP_TYPESTAR)? OP_TYPESTAR :300(c >= OP_NOTSTARI)? OP_NOTSTARI :301(c >= OP_NOTSTAR)? OP_NOTSTAR :302(c >= OP_STARI)? OP_STARI :303OP_STAR;304}305306307/*************************************************308* Fill the character property list *309*************************************************/310311/* Checks whether the code points to an opcode that can take part in auto-312possessification, and if so, fills a list with its properties.313314Arguments:315code points to start of expression316utf TRUE if in UTF mode317ucp TRUE if in UCP mode318fcc points to the case-flipping table319list points to output list320list[0] will be filled with the opcode321list[1] will be non-zero if this opcode322can match an empty character string323list[2..7] depends on the opcode324325Returns: points to the start of the next opcode if *code is accepted326NULL if *code is not accepted327*/328329static PCRE2_SPTR330get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,331uint32_t *list)332{333PCRE2_UCHAR c = *code;334PCRE2_UCHAR base;335PCRE2_SPTR end;336PCRE2_SPTR class_end;337uint32_t chr;338339#ifdef SUPPORT_UNICODE340uint32_t *clist_dest;341const uint32_t *clist_src;342#else343(void)utf; /* Suppress "unused parameter" compiler warnings */344(void)ucp;345#endif346347list[0] = c;348list[1] = FALSE;349code++;350351if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)352{353base = get_repeat_base(c);354c -= (base - OP_STAR);355356if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)357code += IMM2_SIZE;358359list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&360c != OP_POSPLUS);361362switch(base)363{364case OP_STAR:365list[0] = OP_CHAR;366break;367368case OP_STARI:369list[0] = OP_CHARI;370break;371372case OP_NOTSTAR:373list[0] = OP_NOT;374break;375376case OP_NOTSTARI:377list[0] = OP_NOTI;378break;379380case OP_TYPESTAR:381list[0] = *code;382code++;383break;384}385c = list[0];386}387388switch(c)389{390case OP_NOT_DIGIT:391case OP_DIGIT:392case OP_NOT_WHITESPACE:393case OP_WHITESPACE:394case OP_NOT_WORDCHAR:395case OP_WORDCHAR:396case OP_ANY:397case OP_ALLANY:398case OP_ANYNL:399case OP_NOT_HSPACE:400case OP_HSPACE:401case OP_NOT_VSPACE:402case OP_VSPACE:403case OP_EXTUNI:404case OP_EODN:405case OP_EOD:406case OP_DOLL:407case OP_DOLLM:408return code;409410case OP_CHAR:411case OP_NOT:412GETCHARINCTEST(chr, code);413list[2] = chr;414list[3] = NOTACHAR;415return code;416417case OP_CHARI:418case OP_NOTI:419list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;420GETCHARINCTEST(chr, code);421list[2] = chr;422423#ifdef SUPPORT_UNICODE424if (chr < 128 || (chr < 256 && !utf && !ucp))425list[3] = fcc[chr];426else427list[3] = UCD_OTHERCASE(chr);428#elif defined SUPPORT_WIDE_CHARS429list[3] = (chr < 256) ? fcc[chr] : chr;430#else431list[3] = fcc[chr];432#endif433434/* The othercase might be the same value. */435436if (chr == list[3])437list[3] = NOTACHAR;438else439list[4] = NOTACHAR;440return code;441442#ifdef SUPPORT_UNICODE443case OP_PROP:444case OP_NOTPROP:445if (code[0] != PT_CLIST)446{447list[2] = code[0];448list[3] = code[1];449return code + 2;450}451452/* Convert only if we have enough space. */453454clist_src = PRIV(ucd_caseless_sets) + code[1];455clist_dest = list + 2;456code += 2;457458do {459if (clist_dest >= list + MAX_LIST)460{461/* Early return if there is not enough space. GenerateUcd.py462generated a list with more than 5 characters and something463must be done about that going forward. */464PCRE2_DEBUG_UNREACHABLE(); /* Remove if it ever triggers */465list[2] = code[0];466list[3] = code[1];467return code;468}469*clist_dest++ = *clist_src;470}471while(*clist_src++ != NOTACHAR);472473/* All characters are stored. The terminating NOTACHAR is copied from the474clist itself. */475476list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;477return code;478#endif479480case OP_NCLASS:481case OP_CLASS:482#ifdef SUPPORT_WIDE_CHARS483case OP_XCLASS:484case OP_ECLASS:485if (c == OP_XCLASS || c == OP_ECLASS)486end = code + GET(code, 0) - 1;487else488#endif489end = code + 32 / sizeof(PCRE2_UCHAR);490class_end = end;491492switch(*end)493{494case OP_CRSTAR:495case OP_CRMINSTAR:496case OP_CRQUERY:497case OP_CRMINQUERY:498case OP_CRPOSSTAR:499case OP_CRPOSQUERY:500list[1] = TRUE;501end++;502break;503504case OP_CRPLUS:505case OP_CRMINPLUS:506case OP_CRPOSPLUS:507end++;508break;509510case OP_CRRANGE:511case OP_CRMINRANGE:512case OP_CRPOSRANGE:513list[1] = (GET2(end, 1) == 0);514end += 1 + 2 * IMM2_SIZE;515break;516}517list[2] = (uint32_t)(end - code);518list[3] = (uint32_t)(end - class_end);519return end;520}521522return NULL; /* Opcode not accepted */523}524525526527/*************************************************528* Scan further character sets for match *529*************************************************/530531/* Checks whether the base and the current opcode have a common character, in532which case the base cannot be possessified.533534Arguments:535code points to the byte code536utf TRUE in UTF mode537ucp TRUE in UCP mode538cb compile data block539base_list the data list of the base opcode540base_end the end of the base opcode541rec_limit points to recursion depth counter542543Returns: TRUE if the auto-possessification is possible544*/545546static BOOL547compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,548const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)549{550PCRE2_UCHAR c;551uint32_t list[MAX_LIST];552const uint32_t *chr_ptr;553const uint32_t *ochr_ptr;554const uint32_t *list_ptr;555PCRE2_SPTR next_code;556#ifdef SUPPORT_WIDE_CHARS557PCRE2_SPTR xclass_flags;558#endif559const uint8_t *class_bitset;560const uint8_t *set1, *set2, *set_end;561uint32_t chr;562BOOL accepted, invert_bits;563BOOL entered_a_group = FALSE;564565if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */566567/* Note: the base_list[1] contains whether the current opcode has a greedy568(represented by a non-zero value) quantifier. This is a different from569other character type lists, which store here that the character iterator570matches to an empty string (also represented by a non-zero value). */571572for(;;)573{574PCRE2_SPTR bracode;575576/* All operations move the code pointer forward.577Therefore infinite recursions are not possible. */578579c = *code;580581/* Skip over callouts */582583if (c == OP_CALLOUT)584{585code += PRIV(OP_lengths)[c];586continue;587}588589if (c == OP_CALLOUT_STR)590{591code += GET(code, 1 + 2*LINK_SIZE);592continue;593}594595/* At the end of a branch, skip to the end of the group and process it. */596597if (c == OP_ALT)598{599do code += GET(code, 1); while (*code == OP_ALT);600c = *code;601}602603/* Inspect the next opcode. */604605switch(c)606{607/* We can always possessify a greedy iterator at the end of the pattern,608which is reached after skipping over the final OP_KET. A non-greedy609iterator must never be possessified. */610611case OP_END:612return base_list[1] != 0;613614/* When an iterator is at the end of certain kinds of group we can inspect615what follows the group by skipping over the closing ket. Note that this616does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given617iteration is variable (could be another iteration or could be the next618item). As these two opcodes are not listed in the next switch, they will619end up as the next code to inspect, and return FALSE by virtue of being620unsupported. */621622case OP_KET:623case OP_KETRPOS:624/* The non-greedy case cannot be converted to a possessive form. */625626if (base_list[1] == 0) return FALSE;627628/* If the bracket is capturing it might be referenced by an OP_RECURSE629so its last iterator can never be possessified if the pattern contains630recursions. (This could be improved by keeping a list of group numbers that631are called by recursion.) */632633bracode = code - GET(code, 1);634switch(*bracode)635{636case OP_CBRA:637case OP_SCBRA:638case OP_CBRAPOS:639case OP_SCBRAPOS:640if (cb->had_recurse) return FALSE;641break;642643/* A script run might have to backtrack if the iterated item can match644characters from more than one script. So give up unless repeating an645explicit character. */646647case OP_SCRIPT_RUN:648if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)649return FALSE;650break;651652/* Atomic sub-patterns and forward assertions can always auto-possessify653their last iterator. However, if the group was entered as a result of654checking a previous iterator, this is not possible. */655656case OP_ASSERT:657case OP_ASSERT_NOT:658case OP_ONCE:659return !entered_a_group;660661/* Fixed-length lookbehinds can be treated the same way, but variable662length lookbehinds must not auto-possessify their last iterator. Note663that in order to identify a variable length lookbehind we must check664through all branches, because some may be of fixed length. */665666case OP_ASSERTBACK:667case OP_ASSERTBACK_NOT:668do669{670if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE; /* Variable */671bracode += GET(bracode, 1);672}673while (*bracode == OP_ALT);674return !entered_a_group; /* Not variable length */675676/* Non-atomic assertions - don't possessify last iterator. This needs677more thought. */678679case OP_ASSERT_NA:680case OP_ASSERTBACK_NA:681return FALSE;682}683684/* Skip over the bracket and inspect what comes next. */685686code += PRIV(OP_lengths)[c];687continue;688689/* Handle cases where the next item is a group. */690691case OP_ONCE:692case OP_BRA:693case OP_CBRA:694next_code = code + GET(code, 1);695code += PRIV(OP_lengths)[c];696697/* Check each branch. We have to recurse a level for all but the last698branch. */699700while (*next_code == OP_ALT)701{702if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))703return FALSE;704code = next_code + 1 + LINK_SIZE;705next_code += GET(next_code, 1);706}707708entered_a_group = TRUE;709continue;710711case OP_BRAZERO:712case OP_BRAMINZERO:713714next_code = code + 1;715if (*next_code != OP_BRA && *next_code != OP_CBRA &&716*next_code != OP_ONCE) return FALSE;717718do next_code += GET(next_code, 1); while (*next_code == OP_ALT);719720/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */721722next_code += 1 + LINK_SIZE;723if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,724rec_limit))725return FALSE;726727code += PRIV(OP_lengths)[c];728continue;729730/* The next opcode does not need special handling; fall through and use it731to see if the base can be possessified. */732733default:734break;735}736737/* We now have the next appropriate opcode to compare with the base. Check738for a supported opcode, and load its properties. */739740code = get_chr_property_list(code, utf, ucp, cb->fcc, list);741if (code == NULL) return FALSE; /* Unsupported */742743/* If either opcode is a small character list, set pointers for comparing744characters from that list with another list, or with a property. */745746if (base_list[0] == OP_CHAR)747{748chr_ptr = base_list + 2;749list_ptr = list;750}751else if (list[0] == OP_CHAR)752{753chr_ptr = list + 2;754list_ptr = base_list;755}756757/* Character bitsets can also be compared to certain opcodes. */758759else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS760#if PCRE2_CODE_UNIT_WIDTH == 8761/* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */762|| (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))763#endif764)765{766#if PCRE2_CODE_UNIT_WIDTH == 8767if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))768#else769if (base_list[0] == OP_CLASS)770#endif771{772set1 = (const uint8_t *)(base_end - base_list[2]);773list_ptr = list;774}775else776{777set1 = (const uint8_t *)(code - list[2]);778list_ptr = base_list;779}780781invert_bits = FALSE;782switch(list_ptr[0])783{784case OP_CLASS:785case OP_NCLASS:786set2 = (const uint8_t *)787((list_ptr == list ? code : base_end) - list_ptr[2]);788break;789790#ifdef SUPPORT_WIDE_CHARS791case OP_XCLASS:792xclass_flags = (list_ptr == list ? code : base_end) -793list_ptr[2] + LINK_SIZE;794if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;795if ((*xclass_flags & XCL_MAP) == 0)796{797/* No bits are set for characters < 256. */798if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;799/* Might be an empty repeat. */800continue;801}802set2 = (const uint8_t *)(xclass_flags + 1);803break;804#endif805806case OP_NOT_DIGIT:807invert_bits = TRUE;808PCRE2_FALLTHROUGH /* Fall through */809case OP_DIGIT:810set2 = (const uint8_t *)(cb->cbits + cbit_digit);811break;812813case OP_NOT_WHITESPACE:814invert_bits = TRUE;815PCRE2_FALLTHROUGH /* Fall through */816case OP_WHITESPACE:817set2 = (const uint8_t *)(cb->cbits + cbit_space);818break;819820case OP_NOT_WORDCHAR:821invert_bits = TRUE;822PCRE2_FALLTHROUGH /* Fall through */823case OP_WORDCHAR:824set2 = (const uint8_t *)(cb->cbits + cbit_word);825break;826827default:828return FALSE;829}830831/* Because the bit sets are unaligned bytes, we need to perform byte832comparison here. */833834set_end = set1 + 32;835if (invert_bits)836{837do838{839if ((*set1++ & ~(*set2++)) != 0) return FALSE;840}841while (set1 < set_end);842}843else844{845do846{847if ((*set1++ & *set2++) != 0) return FALSE;848}849while (set1 < set_end);850}851852if (list[1] == 0) return TRUE;853/* Might be an empty repeat. */854continue;855}856857/* Some property combinations also acceptable. Unicode property opcodes are858processed specially; the rest can be handled with a lookup table. */859860else861{862uint32_t leftop, rightop;863864leftop = base_list[0];865rightop = list[0];866867#ifdef SUPPORT_UNICODE868accepted = FALSE; /* Always set in non-unicode case. */869if (leftop == OP_PROP || leftop == OP_NOTPROP)870{871if (rightop == OP_EOD)872accepted = TRUE;873else if (rightop == OP_PROP || rightop == OP_NOTPROP)874{875int n;876const uint8_t *p;877BOOL same = leftop == rightop;878BOOL lisprop = leftop == OP_PROP;879BOOL risprop = rightop == OP_PROP;880BOOL bothprop = lisprop && risprop;881882/* There's a table that specifies how each combination is to be883processed:8840 Always return FALSE (never auto-possessify)8851 Character groups are distinct (possessify if both are OP_PROP)8862 Check character categories in the same group (general or particular)8873 Return TRUE if the two opcodes are not the same888... see comments below889*/890891n = propposstab[base_list[2]][list[2]];892switch(n)893{894case 0: break;895case 1: accepted = bothprop; break;896case 2: accepted = (base_list[3] == list[3]) != same; break;897case 3: accepted = !same; break;898899case 4: /* Left general category, right particular category */900accepted = risprop && catposstab[base_list[3]][list[3]] == same;901break;902903case 5: /* Right general category, left particular category */904accepted = lisprop && catposstab[list[3]][base_list[3]] == same;905break;906907/* This code is logically tricky. Think hard before fiddling with it.908The posspropstab table has four entries per row. Each row relates to909one of PCRE's special properties such as ALNUM or SPACE or WORD.910Only WORD actually needs all four entries, but using repeats for the911others means they can all use the same code below.912913The first two entries in each row are Unicode general categories, and914apply always, because all the characters they include are part of the915PCRE character set. The third and fourth entries are a general and a916particular category, respectively, that include one or more relevant917characters. One or the other is used, depending on whether the check918is for a general or a particular category. However, in both cases the919category contains more characters than the specials that are defined920for the property being tested against. Therefore, it cannot be used921in a NOTPROP case.922923Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.924Underscore is covered by ucp_P or ucp_Po. */925926case 6: /* Left alphanum vs right general category */927case 7: /* Left space vs right general category */928case 8: /* Left word vs right general category */929p = posspropstab[n-6];930accepted = risprop && lisprop ==931(list[3] != p[0] &&932list[3] != p[1] &&933(list[3] != p[2] || !lisprop));934break;935936case 9: /* Right alphanum vs left general category */937case 10: /* Right space vs left general category */938case 11: /* Right word vs left general category */939p = posspropstab[n-9];940accepted = lisprop && risprop ==941(base_list[3] != p[0] &&942base_list[3] != p[1] &&943(base_list[3] != p[2] || !risprop));944break;945946case 12: /* Left alphanum vs right particular category */947case 13: /* Left space vs right particular category */948case 14: /* Left word vs right particular category */949p = posspropstab[n-12];950accepted = risprop && lisprop ==951(catposstab[p[0]][list[3]] &&952catposstab[p[1]][list[3]] &&953(list[3] != p[3] || !lisprop));954break;955956case 15: /* Right alphanum vs left particular category */957case 16: /* Right space vs left particular category */958case 17: /* Right word vs left particular category */959p = posspropstab[n-15];960accepted = lisprop && risprop ==961(catposstab[p[0]][base_list[3]] &&962catposstab[p[1]][base_list[3]] &&963(base_list[3] != p[3] || !risprop));964break;965}966}967}968969else970#endif /* SUPPORT_UNICODE */971972accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&973rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&974autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];975976if (!accepted) return FALSE;977978if (list[1] == 0) return TRUE;979/* Might be an empty repeat. */980continue;981}982983/* Control reaches here only if one of the items is a small character list.984All characters are checked against the other side. */985986do987{988chr = *chr_ptr;989990switch(list_ptr[0])991{992case OP_CHAR:993ochr_ptr = list_ptr + 2;994do995{996if (chr == *ochr_ptr) return FALSE;997ochr_ptr++;998}999while(*ochr_ptr != NOTACHAR);1000break;10011002case OP_NOT:1003ochr_ptr = list_ptr + 2;1004do1005{1006if (chr == *ochr_ptr)1007break;1008ochr_ptr++;1009}1010while(*ochr_ptr != NOTACHAR);1011if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */1012break;10131014/* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*1015set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */10161017case OP_DIGIT:1018if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;1019break;10201021case OP_NOT_DIGIT:1022if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;1023break;10241025case OP_WHITESPACE:1026if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;1027break;10281029case OP_NOT_WHITESPACE:1030if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;1031break;10321033case OP_WORDCHAR:1034if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;1035break;10361037case OP_NOT_WORDCHAR:1038if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;1039break;10401041case OP_HSPACE:1042switch(chr)1043{1044HSPACE_CASES: return FALSE;1045default: break;1046}1047break;10481049case OP_NOT_HSPACE:1050switch(chr)1051{1052HSPACE_CASES: break;1053default: return FALSE;1054}1055break;10561057case OP_ANYNL:1058case OP_VSPACE:1059switch(chr)1060{1061VSPACE_CASES: return FALSE;1062default: break;1063}1064break;10651066case OP_NOT_VSPACE:1067switch(chr)1068{1069VSPACE_CASES: break;1070default: return FALSE;1071}1072break;10731074case OP_DOLL:1075case OP_EODN:1076switch (chr)1077{1078case CHAR_CR:1079case CHAR_LF:1080case CHAR_VT:1081case CHAR_FF:1082case CHAR_NEL:1083#ifndef EBCDIC1084case 0x2028:1085case 0x2029:1086#endif /* Not EBCDIC */1087return FALSE;1088}1089break;10901091case OP_EOD: /* Can always possessify before \z */1092break;10931094#ifdef SUPPORT_UNICODE1095case OP_PROP:1096case OP_NOTPROP:1097if (!check_char_prop(chr, list_ptr[2], list_ptr[3],1098list_ptr[0] == OP_NOTPROP))1099return FALSE;1100break;1101#endif11021103case OP_NCLASS:1104if (chr > 255) return FALSE;1105PCRE2_FALLTHROUGH /* Fall through */11061107case OP_CLASS:1108if (chr > 255) break;1109class_bitset = (const uint8_t *)1110((list_ptr == list ? code : base_end) - list_ptr[2]);1111if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;1112break;11131114#ifdef SUPPORT_WIDE_CHARS1115case OP_XCLASS:1116if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -1117list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))1118return FALSE;1119break;11201121case OP_ECLASS:1122if (PRIV(eclass)(chr,1123(list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,1124(list_ptr == list ? code : base_end) - list_ptr[3],1125(const uint8_t*)cb->start_code, utf))1126return FALSE;1127break;1128#endif /* SUPPORT_WIDE_CHARS */11291130default:1131return FALSE;1132}11331134chr_ptr++;1135}1136while(*chr_ptr != NOTACHAR);11371138/* At least one character must be matched from this opcode. */11391140if (list[1] == 0) return TRUE;1141}11421143/* LCOV_EXCL_START */1144PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */1145return FALSE; /* Avoid compiler warnings */1146/* LCOV_EXCL_STOP */1147}1148114911501151/*************************************************1152* Scan compiled regex for auto-possession *1153*************************************************/11541155/* Replaces single character iterations with their possessive alternatives1156if appropriate. This function modifies the compiled opcode! Hitting a1157non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a1158bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches1159overly complicated or large patterns. In these cases, the check just stops,1160leaving the remainder of the pattern unpossessified.11611162Arguments:1163code points to start of the byte code1164cb compile data block11651166Returns: 0 for success1167-1 if a non-existant opcode is encountered1168*/11691170int1171PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)1172{1173PCRE2_UCHAR c;1174PCRE2_SPTR end;1175PCRE2_UCHAR *repeat_opcode;1176uint32_t list[MAX_LIST];1177int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */1178BOOL utf = (cb->external_options & PCRE2_UTF) != 0;1179BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;11801181for (;;)1182{1183c = *code;11841185/* LCOV_EXCL_START */1186if (c >= OP_TABLE_LENGTH)1187{1188PCRE2_DEBUG_UNREACHABLE();1189return -1; /* Something gone wrong */1190}1191/* LCOV_EXCL_STOP */11921193if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)1194{1195c -= get_repeat_base(c) - OP_STAR;1196end = (c <= OP_MINUPTO) ?1197get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;1198list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;11991200if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,1201&rec_limit))1202{1203switch(c)1204{1205case OP_STAR:1206*code += OP_POSSTAR - OP_STAR;1207break;12081209case OP_MINSTAR:1210*code += OP_POSSTAR - OP_MINSTAR;1211break;12121213case OP_PLUS:1214*code += OP_POSPLUS - OP_PLUS;1215break;12161217case OP_MINPLUS:1218*code += OP_POSPLUS - OP_MINPLUS;1219break;12201221case OP_QUERY:1222*code += OP_POSQUERY - OP_QUERY;1223break;12241225case OP_MINQUERY:1226*code += OP_POSQUERY - OP_MINQUERY;1227break;12281229case OP_UPTO:1230*code += OP_POSUPTO - OP_UPTO;1231break;12321233case OP_MINUPTO:1234*code += OP_POSUPTO - OP_MINUPTO;1235break;1236}1237}1238c = *code;1239}1240else if (c == OP_CLASS || c == OP_NCLASS1241#ifdef SUPPORT_WIDE_CHARS1242|| c == OP_XCLASS || c == OP_ECLASS1243#endif1244)1245{1246#ifdef SUPPORT_WIDE_CHARS1247if (c == OP_XCLASS || c == OP_ECLASS)1248repeat_opcode = code + GET(code, 1);1249else1250#endif1251repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));12521253c = *repeat_opcode;1254if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)1255{1256/* The return from get_chr_property_list() will never be NULL when1257*code (aka c) is one of the four class opcodes. However, gcc with1258-fanalyzer notes that a NULL return is possible, and grumbles. Hence we1259put in a check. */12601261end = get_chr_property_list(code, utf, ucp, cb->fcc, list);1262list[1] = (c & 1) == 0;12631264if (end != NULL &&1265compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))1266{1267switch (c)1268{1269case OP_CRSTAR:1270case OP_CRMINSTAR:1271*repeat_opcode = OP_CRPOSSTAR;1272break;12731274case OP_CRPLUS:1275case OP_CRMINPLUS:1276*repeat_opcode = OP_CRPOSPLUS;1277break;12781279case OP_CRQUERY:1280case OP_CRMINQUERY:1281*repeat_opcode = OP_CRPOSQUERY;1282break;12831284case OP_CRRANGE:1285case OP_CRMINRANGE:1286*repeat_opcode = OP_CRPOSRANGE;1287break;1288}1289}1290}1291c = *code;1292}12931294switch(c)1295{1296case OP_END:1297return 0;12981299case OP_TYPESTAR:1300case OP_TYPEMINSTAR:1301case OP_TYPEPLUS:1302case OP_TYPEMINPLUS:1303case OP_TYPEQUERY:1304case OP_TYPEMINQUERY:1305case OP_TYPEPOSSTAR:1306case OP_TYPEPOSPLUS:1307case OP_TYPEPOSQUERY:1308if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;1309break;13101311case OP_TYPEUPTO:1312case OP_TYPEMINUPTO:1313case OP_TYPEEXACT:1314case OP_TYPEPOSUPTO:1315if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)1316code += 2;1317break;13181319case OP_CALLOUT_STR:1320code += GET(code, 1 + 2*LINK_SIZE);1321break;13221323#ifdef SUPPORT_WIDE_CHARS1324case OP_XCLASS:1325case OP_ECLASS:1326code += GET(code, 1);1327break;1328#endif13291330case OP_MARK:1331case OP_COMMIT_ARG:1332case OP_PRUNE_ARG:1333case OP_SKIP_ARG:1334case OP_THEN_ARG:1335code += code[1];1336break;1337}13381339/* Add in the fixed length from the table */13401341code += PRIV(OP_lengths)[c];13421343/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be1344followed by a multi-byte character. The length in the table is a minimum, so1345we have to arrange to skip the extra code units. */13461347#ifdef MAYBE_UTF_MULTI1348if (utf) switch(c)1349{1350case OP_CHAR:1351case OP_CHARI:1352case OP_NOT:1353case OP_NOTI:1354case OP_STAR:1355case OP_MINSTAR:1356case OP_PLUS:1357case OP_MINPLUS:1358case OP_QUERY:1359case OP_MINQUERY:1360case OP_UPTO:1361case OP_MINUPTO:1362case OP_EXACT:1363case OP_POSSTAR:1364case OP_POSPLUS:1365case OP_POSQUERY:1366case OP_POSUPTO:1367case OP_STARI:1368case OP_MINSTARI:1369case OP_PLUSI:1370case OP_MINPLUSI:1371case OP_QUERYI:1372case OP_MINQUERYI:1373case OP_UPTOI:1374case OP_MINUPTOI:1375case OP_EXACTI:1376case OP_POSSTARI:1377case OP_POSPLUSI:1378case OP_POSQUERYI:1379case OP_POSUPTOI:1380case OP_NOTSTAR:1381case OP_NOTMINSTAR:1382case OP_NOTPLUS:1383case OP_NOTMINPLUS:1384case OP_NOTQUERY:1385case OP_NOTMINQUERY:1386case OP_NOTUPTO:1387case OP_NOTMINUPTO:1388case OP_NOTEXACT:1389case OP_NOTPOSSTAR:1390case OP_NOTPOSPLUS:1391case OP_NOTPOSQUERY:1392case OP_NOTPOSUPTO:1393case OP_NOTSTARI:1394case OP_NOTMINSTARI:1395case OP_NOTPLUSI:1396case OP_NOTMINPLUSI:1397case OP_NOTQUERYI:1398case OP_NOTMINQUERYI:1399case OP_NOTUPTOI:1400case OP_NOTMINUPTOI:1401case OP_NOTEXACTI:1402case OP_NOTPOSSTARI:1403case OP_NOTPOSPLUSI:1404case OP_NOTPOSQUERYI:1405case OP_NOTPOSUPTOI:1406if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);1407break;1408}1409#else1410(void)(utf); /* Keep compiler happy by referencing function argument */1411#endif /* SUPPORT_WIDE_CHARS */1412}1413}14141415/* End of pcre2_auto_possess.c */141614171418