Path: blob/master/thirdparty/pcre2/src/pcre2_study.c
21409 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041/* This module contains functions for scanning a compiled pattern and42collecting data (e.g. minimum matching length). */434445#include "pcre2_internal.h"46474849/* The maximum remembered capturing brackets minimum. */5051#define MAX_CACHE_BACKREF 1285253/* Set a bit in the starting code unit bit map. */5455#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))5657/* Returns from set_start_bits() */5859enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };606162/*************************************************63* Find the minimum subject length for a group *64*************************************************/6566/* Scan a parenthesized group and compute the minimum length of subject that67is needed to match it. This is a lower bound; it does not mean there is a68string of that length that matches. In UTF mode, the result is in characters69rather than code units. The field in a compiled pattern for storing the minimum70length is 16-bits long (on the grounds that anything longer than that is71pathological), so we give up when we reach that amount. This also means that72integer overflow for really crazy patterns cannot happen.7374Backreference minimum lengths are cached to speed up multiple references. This75function is called only when the highest back reference in the pattern is less76than or equal to MAX_CACHE_BACKREF, which is one less than the size of the77caching vector. The zeroth element contains the number of the highest set78value.7980Arguments:81re compiled pattern block82code pointer to start of group (the bracket)83startcode pointer to start of the whole pattern's code84utf UTF flag85recurses chain of recurse_check to catch mutual recursion86countptr pointer to call count (to catch over complexity)87backref_cache vector for caching back references.8889This function is no longer called when the pattern contains (*ACCEPT); however,90the old code for returning -1 is retained, just in case.9192Returns: the minimum length93-1 \C in UTF-8 mode94or (*ACCEPT)95or pattern too complicated96-2 internal error (missing capturing bracket)97-3 internal error (opcode not listed)98*/99100static int101find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,102PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,103int *backref_cache)104{105int length = -1;106int branchlength = 0;107int prev_cap_recno = -1;108int prev_cap_d = 0;109int prev_recurse_recno = -1;110int prev_recurse_d = 0;111uint32_t once_fudge = 0;112BOOL had_recurse = FALSE;113BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;114PCRE2_SPTR nextbranch = code + GET(code, 1);115PCRE2_SPTR cc = code + 1 + LINK_SIZE;116recurse_check this_recurse;117118/* If this is a "could be empty" group, its minimum length is 0. */119120if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;121122/* Skip over capturing bracket number */123124if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;125126/* A large and/or complex regex can take too long to process. */127128if ((*countptr)++ > 1000) return -1;129130/* Scan along the opcodes for this branch. If we get to the end of the branch,131check the length against that of the other branches. If the accumulated length132passes 16-bits, reset to that value and skip the rest of the branch. */133134for (;;)135{136int d, min, recno;137PCRE2_UCHAR op;138PCRE2_SPTR cs, ce;139140if (branchlength >= (int)UINT16_MAX)141{142branchlength = UINT16_MAX;143cc = nextbranch;144}145146op = *cc;147switch (op)148{149case OP_COND:150case OP_SCOND:151152/* If there is only one branch in a condition, the implied branch has zero153length, so we don't add anything. This covers the DEFINE "condition"154automatically. If there are two branches we can treat it the same as any155other non-capturing subpattern. */156157cs = cc + GET(cc, 1);158if (*cs != OP_ALT)159{160cc = cs + 1 + LINK_SIZE;161break;162}163goto PROCESS_NON_CAPTURE;164165case OP_BRA:166/* There's a special case of OP_BRA, when it is wrapped round a repeated167OP_RECURSE. We'd like to process the latter at this level so that168remembering the value works for repeated cases. So we do nothing, but169set a fudge value to skip over the OP_KET after the recurse. */170171if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)172{173once_fudge = 1 + LINK_SIZE;174cc += 1 + LINK_SIZE;175break;176}177PCRE2_FALLTHROUGH /* Fall through */178179case OP_ONCE:180case OP_SCRIPT_RUN:181case OP_SBRA:182case OP_BRAPOS:183case OP_SBRAPOS:184PROCESS_NON_CAPTURE:185d = find_minlength(re, cc, startcode, utf, recurses, countptr,186backref_cache);187if (d < 0) return d;188branchlength += d;189do cc += GET(cc, 1); while (*cc == OP_ALT);190cc += 1 + LINK_SIZE;191break;192193/* To save time for repeated capturing subpatterns, we remember the194length of the previous one. Unfortunately we can't do the same for195the unnumbered ones above. Nor can we do this if (?| is present in the196pattern because captures with the same number are not then identical. */197198case OP_CBRA:199case OP_SCBRA:200case OP_CBRAPOS:201case OP_SCBRAPOS:202recno = (int)GET2(cc, 1+LINK_SIZE);203if (dupcapused || recno != prev_cap_recno)204{205prev_cap_recno = recno;206prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,207backref_cache);208if (prev_cap_d < 0) return prev_cap_d;209}210branchlength += prev_cap_d;211do cc += GET(cc, 1); while (*cc == OP_ALT);212cc += 1 + LINK_SIZE;213break;214215/* ACCEPT makes things far too complicated; we have to give up. In fact,216from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not217used. However, leave the code in place, just in case. */218219case OP_ACCEPT:220case OP_ASSERT_ACCEPT:221return -1;222223/* Reached end of a branch; if it's a ket it is the end of a nested224call. If it's ALT it is an alternation in a nested call. If it is END it's225the end of the outer call. All can be handled by the same code. If the226length of any branch is zero, there is no need to scan any subsequent227branches. */228229case OP_ALT:230case OP_KET:231case OP_KETRMAX:232case OP_KETRMIN:233case OP_KETRPOS:234case OP_END:235if (length < 0 || (!had_recurse && branchlength < length))236length = branchlength;237if (op != OP_ALT || length == 0) return length;238nextbranch = cc + GET(cc, 1);239cc += 1 + LINK_SIZE;240branchlength = 0;241had_recurse = FALSE;242break;243244/* Skip over assertive subpatterns */245246case OP_ASSERT:247case OP_ASSERT_NOT:248case OP_ASSERTBACK:249case OP_ASSERTBACK_NOT:250case OP_ASSERT_NA:251case OP_ASSERT_SCS:252case OP_ASSERTBACK_NA:253do cc += GET(cc, 1); while (*cc == OP_ALT);254PCRE2_FALLTHROUGH /* Fall through */255256/* Skip over things that don't match chars */257258case OP_REVERSE:259case OP_VREVERSE:260case OP_CREF:261case OP_DNCREF:262case OP_RREF:263case OP_DNRREF:264case OP_FALSE:265case OP_TRUE:266case OP_CALLOUT:267case OP_SOD:268case OP_SOM:269case OP_EOD:270case OP_EODN:271case OP_CIRC:272case OP_CIRCM:273case OP_DOLL:274case OP_DOLLM:275case OP_NOT_WORD_BOUNDARY:276case OP_WORD_BOUNDARY:277case OP_NOT_UCP_WORD_BOUNDARY:278case OP_UCP_WORD_BOUNDARY:279cc += PRIV(OP_lengths)[*cc];280break;281282case OP_CALLOUT_STR:283cc += GET(cc, 1 + 2*LINK_SIZE);284break;285286/* Skip over a subpattern that has a {0} or {0,x} quantifier */287288case OP_BRAZERO:289case OP_BRAMINZERO:290case OP_BRAPOSZERO:291case OP_SKIPZERO:292cc += PRIV(OP_lengths)[*cc];293do cc += GET(cc, 1); while (*cc == OP_ALT);294cc += 1 + LINK_SIZE;295break;296297/* Handle literal characters and + repetitions */298299case OP_CHAR:300case OP_CHARI:301case OP_NOT:302case OP_NOTI:303case OP_PLUS:304case OP_PLUSI:305case OP_MINPLUS:306case OP_MINPLUSI:307case OP_POSPLUS:308case OP_POSPLUSI:309case OP_NOTPLUS:310case OP_NOTPLUSI:311case OP_NOTMINPLUS:312case OP_NOTMINPLUSI:313case OP_NOTPOSPLUS:314case OP_NOTPOSPLUSI:315branchlength++;316cc += 2;317#ifdef SUPPORT_UNICODE318if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);319#endif320break;321322case OP_TYPEPLUS:323case OP_TYPEMINPLUS:324case OP_TYPEPOSPLUS:325branchlength++;326cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;327break;328329/* Handle exact repetitions. The count is already in characters, but we330may need to skip over a multibyte character in UTF mode. */331332case OP_EXACT:333case OP_EXACTI:334case OP_NOTEXACT:335case OP_NOTEXACTI:336branchlength += GET2(cc,1);337cc += 2 + IMM2_SIZE;338#ifdef SUPPORT_UNICODE339if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);340#endif341break;342343case OP_TYPEEXACT:344branchlength += GET2(cc,1);345cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP346|| cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);347break;348349/* Handle single-char non-literal matchers */350351case OP_PROP:352case OP_NOTPROP:353cc += 2;354PCRE2_FALLTHROUGH /* Fall through */355356case OP_NOT_DIGIT:357case OP_DIGIT:358case OP_NOT_WHITESPACE:359case OP_WHITESPACE:360case OP_NOT_WORDCHAR:361case OP_WORDCHAR:362case OP_ANY:363case OP_ALLANY:364case OP_EXTUNI:365case OP_HSPACE:366case OP_NOT_HSPACE:367case OP_VSPACE:368case OP_NOT_VSPACE:369branchlength++;370cc++;371break;372373/* "Any newline" might match two characters, but it also might match just374one. */375376case OP_ANYNL:377branchlength += 1;378cc++;379break;380381/* The single-byte matcher means we can't proceed in UTF mode. (In382non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever383appear, but leave the code, just in case.) */384385case OP_ANYBYTE:386#ifdef SUPPORT_UNICODE387if (utf) return -1;388#endif389branchlength++;390cc++;391break;392393/* For repeated character types, we have to test for \p and \P, which have394an extra two bytes of parameters. */395396case OP_TYPESTAR:397case OP_TYPEMINSTAR:398case OP_TYPEQUERY:399case OP_TYPEMINQUERY:400case OP_TYPEPOSSTAR:401case OP_TYPEPOSQUERY:402if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;403cc += PRIV(OP_lengths)[op];404break;405406case OP_TYPEUPTO:407case OP_TYPEMINUPTO:408case OP_TYPEPOSUPTO:409if (cc[1 + IMM2_SIZE] == OP_PROP410|| cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;411cc += PRIV(OP_lengths)[op];412break;413414/* Check a class for variable quantification */415416case OP_CLASS:417case OP_NCLASS:418#ifdef SUPPORT_WIDE_CHARS419case OP_XCLASS:420case OP_ECLASS:421/* The original code caused an unsigned overflow in 64 bit systems,422so now we use a conditional statement. */423if (op == OP_XCLASS || op == OP_ECLASS)424cc += GET(cc, 1);425else426#endif427cc += PRIV(OP_lengths)[OP_CLASS];428429switch (*cc)430{431case OP_CRPLUS:432case OP_CRMINPLUS:433case OP_CRPOSPLUS:434branchlength++;435PCRE2_FALLTHROUGH /* Fall through */436437case OP_CRSTAR:438case OP_CRMINSTAR:439case OP_CRQUERY:440case OP_CRMINQUERY:441case OP_CRPOSSTAR:442case OP_CRPOSQUERY:443cc++;444break;445446case OP_CRRANGE:447case OP_CRMINRANGE:448case OP_CRPOSRANGE:449branchlength += GET2(cc,1);450cc += 1 + 2 * IMM2_SIZE;451break;452453default:454branchlength++;455break;456}457break;458459/* Backreferences and subroutine calls (OP_RECURSE) are treated in the same460way: we find the minimum length for the subpattern. A recursion461(backreference or subroutine) causes an a flag to be set that causes the462length of this branch to be ignored. The logic is that a recursion can only463make sense if there is another alternative that stops the recursing. That464will provide the minimum length (when no recursion happens).465466If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket467matches an empty string (by default it causes a matching failure), so in468that case we must set the minimum length to zero.469470For backreferenes, if duplicate numbers are present in the pattern we check471for a reference to a duplicate. If it is, we don't know which version will472be referenced, so we have to set the minimum length to zero. */473474/* Duplicate named pattern back reference. */475476case OP_DNREF:477case OP_DNREFI:478if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)479{480int count = GET2(cc, 1+IMM2_SIZE);481PCRE2_SPTR slot =482(PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +483GET2(cc, 1) * re->name_entry_size;484485d = INT_MAX;486487/* Scan all groups with the same name; find the shortest. */488489while (count-- > 0)490{491int dd, i;492recno = GET2(slot, 0);493494if (recno <= backref_cache[0] && backref_cache[recno] >= 0)495dd = backref_cache[recno];496else497{498ce = cs = PRIV(find_bracket)(startcode, utf, recno);499if (cs == NULL) return -2;500do ce += GET(ce, 1); while (*ce == OP_ALT);501502dd = 0;503if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)504{505if (cc > cs && cc < ce) /* Simple recursion */506{507had_recurse = TRUE;508}509else510{511recurse_check *r = recurses;512for (r = recurses; r != NULL; r = r->prev)513if (r->group == cs) break;514if (r != NULL) /* Mutual recursion */515{516had_recurse = TRUE;517}518else519{520this_recurse.prev = recurses; /* No recursion */521this_recurse.group = cs;522dd = find_minlength(re, cs, startcode, utf, &this_recurse,523countptr, backref_cache);524if (dd < 0) return dd;525}526}527}528529backref_cache[recno] = dd;530for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;531backref_cache[0] = recno;532}533534if (dd < d) d = dd;535if (d <= 0) break; /* No point looking at any more */536slot += re->name_entry_size;537}538}539else d = 0;540cc += PRIV(OP_lengths)[*cc];541goto REPEAT_BACK_REFERENCE;542543/* Single back reference by number. References by name are converted to by544number when there is no duplication. */545546case OP_REF:547case OP_REFI:548recno = GET2(cc, 1);549if (recno <= backref_cache[0] && backref_cache[recno] >= 0)550d = backref_cache[recno];551else552{553int i;554d = 0;555556if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)557{558ce = cs = PRIV(find_bracket)(startcode, utf, recno);559if (cs == NULL) return -2;560do ce += GET(ce, 1); while (*ce == OP_ALT);561562if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)563{564if (cc > cs && cc < ce) /* Simple recursion */565{566had_recurse = TRUE;567}568else569{570recurse_check *r = recurses;571for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;572if (r != NULL) /* Mutual recursion */573{574had_recurse = TRUE;575}576else /* No recursion */577{578this_recurse.prev = recurses;579this_recurse.group = cs;580d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,581backref_cache);582if (d < 0) return d;583}584}585}586}587588backref_cache[recno] = d;589for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;590backref_cache[0] = recno;591}592593cc += PRIV(OP_lengths)[*cc];594595/* Handle repeated back references */596597REPEAT_BACK_REFERENCE:598switch (*cc)599{600case OP_CRSTAR:601case OP_CRMINSTAR:602case OP_CRQUERY:603case OP_CRMINQUERY:604case OP_CRPOSSTAR:605case OP_CRPOSQUERY:606min = 0;607cc++;608break;609610case OP_CRPLUS:611case OP_CRMINPLUS:612case OP_CRPOSPLUS:613min = 1;614cc++;615break;616617case OP_CRRANGE:618case OP_CRMINRANGE:619case OP_CRPOSRANGE:620min = GET2(cc, 1);621cc += 1 + 2 * IMM2_SIZE;622break;623624default:625min = 1;626break;627}628629/* Take care not to overflow: (1) min and d are ints, so check that their630product is not greater than INT_MAX. (2) branchlength is limited to631UINT16_MAX (checked at the top of the loop). */632633if ((d > 0 && (INT_MAX/d) < min) || (int)UINT16_MAX - branchlength < min*d)634branchlength = UINT16_MAX;635else branchlength += min * d;636break;637638/* Recursion always refers to the first occurrence of a subpattern with a639given number. Therefore, we can always make use of caching, even when the640pattern contains multiple subpatterns with the same number. */641642case OP_RECURSE:643cs = ce = startcode + GET(cc, 1);644recno = GET2(cs, 1+LINK_SIZE);645if (recno == prev_recurse_recno)646{647branchlength += prev_recurse_d;648}649else650{651do ce += GET(ce, 1); while (*ce == OP_ALT);652if (cc > cs && cc < ce) /* Simple recursion */653had_recurse = TRUE;654else655{656recurse_check *r = recurses;657for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;658if (r != NULL) /* Mutual recursion */659had_recurse = TRUE;660else661{662this_recurse.prev = recurses;663this_recurse.group = cs;664prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,665countptr, backref_cache);666if (prev_recurse_d < 0) return prev_recurse_d;667prev_recurse_recno = recno;668branchlength += prev_recurse_d;669}670}671}672cc += 1 + LINK_SIZE + once_fudge;673once_fudge = 0;674break;675676/* Anything else does not or need not match a character. We can get the677item's length from the table, but for those that can match zero occurrences678of a character, we must take special action for UTF-8 characters. As it679happens, the "NOT" versions of these opcodes are used at present only for680ASCII characters, so they could be omitted from this list. However, in681future that may change, so we include them here so as not to leave a682gotcha for a future maintainer. */683684case OP_UPTO:685case OP_UPTOI:686case OP_NOTUPTO:687case OP_NOTUPTOI:688case OP_MINUPTO:689case OP_MINUPTOI:690case OP_NOTMINUPTO:691case OP_NOTMINUPTOI:692case OP_POSUPTO:693case OP_POSUPTOI:694case OP_NOTPOSUPTO:695case OP_NOTPOSUPTOI:696697case OP_STAR:698case OP_STARI:699case OP_NOTSTAR:700case OP_NOTSTARI:701case OP_MINSTAR:702case OP_MINSTARI:703case OP_NOTMINSTAR:704case OP_NOTMINSTARI:705case OP_POSSTAR:706case OP_POSSTARI:707case OP_NOTPOSSTAR:708case OP_NOTPOSSTARI:709710case OP_QUERY:711case OP_QUERYI:712case OP_NOTQUERY:713case OP_NOTQUERYI:714case OP_MINQUERY:715case OP_MINQUERYI:716case OP_NOTMINQUERY:717case OP_NOTMINQUERYI:718case OP_POSQUERY:719case OP_POSQUERYI:720case OP_NOTPOSQUERY:721case OP_NOTPOSQUERYI:722723cc += PRIV(OP_lengths)[op];724#ifdef SUPPORT_UNICODE725if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);726#endif727break;728729/* Skip these, but we need to add in the name length. */730731case OP_MARK:732case OP_COMMIT_ARG:733case OP_PRUNE_ARG:734case OP_SKIP_ARG:735case OP_THEN_ARG:736cc += PRIV(OP_lengths)[op] + cc[1];737break;738739/* The remaining opcodes are just skipped over. */740741case OP_CLOSE:742case OP_COMMIT:743case OP_FAIL:744case OP_PRUNE:745case OP_SET_SOM:746case OP_SKIP:747case OP_THEN:748cc += PRIV(OP_lengths)[op];749break;750751/* This should not occur: we list all opcodes explicitly so that when752new ones get added they are properly considered. */753754/* LCOV_EXCL_START */755default:756PCRE2_DEBUG_UNREACHABLE();757return -3;758/* LCOV_EXCL_STOP */759}760}761762/* LCOV_EXCL_START */763PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */764return -3; /* Avoid compiler warnings */765/* LCOV_EXCL_STOP */766}767768769770/*************************************************771* Set a bit and maybe its alternate case *772*************************************************/773774/* Given a character, set its first code unit's bit in the table, and also the775corresponding bit for the other version of a letter if we are caseless.776777Arguments:778re points to the regex block779p points to the first code unit of the character780caseless TRUE if caseless781utf TRUE for UTF mode782ucp TRUE for UCP mode783784Returns: pointer after the character785*/786787static PCRE2_SPTR788set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,789BOOL ucp)790{791uint32_t c = *p++; /* First code unit */792793(void)utf; /* Stop compiler warnings when UTF not supported */794(void)ucp;795796/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for7970xff. */798799#if PCRE2_CODE_UNIT_WIDTH != 8800if (c > 0xff) SET_BIT(0xff); else801#endif802803SET_BIT(c);804805/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find806the end of the character, even when caseless. */807808#ifdef SUPPORT_UNICODE809if (utf)810{811#if PCRE2_CODE_UNIT_WIDTH == 8812if (c >= 0xc0) GETUTF8INC(c, p);813#elif PCRE2_CODE_UNIT_WIDTH == 16814if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);815#endif816}817#endif /* SUPPORT_UNICODE */818819/* If caseless, handle the other case of the character. */820821if (caseless)822{823#ifdef SUPPORT_UNICODE824if (utf || ucp)825{826c = UCD_OTHERCASE(c);827#if PCRE2_CODE_UNIT_WIDTH == 8828if (utf)829{830PCRE2_UCHAR buff[6];831(void)PRIV(ord2utf)(c, buff);832SET_BIT(buff[0]);833}834else if (c < 256) SET_BIT(c);835#else /* 16-bit or 32-bit mode */836if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);837#endif838}839840else841#endif /* SUPPORT_UNICODE */842843/* Not UTF or UCP */844845if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);846}847848return p;849}850851852853/*************************************************854* Set bits for a positive character type *855*************************************************/856857/* This function sets starting bits for a character type. In UTF-8 mode, we can858only do a direct setting for bytes less than 128, as otherwise there can be859confusion with bytes in the middle of UTF-8 characters. In a "traditional"860environment, the tables will only recognize ASCII characters anyway, but in at861least one Windows environment, some higher bytes bits were set in the tables.862So we deal with that case by considering the UTF-8 encoding.863864Arguments:865re the regex block866cbit type the type of character wanted867table_limit 32 for non-UTF-8; 16 for UTF-8868869Returns: nothing870*/871872static void873set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)874{875uint32_t c;876for (c = 0; c < table_limit; c++)877re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];878#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8879if (table_limit == 32) return;880for (c = 128; c < 256; c++)881{882if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)883{884PCRE2_UCHAR buff[6];885(void)PRIV(ord2utf)(c, buff);886SET_BIT(buff[0]);887}888}889#endif /* UTF-8 */890}891892893/*************************************************894* Set bits for a negative character type *895*************************************************/896897/* This function sets starting bits for a negative character type such as \D.898In UTF-8 mode, we can only do a direct setting for bytes less than 128, as899otherwise there can be confusion with bytes in the middle of UTF-8 characters.900Unlike in the positive case, where we can set appropriate starting bits for901specific high-valued UTF-8 characters, in this case we have to set the bits for902all high-valued characters. The lowest is 0xc2, but we overkill by starting at9030xc0 (192) for simplicity.904905Arguments:906re the regex block907cbit type the type of character wanted908table_limit 32 for non-UTF-8; 16 for UTF-8909910Returns: nothing911*/912913static void914set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)915{916uint32_t c;917for (c = 0; c < table_limit; c++)918re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));919#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8920if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;921#endif922}923924925926#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8927/*************************************************928* Set starting bits for a character list. *929*************************************************/930931/* This function sets starting bits for a character list. It enumerates932all characters and character ranges in the character list, and sets933the starting bits accordingly.934935Arguments:936code pointer to the code937start_bitmap pointer to the starting bitmap938939Returns: nothing940*/941static void942study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,943const uint8_t *char_lists_end)944{945uint32_t type, list_ind;946uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;947uint32_t range_start = ~(uint32_t)0, range_end = 0;948const uint8_t *next_char;949PCRE2_UCHAR start_buffer[6], end_buffer[6];950PCRE2_UCHAR start, end;951952/* Only needed in 8-bit mode at the moment. */953type = (uint32_t)(code[0] << 8) | code[1];954code += 2;955956/* Align characters. */957next_char = char_lists_end - (GET(code, 0) << 1);958type &= XCL_TYPE_MASK;959list_ind = 0;960961if ((type & XCL_BEGIN_WITH_RANGE) != 0)962range_start = XCL_CHAR_LIST_LOW_16_START;963964while (type > 0)965{966uint32_t item_count = type & XCL_ITEM_COUNT_MASK;967968if (item_count == XCL_ITEM_COUNT_MASK)969{970if (list_ind <= 1)971{972item_count = *(const uint16_t*)next_char;973next_char += 2;974}975else976{977item_count = *(const uint32_t*)next_char;978next_char += 4;979}980}981982while (item_count > 0)983{984if (list_ind <= 1)985{986range_end = *(const uint16_t*)next_char;987next_char += 2;988}989else990{991range_end = *(const uint32_t*)next_char;992next_char += 4;993}994995if ((range_end & XCL_CHAR_END) != 0)996{997range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);998999PRIV(ord2utf)(range_end, end_buffer);1000end = end_buffer[0];10011002if (range_start < range_end)1003{1004PRIV(ord2utf)(range_start, start_buffer);1005for (start = start_buffer[0]; start <= end; start++)1006start_bitmap[start / 8] |= (1u << (start & 7));1007}1008else1009start_bitmap[end / 8] |= (1u << (end & 7));10101011range_start = ~(uint32_t)0;1012}1013else1014range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);10151016item_count--;1017}10181019list_ind++;1020type >>= XCL_TYPE_BIT_LEN;10211022if (range_start == ~(uint32_t)0)1023{1024if ((type & XCL_BEGIN_WITH_RANGE) != 0)1025{1026/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */1027if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;1028else range_start = XCL_CHAR_LIST_LOW_32_START;1029}1030}1031else if ((type & XCL_BEGIN_WITH_RANGE) == 0)1032{1033PRIV(ord2utf)(range_start, start_buffer);10341035/* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and1036XCL_CHAR_LIST_HIGH_32_END are not possible. */1037if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;1038else range_end = XCL_CHAR_LIST_HIGH_16_END;10391040PRIV(ord2utf)(range_end, end_buffer);1041end = end_buffer[0];10421043for (start = start_buffer[0]; start <= end; start++)1044start_bitmap[start / 8] |= (1u << (start & 7));10451046range_start = ~(uint32_t)0;1047}10481049/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */1050if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;1051else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;1052}1053}1054#endif1055105610571058/*************************************************1059* Create bitmap of starting code units *1060*************************************************/10611062/* This function scans a compiled unanchored expression recursively and1063attempts to build a bitmap of the set of possible starting code units whose1064values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause1065the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode1066we pass a value of 16 rather than 32 as the final argument. (See comments in1067those functions for the reason.)10681069The SSB_CONTINUE return is useful for parenthesized groups in patterns such as1070(a*)b where the group provides some optional starting code units but scanning1071must continue at the outer level to find at least one mandatory code unit. At1072the outermost level, this function fails unless the result is SSB_DONE.10731074We restrict recursion (for nested groups) to 1000 to avoid stack overflow1075issues.10761077Arguments:1078re points to the compiled regex block1079code points to an expression1080utf TRUE if in UTF mode1081ucp TRUE if in UCP mode1082depthptr pointer to recurse depth10831084Returns: SSB_FAIL => Failed to find any starting code units1085SSB_DONE => Found mandatory starting code units1086SSB_CONTINUE => Found optional starting code units1087SSB_UNKNOWN => Hit an unrecognized opcode1088SSB_TOODEEP => Recursion is too deep1089*/10901091static int1092set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,1093int *depthptr)1094{1095uint32_t c;1096int yield = SSB_DONE;10971098#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81099int table_limit = utf? 16:32;1100#else1101int table_limit = 32;1102#endif11031104*depthptr += 1;1105if (*depthptr > 1000) return SSB_TOODEEP;11061107do1108{1109BOOL try_next = TRUE;1110PCRE2_SPTR tcode = code + 1 + LINK_SIZE;11111112if (*code == OP_CBRA || *code == OP_SCBRA ||1113*code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;11141115while (try_next) /* Loop for items in this branch */1116{1117int rc;1118PCRE2_SPTR ncode;1119const uint8_t *classmap = NULL;1120#ifdef SUPPORT_WIDE_CHARS1121PCRE2_UCHAR xclassflags;1122#endif11231124switch(*tcode)1125{1126/* If we reach something we don't understand, it means a new opcode has1127been created that hasn't been added to this function. Hopefully this1128problem will be discovered during testing. */11291130default:1131return SSB_UNKNOWN;11321133/* Fail for a valid opcode that implies no starting bits. */11341135case OP_ACCEPT:1136case OP_ASSERT_ACCEPT:1137case OP_ALLANY:1138case OP_ANY:1139case OP_ANYBYTE:1140case OP_CIRCM:1141case OP_CLOSE:1142case OP_COMMIT:1143case OP_COMMIT_ARG:1144case OP_COND:1145case OP_CREF:1146case OP_FALSE:1147case OP_TRUE:1148case OP_DNCREF:1149case OP_DNREF:1150case OP_DNREFI:1151case OP_DNRREF:1152case OP_DOLL:1153case OP_DOLLM:1154case OP_END:1155case OP_EOD:1156case OP_EODN:1157case OP_EXTUNI:1158case OP_FAIL:1159case OP_MARK:1160case OP_NOT:1161case OP_NOTEXACT:1162case OP_NOTEXACTI:1163case OP_NOTI:1164case OP_NOTMINPLUS:1165case OP_NOTMINPLUSI:1166case OP_NOTMINQUERY:1167case OP_NOTMINQUERYI:1168case OP_NOTMINSTAR:1169case OP_NOTMINSTARI:1170case OP_NOTMINUPTO:1171case OP_NOTMINUPTOI:1172case OP_NOTPLUS:1173case OP_NOTPLUSI:1174case OP_NOTPOSPLUS:1175case OP_NOTPOSPLUSI:1176case OP_NOTPOSQUERY:1177case OP_NOTPOSQUERYI:1178case OP_NOTPOSSTAR:1179case OP_NOTPOSSTARI:1180case OP_NOTPOSUPTO:1181case OP_NOTPOSUPTOI:1182case OP_NOTPROP:1183case OP_NOTQUERY:1184case OP_NOTQUERYI:1185case OP_NOTSTAR:1186case OP_NOTSTARI:1187case OP_NOTUPTO:1188case OP_NOTUPTOI:1189case OP_NOT_HSPACE:1190case OP_NOT_VSPACE:1191case OP_PRUNE:1192case OP_PRUNE_ARG:1193case OP_RECURSE:1194case OP_REF:1195case OP_REFI:1196case OP_REVERSE:1197case OP_VREVERSE:1198case OP_RREF:1199case OP_SCOND:1200case OP_SET_SOM:1201case OP_SKIP:1202case OP_SKIP_ARG:1203case OP_SOD:1204case OP_SOM:1205case OP_THEN:1206case OP_THEN_ARG:1207return SSB_FAIL;12081209/* OP_CIRC happens only at the start of an anchored branch (multiline ^1210uses OP_CIRCM). Skip over it. */12111212case OP_CIRC:1213tcode += PRIV(OP_lengths)[OP_CIRC];1214break;12151216/* A "real" property test implies no starting bits, but the fake property1217PT_CLIST identifies a list of characters. These lists are short, as they1218are used for characters with more than one "other case", so there is no1219point in recognizing them for OP_NOTPROP. */12201221case OP_PROP:1222if (tcode[1] != PT_CLIST) return SSB_FAIL;1223{1224const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];1225while ((c = *p++) < NOTACHAR)1226{1227#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81228if (utf)1229{1230PCRE2_UCHAR buff[6];1231(void)PRIV(ord2utf)(c, buff);1232c = buff[0];1233}1234#endif1235if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);1236}1237}1238try_next = FALSE;1239break;12401241/* We can ignore word boundary tests. */12421243case OP_WORD_BOUNDARY:1244case OP_NOT_WORD_BOUNDARY:1245case OP_UCP_WORD_BOUNDARY:1246case OP_NOT_UCP_WORD_BOUNDARY:1247tcode++;1248break;12491250/* For a positive lookahead assertion, inspect what immediately follows,1251ignoring intermediate assertions and callouts. If the next item is one1252that sets a mandatory character, skip this assertion. Otherwise, treat it1253the same as other bracket groups. */12541255case OP_ASSERT:1256case OP_ASSERT_NA:1257ncode = tcode + GET(tcode, 1);1258while (*ncode == OP_ALT) ncode += GET(ncode, 1);1259ncode += 1 + LINK_SIZE;12601261/* Skip irrelevant items */12621263for (BOOL done = FALSE; !done;)1264{1265switch (*ncode)1266{1267case OP_ASSERT:1268case OP_ASSERT_NOT:1269case OP_ASSERTBACK:1270case OP_ASSERTBACK_NOT:1271case OP_ASSERT_NA:1272case OP_ASSERTBACK_NA:1273case OP_ASSERT_SCS:1274ncode += GET(ncode, 1);1275while (*ncode == OP_ALT) ncode += GET(ncode, 1);1276ncode += 1 + LINK_SIZE;1277break;12781279case OP_WORD_BOUNDARY:1280case OP_NOT_WORD_BOUNDARY:1281case OP_UCP_WORD_BOUNDARY:1282case OP_NOT_UCP_WORD_BOUNDARY:1283ncode++;1284break;12851286case OP_CALLOUT:1287ncode += PRIV(OP_lengths)[OP_CALLOUT];1288break;12891290case OP_CALLOUT_STR:1291ncode += GET(ncode, 1 + 2*LINK_SIZE);1292break;12931294default:1295done = TRUE;1296break;1297}1298}12991300/* Now check the next significant item. */13011302switch(*ncode)1303{1304default:1305break;13061307case OP_PROP:1308if (ncode[1] != PT_CLIST) break;1309PCRE2_FALLTHROUGH /* Fall through */1310case OP_ANYNL:1311case OP_CHAR:1312case OP_CHARI:1313case OP_EXACT:1314case OP_EXACTI:1315case OP_HSPACE:1316case OP_MINPLUS:1317case OP_MINPLUSI:1318case OP_PLUS:1319case OP_PLUSI:1320case OP_POSPLUS:1321case OP_POSPLUSI:1322case OP_VSPACE:1323/* Note that these types will only be present in non-UCP mode. */1324case OP_DIGIT:1325case OP_NOT_DIGIT:1326case OP_WORDCHAR:1327case OP_NOT_WORDCHAR:1328case OP_WHITESPACE:1329case OP_NOT_WHITESPACE:1330tcode = ncode;1331continue; /* With the following significant opcode */1332}1333PCRE2_FALLTHROUGH /* Fall through */13341335/* For a group bracket or a positive assertion without an immediately1336following mandatory setting, recurse to set bits from within the1337subpattern. If it can't find anything, we have to give up. If it finds1338some mandatory character(s), we are done for this branch. Otherwise,1339carry on scanning after the subpattern. */13401341case OP_BRA:1342case OP_SBRA:1343case OP_CBRA:1344case OP_SCBRA:1345case OP_BRAPOS:1346case OP_SBRAPOS:1347case OP_CBRAPOS:1348case OP_SCBRAPOS:1349case OP_ONCE:1350case OP_SCRIPT_RUN:1351rc = set_start_bits(re, tcode, utf, ucp, depthptr);1352if (rc == SSB_DONE)1353{1354try_next = FALSE;1355}1356else if (rc == SSB_CONTINUE)1357{1358do tcode += GET(tcode, 1); while (*tcode == OP_ALT);1359tcode += 1 + LINK_SIZE;1360}1361else return rc; /* FAIL, UNKNOWN, or TOODEEP */1362break;13631364/* If we hit ALT or KET, it means we haven't found anything mandatory in1365this branch, though we might have found something optional. For ALT, we1366continue with the next alternative, but we have to arrange that the final1367result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,1368return SSB_CONTINUE: if this is the top level, that indicates failure,1369but after a nested subpattern, it causes scanning to continue. */13701371case OP_ALT:1372yield = SSB_CONTINUE;1373try_next = FALSE;1374break;13751376case OP_KET:1377case OP_KETRMAX:1378case OP_KETRMIN:1379case OP_KETRPOS:1380return SSB_CONTINUE;13811382/* Skip over callout */13831384case OP_CALLOUT:1385tcode += PRIV(OP_lengths)[OP_CALLOUT];1386break;13871388case OP_CALLOUT_STR:1389tcode += GET(tcode, 1 + 2*LINK_SIZE);1390break;13911392/* Skip over lookbehind, negative lookahead, and scan substring1393assertions */13941395case OP_ASSERT_NOT:1396case OP_ASSERTBACK:1397case OP_ASSERTBACK_NOT:1398case OP_ASSERTBACK_NA:1399case OP_ASSERT_SCS:1400do tcode += GET(tcode, 1); while (*tcode == OP_ALT);1401tcode += 1 + LINK_SIZE;1402break;14031404/* BRAZERO does the bracket, but carries on. */14051406case OP_BRAZERO:1407case OP_BRAMINZERO:1408case OP_BRAPOSZERO:1409rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);1410if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;1411do tcode += GET(tcode,1); while (*tcode == OP_ALT);1412tcode += 1 + LINK_SIZE;1413break;14141415/* SKIPZERO skips the bracket. */14161417case OP_SKIPZERO:1418tcode++;1419do tcode += GET(tcode,1); while (*tcode == OP_ALT);1420tcode += 1 + LINK_SIZE;1421break;14221423/* Single-char * or ? sets the bit and tries the next item */14241425case OP_STAR:1426case OP_MINSTAR:1427case OP_POSSTAR:1428case OP_QUERY:1429case OP_MINQUERY:1430case OP_POSQUERY:1431tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);1432break;14331434case OP_STARI:1435case OP_MINSTARI:1436case OP_POSSTARI:1437case OP_QUERYI:1438case OP_MINQUERYI:1439case OP_POSQUERYI:1440tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);1441break;14421443/* Single-char upto sets the bit and tries the next */14441445case OP_UPTO:1446case OP_MINUPTO:1447case OP_POSUPTO:1448tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);1449break;14501451case OP_UPTOI:1452case OP_MINUPTOI:1453case OP_POSUPTOI:1454tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);1455break;14561457/* At least one single char sets the bit and stops */14581459case OP_EXACT:1460tcode += IMM2_SIZE;1461PCRE2_FALLTHROUGH /* Fall through */1462case OP_CHAR:1463case OP_PLUS:1464case OP_MINPLUS:1465case OP_POSPLUS:1466(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);1467try_next = FALSE;1468break;14691470case OP_EXACTI:1471tcode += IMM2_SIZE;1472PCRE2_FALLTHROUGH /* Fall through */1473case OP_CHARI:1474case OP_PLUSI:1475case OP_MINPLUSI:1476case OP_POSPLUSI:1477(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);1478try_next = FALSE;1479break;14801481/* Special spacing and line-terminating items. These recognize specific1482lists of characters. The difference between VSPACE and ANYNL is that the1483latter can match the two-character CRLF sequence, but that is not1484relevant for finding the first character, so their code here is1485identical. */14861487case OP_HSPACE:1488SET_BIT(CHAR_HT);1489SET_BIT(CHAR_SPACE);14901491/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1492the bits for NBSP and for code units >= 255, independently of UTF. */14931494#if PCRE2_CODE_UNIT_WIDTH != 81495SET_BIT(CHAR_NBSP);1496SET_BIT(0xFF);1497#else1498/* For the 8-bit library in UTF-8 mode, set the bits for the first code1499units of horizontal space characters. */15001501#ifdef SUPPORT_UNICODE1502if (utf)1503{1504SET_BIT(0xC2); /* For U+00A0 */1505SET_BIT(0xE1); /* For U+1680, U+180E */1506SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */1507SET_BIT(0xE3); /* For U+3000 */1508}1509else1510#endif1511/* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */1512{1513SET_BIT(CHAR_NBSP);1514}1515#endif /* 8-bit support */15161517try_next = FALSE;1518break;15191520case OP_ANYNL:1521case OP_VSPACE:1522SET_BIT(CHAR_LF);1523SET_BIT(CHAR_VT);1524SET_BIT(CHAR_FF);1525SET_BIT(CHAR_CR);15261527/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1528the bits for NEL and for code units >= 255, independently of UTF. */15291530#if PCRE2_CODE_UNIT_WIDTH != 81531SET_BIT(CHAR_NEL);1532SET_BIT(0xFF);1533#else1534/* For the 8-bit library in UTF-8 mode, set the bits for the first code1535units of vertical space characters. */15361537#ifdef SUPPORT_UNICODE1538if (utf)1539{1540SET_BIT(0xC2); /* For U+0085 (NEL) */1541SET_BIT(0xE2); /* For U+2028, U+2029 */1542}1543else1544#endif1545/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */1546{1547SET_BIT(CHAR_NEL);1548}1549#endif /* 8-bit support */15501551try_next = FALSE;1552break;15531554/* Single character types set the bits and stop. Note that if PCRE2_UCP1555is set, we do not see these opcodes because \d etc are converted to1556properties. Therefore, these apply in the case when only characters less1557than 256 are recognized to match the types. */15581559case OP_NOT_DIGIT:1560set_nottype_bits(re, cbit_digit, table_limit);1561try_next = FALSE;1562break;15631564case OP_DIGIT:1565set_type_bits(re, cbit_digit, table_limit);1566try_next = FALSE;1567break;15681569case OP_NOT_WHITESPACE:1570set_nottype_bits(re, cbit_space, table_limit);1571try_next = FALSE;1572break;15731574case OP_WHITESPACE:1575set_type_bits(re, cbit_space, table_limit);1576try_next = FALSE;1577break;15781579case OP_NOT_WORDCHAR:1580set_nottype_bits(re, cbit_word, table_limit);1581try_next = FALSE;1582break;15831584case OP_WORDCHAR:1585set_type_bits(re, cbit_word, table_limit);1586try_next = FALSE;1587break;15881589/* One or more character type fudges the pointer and restarts, knowing1590it will hit a single character type and stop there. */15911592case OP_TYPEPLUS:1593case OP_TYPEMINPLUS:1594case OP_TYPEPOSPLUS:1595tcode++;1596break;15971598case OP_TYPEEXACT:1599tcode += 1 + IMM2_SIZE;1600break;16011602/* Zero or more repeats of character types set the bits and then1603try again. */16041605case OP_TYPEUPTO:1606case OP_TYPEMINUPTO:1607case OP_TYPEPOSUPTO:1608tcode += IMM2_SIZE;1609PCRE2_FALLTHROUGH /* Fall through */16101611case OP_TYPESTAR:1612case OP_TYPEMINSTAR:1613case OP_TYPEPOSSTAR:1614case OP_TYPEQUERY:1615case OP_TYPEMINQUERY:1616case OP_TYPEPOSQUERY:1617switch(tcode[1])1618{1619default:1620case OP_ANY:1621case OP_ALLANY:1622return SSB_FAIL;16231624case OP_HSPACE:1625SET_BIT(CHAR_HT);1626SET_BIT(CHAR_SPACE);16271628/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1629the bits for NBSP and for code units >= 255, independently of UTF. */16301631#if PCRE2_CODE_UNIT_WIDTH != 81632SET_BIT(CHAR_NBSP);1633SET_BIT(0xFF);1634#else1635/* For the 8-bit library in UTF-8 mode, set the bits for the first code1636units of horizontal space characters. */16371638#ifdef SUPPORT_UNICODE1639if (utf)1640{1641SET_BIT(0xC2); /* For U+00A0 */1642SET_BIT(0xE1); /* For U+1680, U+180E */1643SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */1644SET_BIT(0xE3); /* For U+3000 */1645}1646else1647#endif1648/* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */1649{1650SET_BIT(CHAR_NBSP);1651}1652#endif /* 8-bit support */1653break;16541655case OP_ANYNL:1656case OP_VSPACE:1657SET_BIT(CHAR_LF);1658SET_BIT(CHAR_VT);1659SET_BIT(CHAR_FF);1660SET_BIT(CHAR_CR);16611662/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1663the bits for NEL and for code units >= 255, independently of UTF. */16641665#if PCRE2_CODE_UNIT_WIDTH != 81666SET_BIT(CHAR_NEL);1667SET_BIT(0xFF);1668#else1669/* For the 8-bit library in UTF-8 mode, set the bits for the first code1670units of vertical space characters. */16711672#ifdef SUPPORT_UNICODE1673if (utf)1674{1675SET_BIT(0xC2); /* For U+0085 (NEL) */1676SET_BIT(0xE2); /* For U+2028, U+2029 */1677}1678else1679#endif1680/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */1681{1682SET_BIT(CHAR_NEL);1683}1684#endif /* 8-bit support */1685break;16861687case OP_NOT_DIGIT:1688set_nottype_bits(re, cbit_digit, table_limit);1689break;16901691case OP_DIGIT:1692set_type_bits(re, cbit_digit, table_limit);1693break;16941695case OP_NOT_WHITESPACE:1696set_nottype_bits(re, cbit_space, table_limit);1697break;16981699case OP_WHITESPACE:1700set_type_bits(re, cbit_space, table_limit);1701break;17021703case OP_NOT_WORDCHAR:1704set_nottype_bits(re, cbit_word, table_limit);1705break;17061707case OP_WORDCHAR:1708set_type_bits(re, cbit_word, table_limit);1709break;1710}17111712tcode += 2;1713break;17141715/* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */17161717#ifdef SUPPORT_WIDE_CHARS1718case OP_ECLASS:1719return SSB_FAIL;1720#endif17211722/* Extended class: if there are any property checks, or if this is a1723negative XCLASS without a map, give up. If there are no property checks,1724there must be wide characters on the XCLASS list, because otherwise an1725XCLASS would not have been created. This means that code points >= 2551726are potential starters. In the UTF-8 case we can scan them and set bits1727for the relevant leading bytes. */17281729#ifdef SUPPORT_WIDE_CHARS1730case OP_XCLASS:1731xclassflags = tcode[1 + LINK_SIZE];1732if ((xclassflags & XCL_HASPROP) != 0 ||1733(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)1734return SSB_FAIL;17351736/* We have a positive XCLASS or a negative one without a map. Set up the1737map pointer if there is one, and fall through. */17381739classmap = ((xclassflags & XCL_MAP) == 0)? NULL :1740(const uint8_t *)(tcode + 1 + LINK_SIZE + 1);17411742/* In UTF-8 mode, scan the character list and set bits for leading bytes,1743then jump to handle the map. */17441745#if PCRE2_CODE_UNIT_WIDTH == 81746if (utf && (xclassflags & XCL_NOT) == 0)1747{1748PCRE2_UCHAR b, e;1749PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);1750tcode += GET(tcode, 1);17511752if (*p >= XCL_LIST)1753{1754study_char_list(p, re->start_bitmap,1755((const uint8_t *)re + re->code_start));1756goto HANDLE_CLASSMAP;1757}17581759for (;;) switch (*p++)1760{1761case XCL_SINGLE:1762b = *p++;1763while ((*p & 0xc0) == 0x80) p++;1764re->start_bitmap[b/8] |= (1u << (b&7));1765break;17661767case XCL_RANGE:1768b = *p++;1769while ((*p & 0xc0) == 0x80) p++;1770e = *p++;1771while ((*p & 0xc0) == 0x80) p++;1772for (; b <= e; b++)1773re->start_bitmap[b/8] |= (1u << (b&7));1774break;17751776case XCL_END:1777goto HANDLE_CLASSMAP;17781779/* LCOV_EXCL_START */1780default:1781PCRE2_DEBUG_UNREACHABLE();1782return SSB_UNKNOWN; /* Internal error, should not occur */1783/* LCOV_EXCL_STOP */1784}1785}1786#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */1787#endif /* SUPPORT_WIDE_CHARS */17881789/* It seems that the fall through comment must be outside the #ifdef if1790it is to avoid the gcc compiler warning. */17911792PCRE2_FALLTHROUGH /* Fall through */17931794/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are1795in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter1796because it starts a character with a value > 255. In 8-bit non-UTF mode,1797there is no difference between CLASS and NCLASS. In all other wide1798character modes, set the 0xFF bit to indicate code units >= 255. */17991800case OP_NCLASS:1801#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81802if (utf)1803{1804re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */1805memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */1806}1807PCRE2_FALLTHROUGH /* Fall through */1808#elif PCRE2_CODE_UNIT_WIDTH != 81809SET_BIT(0xFF); /* For characters >= 255 */1810PCRE2_FALLTHROUGH /* Fall through */1811#endif18121813/* Enter here for a positive non-XCLASS. If we have fallen through from1814an XCLASS, classmap will already be set; just advance the code pointer.1815Otherwise, set up classmap for a non-XCLASS and advance past it. */18161817case OP_CLASS:1818if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else1819{1820classmap = (const uint8_t *)(++tcode);1821tcode += 32 / sizeof(PCRE2_UCHAR);1822}18231824/* When wide characters are supported, classmap may be NULL. In UTF-81825(sic) mode, the bits in a class bit map correspond to character values,1826not to byte values. However, the bit map we are constructing is for byte1827values. So we have to do a conversion for characters whose code point is1828greater than 127. In fact, there are only two possible starting bytes for1829characters in the range 128 - 255. */18301831#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 81832HANDLE_CLASSMAP:1833#endif1834if (classmap != NULL)1835{1836#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81837if (utf)1838{1839for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];1840for (c = 128; c < 256; c++)1841{1842if ((classmap[c/8] & (1u << (c&7))) != 0)1843{1844int d = (c >> 6) | 0xc0; /* Set bit for this starter */1845re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */1846c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */1847}1848}1849}1850else1851#endif1852/* In all modes except UTF-8, the two bit maps are compatible. */18531854{1855for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];1856}1857}18581859/* Act on what follows the class. For a zero minimum repeat, continue;1860otherwise stop processing. */18611862switch (*tcode)1863{1864case OP_CRSTAR:1865case OP_CRMINSTAR:1866case OP_CRQUERY:1867case OP_CRMINQUERY:1868case OP_CRPOSSTAR:1869case OP_CRPOSQUERY:1870tcode++;1871break;18721873case OP_CRRANGE:1874case OP_CRMINRANGE:1875case OP_CRPOSRANGE:1876if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;1877else try_next = FALSE;1878break;18791880default:1881try_next = FALSE;1882break;1883}1884break; /* End of class handling case */1885} /* End of switch for opcodes */1886} /* End of try_next loop */18871888code += GET(code, 1); /* Advance to next branch */1889}1890while (*code == OP_ALT);18911892return yield;1893}1894189518961897/*************************************************1898* Study a compiled expression *1899*************************************************/19001901/* This function is handed a compiled expression that it must study to produce1902information that will speed up the matching.19031904Argument:1905re points to the compiled expression19061907Returns: 0 normally; non-zero should never normally occur19081 unknown opcode in set_start_bits19092 missing capturing bracket19103 unknown opcode in find_minlength1911*/19121913int1914PRIV(study)(pcre2_real_code *re)1915{1916int count = 0;1917PCRE2_UCHAR *code;1918BOOL utf = (re->overall_options & PCRE2_UTF) != 0;1919BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;19201921/* Find start of compiled code */19221923code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);19241925/* For a pattern that has a first code unit, or a multiline pattern that1926matches only at "line start", there is no point in seeking a list of starting1927code units. */19281929if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)1930{1931int depth = 0;1932int rc = set_start_bits(re, code, utf, ucp, &depth);1933/* LCOV_EXCL_START */1934if (rc == SSB_UNKNOWN)1935{1936PCRE2_DEBUG_UNREACHABLE();1937return 1;1938}1939/* LCOV_EXCL_STOP */19401941/* If a list of starting code units was set up, scan the list to see if only1942one or two were listed. Having only one listed is rare because usually a1943single starting code unit will have been recognized and PCRE2_FIRSTSET set.1944If two are listed, see if they are caseless versions of the same character;1945if so we can replace the list with a caseless first code unit. This gives1946better performance and is plausibly worth doing for patterns such as [Ww]ord1947or (word|WORD). */19481949if (rc == SSB_DONE)1950{1951int i;1952int a = -1;1953int b = -1;1954uint8_t *p = re->start_bitmap;1955uint32_t flags = PCRE2_FIRSTMAPSET;19561957for (i = 0; i < 256; p++, i += 8)1958{1959uint8_t x = *p;1960if (x != 0)1961{1962int c;1963uint8_t y = x & (~x + 1); /* Least significant bit */1964if (y != x) goto DONE; /* More than one bit set */19651966/* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and1967all wide characters", so we cannot use it here. */19681969#if PCRE2_CODE_UNIT_WIDTH != 81970if (i == 248 && x == 0x80) goto DONE;1971#endif19721973/* Compute the character value */19741975c = i;1976switch (x)1977{1978case 1: break;1979case 2: c += 1; break; case 4: c += 2; break;1980case 8: c += 3; break; case 16: c += 4; break;1981case 32: c += 5; break; case 64: c += 6; break;1982case 128: c += 7; break;1983}19841985/* c contains the code unit value, in the range 0-255. In 8-bit UTF1986mode, only values < 128 can be used. In all the other cases, c is a1987character value. */19881989#if PCRE2_CODE_UNIT_WIDTH == 81990if (utf && c > 127) goto DONE;1991#endif1992if (a < 0) a = c; /* First one found, save in a */1993else if (b < 0) /* Second one found */1994{1995int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);19961997#ifdef SUPPORT_UNICODE1998if (utf || ucp)1999{2000if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */2001if (c > 127) d = UCD_OTHERCASE(c);2002}2003#endif /* SUPPORT_UNICODE */20042005if (d != a) goto DONE; /* Not the other case of a */2006b = c; /* Save second in b */20072008#ifdef EBCDIC2009/* To match ASCII (which puts the uppercase one in a), swap a & b2010if needed. This doesn't really matter, but neatens the tests. */2011if (TABLE_GET((unsigned int)a, re->tables + lcc_offset, a) == a)2012{2013b = a;2014a = c;2015}2016#endif2017}2018else goto DONE; /* More than two characters found */2019}2020}20212022/* Replace the start code unit bits with a first code unit. If it is the2023same as a required later code unit, then clear the required later code2024unit. This is because a search for a required code unit starts after an2025explicit first code unit, but at a code unit found from the bitmap.2026Patterns such as /a*a/ don't work if both the start unit and required2027unit are the same. */20282029if (a >= 0) {2030if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {2031re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);2032re->last_codeunit = 0;2033}2034re->first_codeunit = a;2035flags = PCRE2_FIRSTSET;2036if (b >= 0) flags |= PCRE2_FIRSTCASELESS;2037}20382039DONE:2040re->flags |= flags;2041}2042}20432044/* Find the minimum length of subject string. If the pattern can match an empty2045string, the minimum length is already known. If the pattern contains (*ACCEPT)2046all bets are off, and we don't even try to find a minimum length. If there are2047more back references than the size of the vector we are going to cache them in,2048do nothing. A pattern that complicated will probably take a long time to2049analyze and may in any case turn out to be too complicated. Note that back2050reference minima are held as 16-bit numbers. */20512052if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&2053re->top_backref <= MAX_CACHE_BACKREF)2054{2055int min;2056int backref_cache[MAX_CACHE_BACKREF+1];2057backref_cache[0] = 0; /* Highest one that is set */2058min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);2059switch(min)2060{2061case -1: /* \C in UTF mode or over-complex regex */2062break; /* Leave minlength unchanged (will be zero) */20632064/* LCOV_EXCL_START */2065case -2:2066PCRE2_DEBUG_UNREACHABLE();2067return 2; /* missing capturing bracket */2068/* LCOV_EXCL_STOP */20692070/* LCOV_EXCL_START */2071case -3:2072PCRE2_DEBUG_UNREACHABLE();2073return 3; /* unrecognized opcode */2074/* LCOV_EXCL_STOP */20752076default:2077re->minlength = (min > (int)UINT16_MAX)? (int)UINT16_MAX : min;2078break;2079}2080}20812082return 0;2083}20842085/* End of pcre2_study.c */208620872088