/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/3940/* This module contains functions for scanning a compiled pattern and41collecting data (e.g. minimum matching length). */424344#ifdef HAVE_CONFIG_H45#include "config.h"46#endif4748#include "pcre2_internal.h"4950/* The maximum remembered capturing brackets minimum. */5152#define MAX_CACHE_BACKREF 1285354/* Set a bit in the starting code unit bit map. */5556#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))5758/* Returns from set_start_bits() */5960enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };616263/*************************************************64* Find the minimum subject length for a group *65*************************************************/6667/* Scan a parenthesized group and compute the minimum length of subject that68is needed to match it. This is a lower bound; it does not mean there is a69string of that length that matches. In UTF mode, the result is in characters70rather than code units. The field in a compiled pattern for storing the minimum71length is 16-bits long (on the grounds that anything longer than that is72pathological), so we give up when we reach that amount. This also means that73integer overflow for really crazy patterns cannot happen.7475Backreference minimum lengths are cached to speed up multiple references. This76function is called only when the highest back reference in the pattern is less77than or equal to MAX_CACHE_BACKREF, which is one less than the size of the78caching vector. The zeroth element contains the number of the highest set79value.8081Arguments:82re compiled pattern block83code pointer to start of group (the bracket)84startcode pointer to start of the whole pattern's code85utf UTF flag86recurses chain of recurse_check to catch mutual recursion87countptr pointer to call count (to catch over complexity)88backref_cache vector for caching back references.8990This function is no longer called when the pattern contains (*ACCEPT); however,91the old code for returning -1 is retained, just in case.9293Returns: the minimum length94-1 \C in UTF-8 mode95or (*ACCEPT)96or pattern too complicated97-2 internal error (missing capturing bracket)98-3 internal error (opcode not listed)99*/100101static int102find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,103PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,104int *backref_cache)105{106int length = -1;107int branchlength = 0;108int prev_cap_recno = -1;109int prev_cap_d = 0;110int prev_recurse_recno = -1;111int prev_recurse_d = 0;112uint32_t once_fudge = 0;113BOOL had_recurse = FALSE;114BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;115PCRE2_SPTR nextbranch = code + GET(code, 1);116PCRE2_SPTR cc = code + 1 + LINK_SIZE;117recurse_check this_recurse;118119/* If this is a "could be empty" group, its minimum length is 0. */120121if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;122123/* Skip over capturing bracket number */124125if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;126127/* A large and/or complex regex can take too long to process. */128129if ((*countptr)++ > 1000) return -1;130131/* Scan along the opcodes for this branch. If we get to the end of the branch,132check the length against that of the other branches. If the accumulated length133passes 16-bits, reset to that value and skip the rest of the branch. */134135for (;;)136{137int d, min, recno;138PCRE2_UCHAR op;139PCRE2_SPTR cs, ce;140141if (branchlength >= UINT16_MAX)142{143branchlength = UINT16_MAX;144cc = nextbranch;145}146147op = *cc;148switch (op)149{150case OP_COND:151case OP_SCOND:152153/* If there is only one branch in a condition, the implied branch has zero154length, so we don't add anything. This covers the DEFINE "condition"155automatically. If there are two branches we can treat it the same as any156other non-capturing subpattern. */157158cs = cc + GET(cc, 1);159if (*cs != OP_ALT)160{161cc = cs + 1 + LINK_SIZE;162break;163}164goto PROCESS_NON_CAPTURE;165166case OP_BRA:167/* There's a special case of OP_BRA, when it is wrapped round a repeated168OP_RECURSE. We'd like to process the latter at this level so that169remembering the value works for repeated cases. So we do nothing, but170set a fudge value to skip over the OP_KET after the recurse. */171172if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)173{174once_fudge = 1 + LINK_SIZE;175cc += 1 + LINK_SIZE;176break;177}178/* Fall through */179180case OP_ONCE:181case OP_SCRIPT_RUN:182case OP_SBRA:183case OP_BRAPOS:184case OP_SBRAPOS:185PROCESS_NON_CAPTURE:186d = find_minlength(re, cc, startcode, utf, recurses, countptr,187backref_cache);188if (d < 0) return d;189branchlength += d;190do cc += GET(cc, 1); while (*cc == OP_ALT);191cc += 1 + LINK_SIZE;192break;193194/* To save time for repeated capturing subpatterns, we remember the195length of the previous one. Unfortunately we can't do the same for196the unnumbered ones above. Nor can we do this if (?| is present in the197pattern because captures with the same number are not then identical. */198199case OP_CBRA:200case OP_SCBRA:201case OP_CBRAPOS:202case OP_SCBRAPOS:203recno = (int)GET2(cc, 1+LINK_SIZE);204if (dupcapused || recno != prev_cap_recno)205{206prev_cap_recno = recno;207prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,208backref_cache);209if (prev_cap_d < 0) return prev_cap_d;210}211branchlength += prev_cap_d;212do cc += GET(cc, 1); while (*cc == OP_ALT);213cc += 1 + LINK_SIZE;214break;215216/* ACCEPT makes things far too complicated; we have to give up. In fact,217from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not218used. However, leave the code in place, just in case. */219220case OP_ACCEPT:221case OP_ASSERT_ACCEPT:222return -1;223224/* Reached end of a branch; if it's a ket it is the end of a nested225call. If it's ALT it is an alternation in a nested call. If it is END it's226the end of the outer call. All can be handled by the same code. If the227length of any branch is zero, there is no need to scan any subsequent228branches. */229230case OP_ALT:231case OP_KET:232case OP_KETRMAX:233case OP_KETRMIN:234case OP_KETRPOS:235case OP_END:236if (length < 0 || (!had_recurse && branchlength < length))237length = branchlength;238if (op != OP_ALT || length == 0) return length;239nextbranch = cc + GET(cc, 1);240cc += 1 + LINK_SIZE;241branchlength = 0;242had_recurse = FALSE;243break;244245/* Skip over assertive subpatterns */246247case OP_ASSERT:248case OP_ASSERT_NOT:249case OP_ASSERTBACK:250case OP_ASSERTBACK_NOT:251case OP_ASSERT_NA:252case OP_ASSERT_SCS:253case OP_ASSERTBACK_NA:254do cc += GET(cc, 1); while (*cc == OP_ALT);255/* Fall through */256257/* Skip over things that don't match chars */258259case OP_REVERSE:260case OP_VREVERSE:261case OP_CREF:262case OP_DNCREF:263case OP_RREF:264case OP_DNRREF:265case OP_FALSE:266case OP_TRUE:267case OP_CALLOUT:268case OP_SOD:269case OP_SOM:270case OP_EOD:271case OP_EODN:272case OP_CIRC:273case OP_CIRCM:274case OP_DOLL:275case OP_DOLLM:276case OP_NOT_WORD_BOUNDARY:277case OP_WORD_BOUNDARY:278case OP_NOT_UCP_WORD_BOUNDARY:279case OP_UCP_WORD_BOUNDARY:280cc += PRIV(OP_lengths)[*cc];281break;282283case OP_CALLOUT_STR:284cc += GET(cc, 1 + 2*LINK_SIZE);285break;286287/* Skip over a subpattern that has a {0} or {0,x} quantifier */288289case OP_BRAZERO:290case OP_BRAMINZERO:291case OP_BRAPOSZERO:292case OP_SKIPZERO:293cc += PRIV(OP_lengths)[*cc];294do cc += GET(cc, 1); while (*cc == OP_ALT);295cc += 1 + LINK_SIZE;296break;297298/* Handle literal characters and + repetitions */299300case OP_CHAR:301case OP_CHARI:302case OP_NOT:303case OP_NOTI:304case OP_PLUS:305case OP_PLUSI:306case OP_MINPLUS:307case OP_MINPLUSI:308case OP_POSPLUS:309case OP_POSPLUSI:310case OP_NOTPLUS:311case OP_NOTPLUSI:312case OP_NOTMINPLUS:313case OP_NOTMINPLUSI:314case OP_NOTPOSPLUS:315case OP_NOTPOSPLUSI:316branchlength++;317cc += 2;318#ifdef SUPPORT_UNICODE319if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);320#endif321break;322323case OP_TYPEPLUS:324case OP_TYPEMINPLUS:325case OP_TYPEPOSPLUS:326branchlength++;327cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;328break;329330/* Handle exact repetitions. The count is already in characters, but we331may need to skip over a multibyte character in UTF mode. */332333case OP_EXACT:334case OP_EXACTI:335case OP_NOTEXACT:336case OP_NOTEXACTI:337branchlength += GET2(cc,1);338cc += 2 + IMM2_SIZE;339#ifdef SUPPORT_UNICODE340if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);341#endif342break;343344case OP_TYPEEXACT:345branchlength += GET2(cc,1);346cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP347|| cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);348break;349350/* Handle single-char non-literal matchers */351352case OP_PROP:353case OP_NOTPROP:354cc += 2;355/* Fall through */356357case OP_NOT_DIGIT:358case OP_DIGIT:359case OP_NOT_WHITESPACE:360case OP_WHITESPACE:361case OP_NOT_WORDCHAR:362case OP_WORDCHAR:363case OP_ANY:364case OP_ALLANY:365case OP_EXTUNI:366case OP_HSPACE:367case OP_NOT_HSPACE:368case OP_VSPACE:369case OP_NOT_VSPACE:370branchlength++;371cc++;372break;373374/* "Any newline" might match two characters, but it also might match just375one. */376377case OP_ANYNL:378branchlength += 1;379cc++;380break;381382/* The single-byte matcher means we can't proceed in UTF mode. (In383non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever384appear, but leave the code, just in case.) */385386case OP_ANYBYTE:387#ifdef SUPPORT_UNICODE388if (utf) return -1;389#endif390branchlength++;391cc++;392break;393394/* For repeated character types, we have to test for \p and \P, which have395an extra two bytes of parameters. */396397case OP_TYPESTAR:398case OP_TYPEMINSTAR:399case OP_TYPEQUERY:400case OP_TYPEMINQUERY:401case OP_TYPEPOSSTAR:402case OP_TYPEPOSQUERY:403if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;404cc += PRIV(OP_lengths)[op];405break;406407case OP_TYPEUPTO:408case OP_TYPEMINUPTO:409case OP_TYPEPOSUPTO:410if (cc[1 + IMM2_SIZE] == OP_PROP411|| cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;412cc += PRIV(OP_lengths)[op];413break;414415/* Check a class for variable quantification */416417case OP_CLASS:418case OP_NCLASS:419#ifdef SUPPORT_WIDE_CHARS420case OP_XCLASS:421case OP_ECLASS:422/* The original code caused an unsigned overflow in 64 bit systems,423so now we use a conditional statement. */424if (op == OP_XCLASS || op == OP_ECLASS)425cc += GET(cc, 1);426else427#endif428cc += PRIV(OP_lengths)[OP_CLASS];429430switch (*cc)431{432case OP_CRPLUS:433case OP_CRMINPLUS:434case OP_CRPOSPLUS:435branchlength++;436/* Fall through */437438case OP_CRSTAR:439case OP_CRMINSTAR:440case OP_CRQUERY:441case OP_CRMINQUERY:442case OP_CRPOSSTAR:443case OP_CRPOSQUERY:444cc++;445break;446447case OP_CRRANGE:448case OP_CRMINRANGE:449case OP_CRPOSRANGE:450branchlength += GET2(cc,1);451cc += 1 + 2 * IMM2_SIZE;452break;453454default:455branchlength++;456break;457}458break;459460/* Backreferences and subroutine calls (OP_RECURSE) are treated in the same461way: we find the minimum length for the subpattern. A recursion462(backreference or subroutine) causes an a flag to be set that causes the463length of this branch to be ignored. The logic is that a recursion can only464make sense if there is another alternative that stops the recursing. That465will provide the minimum length (when no recursion happens).466467If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket468matches an empty string (by default it causes a matching failure), so in469that case we must set the minimum length to zero.470471For backreferenes, if duplicate numbers are present in the pattern we check472for a reference to a duplicate. If it is, we don't know which version will473be referenced, so we have to set the minimum length to zero. */474475/* Duplicate named pattern back reference. */476477case OP_DNREF:478case OP_DNREFI:479if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)480{481int count = GET2(cc, 1+IMM2_SIZE);482PCRE2_SPTR slot =483(PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +484GET2(cc, 1) * re->name_entry_size;485486d = INT_MAX;487488/* Scan all groups with the same name; find the shortest. */489490while (count-- > 0)491{492int dd, i;493recno = GET2(slot, 0);494495if (recno <= backref_cache[0] && backref_cache[recno] >= 0)496dd = backref_cache[recno];497else498{499ce = cs = PRIV(find_bracket)(startcode, utf, recno);500if (cs == NULL) return -2;501do ce += GET(ce, 1); while (*ce == OP_ALT);502503dd = 0;504if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)505{506if (cc > cs && cc < ce) /* Simple recursion */507{508had_recurse = TRUE;509}510else511{512recurse_check *r = recurses;513for (r = recurses; r != NULL; r = r->prev)514if (r->group == cs) break;515if (r != NULL) /* Mutual recursion */516{517had_recurse = TRUE;518}519else520{521this_recurse.prev = recurses; /* No recursion */522this_recurse.group = cs;523dd = find_minlength(re, cs, startcode, utf, &this_recurse,524countptr, backref_cache);525if (dd < 0) return dd;526}527}528}529530backref_cache[recno] = dd;531for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;532backref_cache[0] = recno;533}534535if (dd < d) d = dd;536if (d <= 0) break; /* No point looking at any more */537slot += re->name_entry_size;538}539}540else d = 0;541cc += PRIV(OP_lengths)[*cc];542goto REPEAT_BACK_REFERENCE;543544/* Single back reference by number. References by name are converted to by545number when there is no duplication. */546547case OP_REF:548case OP_REFI:549recno = GET2(cc, 1);550if (recno <= backref_cache[0] && backref_cache[recno] >= 0)551d = backref_cache[recno];552else553{554int i;555d = 0;556557if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)558{559ce = cs = PRIV(find_bracket)(startcode, utf, recno);560if (cs == NULL) return -2;561do ce += GET(ce, 1); while (*ce == OP_ALT);562563if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)564{565if (cc > cs && cc < ce) /* Simple recursion */566{567had_recurse = TRUE;568}569else570{571recurse_check *r = recurses;572for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;573if (r != NULL) /* Mutual recursion */574{575had_recurse = TRUE;576}577else /* No recursion */578{579this_recurse.prev = recurses;580this_recurse.group = cs;581d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,582backref_cache);583if (d < 0) return d;584}585}586}587}588589backref_cache[recno] = d;590for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;591backref_cache[0] = recno;592}593594cc += PRIV(OP_lengths)[*cc];595596/* Handle repeated back references */597598REPEAT_BACK_REFERENCE:599switch (*cc)600{601case OP_CRSTAR:602case OP_CRMINSTAR:603case OP_CRQUERY:604case OP_CRMINQUERY:605case OP_CRPOSSTAR:606case OP_CRPOSQUERY:607min = 0;608cc++;609break;610611case OP_CRPLUS:612case OP_CRMINPLUS:613case OP_CRPOSPLUS:614min = 1;615cc++;616break;617618case OP_CRRANGE:619case OP_CRMINRANGE:620case OP_CRPOSRANGE:621min = GET2(cc, 1);622cc += 1 + 2 * IMM2_SIZE;623break;624625default:626min = 1;627break;628}629630/* Take care not to overflow: (1) min and d are ints, so check that their631product is not greater than INT_MAX. (2) branchlength is limited to632UINT16_MAX (checked at the top of the loop). */633634if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)635branchlength = UINT16_MAX;636else branchlength += min * d;637break;638639/* Recursion always refers to the first occurrence of a subpattern with a640given number. Therefore, we can always make use of caching, even when the641pattern contains multiple subpatterns with the same number. */642643case OP_RECURSE:644cs = ce = startcode + GET(cc, 1);645recno = GET2(cs, 1+LINK_SIZE);646if (recno == prev_recurse_recno)647{648branchlength += prev_recurse_d;649}650else651{652do ce += GET(ce, 1); while (*ce == OP_ALT);653if (cc > cs && cc < ce) /* Simple recursion */654had_recurse = TRUE;655else656{657recurse_check *r = recurses;658for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;659if (r != NULL) /* Mutual recursion */660had_recurse = TRUE;661else662{663this_recurse.prev = recurses;664this_recurse.group = cs;665prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,666countptr, backref_cache);667if (prev_recurse_d < 0) return prev_recurse_d;668prev_recurse_recno = recno;669branchlength += prev_recurse_d;670}671}672}673cc += 1 + LINK_SIZE + once_fudge;674once_fudge = 0;675break;676677/* Anything else does not or need not match a character. We can get the678item's length from the table, but for those that can match zero occurrences679of a character, we must take special action for UTF-8 characters. As it680happens, the "NOT" versions of these opcodes are used at present only for681ASCII characters, so they could be omitted from this list. However, in682future that may change, so we include them here so as not to leave a683gotcha for a future maintainer. */684685case OP_UPTO:686case OP_UPTOI:687case OP_NOTUPTO:688case OP_NOTUPTOI:689case OP_MINUPTO:690case OP_MINUPTOI:691case OP_NOTMINUPTO:692case OP_NOTMINUPTOI:693case OP_POSUPTO:694case OP_POSUPTOI:695case OP_NOTPOSUPTO:696case OP_NOTPOSUPTOI:697698case OP_STAR:699case OP_STARI:700case OP_NOTSTAR:701case OP_NOTSTARI:702case OP_MINSTAR:703case OP_MINSTARI:704case OP_NOTMINSTAR:705case OP_NOTMINSTARI:706case OP_POSSTAR:707case OP_POSSTARI:708case OP_NOTPOSSTAR:709case OP_NOTPOSSTARI:710711case OP_QUERY:712case OP_QUERYI:713case OP_NOTQUERY:714case OP_NOTQUERYI:715case OP_MINQUERY:716case OP_MINQUERYI:717case OP_NOTMINQUERY:718case OP_NOTMINQUERYI:719case OP_POSQUERY:720case OP_POSQUERYI:721case OP_NOTPOSQUERY:722case OP_NOTPOSQUERYI:723724cc += PRIV(OP_lengths)[op];725#ifdef SUPPORT_UNICODE726if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);727#endif728break;729730/* Skip these, but we need to add in the name length. */731732case OP_MARK:733case OP_COMMIT_ARG:734case OP_PRUNE_ARG:735case OP_SKIP_ARG:736case OP_THEN_ARG:737cc += PRIV(OP_lengths)[op] + cc[1];738break;739740/* The remaining opcodes are just skipped over. */741742case OP_CLOSE:743case OP_COMMIT:744case OP_FAIL:745case OP_PRUNE:746case OP_SET_SOM:747case OP_SKIP:748case OP_THEN:749cc += PRIV(OP_lengths)[op];750break;751752/* This should not occur: we list all opcodes explicitly so that when753new ones get added they are properly considered. */754755default:756PCRE2_DEBUG_UNREACHABLE();757return -3;758}759}760761PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */762return -3; /* Avoid compiler warnings */763}764765766767/*************************************************768* Set a bit and maybe its alternate case *769*************************************************/770771/* Given a character, set its first code unit's bit in the table, and also the772corresponding bit for the other version of a letter if we are caseless.773774Arguments:775re points to the regex block776p points to the first code unit of the character777caseless TRUE if caseless778utf TRUE for UTF mode779ucp TRUE for UCP mode780781Returns: pointer after the character782*/783784static PCRE2_SPTR785set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,786BOOL ucp)787{788uint32_t c = *p++; /* First code unit */789790(void)utf; /* Stop compiler warnings when UTF not supported */791(void)ucp;792793/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for7940xff. */795796#if PCRE2_CODE_UNIT_WIDTH != 8797if (c > 0xff) SET_BIT(0xff); else798#endif799800SET_BIT(c);801802/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find803the end of the character, even when caseless. */804805#ifdef SUPPORT_UNICODE806if (utf)807{808#if PCRE2_CODE_UNIT_WIDTH == 8809if (c >= 0xc0) GETUTF8INC(c, p);810#elif PCRE2_CODE_UNIT_WIDTH == 16811if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);812#endif813}814#endif /* SUPPORT_UNICODE */815816/* If caseless, handle the other case of the character. */817818if (caseless)819{820#ifdef SUPPORT_UNICODE821if (utf || ucp)822{823c = UCD_OTHERCASE(c);824#if PCRE2_CODE_UNIT_WIDTH == 8825if (utf)826{827PCRE2_UCHAR buff[6];828(void)PRIV(ord2utf)(c, buff);829SET_BIT(buff[0]);830}831else if (c < 256) SET_BIT(c);832#else /* 16-bit or 32-bit mode */833if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);834#endif835}836837else838#endif /* SUPPORT_UNICODE */839840/* Not UTF or UCP */841842if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);843}844845return p;846}847848849850/*************************************************851* Set bits for a positive character type *852*************************************************/853854/* This function sets starting bits for a character type. In UTF-8 mode, we can855only do a direct setting for bytes less than 128, as otherwise there can be856confusion with bytes in the middle of UTF-8 characters. In a "traditional"857environment, the tables will only recognize ASCII characters anyway, but in at858least one Windows environment, some higher bytes bits were set in the tables.859So we deal with that case by considering the UTF-8 encoding.860861Arguments:862re the regex block863cbit type the type of character wanted864table_limit 32 for non-UTF-8; 16 for UTF-8865866Returns: nothing867*/868869static void870set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)871{872uint32_t c;873for (c = 0; c < table_limit; c++)874re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];875#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8876if (table_limit == 32) return;877for (c = 128; c < 256; c++)878{879if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)880{881PCRE2_UCHAR buff[6];882(void)PRIV(ord2utf)(c, buff);883SET_BIT(buff[0]);884}885}886#endif /* UTF-8 */887}888889890/*************************************************891* Set bits for a negative character type *892*************************************************/893894/* This function sets starting bits for a negative character type such as \D.895In UTF-8 mode, we can only do a direct setting for bytes less than 128, as896otherwise there can be confusion with bytes in the middle of UTF-8 characters.897Unlike in the positive case, where we can set appropriate starting bits for898specific high-valued UTF-8 characters, in this case we have to set the bits for899all high-valued characters. The lowest is 0xc2, but we overkill by starting at9000xc0 (192) for simplicity.901902Arguments:903re the regex block904cbit type the type of character wanted905table_limit 32 for non-UTF-8; 16 for UTF-8906907Returns: nothing908*/909910static void911set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)912{913uint32_t c;914for (c = 0; c < table_limit; c++)915re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));916#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8917if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;918#endif919}920921922923#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8924/*************************************************925* Set starting bits for a character list. *926*************************************************/927928/* This function sets starting bits for a character list. It enumerates929all characters and character ranges in the character list, and sets930the starting bits accordingly.931932Arguments:933code pointer to the code934start_bitmap pointer to the starting bitmap935936Returns: nothing937*/938static void939study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,940const uint8_t *char_lists_end)941{942uint32_t type, list_ind;943uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;944uint32_t range_start = ~(uint32_t)0, range_end = 0;945const uint8_t *next_char;946PCRE2_UCHAR start_buffer[6], end_buffer[6];947PCRE2_UCHAR start, end;948949/* Only needed in 8-bit mode at the moment. */950type = (uint32_t)(code[0] << 8) | code[1];951code += 2;952953/* Align characters. */954next_char = char_lists_end - (GET(code, 0) << 1);955type &= XCL_TYPE_MASK;956list_ind = 0;957958if ((type & XCL_BEGIN_WITH_RANGE) != 0)959range_start = XCL_CHAR_LIST_LOW_16_START;960961while (type > 0)962{963uint32_t item_count = type & XCL_ITEM_COUNT_MASK;964965if (item_count == XCL_ITEM_COUNT_MASK)966{967if (list_ind <= 1)968{969item_count = *(const uint16_t*)next_char;970next_char += 2;971}972else973{974item_count = *(const uint32_t*)next_char;975next_char += 4;976}977}978979while (item_count > 0)980{981if (list_ind <= 1)982{983range_end = *(const uint16_t*)next_char;984next_char += 2;985}986else987{988range_end = *(const uint32_t*)next_char;989next_char += 4;990}991992if ((range_end & XCL_CHAR_END) != 0)993{994range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);995996PRIV(ord2utf)(range_end, end_buffer);997end = end_buffer[0];998999if (range_start < range_end)1000{1001PRIV(ord2utf)(range_start, start_buffer);1002for (start = start_buffer[0]; start <= end; start++)1003start_bitmap[start / 8] |= (1u << (start & 7));1004}1005else1006start_bitmap[end / 8] |= (1u << (end & 7));10071008range_start = ~(uint32_t)0;1009}1010else1011range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);10121013item_count--;1014}10151016list_ind++;1017type >>= XCL_TYPE_BIT_LEN;10181019if (range_start == ~(uint32_t)0)1020{1021if ((type & XCL_BEGIN_WITH_RANGE) != 0)1022{1023/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */1024if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;1025else range_start = XCL_CHAR_LIST_LOW_32_START;1026}1027}1028else if ((type & XCL_BEGIN_WITH_RANGE) == 0)1029{1030PRIV(ord2utf)(range_start, start_buffer);10311032/* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and1033XCL_CHAR_LIST_HIGH_32_END are not possible. */1034if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;1035else range_end = XCL_CHAR_LIST_HIGH_16_END;10361037PRIV(ord2utf)(range_end, end_buffer);1038end = end_buffer[0];10391040for (start = start_buffer[0]; start <= end; start++)1041start_bitmap[start / 8] |= (1u << (start & 7));10421043range_start = ~(uint32_t)0;1044}10451046/* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */1047if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;1048else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;1049}1050}1051#endif1052105310541055/*************************************************1056* Create bitmap of starting code units *1057*************************************************/10581059/* This function scans a compiled unanchored expression recursively and1060attempts to build a bitmap of the set of possible starting code units whose1061values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause1062the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode1063we pass a value of 16 rather than 32 as the final argument. (See comments in1064those functions for the reason.)10651066The SSB_CONTINUE return is useful for parenthesized groups in patterns such as1067(a*)b where the group provides some optional starting code units but scanning1068must continue at the outer level to find at least one mandatory code unit. At1069the outermost level, this function fails unless the result is SSB_DONE.10701071We restrict recursion (for nested groups) to 1000 to avoid stack overflow1072issues.10731074Arguments:1075re points to the compiled regex block1076code points to an expression1077utf TRUE if in UTF mode1078ucp TRUE if in UCP mode1079depthptr pointer to recurse depth10801081Returns: SSB_FAIL => Failed to find any starting code units1082SSB_DONE => Found mandatory starting code units1083SSB_CONTINUE => Found optional starting code units1084SSB_UNKNOWN => Hit an unrecognized opcode1085SSB_TOODEEP => Recursion is too deep1086*/10871088static int1089set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,1090int *depthptr)1091{1092uint32_t c;1093int yield = SSB_DONE;10941095#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81096int table_limit = utf? 16:32;1097#else1098int table_limit = 32;1099#endif11001101*depthptr += 1;1102if (*depthptr > 1000) return SSB_TOODEEP;11031104do1105{1106BOOL try_next = TRUE;1107PCRE2_SPTR tcode = code + 1 + LINK_SIZE;11081109if (*code == OP_CBRA || *code == OP_SCBRA ||1110*code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;11111112while (try_next) /* Loop for items in this branch */1113{1114int rc;1115PCRE2_SPTR ncode;1116const uint8_t *classmap = NULL;1117#ifdef SUPPORT_WIDE_CHARS1118PCRE2_UCHAR xclassflags;1119#endif11201121switch(*tcode)1122{1123/* If we reach something we don't understand, it means a new opcode has1124been created that hasn't been added to this function. Hopefully this1125problem will be discovered during testing. */11261127default:1128return SSB_UNKNOWN;11291130/* Fail for a valid opcode that implies no starting bits. */11311132case OP_ACCEPT:1133case OP_ASSERT_ACCEPT:1134case OP_ALLANY:1135case OP_ANY:1136case OP_ANYBYTE:1137case OP_CIRCM:1138case OP_CLOSE:1139case OP_COMMIT:1140case OP_COMMIT_ARG:1141case OP_COND:1142case OP_CREF:1143case OP_FALSE:1144case OP_TRUE:1145case OP_DNCREF:1146case OP_DNREF:1147case OP_DNREFI:1148case OP_DNRREF:1149case OP_DOLL:1150case OP_DOLLM:1151case OP_END:1152case OP_EOD:1153case OP_EODN:1154case OP_EXTUNI:1155case OP_FAIL:1156case OP_MARK:1157case OP_NOT:1158case OP_NOTEXACT:1159case OP_NOTEXACTI:1160case OP_NOTI:1161case OP_NOTMINPLUS:1162case OP_NOTMINPLUSI:1163case OP_NOTMINQUERY:1164case OP_NOTMINQUERYI:1165case OP_NOTMINSTAR:1166case OP_NOTMINSTARI:1167case OP_NOTMINUPTO:1168case OP_NOTMINUPTOI:1169case OP_NOTPLUS:1170case OP_NOTPLUSI:1171case OP_NOTPOSPLUS:1172case OP_NOTPOSPLUSI:1173case OP_NOTPOSQUERY:1174case OP_NOTPOSQUERYI:1175case OP_NOTPOSSTAR:1176case OP_NOTPOSSTARI:1177case OP_NOTPOSUPTO:1178case OP_NOTPOSUPTOI:1179case OP_NOTPROP:1180case OP_NOTQUERY:1181case OP_NOTQUERYI:1182case OP_NOTSTAR:1183case OP_NOTSTARI:1184case OP_NOTUPTO:1185case OP_NOTUPTOI:1186case OP_NOT_HSPACE:1187case OP_NOT_VSPACE:1188case OP_PRUNE:1189case OP_PRUNE_ARG:1190case OP_RECURSE:1191case OP_REF:1192case OP_REFI:1193case OP_REVERSE:1194case OP_VREVERSE:1195case OP_RREF:1196case OP_SCOND:1197case OP_SET_SOM:1198case OP_SKIP:1199case OP_SKIP_ARG:1200case OP_SOD:1201case OP_SOM:1202case OP_THEN:1203case OP_THEN_ARG:1204return SSB_FAIL;12051206/* OP_CIRC happens only at the start of an anchored branch (multiline ^1207uses OP_CIRCM). Skip over it. */12081209case OP_CIRC:1210tcode += PRIV(OP_lengths)[OP_CIRC];1211break;12121213/* A "real" property test implies no starting bits, but the fake property1214PT_CLIST identifies a list of characters. These lists are short, as they1215are used for characters with more than one "other case", so there is no1216point in recognizing them for OP_NOTPROP. */12171218case OP_PROP:1219if (tcode[1] != PT_CLIST) return SSB_FAIL;1220{1221const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];1222while ((c = *p++) < NOTACHAR)1223{1224#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81225if (utf)1226{1227PCRE2_UCHAR buff[6];1228(void)PRIV(ord2utf)(c, buff);1229c = buff[0];1230}1231#endif1232if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);1233}1234}1235try_next = FALSE;1236break;12371238/* We can ignore word boundary tests. */12391240case OP_WORD_BOUNDARY:1241case OP_NOT_WORD_BOUNDARY:1242case OP_UCP_WORD_BOUNDARY:1243case OP_NOT_UCP_WORD_BOUNDARY:1244tcode++;1245break;12461247/* For a positive lookahead assertion, inspect what immediately follows,1248ignoring intermediate assertions and callouts. If the next item is one1249that sets a mandatory character, skip this assertion. Otherwise, treat it1250the same as other bracket groups. */12511252case OP_ASSERT:1253case OP_ASSERT_NA:1254ncode = tcode + GET(tcode, 1);1255while (*ncode == OP_ALT) ncode += GET(ncode, 1);1256ncode += 1 + LINK_SIZE;12571258/* Skip irrelevant items */12591260for (BOOL done = FALSE; !done;)1261{1262switch (*ncode)1263{1264case OP_ASSERT:1265case OP_ASSERT_NOT:1266case OP_ASSERTBACK:1267case OP_ASSERTBACK_NOT:1268case OP_ASSERT_NA:1269case OP_ASSERTBACK_NA:1270case OP_ASSERT_SCS:1271ncode += GET(ncode, 1);1272while (*ncode == OP_ALT) ncode += GET(ncode, 1);1273ncode += 1 + LINK_SIZE;1274break;12751276case OP_WORD_BOUNDARY:1277case OP_NOT_WORD_BOUNDARY:1278case OP_UCP_WORD_BOUNDARY:1279case OP_NOT_UCP_WORD_BOUNDARY:1280ncode++;1281break;12821283case OP_CALLOUT:1284ncode += PRIV(OP_lengths)[OP_CALLOUT];1285break;12861287case OP_CALLOUT_STR:1288ncode += GET(ncode, 1 + 2*LINK_SIZE);1289break;12901291default:1292done = TRUE;1293break;1294}1295}12961297/* Now check the next significant item. */12981299switch(*ncode)1300{1301default:1302break;13031304case OP_PROP:1305if (ncode[1] != PT_CLIST) break;1306/* Fall through */1307case OP_ANYNL:1308case OP_CHAR:1309case OP_CHARI:1310case OP_EXACT:1311case OP_EXACTI:1312case OP_HSPACE:1313case OP_MINPLUS:1314case OP_MINPLUSI:1315case OP_PLUS:1316case OP_PLUSI:1317case OP_POSPLUS:1318case OP_POSPLUSI:1319case OP_VSPACE:1320/* Note that these types will only be present in non-UCP mode. */1321case OP_DIGIT:1322case OP_NOT_DIGIT:1323case OP_WORDCHAR:1324case OP_NOT_WORDCHAR:1325case OP_WHITESPACE:1326case OP_NOT_WHITESPACE:1327tcode = ncode;1328continue; /* With the following significant opcode */1329}1330/* Fall through */13311332/* For a group bracket or a positive assertion without an immediately1333following mandatory setting, recurse to set bits from within the1334subpattern. If it can't find anything, we have to give up. If it finds1335some mandatory character(s), we are done for this branch. Otherwise,1336carry on scanning after the subpattern. */13371338case OP_BRA:1339case OP_SBRA:1340case OP_CBRA:1341case OP_SCBRA:1342case OP_BRAPOS:1343case OP_SBRAPOS:1344case OP_CBRAPOS:1345case OP_SCBRAPOS:1346case OP_ONCE:1347case OP_SCRIPT_RUN:1348rc = set_start_bits(re, tcode, utf, ucp, depthptr);1349if (rc == SSB_DONE)1350{1351try_next = FALSE;1352}1353else if (rc == SSB_CONTINUE)1354{1355do tcode += GET(tcode, 1); while (*tcode == OP_ALT);1356tcode += 1 + LINK_SIZE;1357}1358else return rc; /* FAIL, UNKNOWN, or TOODEEP */1359break;13601361/* If we hit ALT or KET, it means we haven't found anything mandatory in1362this branch, though we might have found something optional. For ALT, we1363continue with the next alternative, but we have to arrange that the final1364result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,1365return SSB_CONTINUE: if this is the top level, that indicates failure,1366but after a nested subpattern, it causes scanning to continue. */13671368case OP_ALT:1369yield = SSB_CONTINUE;1370try_next = FALSE;1371break;13721373case OP_KET:1374case OP_KETRMAX:1375case OP_KETRMIN:1376case OP_KETRPOS:1377return SSB_CONTINUE;13781379/* Skip over callout */13801381case OP_CALLOUT:1382tcode += PRIV(OP_lengths)[OP_CALLOUT];1383break;13841385case OP_CALLOUT_STR:1386tcode += GET(tcode, 1 + 2*LINK_SIZE);1387break;13881389/* Skip over lookbehind, negative lookahead, and scan substring1390assertions */13911392case OP_ASSERT_NOT:1393case OP_ASSERTBACK:1394case OP_ASSERTBACK_NOT:1395case OP_ASSERTBACK_NA:1396case OP_ASSERT_SCS:1397do tcode += GET(tcode, 1); while (*tcode == OP_ALT);1398tcode += 1 + LINK_SIZE;1399break;14001401/* BRAZERO does the bracket, but carries on. */14021403case OP_BRAZERO:1404case OP_BRAMINZERO:1405case OP_BRAPOSZERO:1406rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);1407if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;1408do tcode += GET(tcode,1); while (*tcode == OP_ALT);1409tcode += 1 + LINK_SIZE;1410break;14111412/* SKIPZERO skips the bracket. */14131414case OP_SKIPZERO:1415tcode++;1416do tcode += GET(tcode,1); while (*tcode == OP_ALT);1417tcode += 1 + LINK_SIZE;1418break;14191420/* Single-char * or ? sets the bit and tries the next item */14211422case OP_STAR:1423case OP_MINSTAR:1424case OP_POSSTAR:1425case OP_QUERY:1426case OP_MINQUERY:1427case OP_POSQUERY:1428tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);1429break;14301431case OP_STARI:1432case OP_MINSTARI:1433case OP_POSSTARI:1434case OP_QUERYI:1435case OP_MINQUERYI:1436case OP_POSQUERYI:1437tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);1438break;14391440/* Single-char upto sets the bit and tries the next */14411442case OP_UPTO:1443case OP_MINUPTO:1444case OP_POSUPTO:1445tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);1446break;14471448case OP_UPTOI:1449case OP_MINUPTOI:1450case OP_POSUPTOI:1451tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);1452break;14531454/* At least one single char sets the bit and stops */14551456case OP_EXACT:1457tcode += IMM2_SIZE;1458/* Fall through */1459case OP_CHAR:1460case OP_PLUS:1461case OP_MINPLUS:1462case OP_POSPLUS:1463(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);1464try_next = FALSE;1465break;14661467case OP_EXACTI:1468tcode += IMM2_SIZE;1469/* Fall through */1470case OP_CHARI:1471case OP_PLUSI:1472case OP_MINPLUSI:1473case OP_POSPLUSI:1474(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);1475try_next = FALSE;1476break;14771478/* Special spacing and line-terminating items. These recognize specific1479lists of characters. The difference between VSPACE and ANYNL is that the1480latter can match the two-character CRLF sequence, but that is not1481relevant for finding the first character, so their code here is1482identical. */14831484case OP_HSPACE:1485SET_BIT(CHAR_HT);1486SET_BIT(CHAR_SPACE);14871488/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1489the bits for 0xA0 and for code units >= 255, independently of UTF. */14901491#if PCRE2_CODE_UNIT_WIDTH != 81492SET_BIT(0xA0);1493SET_BIT(0xFF);1494#else1495/* For the 8-bit library in UTF-8 mode, set the bits for the first code1496units of horizontal space characters. */14971498#ifdef SUPPORT_UNICODE1499if (utf)1500{1501SET_BIT(0xC2); /* For U+00A0 */1502SET_BIT(0xE1); /* For U+1680, U+180E */1503SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */1504SET_BIT(0xE3); /* For U+3000 */1505}1506else1507#endif1508/* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless1509the code is EBCDIC. */1510{1511#ifndef EBCDIC1512SET_BIT(0xA0);1513#endif /* Not EBCDIC */1514}1515#endif /* 8-bit support */15161517try_next = FALSE;1518break;15191520case OP_ANYNL:1521case OP_VSPACE:1522SET_BIT(CHAR_LF);1523SET_BIT(CHAR_VT);1524SET_BIT(CHAR_FF);1525SET_BIT(CHAR_CR);15261527/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1528the bits for NEL and for code units >= 255, independently of UTF. */15291530#if PCRE2_CODE_UNIT_WIDTH != 81531SET_BIT(CHAR_NEL);1532SET_BIT(0xFF);1533#else1534/* For the 8-bit library in UTF-8 mode, set the bits for the first code1535units of vertical space characters. */15361537#ifdef SUPPORT_UNICODE1538if (utf)1539{1540SET_BIT(0xC2); /* For U+0085 (NEL) */1541SET_BIT(0xE2); /* For U+2028, U+2029 */1542}1543else1544#endif1545/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */1546{1547SET_BIT(CHAR_NEL);1548}1549#endif /* 8-bit support */15501551try_next = FALSE;1552break;15531554/* Single character types set the bits and stop. Note that if PCRE2_UCP1555is set, we do not see these opcodes because \d etc are converted to1556properties. Therefore, these apply in the case when only characters less1557than 256 are recognized to match the types. */15581559case OP_NOT_DIGIT:1560set_nottype_bits(re, cbit_digit, table_limit);1561try_next = FALSE;1562break;15631564case OP_DIGIT:1565set_type_bits(re, cbit_digit, table_limit);1566try_next = FALSE;1567break;15681569case OP_NOT_WHITESPACE:1570set_nottype_bits(re, cbit_space, table_limit);1571try_next = FALSE;1572break;15731574case OP_WHITESPACE:1575set_type_bits(re, cbit_space, table_limit);1576try_next = FALSE;1577break;15781579case OP_NOT_WORDCHAR:1580set_nottype_bits(re, cbit_word, table_limit);1581try_next = FALSE;1582break;15831584case OP_WORDCHAR:1585set_type_bits(re, cbit_word, table_limit);1586try_next = FALSE;1587break;15881589/* One or more character type fudges the pointer and restarts, knowing1590it will hit a single character type and stop there. */15911592case OP_TYPEPLUS:1593case OP_TYPEMINPLUS:1594case OP_TYPEPOSPLUS:1595tcode++;1596break;15971598case OP_TYPEEXACT:1599tcode += 1 + IMM2_SIZE;1600break;16011602/* Zero or more repeats of character types set the bits and then1603try again. */16041605case OP_TYPEUPTO:1606case OP_TYPEMINUPTO:1607case OP_TYPEPOSUPTO:1608tcode += IMM2_SIZE; /* Fall through */16091610case OP_TYPESTAR:1611case OP_TYPEMINSTAR:1612case OP_TYPEPOSSTAR:1613case OP_TYPEQUERY:1614case OP_TYPEMINQUERY:1615case OP_TYPEPOSQUERY:1616switch(tcode[1])1617{1618default:1619case OP_ANY:1620case OP_ALLANY:1621return SSB_FAIL;16221623case OP_HSPACE:1624SET_BIT(CHAR_HT);1625SET_BIT(CHAR_SPACE);16261627/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1628the bits for 0xA0 and for code units >= 255, independently of UTF. */16291630#if PCRE2_CODE_UNIT_WIDTH != 81631SET_BIT(0xA0);1632SET_BIT(0xFF);1633#else1634/* For the 8-bit library in UTF-8 mode, set the bits for the first code1635units of horizontal space characters. */16361637#ifdef SUPPORT_UNICODE1638if (utf)1639{1640SET_BIT(0xC2); /* For U+00A0 */1641SET_BIT(0xE1); /* For U+1680, U+180E */1642SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */1643SET_BIT(0xE3); /* For U+3000 */1644}1645else1646#endif1647/* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless1648the code is EBCDIC. */1649{1650#ifndef EBCDIC1651SET_BIT(0xA0);1652#endif /* Not EBCDIC */1653}1654#endif /* 8-bit support */1655break;16561657case OP_ANYNL:1658case OP_VSPACE:1659SET_BIT(CHAR_LF);1660SET_BIT(CHAR_VT);1661SET_BIT(CHAR_FF);1662SET_BIT(CHAR_CR);16631664/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set1665the bits for NEL and for code units >= 255, independently of UTF. */16661667#if PCRE2_CODE_UNIT_WIDTH != 81668SET_BIT(CHAR_NEL);1669SET_BIT(0xFF);1670#else1671/* For the 8-bit library in UTF-8 mode, set the bits for the first code1672units of vertical space characters. */16731674#ifdef SUPPORT_UNICODE1675if (utf)1676{1677SET_BIT(0xC2); /* For U+0085 (NEL) */1678SET_BIT(0xE2); /* For U+2028, U+2029 */1679}1680else1681#endif1682/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */1683{1684SET_BIT(CHAR_NEL);1685}1686#endif /* 8-bit support */1687break;16881689case OP_NOT_DIGIT:1690set_nottype_bits(re, cbit_digit, table_limit);1691break;16921693case OP_DIGIT:1694set_type_bits(re, cbit_digit, table_limit);1695break;16961697case OP_NOT_WHITESPACE:1698set_nottype_bits(re, cbit_space, table_limit);1699break;17001701case OP_WHITESPACE:1702set_type_bits(re, cbit_space, table_limit);1703break;17041705case OP_NOT_WORDCHAR:1706set_nottype_bits(re, cbit_word, table_limit);1707break;17081709case OP_WORDCHAR:1710set_type_bits(re, cbit_word, table_limit);1711break;1712}17131714tcode += 2;1715break;17161717/* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */17181719#ifdef SUPPORT_WIDE_CHARS1720case OP_ECLASS:1721return SSB_FAIL;1722#endif17231724/* Extended class: if there are any property checks, or if this is a1725negative XCLASS without a map, give up. If there are no property checks,1726there must be wide characters on the XCLASS list, because otherwise an1727XCLASS would not have been created. This means that code points >= 2551728are potential starters. In the UTF-8 case we can scan them and set bits1729for the relevant leading bytes. */17301731#ifdef SUPPORT_WIDE_CHARS1732case OP_XCLASS:1733xclassflags = tcode[1 + LINK_SIZE];1734if ((xclassflags & XCL_HASPROP) != 0 ||1735(xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)1736return SSB_FAIL;17371738/* We have a positive XCLASS or a negative one without a map. Set up the1739map pointer if there is one, and fall through. */17401741classmap = ((xclassflags & XCL_MAP) == 0)? NULL :1742(const uint8_t *)(tcode + 1 + LINK_SIZE + 1);17431744/* In UTF-8 mode, scan the character list and set bits for leading bytes,1745then jump to handle the map. */17461747#if PCRE2_CODE_UNIT_WIDTH == 81748if (utf && (xclassflags & XCL_NOT) == 0)1749{1750PCRE2_UCHAR b, e;1751PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);1752tcode += GET(tcode, 1);17531754if (*p >= XCL_LIST)1755{1756study_char_list(p, re->start_bitmap,1757((const uint8_t *)re + re->code_start));1758goto HANDLE_CLASSMAP;1759}17601761for (;;) switch (*p++)1762{1763case XCL_SINGLE:1764b = *p++;1765while ((*p & 0xc0) == 0x80) p++;1766re->start_bitmap[b/8] |= (1u << (b&7));1767break;17681769case XCL_RANGE:1770b = *p++;1771while ((*p & 0xc0) == 0x80) p++;1772e = *p++;1773while ((*p & 0xc0) == 0x80) p++;1774for (; b <= e; b++)1775re->start_bitmap[b/8] |= (1u << (b&7));1776break;17771778case XCL_END:1779goto HANDLE_CLASSMAP;17801781default:1782PCRE2_DEBUG_UNREACHABLE();1783return SSB_UNKNOWN; /* Internal error, should not occur */1784}1785}1786#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */1787#endif /* SUPPORT_WIDE_CHARS */17881789/* It seems that the fall through comment must be outside the #ifdef if1790it is to avoid the gcc compiler warning. */17911792/* Fall through */17931794/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are1795in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter1796because it starts a character with a value > 255. In 8-bit non-UTF mode,1797there is no difference between CLASS and NCLASS. In all other wide1798character modes, set the 0xFF bit to indicate code units >= 255. */17991800case OP_NCLASS:1801#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81802if (utf)1803{1804re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */1805memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */1806}1807#elif PCRE2_CODE_UNIT_WIDTH != 81808SET_BIT(0xFF); /* For characters >= 255 */1809#endif1810/* Fall through */18111812/* Enter here for a positive non-XCLASS. If we have fallen through from1813an XCLASS, classmap will already be set; just advance the code pointer.1814Otherwise, set up classmap for a a non-XCLASS and advance past it. */18151816case OP_CLASS:1817if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else1818{1819classmap = (const uint8_t *)(++tcode);1820tcode += 32 / sizeof(PCRE2_UCHAR);1821}18221823/* When wide characters are supported, classmap may be NULL. In UTF-81824(sic) mode, the bits in a class bit map correspond to character values,1825not to byte values. However, the bit map we are constructing is for byte1826values. So we have to do a conversion for characters whose code point is1827greater than 127. In fact, there are only two possible starting bytes for1828characters in the range 128 - 255. */18291830#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 81831HANDLE_CLASSMAP:1832#endif1833if (classmap != NULL)1834{1835#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81836if (utf)1837{1838for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];1839for (c = 128; c < 256; c++)1840{1841if ((classmap[c/8] & (1u << (c&7))) != 0)1842{1843int d = (c >> 6) | 0xc0; /* Set bit for this starter */1844re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */1845c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */1846}1847}1848}1849else1850#endif1851/* In all modes except UTF-8, the two bit maps are compatible. */18521853{1854for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];1855}1856}18571858/* Act on what follows the class. For a zero minimum repeat, continue;1859otherwise stop processing. */18601861switch (*tcode)1862{1863case OP_CRSTAR:1864case OP_CRMINSTAR:1865case OP_CRQUERY:1866case OP_CRMINQUERY:1867case OP_CRPOSSTAR:1868case OP_CRPOSQUERY:1869tcode++;1870break;18711872case OP_CRRANGE:1873case OP_CRMINRANGE:1874case OP_CRPOSRANGE:1875if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;1876else try_next = FALSE;1877break;18781879default:1880try_next = FALSE;1881break;1882}1883break; /* End of class handling case */1884} /* End of switch for opcodes */1885} /* End of try_next loop */18861887code += GET(code, 1); /* Advance to next branch */1888}1889while (*code == OP_ALT);18901891return yield;1892}1893189418951896/*************************************************1897* Study a compiled expression *1898*************************************************/18991900/* This function is handed a compiled expression that it must study to produce1901information that will speed up the matching.19021903Argument:1904re points to the compiled expression19051906Returns: 0 normally; non-zero should never normally occur19071 unknown opcode in set_start_bits19082 missing capturing bracket19093 unknown opcode in find_minlength1910*/19111912int1913PRIV(study)(pcre2_real_code *re)1914{1915int count = 0;1916PCRE2_UCHAR *code;1917BOOL utf = (re->overall_options & PCRE2_UTF) != 0;1918BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;19191920/* Find start of compiled code */19211922code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);19231924/* For a pattern that has a first code unit, or a multiline pattern that1925matches only at "line start", there is no point in seeking a list of starting1926code units. */19271928if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)1929{1930int depth = 0;1931int rc = set_start_bits(re, code, utf, ucp, &depth);1932if (rc == SSB_UNKNOWN)1933{1934PCRE2_DEBUG_UNREACHABLE();1935return 1;1936}19371938/* If a list of starting code units was set up, scan the list to see if only1939one or two were listed. Having only one listed is rare because usually a1940single starting code unit will have been recognized and PCRE2_FIRSTSET set.1941If two are listed, see if they are caseless versions of the same character;1942if so we can replace the list with a caseless first code unit. This gives1943better performance and is plausibly worth doing for patterns such as [Ww]ord1944or (word|WORD). */19451946if (rc == SSB_DONE)1947{1948int i;1949int a = -1;1950int b = -1;1951uint8_t *p = re->start_bitmap;1952uint32_t flags = PCRE2_FIRSTMAPSET;19531954for (i = 0; i < 256; p++, i += 8)1955{1956uint8_t x = *p;1957if (x != 0)1958{1959int c;1960uint8_t y = x & (~x + 1); /* Least significant bit */1961if (y != x) goto DONE; /* More than one bit set */19621963/* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and1964all wide characters", so we cannot use it here. */19651966#if PCRE2_CODE_UNIT_WIDTH != 81967if (i == 248 && x == 0x80) goto DONE;1968#endif19691970/* Compute the character value */19711972c = i;1973switch (x)1974{1975case 1: break;1976case 2: c += 1; break; case 4: c += 2; break;1977case 8: c += 3; break; case 16: c += 4; break;1978case 32: c += 5; break; case 64: c += 6; break;1979case 128: c += 7; break;1980}19811982/* c contains the code unit value, in the range 0-255. In 8-bit UTF1983mode, only values < 128 can be used. In all the other cases, c is a1984character value. */19851986#if PCRE2_CODE_UNIT_WIDTH == 81987if (utf && c > 127) goto DONE;1988#endif1989if (a < 0) a = c; /* First one found, save in a */1990else if (b < 0) /* Second one found */1991{1992int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);19931994#ifdef SUPPORT_UNICODE1995if (utf || ucp)1996{1997if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */1998if (c > 127) d = UCD_OTHERCASE(c);1999}2000#endif /* SUPPORT_UNICODE */20012002if (d != a) goto DONE; /* Not the other case of a */2003b = c; /* Save second in b */2004}2005else goto DONE; /* More than two characters found */2006}2007}20082009/* Replace the start code unit bits with a first code unit. If it is the2010same as a required later code unit, then clear the required later code2011unit. This is because a search for a required code unit starts after an2012explicit first code unit, but at a code unit found from the bitmap.2013Patterns such as /a*a/ don't work if both the start unit and required2014unit are the same. */20152016if (a >= 0) {2017if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {2018re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);2019re->last_codeunit = 0;2020}2021re->first_codeunit = a;2022flags = PCRE2_FIRSTSET;2023if (b >= 0) flags |= PCRE2_FIRSTCASELESS;2024}20252026DONE:2027re->flags |= flags;2028}2029}20302031/* Find the minimum length of subject string. If the pattern can match an empty2032string, the minimum length is already known. If the pattern contains (*ACCEPT)2033all bets are off, and we don't even try to find a minimum length. If there are2034more back references than the size of the vector we are going to cache them in,2035do nothing. A pattern that complicated will probably take a long time to2036analyze and may in any case turn out to be too complicated. Note that back2037reference minima are held as 16-bit numbers. */20382039if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&2040re->top_backref <= MAX_CACHE_BACKREF)2041{2042int min;2043int backref_cache[MAX_CACHE_BACKREF+1];2044backref_cache[0] = 0; /* Highest one that is set */2045min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);2046switch(min)2047{2048case -1: /* \C in UTF mode or over-complex regex */2049break; /* Leave minlength unchanged (will be zero) */20502051case -2:2052PCRE2_DEBUG_UNREACHABLE();2053return 2; /* missing capturing bracket */20542055case -3:2056PCRE2_DEBUG_UNREACHABLE();2057return 3; /* unrecognized opcode */20582059default:2060re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;2061break;2062}2063}20642065return 0;2066}20672068/* End of pcre2_study.c */206920702071