Path: blob/master/thirdparty/pcre2/src/pcre2_match.c
21798 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2015-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#include "pcre2_internal.h"42434445/* These defines enable debugging code */4647/* #define DEBUG_FRAMES_DISPLAY */48/* #define DEBUG_SHOW_OPS */49/* #define DEBUG_SHOW_RMATCH */5051#ifdef DEBUG_FRAMES_DISPLAY52#include <stdarg.h>53#endif5455#ifdef DEBUG_SHOW_OPS56static const char *OP_names[] = { OP_NAME_LIST };57#endif5859/* These defines identify the name of the block containing "static"60information, and fields within it. */6162#define NLBLOCK mb /* Block containing newline information */63#define PSSTART start_subject /* Field containing processed string start */64#define PSEND end_subject /* Field containing processed string end */6566#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */6768/* Masks for identifying the public options that are permitted at match time. */6970#define PUBLIC_MATCH_OPTIONS \71(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \72PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \73PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \74PCRE2_DISABLE_RECURSELOOP_CHECK)7576#define PUBLIC_JIT_MATCH_OPTIONS \77(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\78PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\79PCRE2_COPY_MATCHED_SUBJECT)8081/* Non-error returns from and within the match() function. Error returns are82externally defined PCRE2_ERROR_xxx codes, which are all negative. */8384#define MATCH_MATCH 185#define MATCH_NOMATCH 08687/* Special internal returns used in the match() function. Make them88sufficiently negative to avoid the external error codes. */8990#define MATCH_ACCEPT (-999)91#define MATCH_KETRPOS (-998)92/* The next 5 must be kept together and in sequence so that a test that checks93for any one of them can use a range. */94#define MATCH_COMMIT (-997)95#define MATCH_PRUNE (-996)96#define MATCH_SKIP (-995)97#define MATCH_SKIP_ARG (-994)98#define MATCH_THEN (-993)99#define MATCH_BACKTRACK_MAX MATCH_THEN100#define MATCH_BACKTRACK_MIN MATCH_COMMIT101102/* Group frame type values. Zero means the frame is not a group frame. The103lower 16 bits are used for data (e.g. the capture number). Group frames are104used for most groups so that information about the start is easily available at105the end without having to scan back through intermediate frames (backtrack106points). */107108#define GF_CAPTURE 0x00010000u109#define GF_NOCAPTURE 0x00020000u110#define GF_CONDASSERT 0x00030000u111#define GF_RECURSE 0x00040000u112113/* Masks for the identity and data parts of the group frame type. */114115#define GF_IDMASK(a) ((a) & 0xffff0000u)116#define GF_DATAMASK(a) ((a) & 0x0000ffffu)117118/* Repetition types */119120enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };121122/* Min and max values for the common repeats; a maximum of UINT32_MAX =>123infinity. */124125static const uint32_t rep_min[] = {1260, 0, /* * and *? */1271, 1, /* + and +? */1280, 0, /* ? and ?? */1290, 0, /* dummy placefillers for OP_CR[MIN]RANGE */1300, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */131132static const uint32_t rep_max[] = {133UINT32_MAX, UINT32_MAX, /* * and *? */134UINT32_MAX, UINT32_MAX, /* + and +? */1351, 1, /* ? and ?? */1360, 0, /* dummy placefillers for OP_CR[MIN]RANGE */137UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */138139/* Repetition types - must include OP_CRPOSRANGE (not needed above) */140141static const uint32_t rep_typ[] = {142REPTYPE_MAX, REPTYPE_MIN, /* * and *? */143REPTYPE_MAX, REPTYPE_MIN, /* + and +? */144REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */145REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */146REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */147REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */148149/* Numbers for RMATCH calls at backtracking points. When these lists are150changed, the code at RETURN_SWITCH below must be updated in sync. */151152enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,153RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,154RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,155RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 };156157#ifdef SUPPORT_WIDE_CHARS158enum { RM100=100, RM101, RM102, RM103 };159#endif160161#ifdef SUPPORT_UNICODE162enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,163RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,164RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,165RM224 };166#endif167168/* Define short names for general fields in the current backtrack frame, which169is always pointed to by the F variable. Occasional references to fields in170other frames are written out explicitly. There are also some fields in the171current frame whose names start with "temp" that are used for short-term,172localised backtracking memory. These are #defined with Lxxx names at the point173of use and undefined afterwards. */174175#define Fback_frame F->back_frame176#define Fcapture_last F->capture_last177#define Fcurrent_recurse F->current_recurse178#define Fecode F->ecode179#define Feptr F->eptr180#define Fgroup_frame_type F->group_frame_type181#define Flast_group_offset F->last_group_offset182#define Flength F->length183#define Fmark F->mark184#define Frdepth F->rdepth185#define Fstart_match F->start_match186#define Foffset_top F->offset_top187#define Foccu F->occu188#define Fop F->op189#define Fovector F->ovector190#define Freturn_id F->return_id191192193#ifdef DEBUG_FRAMES_DISPLAY194/*************************************************195* Display current frames and contents *196*************************************************/197198/* This debugging function displays the current set of frames and their199contents. It is not called automatically from anywhere, the intention being200that calls can be inserted where necessary when debugging frame-related201problems.202203Arguments:204f the file to write to205F the current top frame206P a previous frame of interest207frame_size the frame size208mb points to the match block209match_data points to the match data block210s identification text211212Returns: nothing213*/214215static void216display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,217match_block *mb, pcre2_match_data *match_data, const char *s, ...)218{219uint32_t i;220heapframe *Q;221va_list ap;222va_start(ap, s);223224fprintf(f, "FRAMES ");225vfprintf(f, s, ap);226va_end(ap);227228if (P != NULL) fprintf(f, " P=%lu",229((char *)P - (char *)(match_data->heapframes))/frame_size);230fprintf(f, "\n");231232for (i = 0, Q = match_data->heapframes;233Q <= F;234i++, Q = (heapframe *)((char *)Q + frame_size))235{236fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",237i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),238Q->back_frame, Q->return_id);239240if (Q->last_group_offset == PCRE2_UNSET)241fprintf(f, " lgoffset=unset\n");242else243fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);244}245}246247#endif248249250251/*************************************************252* Process a callout *253*************************************************/254255/* This function is called for all callouts, whether "standalone" or at the256start of a conditional group. Feptr will be pointing to either OP_CALLOUT or257OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized258with fixed values.259260Arguments:261F points to the current backtracking frame262mb points to the match block263lengthptr where to return the length of the callout item264265Returns: the return from the callout266or 0 if no callout function exists267*/268269static int270do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)271{272int rc;273PCRE2_SIZE save0, save1;274PCRE2_SIZE *callout_ovector;275pcre2_callout_block *cb;276277*lengthptr = (*Fecode == OP_CALLOUT)?278PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);279280if (mb->callout == NULL) return 0; /* No callout function provided */281282/* The original matching code (pre 10.30) worked directly with the ovector283passed by the user, and this was passed to callouts. Now that the working284ovector is in the backtracking frame, it no longer needs to reserve space for285the overall match offsets (which would waste space in the frame). For backward286compatibility, however, we pass capture_top and offset_vector to the callout as287if for the extended ovector, and we ensure that the first two slots are unset288by preserving and restoring their current contents. Picky compilers complain if289references such as Fovector[-2] are use directly, so we set up a separate290pointer. */291292callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;293294/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields295are set externally. The first 3 never change; the last is updated for each296bumpalong. */297298cb = mb->cb;299cb->capture_top = (uint32_t)Foffset_top/2 + 1;300cb->capture_last = Fcapture_last;301cb->offset_vector = callout_ovector;302cb->mark = mb->nomatch_mark;303cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);304cb->pattern_position = GET(Fecode, 1);305cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);306307if (*Fecode == OP_CALLOUT) /* Numerical callout */308{309cb->callout_number = Fecode[1 + 2*LINK_SIZE];310cb->callout_string_offset = 0;311cb->callout_string = NULL;312cb->callout_string_length = 0;313}314else /* String callout */315{316cb->callout_number = 0;317cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);318cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;319cb->callout_string_length =320*lengthptr - (1 + 4*LINK_SIZE) - 2;321}322323save0 = callout_ovector[0];324save1 = callout_ovector[1];325callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;326rc = mb->callout(cb, mb->callout_data);327callout_ovector[0] = save0;328callout_ovector[1] = save1;329cb->callout_flags = 0;330return rc;331}332333334335/*************************************************336* Match a back-reference *337*************************************************/338339/* This function is called only when it is known that the offset lies within340the offsets that have so far been used in the match. Note that in caseless341UTF-8 mode, the number of subject bytes matched may be different to the number342of reference bytes. (In theory this could also happen in UTF-16 mode, but it343seems unlikely.)344345Arguments:346offset index into the offset vector347caseless TRUE if caseless348caseopts bitmask of REFI_FLAG_XYZ values349F the current backtracking frame pointer350mb points to match block351lengthptr pointer for returning the length matched352353Returns: = 0 sucessful match; number of code units matched is set354< 0 no match355> 0 partial match356*/357358static int359match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F,360match_block *mb, PCRE2_SIZE *lengthptr)361{362PCRE2_SPTR p;363PCRE2_SIZE length;364PCRE2_SPTR eptr;365PCRE2_SPTR eptr_start;366367#ifndef SUPPORT_UNICODE368(void)caseopts; /* Avoid compiler warning. */369#endif370371/* Deal with an unset group. The default is no match, but there is an option to372match an empty string. */373374if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)375{376if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)377{378*lengthptr = 0;379return 0; /* Match */380}381else return -1; /* No match */382}383384/* Separate the caseless and UTF cases for speed. */385386eptr = eptr_start = Feptr;387p = mb->start_subject + Fovector[offset];388length = Fovector[offset+1] - Fovector[offset];389PCRE2_ASSERT(eptr <= mb->end_subject);390391if (caseless)392{393#if defined SUPPORT_UNICODE394BOOL utf = (mb->poptions & PCRE2_UTF) != 0;395BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0;396BOOL turkish_casing = !caseless_restrict && (caseopts & REFI_FLAG_TURKISH_CASING) != 0;397398if (utf || (mb->poptions & PCRE2_UCP) != 0)399{400PCRE2_SPTR endptr = p + length;401402/* Match characters up to the end of the reference. NOTE: the number of403code units matched may differ, because in UTF-8 there are some characters404whose upper and lower case codes have different numbers of bytes. For405example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3406bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a407sequence of two of the latter. It is important, therefore, to check the408length along the reference, not along the subject (earlier code did this409wrong). UCP uses Unicode properties but without UTF encoding. */410411while (p < endptr)412{413uint32_t c, d;414const ucd_record *ur;415if (eptr >= mb->end_subject) return 1; /* Partial match */416417if (utf)418{419GETCHARINC(c, eptr);420GETCHARINC(d, p);421}422else423{424c = *eptr++;425d = *p++;426}427428if (turkish_casing && UCD_ANY_I(d))429{430c = UCD_FOLD_I_TURKISH(c);431d = UCD_FOLD_I_TURKISH(d);432if (c != d) return -1; /* No match */433}434else if (c != d && c != (uint32_t)((int)d + (ur = GET_UCD(d))->other_case))435{436const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;437438/* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets439that start with an ASCII character. */440if (caseless_restrict && *pp < 128) return -1; /* No match */441442for (;;)443{444if (c < *pp) return -1; /* No match */445if (c == *pp++) break;446}447}448}449}450else451#endif452453/* Not in UTF or UCP mode */454{455for (; length > 0; length--)456{457uint32_t cc, cp;458if (eptr >= mb->end_subject) return 1; /* Partial match */459cc = UCHAR21TEST(eptr);460cp = UCHAR21TEST(p);461if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))462return -1; /* No match */463p++;464eptr++;465}466}467}468469/* In the caseful case, we can just compare the code units, whether or not we470are in UTF and/or UCP mode. When partial matching, we have to do this unit by471unit. */472473else474{475if (mb->partial != 0)476{477for (; length > 0; length--)478{479if (eptr >= mb->end_subject) return 1; /* Partial match */480if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */481}482}483484/* Not partial matching */485486else487{488if ((PCRE2_SIZE)(mb->end_subject - eptr) < length ||489memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */490eptr += length;491}492}493494*lengthptr = eptr - eptr_start;495return 0; /* Match */496}497498499500/*************************************************501* Restore offsets after a recurse *502*************************************************/503504/* This function restores the ovector values when505a recursive block reaches its end, and the triggering506recurse has and argument list.507508Arguments:509F the current backtracking frame pointer510P the previous backtracking frame pointer511*/512513static void514recurse_update_offsets(heapframe *F, heapframe *P)515{516PCRE2_SIZE *dst = F->ovector;517PCRE2_SIZE *src = P->ovector;518/* The first bracket has offset 2, because519offset 0 is reserved for the full match. */520PCRE2_SIZE offset = 2;521PCRE2_SIZE offset_top = Foffset_top + 2;522PCRE2_SIZE diff;523PCRE2_SPTR ecode = Fecode;524525do526{527diff = (GET2(ecode, 1) << 1) - offset;528ecode += 1 + IMM2_SIZE;529530if (offset + diff >= offset_top)531{532/* Some OP_CREF opcodes are not533processed, they must be skipped. */534while (*ecode == OP_CREF) ecode += 1 + IMM2_SIZE;535break;536}537538if (diff == 2)539{540dst[0] = src[0];541dst[1] = src[1];542}543else if (diff >= 4)544memcpy(dst, src, diff * sizeof(PCRE2_SIZE));545546/* Skip the unmodified entry. */547diff += 2;548offset += diff;549dst += diff;550src += diff;551}552while (*ecode == OP_CREF);553554diff = offset_top - offset;555if (diff == 2)556{557dst[0] = src[0];558dst[1] = src[1];559}560else if (diff >= 4)561memcpy(dst, src, diff * sizeof(PCRE2_SIZE));562563Fecode = ecode;564Foffset_top = (offset <= P->offset_top) ? P->offset_top : (offset - 2);565}566567568569/******************************************************************************570*******************************************************************************571"Recursion" in the match() function572573The original match() function was highly recursive, but this proved to be the574source of a number of problems over the years, mostly because of the relatively575small system stacks that are commonly found. As new features were added to576patterns, various kludges were invented to reduce the amount of stack used,577making the code hard to understand in places.578579A version did exist that used individual frames on the heap instead of calling580match() recursively, but this ran substantially slower. The current version is581a refactoring that uses a vector of frames to remember backtracking points.582This runs no slower, and possibly even a bit faster than the original recursive583implementation.584585At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50586frames) was allocated on the system stack. If this was not big enough, the heap587was used for a larger vector. However, it turns out that there are environments588where taking as little as 20KiB from the system stack is an embarrassment.589After another refactoring, the heap is used exclusively, but a pointer the590frames vector and its size are cached in the match_data block, so that there is591no new memory allocation if the same match_data block is used for multiple592matches (unless the frames vector has to be extended).593*******************************************************************************594******************************************************************************/595596597598599/*************************************************600* Macros for the match() function *601*************************************************/602603/* These macros pack up tests that are used for partial matching several times604in the code. The second one is used when we already know we are past the end of605the subject. We set the "hit end" flag if the pointer is at the end of the606subject and either (a) the pointer is past the earliest inspected character607(i.e. something has been matched, even if not part of the actual matched608string), or (b) the pattern contains a lookbehind. These are the conditions for609which adding more characters may allow the current match to continue.610611For hard partial matching, we immediately return a partial match. Otherwise,612carrying on means that a complete match on the current subject will be sought.613A partial match is returned only if no complete match can be found. */614615#define CHECK_PARTIAL() \616do { \617if (Feptr >= mb->end_subject) \618{ \619SCHECK_PARTIAL(); \620} \621} \622while (0)623624#define SCHECK_PARTIAL() \625do { \626if (mb->partial != 0 && \627(Feptr > mb->start_used_ptr || mb->allowemptypartial)) \628{ \629mb->hitend = TRUE; \630if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \631} \632} \633while (0)634635636/* These macros are used to implement backtracking. They simulate a recursive637call to the match() function by means of a local vector of frames which638remember the backtracking points. */639640#define RMATCH(ra,rb) \641do { \642start_ecode = ra; \643Freturn_id = rb; \644goto MATCH_RECURSE; \645L_##rb:; \646} \647while (0)648649#define RRETURN(ra) \650do { \651rrc = ra; \652goto RETURN_SWITCH; \653} \654while (0)655656657658/*************************************************659* Match from current position *660*************************************************/661662/* This function is called to run one match attempt at a single starting point663in the subject.664665Performance note: It might be tempting to extract commonly used fields from the666mb structure (e.g. end_subject) into individual variables to improve667performance. Tests using gcc on a SPARC disproved this; in the first case, it668made performance worse.669670Arguments:671start_eptr starting character in subject672start_ecode starting position in compiled code673top_bracket number of capturing parentheses in the pattern674frame_size size of each backtracking frame675match_data pointer to the match_data block676mb pointer to "static" variables block677678Returns: MATCH_MATCH if matched ) these values are >= 0679MATCH_NOMATCH if failed to match )680negative MATCH_xxx value for PRUNE, SKIP, etc681negative PCRE2_ERROR_xxx value if aborted by an error condition682(e.g. stopped by repeated call or depth limit)683*/684685static int686match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,687PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)688{689/* Frame-handling variables */690691heapframe *F; /* Current frame pointer */692heapframe *N = NULL; /* Temporary frame pointers */693heapframe *P = NULL;694695heapframe *frames_top; /* End of frames vector */696heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */697PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */698699/* Local variables that do not need to be preserved over calls to RRMATCH(). */700701PCRE2_SPTR branch_end = NULL;702PCRE2_SPTR branch_start;703PCRE2_SPTR bracode; /* Temp pointer to start of group */704PCRE2_SIZE offset; /* Used for group offsets */705PCRE2_SIZE length; /* Used for various length calculations */706707int rrc; /* Return from functions & backtracking "recursions" */708#ifdef SUPPORT_UNICODE709int proptype; /* Type of character property */710#endif711712uint32_t i; /* Used for local loops */713uint32_t fc; /* Character values */714uint32_t number; /* Used for group and other numbers */715uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */716uint32_t group_frame_type; /* Specifies type for new group frames */717718BOOL condition; /* Used in conditional groups */719BOOL cur_is_word; /* Used in "word" tests */720BOOL prev_is_word; /* Used in "word" tests */721722/* UTF and UCP flags */723724#ifdef SUPPORT_UNICODE725BOOL utf = (mb->poptions & PCRE2_UTF) != 0;726BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;727#else728BOOL utf = FALSE; /* Required for convenience even when no Unicode support */729#endif730731/* This is the length of the last part of a backtracking frame that must be732copied when a new frame is created. */733734frame_copy_size = frame_size - offsetof(heapframe, eptr);735736/* Set up the first frame and the end of the frames vector. */737738F = match_data->heapframes;739frames_top = (heapframe *)((char *)F + match_data->heapframes_size);740741Frdepth = 0; /* "Recursion" depth */742Fcapture_last = 0; /* Number of most recent capture */743Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */744Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */745Fmark = NULL; /* Most recent mark */746Foffset_top = 0; /* End of captures within the frame */747Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */748group_frame_type = 0; /* Not a start of group frame */749goto NEW_FRAME; /* Start processing with this frame */750751/* Come back here when we want to create a new frame for remembering a752backtracking point. */753754MATCH_RECURSE:755756/* Set up a new backtracking frame. If the vector is full, get a new one,757doubling the size, but constrained by the heap limit (which is in KiB). */758759N = (heapframe *)((char *)F + frame_size);760if ((heapframe *)((char *)N + frame_size) >= frames_top)761{762heapframe *new;763PCRE2_SIZE newsize;764PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);765766if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)767{768if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)769return PCRE2_ERROR_NOMEMORY;770newsize = PCRE2_SIZE_MAX - 1;771}772else773newsize = match_data->heapframes_size * 2;774775if (newsize / 1024 >= mb->heap_limit)776{777PCRE2_SIZE old_size = match_data->heapframes_size / 1024;778if (mb->heap_limit <= old_size)779return PCRE2_ERROR_HEAPLIMIT;780else781{782PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);783int over_bytes = match_data->heapframes_size % 1024;784if (over_bytes) max_delta -= (1024 - over_bytes);785newsize = match_data->heapframes_size + max_delta;786}787}788789/* With a heap limit set, the permitted additional size may not be enough for790another frame, so do a final check. */791792if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;793new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);794if (new == NULL) return PCRE2_ERROR_NOMEMORY;795memcpy(new, match_data->heapframes, usedsize);796797N = (heapframe *)((char *)new + usedsize);798F = (heapframe *)((char *)N - frame_size);799800match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);801match_data->heapframes = new;802match_data->heapframes_size = newsize;803frames_top = (heapframe *)((char *)new + newsize);804}805806#ifdef DEBUG_SHOW_RMATCH807fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);808if (group_frame_type != 0)809{810fprintf(stderr, " type=%x ", group_frame_type);811switch (GF_IDMASK(group_frame_type))812{813case GF_CAPTURE:814fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));815break;816817case GF_NOCAPTURE:818fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));819break;820821case GF_CONDASSERT:822fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));823break;824825case GF_RECURSE:826fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));827break;828829default:830fprintf(stderr, "*** unknown ***");831break;832}833}834fprintf(stderr, "\n");835#endif836837/* Copy those fields that must be copied into the new frame, increase the838"recursion" depth (i.e. the new frame's index) and then make the new frame839current. */840841memcpy((char *)N + offsetof(heapframe, eptr),842(char *)F + offsetof(heapframe, eptr),843frame_copy_size);844845N->rdepth = Frdepth + 1;846F = N;847848/* Carry on processing with a new frame. */849850NEW_FRAME:851Fgroup_frame_type = group_frame_type;852Fecode = start_ecode; /* Starting code pointer */853Fback_frame = frame_size; /* Default is go back one frame */854855/* If this is a special type of group frame, remember its offset for quick856access at the end of the group. If this is a recursion, set a new current857recursion value. */858859if (group_frame_type != 0)860{861Flast_group_offset = (char *)F - (char *)match_data->heapframes;862if (GF_IDMASK(group_frame_type) == GF_RECURSE)863Fcurrent_recurse = GF_DATAMASK(group_frame_type);864group_frame_type = 0;865}866867868/* ========================================================================= */869/* This is the main processing loop. First check that we haven't recorded too870many backtracks (search tree is too large), or that we haven't exceeded the871recursive depth limit (used too many backtracking frames). If not, process the872opcodes. */873874if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;875if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;876877#ifdef DEBUG_SHOW_OPS878fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",879GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);880#endif881882for (;;)883{884#ifdef DEBUG_SHOW_OPS885fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,886OP_names[*Fecode]);887#endif888889Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */890switch(Fop)891{892/* ===================================================================== */893/* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close894any currently open capturing brackets. Unlike reaching the end of a group,895where we know the starting frame is at the top of the chained frames, in896this case we have to search back for the relevant frame in case other types897of group that use chained frames have intervened. Multiple OP_CLOSEs always898come innermost first, which matches the chain order. We can ignore this in899a recursion, because captures are not passed out of recursions. */900901case OP_CLOSE:902if (Fcurrent_recurse == RECURSE_UNSET)903{904number = GET2(Fecode, 1);905offset = Flast_group_offset;906for(;;)907{908/* Corrupted heapframes?. Trigger an assert and return an error */909PCRE2_ASSERT(offset != PCRE2_UNSET);910if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;911912N = (heapframe *)((char *)match_data->heapframes + offset);913P = (heapframe *)((char *)N - frame_size);914if (N->group_frame_type == (GF_CAPTURE | number)) break;915offset = P->last_group_offset;916}917offset = (number << 1) - 2;918Fcapture_last = number;919Fovector[offset] = P->eptr - mb->start_subject;920Fovector[offset+1] = Feptr - mb->start_subject;921if (offset >= Foffset_top) Foffset_top = offset + 2;922}923Fecode += PRIV(OP_lengths)[*Fecode];924break;925926927/* ===================================================================== */928/* Real or forced end of the pattern, assertion, or recursion. In an929assertion ACCEPT, update the last used pointer and remember the current930frame so that the captures and mark can be fished out of it. */931932case OP_ASSERT_ACCEPT:933if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;934assert_accept_frame = F;935RRETURN(MATCH_ACCEPT);936937/* For ACCEPT within a recursion, we have to find the most recent938recursion. If not in a recursion, fall through to code that is common with939OP_END. */940941case OP_ACCEPT:942if (Fcurrent_recurse != RECURSE_UNSET)943{944#ifdef DEBUG_SHOW_OPS945fprintf(stderr, "++ Accept within recursion\n");946#endif947offset = Flast_group_offset;948for(;;)949{950/* Corrupted heapframes?. Trigger an assert and return an error */951PCRE2_ASSERT(offset != PCRE2_UNSET);952if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;953954N = (heapframe *)((char *)match_data->heapframes + offset);955P = (heapframe *)((char *)N - frame_size);956if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;957offset = P->last_group_offset;958}959960/* N is now the frame of the recursion; the previous frame is at the961OP_RECURSE position. Go back there, copying the current subject position962and mark, and the start_match position (\K might have changed it), and963then move on past the OP_RECURSE. */964965P->eptr = Feptr;966P->mark = Fmark;967P->start_match = Fstart_match;968F = P;969Fecode += 1 + LINK_SIZE;970continue;971}972PCRE2_FALLTHROUGH /* Fall through */973974/* OP_END itself can never be reached within a recursion because that is975picked up when the OP_KET that always precedes OP_END is reached. */976977case OP_END:978979/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if980PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the981subject. In both cases, backtracking will then try other alternatives, if982any. */983984if (Feptr == Fstart_match &&985((mb->moptions & PCRE2_NOTEMPTY) != 0 ||986((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&987Fstart_match == mb->start_subject + mb->start_offset)))988{989#ifdef DEBUG_SHOW_OPS990fprintf(stderr, "++ Backtrack because empty string\n");991#endif992RRETURN(MATCH_NOMATCH);993}994995/* Fail if PCRE2_ENDANCHORED is set and the end of the match is not996the end of the subject. After (*ACCEPT) we fail the entire match (at this997position) but backtrack if we've reached the end of the pattern. This998applies whether or not we are in a recursion. */9991000if (Feptr < mb->end_subject &&1001((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)1002{1003if (Fop == OP_END)1004{1005#ifdef DEBUG_SHOW_OPS1006fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");1007#endif1008RRETURN(MATCH_NOMATCH);1009}10101011#ifdef DEBUG_SHOW_OPS1012fprintf(stderr, "++ Failed ACCEPT not at end (endanchored set)\n");1013#endif1014return MATCH_NOMATCH; /* (*ACCEPT) */1015}10161017/* Fail if we detect that the start position was moved to be either after1018the end position (\K in lookahead) or before the start offset (\K in1019lookbehind). If this occurs, the pattern must have used \K in a somewhat1020sneaky way (e.g. by pattern recursion), because if the \K is actually1021syntactically inside the lookaround, it's blocked at compile-time. */10221023if (Fstart_match < mb->start_subject + mb->start_offset ||1024Fstart_match > Feptr)1025{1026/* The \K expression is fairly rare. We assert it was used so that we1027catch any unexpected invalid data in start_match. */1028PCRE2_ASSERT(mb->hasbsk);10291030if (!mb->allowlookaroundbsk)1031return PCRE2_ERROR_BAD_BACKSLASH_K;1032}10331034/* We have a successful match of the whole pattern. Record the result and1035then do a direct return from the function. If there is space in the offset1036vector, set any pairs that follow the highest-numbered captured string but1037are less than the number of capturing groups in the pattern to PCRE2_UNSET.1038It is documented that this happens. "Gaps" are set to PCRE2_UNSET1039dynamically. It is only those at the end that need setting here. */10401041mb->end_match_ptr = Feptr; /* Record where we ended */1042mb->end_offset_top = Foffset_top; /* and how many extracts were taken */1043mb->mark = Fmark; /* and the last success mark */1044if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;10451046match_data->ovector[0] = Fstart_match - mb->start_subject;1047match_data->ovector[1] = Feptr - mb->start_subject;10481049/* Set i to the smaller of the sizes of the external and frame ovectors. */10501051i = 2 * ((top_bracket + 1 > match_data->oveccount)?1052match_data->oveccount : top_bracket + 1);1053memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));1054while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;1055return MATCH_MATCH; /* Note: NOT RRETURN */105610571058/*===================================================================== */1059/* Match any single character type except newline; have to take care with1060CRLF newlines and partial matching. */10611062case OP_ANY:1063if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);1064if (mb->partial != 0 &&1065Feptr == mb->end_subject - 1 &&1066NLBLOCK->nltype == NLTYPE_FIXED &&1067NLBLOCK->nllen == 2 &&1068UCHAR21TEST(Feptr) == NLBLOCK->nl[0])1069{1070mb->hitend = TRUE;1071if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;1072}1073PCRE2_FALLTHROUGH /* Fall through */10741075/* Match any single character whatsoever. */10761077case OP_ALLANY:1078if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */1079{ /* not be updated before SCHECK_PARTIAL. */1080SCHECK_PARTIAL();1081RRETURN(MATCH_NOMATCH);1082}1083Feptr++;1084#ifdef SUPPORT_UNICODE1085if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);1086#endif1087Fecode++;1088break;108910901091/* ===================================================================== */1092/* Match a single code unit, even in UTF mode. This opcode really does1093match any code unit, even newline. (It really should be called ANYCODEUNIT,1094of course - the byte name is from pre-16 bit days.) */10951096case OP_ANYBYTE:1097if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */1098{ /* not be updated before SCHECK_PARTIAL. */1099SCHECK_PARTIAL();1100RRETURN(MATCH_NOMATCH);1101}1102Feptr++;1103Fecode++;1104break;110511061107/* ===================================================================== */1108/* Match a single character, casefully */11091110case OP_CHAR:1111#ifdef SUPPORT_UNICODE1112if (utf)1113{1114Flength = 1;1115Fecode++;1116GETCHARLEN(fc, Fecode, Flength);1117if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))1118{1119CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */1120RRETURN(MATCH_NOMATCH);1121}1122for (; Flength > 0; Flength--)1123{1124if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);1125}1126}1127else1128#endif11291130/* Not UTF mode */1131{1132if (mb->end_subject - Feptr < 1)1133{1134SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */1135RRETURN(MATCH_NOMATCH);1136}1137if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);1138Fecode += 2;1139}1140break;114111421143/* ===================================================================== */1144/* Match a single character, caselessly. If we are at the end of the1145subject, give up immediately. We get here only when the pattern character1146has at most one other case. Characters with more than two cases are coded1147as OP_PROP with the pseudo-property PT_CLIST. */11481149case OP_CHARI:1150if (Feptr >= mb->end_subject)1151{1152SCHECK_PARTIAL();1153RRETURN(MATCH_NOMATCH);1154}11551156#ifdef SUPPORT_UNICODE1157if (utf)1158{1159Flength = 1;1160Fecode++;1161GETCHARLEN(fc, Fecode, Flength);11621163/* If the pattern character's value is < 128, we know that its other case1164(if any) is also < 128 (and therefore only one code unit long in all1165code-unit widths), so we can use the fast lookup table. We checked above1166that there is at least one character left in the subject. */11671168if (fc < 128)1169{1170uint32_t cc = UCHAR21(Feptr);1171if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);1172Fecode++;1173Feptr++;1174}11751176/* Otherwise we must pick up the subject character and use Unicode1177property support to test its other case. Note that we cannot use the1178value of "Flength" to check for sufficient bytes left, because the other1179case of the character may have more or fewer code units. */11801181else1182{1183uint32_t dc;1184GETCHARINC(dc, Feptr);1185Fecode += Flength;1186if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);1187}1188}11891190/* If UCP is set without UTF we must do the same as above, but with one1191character per code unit. */11921193else if (ucp)1194{1195uint32_t cc = UCHAR21(Feptr);1196fc = Fecode[1];1197if (fc < 128)1198{1199if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);1200}1201else1202{1203if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);1204}1205Feptr++;1206Fecode += 2;1207}12081209else1210#endif /* SUPPORT_UNICODE */12111212/* Not UTF or UCP mode; use the table for characters < 256. */1213{1214if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])1215!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);1216Feptr++;1217Fecode += 2;1218}1219break;122012211222/* ===================================================================== */1223/* Match not a single character. */12241225case OP_NOT:1226case OP_NOTI:1227if (Feptr >= mb->end_subject)1228{1229SCHECK_PARTIAL();1230RRETURN(MATCH_NOMATCH);1231}12321233#ifdef SUPPORT_UNICODE1234if (utf)1235{1236uint32_t ch;1237Fecode++;1238GETCHARINC(ch, Fecode);1239GETCHARINC(fc, Feptr);1240if (ch == fc)1241{1242RRETURN(MATCH_NOMATCH); /* Caseful match */1243}1244else if (Fop == OP_NOTI) /* If caseless */1245{1246if (ch > 127)1247ch = UCD_OTHERCASE(ch);1248else1249ch = (mb->fcc)[ch];1250if (ch == fc) RRETURN(MATCH_NOMATCH);1251}1252}12531254/* UCP without UTF is as above, but with one character per code unit. */12551256else if (ucp)1257{1258uint32_t ch;1259fc = UCHAR21INC(Feptr);1260ch = Fecode[1];1261Fecode += 2;12621263if (ch == fc)1264{1265RRETURN(MATCH_NOMATCH); /* Caseful match */1266}1267else if (Fop == OP_NOTI) /* If caseless */1268{1269if (ch > 127)1270ch = UCD_OTHERCASE(ch);1271else1272ch = (mb->fcc)[ch];1273if (ch == fc) RRETURN(MATCH_NOMATCH);1274}1275}12761277else1278#endif /* SUPPORT_UNICODE */12791280/* Neither UTF nor UCP is set */12811282{1283uint32_t ch = Fecode[1];1284fc = UCHAR21INC(Feptr);1285if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))1286RRETURN(MATCH_NOMATCH);1287Fecode += 2;1288}1289break;129012911292/* ===================================================================== */1293/* Match a single character repeatedly. */12941295#define Loclength F->temp_size1296#define Lstart_eptr F->temp_sptr[0]1297#define Lcharptr F->temp_sptr[1]1298#define Lmin F->temp_32[0]1299#define Lmax F->temp_32[1]1300#define Lc F->temp_32[2]1301#define Loc F->temp_32[3]13021303case OP_EXACT:1304case OP_EXACTI:1305Lmin = Lmax = GET2(Fecode, 1);1306Fecode += 1 + IMM2_SIZE;1307goto REPEATCHAR;13081309case OP_POSUPTO:1310case OP_POSUPTOI:1311reptype = REPTYPE_POS;1312Lmin = 0;1313Lmax = GET2(Fecode, 1);1314Fecode += 1 + IMM2_SIZE;1315goto REPEATCHAR;13161317case OP_UPTO:1318case OP_UPTOI:1319reptype = REPTYPE_MAX;1320Lmin = 0;1321Lmax = GET2(Fecode, 1);1322Fecode += 1 + IMM2_SIZE;1323goto REPEATCHAR;13241325case OP_MINUPTO:1326case OP_MINUPTOI:1327reptype = REPTYPE_MIN;1328Lmin = 0;1329Lmax = GET2(Fecode, 1);1330Fecode += 1 + IMM2_SIZE;1331goto REPEATCHAR;13321333case OP_POSSTAR:1334case OP_POSSTARI:1335reptype = REPTYPE_POS;1336Lmin = 0;1337Lmax = UINT32_MAX;1338Fecode++;1339goto REPEATCHAR;13401341case OP_POSPLUS:1342case OP_POSPLUSI:1343reptype = REPTYPE_POS;1344Lmin = 1;1345Lmax = UINT32_MAX;1346Fecode++;1347goto REPEATCHAR;13481349case OP_POSQUERY:1350case OP_POSQUERYI:1351reptype = REPTYPE_POS;1352Lmin = 0;1353Lmax = 1;1354Fecode++;1355goto REPEATCHAR;13561357case OP_STAR:1358case OP_STARI:1359case OP_MINSTAR:1360case OP_MINSTARI:1361case OP_PLUS:1362case OP_PLUSI:1363case OP_MINPLUS:1364case OP_MINPLUSI:1365case OP_QUERY:1366case OP_QUERYI:1367case OP_MINQUERY:1368case OP_MINQUERYI:1369fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);1370Lmin = rep_min[fc];1371Lmax = rep_max[fc];1372reptype = rep_typ[fc];13731374/* Common code for all repeated single-character matches. We first check1375for the minimum number of characters. If the minimum equals the maximum, we1376are done. Otherwise, if minimizing, check the rest of the pattern for a1377match; if there isn't one, advance up to the maximum, one character at a1378time.13791380If maximizing, advance up to the maximum number of matching characters,1381until Feptr is past the end of the maximum run. If possessive, we are1382then done (no backing up). Otherwise, match at this position; anything1383other than no match is immediately returned. For nomatch, back up one1384character, unless we are matching \R and the last thing matched was1385\r\n, in which case, back up two code units until we reach the first1386optional character position.13871388The various UTF/non-UTF and caseful/caseless cases are handled separately,1389for speed. */13901391REPEATCHAR:1392#ifdef SUPPORT_UNICODE1393if (utf)1394{1395Flength = 1;1396Lcharptr = Fecode;1397GETCHARLEN(fc, Fecode, Flength);1398Fecode += Flength;13991400/* Handle multi-code-unit character matching, caseful and caseless. */14011402if (Flength > 1)1403{1404uint32_t othercase;14051406if (Fop >= OP_STARI && /* Caseless */1407(othercase = UCD_OTHERCASE(fc)) != fc)1408Loclength = PRIV(ord2utf)(othercase, Foccu);1409else Loclength = 0;14101411for (i = 1; i <= Lmin; i++)1412{1413if (Feptr <= mb->end_subject - Flength &&1414memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;1415else if (Loclength > 0 &&1416Feptr <= mb->end_subject - Loclength &&1417memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)1418Feptr += Loclength;1419else1420{1421CHECK_PARTIAL();1422RRETURN(MATCH_NOMATCH);1423}1424}14251426if (Lmin == Lmax) continue;14271428if (reptype == REPTYPE_MIN)1429{1430for (;;)1431{1432RMATCH(Fecode, RM202);1433if (rrc != MATCH_NOMATCH) RRETURN(rrc);1434if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1435if (Feptr <= mb->end_subject - Flength &&1436memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;1437else if (Loclength > 0 &&1438Feptr <= mb->end_subject - Loclength &&1439memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)1440Feptr += Loclength;1441else1442{1443CHECK_PARTIAL();1444RRETURN(MATCH_NOMATCH);1445}1446}1447PCRE2_UNREACHABLE(); /* Control never reaches here */1448}14491450else /* Maximize */1451{1452Lstart_eptr = Feptr;1453for (i = Lmin; i < Lmax; i++)1454{1455if (Feptr <= mb->end_subject - Flength &&1456memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)1457Feptr += Flength;1458else if (Loclength > 0 &&1459Feptr <= mb->end_subject - Loclength &&1460memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)1461Feptr += Loclength;1462else1463{1464CHECK_PARTIAL();1465break;1466}1467}14681469/* After \C in UTF mode, Lstart_eptr might be in the middle of a1470Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't1471go too far. */14721473if (reptype != REPTYPE_POS) for(;;)1474{1475if (Feptr <= Lstart_eptr) break;1476RMATCH(Fecode, RM203);1477if (rrc != MATCH_NOMATCH) RRETURN(rrc);1478Feptr--;1479BACKCHAR(Feptr);1480}1481}1482break; /* End of repeated wide character handling */1483}14841485/* Length of UTF character is 1. Put it into the preserved variable and1486fall through to the non-UTF code. */14871488Lc = fc;1489}1490else1491#endif /* SUPPORT_UNICODE */14921493/* When not in UTF mode, load a single-code-unit character. Then proceed as1494above, using Unicode casing if either UTF or UCP is set. */14951496Lc = *Fecode++;14971498/* Caseless comparison */14991500if (Fop >= OP_STARI)1501{1502#if PCRE2_CODE_UNIT_WIDTH == 81503#ifdef SUPPORT_UNICODE1504if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);1505else1506#endif /* SUPPORT_UNICODE */1507/* Lc will be < 128 in UTF-8 mode. */1508Loc = mb->fcc[Lc];1509#else /* 16-bit & 32-bit */1510#ifdef SUPPORT_UNICODE1511if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);1512else1513#endif /* SUPPORT_UNICODE */1514Loc = TABLE_GET(Lc, mb->fcc, Lc);1515#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */15161517for (i = 1; i <= Lmin; i++)1518{1519uint32_t cc; /* Faster than PCRE2_UCHAR */1520if (Feptr >= mb->end_subject)1521{1522SCHECK_PARTIAL();1523RRETURN(MATCH_NOMATCH);1524}1525cc = UCHAR21TEST(Feptr);1526if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);1527Feptr++;1528}1529if (Lmin == Lmax) continue;15301531if (reptype == REPTYPE_MIN)1532{1533for (;;)1534{1535uint32_t cc; /* Faster than PCRE2_UCHAR */1536RMATCH(Fecode, RM25);1537if (rrc != MATCH_NOMATCH) RRETURN(rrc);1538if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1539if (Feptr >= mb->end_subject)1540{1541SCHECK_PARTIAL();1542RRETURN(MATCH_NOMATCH);1543}1544cc = UCHAR21TEST(Feptr);1545if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);1546Feptr++;1547}1548PCRE2_UNREACHABLE(); /* Control never reaches here */1549}15501551else /* Maximize */1552{1553Lstart_eptr = Feptr;1554for (i = Lmin; i < Lmax; i++)1555{1556uint32_t cc; /* Faster than PCRE2_UCHAR */1557if (Feptr >= mb->end_subject)1558{1559SCHECK_PARTIAL();1560break;1561}1562cc = UCHAR21TEST(Feptr);1563if (Lc != cc && Loc != cc) break;1564Feptr++;1565}1566if (reptype != REPTYPE_POS) for (;;)1567{1568if (Feptr == Lstart_eptr) break;1569RMATCH(Fecode, RM26);1570Feptr--;1571if (rrc != MATCH_NOMATCH) RRETURN(rrc);1572}1573}1574}15751576/* Caseful comparisons (includes all multi-byte characters) */15771578else1579{1580for (i = 1; i <= Lmin; i++)1581{1582if (Feptr >= mb->end_subject)1583{1584SCHECK_PARTIAL();1585RRETURN(MATCH_NOMATCH);1586}1587if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);1588}15891590if (Lmin == Lmax) continue;15911592if (reptype == REPTYPE_MIN)1593{1594for (;;)1595{1596RMATCH(Fecode, RM27);1597if (rrc != MATCH_NOMATCH) RRETURN(rrc);1598if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1599if (Feptr >= mb->end_subject)1600{1601SCHECK_PARTIAL();1602RRETURN(MATCH_NOMATCH);1603}1604if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);1605}1606PCRE2_UNREACHABLE(); /* Control never reaches here */1607}1608else /* Maximize */1609{1610Lstart_eptr = Feptr;1611for (i = Lmin; i < Lmax; i++)1612{1613if (Feptr >= mb->end_subject)1614{1615SCHECK_PARTIAL();1616break;1617}16181619if (Lc != UCHAR21TEST(Feptr)) break;1620Feptr++;1621}16221623if (reptype != REPTYPE_POS) for (;;)1624{1625if (Feptr <= Lstart_eptr) break;1626RMATCH(Fecode, RM28);1627Feptr--;1628if (rrc != MATCH_NOMATCH) RRETURN(rrc);1629}1630}1631}1632break;16331634#undef Loclength1635#undef Lstart_eptr1636#undef Lcharptr1637#undef Lmin1638#undef Lmax1639#undef Lc1640#undef Loc164116421643/* ===================================================================== */1644/* Match a negated single one-byte character repeatedly. This is almost a1645repeat of the code for a repeated single character, but I haven't found a1646nice way of commoning these up that doesn't require a test of the1647positive/negative option for each character match. Maybe that wouldn't add1648very much to the time taken, but character matching *is* what this is all1649about... */16501651#define Lstart_eptr F->temp_sptr[0]1652#define Lmin F->temp_32[0]1653#define Lmax F->temp_32[1]1654#define Lc F->temp_32[2]1655#define Loc F->temp_32[3]16561657case OP_NOTEXACT:1658case OP_NOTEXACTI:1659Lmin = Lmax = GET2(Fecode, 1);1660Fecode += 1 + IMM2_SIZE;1661goto REPEATNOTCHAR;16621663case OP_NOTUPTO:1664case OP_NOTUPTOI:1665Lmin = 0;1666Lmax = GET2(Fecode, 1);1667reptype = REPTYPE_MAX;1668Fecode += 1 + IMM2_SIZE;1669goto REPEATNOTCHAR;16701671case OP_NOTMINUPTO:1672case OP_NOTMINUPTOI:1673Lmin = 0;1674Lmax = GET2(Fecode, 1);1675reptype = REPTYPE_MIN;1676Fecode += 1 + IMM2_SIZE;1677goto REPEATNOTCHAR;16781679case OP_NOTPOSSTAR:1680case OP_NOTPOSSTARI:1681reptype = REPTYPE_POS;1682Lmin = 0;1683Lmax = UINT32_MAX;1684Fecode++;1685goto REPEATNOTCHAR;16861687case OP_NOTPOSPLUS:1688case OP_NOTPOSPLUSI:1689reptype = REPTYPE_POS;1690Lmin = 1;1691Lmax = UINT32_MAX;1692Fecode++;1693goto REPEATNOTCHAR;16941695case OP_NOTPOSQUERY:1696case OP_NOTPOSQUERYI:1697reptype = REPTYPE_POS;1698Lmin = 0;1699Lmax = 1;1700Fecode++;1701goto REPEATNOTCHAR;17021703case OP_NOTPOSUPTO:1704case OP_NOTPOSUPTOI:1705reptype = REPTYPE_POS;1706Lmin = 0;1707Lmax = GET2(Fecode, 1);1708Fecode += 1 + IMM2_SIZE;1709goto REPEATNOTCHAR;17101711case OP_NOTSTAR:1712case OP_NOTSTARI:1713case OP_NOTMINSTAR:1714case OP_NOTMINSTARI:1715case OP_NOTPLUS:1716case OP_NOTPLUSI:1717case OP_NOTMINPLUS:1718case OP_NOTMINPLUSI:1719case OP_NOTQUERY:1720case OP_NOTQUERYI:1721case OP_NOTMINQUERY:1722case OP_NOTMINQUERYI:1723fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);1724Lmin = rep_min[fc];1725Lmax = rep_max[fc];1726reptype = rep_typ[fc];17271728/* Common code for all repeated single-character non-matches. */17291730REPEATNOTCHAR:1731GETCHARINCTEST(Lc, Fecode);17321733/* The code is duplicated for the caseless and caseful cases, for speed,1734since matching characters is likely to be quite common. First, ensure the1735minimum number of matches are present. If Lmin = Lmax, we are done.1736Otherwise, if minimizing, keep trying the rest of the expression and1737advancing one matching character if failing, up to the maximum.1738Alternatively, if maximizing, find the maximum number of characters and1739work backwards. */17401741if (Fop >= OP_NOTSTARI) /* Caseless */1742{1743#ifdef SUPPORT_UNICODE1744if ((utf || ucp) && Lc > 127)1745Loc = UCD_OTHERCASE(Lc);1746else1747#endif /* SUPPORT_UNICODE */17481749Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */17501751#ifdef SUPPORT_UNICODE1752if (utf)1753{1754uint32_t d;1755for (i = 1; i <= Lmin; i++)1756{1757if (Feptr >= mb->end_subject)1758{1759SCHECK_PARTIAL();1760RRETURN(MATCH_NOMATCH);1761}1762GETCHARINC(d, Feptr);1763if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);1764}1765}1766else1767#endif /* SUPPORT_UNICODE */17681769/* Not UTF mode */1770{1771for (i = 1; i <= Lmin; i++)1772{1773if (Feptr >= mb->end_subject)1774{1775SCHECK_PARTIAL();1776RRETURN(MATCH_NOMATCH);1777}1778if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);1779Feptr++;1780}1781}17821783if (Lmin == Lmax) continue; /* Finished for exact count */17841785if (reptype == REPTYPE_MIN)1786{1787#ifdef SUPPORT_UNICODE1788if (utf)1789{1790uint32_t d;1791for (;;)1792{1793RMATCH(Fecode, RM204);1794if (rrc != MATCH_NOMATCH) RRETURN(rrc);1795if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1796if (Feptr >= mb->end_subject)1797{1798SCHECK_PARTIAL();1799RRETURN(MATCH_NOMATCH);1800}1801GETCHARINC(d, Feptr);1802if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);1803}1804}1805else1806#endif /*SUPPORT_UNICODE */18071808/* Not UTF mode */1809{1810for (;;)1811{1812RMATCH(Fecode, RM29);1813if (rrc != MATCH_NOMATCH) RRETURN(rrc);1814if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1815if (Feptr >= mb->end_subject)1816{1817SCHECK_PARTIAL();1818RRETURN(MATCH_NOMATCH);1819}1820if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);1821Feptr++;1822}1823}1824PCRE2_UNREACHABLE(); /* Control never reaches here */1825}18261827/* Maximize case */18281829else1830{1831Lstart_eptr = Feptr;18321833#ifdef SUPPORT_UNICODE1834if (utf)1835{1836uint32_t d;1837for (i = Lmin; i < Lmax; i++)1838{1839int len = 1;1840if (Feptr >= mb->end_subject)1841{1842SCHECK_PARTIAL();1843break;1844}1845GETCHARLEN(d, Feptr, len);1846if (Lc == d || Loc == d) break;1847Feptr += len;1848}18491850/* After \C in UTF mode, Lstart_eptr might be in the middle of a1851Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't1852go too far. */18531854if (reptype != REPTYPE_POS) for(;;)1855{1856if (Feptr <= Lstart_eptr) break;1857RMATCH(Fecode, RM205);1858if (rrc != MATCH_NOMATCH) RRETURN(rrc);1859Feptr--;1860BACKCHAR(Feptr);1861}1862}1863else1864#endif /* SUPPORT_UNICODE */18651866/* Not UTF mode */1867{1868for (i = Lmin; i < Lmax; i++)1869{1870if (Feptr >= mb->end_subject)1871{1872SCHECK_PARTIAL();1873break;1874}1875if (Lc == *Feptr || Loc == *Feptr) break;1876Feptr++;1877}1878if (reptype != REPTYPE_POS) for (;;)1879{1880if (Feptr == Lstart_eptr) break;1881RMATCH(Fecode, RM30);1882if (rrc != MATCH_NOMATCH) RRETURN(rrc);1883Feptr--;1884}1885}1886}1887}18881889/* Caseful comparisons */18901891else1892{1893#ifdef SUPPORT_UNICODE1894if (utf)1895{1896uint32_t d;1897for (i = 1; i <= Lmin; i++)1898{1899if (Feptr >= mb->end_subject)1900{1901SCHECK_PARTIAL();1902RRETURN(MATCH_NOMATCH);1903}1904GETCHARINC(d, Feptr);1905if (Lc == d) RRETURN(MATCH_NOMATCH);1906}1907}1908else1909#endif1910/* Not UTF mode */1911{1912for (i = 1; i <= Lmin; i++)1913{1914if (Feptr >= mb->end_subject)1915{1916SCHECK_PARTIAL();1917RRETURN(MATCH_NOMATCH);1918}1919if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);1920}1921}19221923if (Lmin == Lmax) continue;19241925if (reptype == REPTYPE_MIN)1926{1927#ifdef SUPPORT_UNICODE1928if (utf)1929{1930uint32_t d;1931for (;;)1932{1933RMATCH(Fecode, RM206);1934if (rrc != MATCH_NOMATCH) RRETURN(rrc);1935if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1936if (Feptr >= mb->end_subject)1937{1938SCHECK_PARTIAL();1939RRETURN(MATCH_NOMATCH);1940}1941GETCHARINC(d, Feptr);1942if (Lc == d) RRETURN(MATCH_NOMATCH);1943}1944}1945else1946#endif1947/* Not UTF mode */1948{1949for (;;)1950{1951RMATCH(Fecode, RM31);1952if (rrc != MATCH_NOMATCH) RRETURN(rrc);1953if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1954if (Feptr >= mb->end_subject)1955{1956SCHECK_PARTIAL();1957RRETURN(MATCH_NOMATCH);1958}1959if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);1960}1961}1962PCRE2_UNREACHABLE(); /* Control never reaches here */1963}19641965/* Maximize case */19661967else1968{1969Lstart_eptr = Feptr;19701971#ifdef SUPPORT_UNICODE1972if (utf)1973{1974uint32_t d;1975for (i = Lmin; i < Lmax; i++)1976{1977int len = 1;1978if (Feptr >= mb->end_subject)1979{1980SCHECK_PARTIAL();1981break;1982}1983GETCHARLEN(d, Feptr, len);1984if (Lc == d) break;1985Feptr += len;1986}19871988/* After \C in UTF mode, Lstart_eptr might be in the middle of a1989Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't1990go too far. */19911992if (reptype != REPTYPE_POS) for(;;)1993{1994if (Feptr <= Lstart_eptr) break;1995RMATCH(Fecode, RM207);1996if (rrc != MATCH_NOMATCH) RRETURN(rrc);1997Feptr--;1998BACKCHAR(Feptr);1999}2000}2001else2002#endif2003/* Not UTF mode */2004{2005for (i = Lmin; i < Lmax; i++)2006{2007if (Feptr >= mb->end_subject)2008{2009SCHECK_PARTIAL();2010break;2011}2012if (Lc == *Feptr) break;2013Feptr++;2014}2015if (reptype != REPTYPE_POS) for (;;)2016{2017if (Feptr == Lstart_eptr) break;2018RMATCH(Fecode, RM32);2019if (rrc != MATCH_NOMATCH) RRETURN(rrc);2020Feptr--;2021}2022}2023}2024}2025break;20262027#undef Lstart_eptr2028#undef Lmin2029#undef Lmax2030#undef Lc2031#undef Loc203220332034/* ===================================================================== */2035/* Match a bit-mapped character class, possibly repeatedly. These opcodes2036are used when all the characters in the class have values in the range20370-255, and either the matching is caseful, or the characters are in the2038range 0-127 when UTF processing is enabled. The only difference between2039OP_CLASS and OP_NCLASS occurs when a data character outside the range is2040encountered. */20412042#define Lmin F->temp_32[0]2043#define Lmax F->temp_32[1]2044#define Lstart_eptr F->temp_sptr[0]2045#define Lbyte_map_address F->temp_sptr[1]2046#define Lbyte_map ((const unsigned char *)Lbyte_map_address)20472048case OP_NCLASS:2049case OP_CLASS:2050{2051Lbyte_map_address = Fecode + 1; /* Save for matching */2052Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */20532054/* Look past the end of the item to see if there is repeat information2055following. Then obey similar code to character type repeats. */20562057switch (*Fecode)2058{2059case OP_CRSTAR:2060case OP_CRMINSTAR:2061case OP_CRPLUS:2062case OP_CRMINPLUS:2063case OP_CRQUERY:2064case OP_CRMINQUERY:2065case OP_CRPOSSTAR:2066case OP_CRPOSPLUS:2067case OP_CRPOSQUERY:2068fc = *Fecode++ - OP_CRSTAR;2069Lmin = rep_min[fc];2070Lmax = rep_max[fc];2071reptype = rep_typ[fc];2072break;20732074case OP_CRRANGE:2075case OP_CRMINRANGE:2076case OP_CRPOSRANGE:2077Lmin = GET2(Fecode, 1);2078Lmax = GET2(Fecode, 1 + IMM2_SIZE);2079if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */2080reptype = rep_typ[*Fecode - OP_CRSTAR];2081Fecode += 1 + 2 * IMM2_SIZE;2082break;20832084default: /* No repeat follows */2085Lmin = Lmax = 1;2086break;2087}20882089/* First, ensure the minimum number of matches are present. */20902091#ifdef SUPPORT_UNICODE2092if (utf)2093{2094for (i = 1; i <= Lmin; i++)2095{2096if (Feptr >= mb->end_subject)2097{2098SCHECK_PARTIAL();2099RRETURN(MATCH_NOMATCH);2100}2101GETCHARINC(fc, Feptr);2102if (fc > 255)2103{2104if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2105}2106else2107if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2108}2109}2110else2111#endif2112/* Not UTF mode */2113{2114for (i = 1; i <= Lmin; i++)2115{2116if (Feptr >= mb->end_subject)2117{2118SCHECK_PARTIAL();2119RRETURN(MATCH_NOMATCH);2120}2121fc = *Feptr++;2122#if PCRE2_CODE_UNIT_WIDTH != 82123if (fc > 255)2124{2125if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2126}2127else2128#endif2129if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2130}2131}21322133/* If Lmax == Lmin we are done. Continue with main loop. */21342135if (Lmin == Lmax) continue;21362137/* If minimizing, keep testing the rest of the expression and advancing2138the pointer while it matches the class. */21392140if (reptype == REPTYPE_MIN)2141{2142#ifdef SUPPORT_UNICODE2143if (utf)2144{2145for (;;)2146{2147RMATCH(Fecode, RM200);2148if (rrc != MATCH_NOMATCH) RRETURN(rrc);2149if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2150if (Feptr >= mb->end_subject)2151{2152SCHECK_PARTIAL();2153RRETURN(MATCH_NOMATCH);2154}2155GETCHARINC(fc, Feptr);2156if (fc > 255)2157{2158if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2159}2160else2161if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2162}2163}2164else2165#endif2166/* Not UTF mode */2167{2168for (;;)2169{2170RMATCH(Fecode, RM23);2171if (rrc != MATCH_NOMATCH) RRETURN(rrc);2172if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2173if (Feptr >= mb->end_subject)2174{2175SCHECK_PARTIAL();2176RRETURN(MATCH_NOMATCH);2177}2178fc = *Feptr++;2179#if PCRE2_CODE_UNIT_WIDTH != 82180if (fc > 255)2181{2182if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2183}2184else2185#endif2186if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2187}2188}2189PCRE2_UNREACHABLE(); /* Control never reaches here */2190}21912192/* If maximizing, find the longest possible run, then work backwards. */21932194else2195{2196Lstart_eptr = Feptr;21972198#ifdef SUPPORT_UNICODE2199if (utf)2200{2201for (i = Lmin; i < Lmax; i++)2202{2203int len = 1;2204if (Feptr >= mb->end_subject)2205{2206SCHECK_PARTIAL();2207break;2208}2209GETCHARLEN(fc, Feptr, len);2210if (fc > 255)2211{2212if (Fop == OP_CLASS) break;2213}2214else2215if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;2216Feptr += len;2217}22182219if (reptype == REPTYPE_POS) continue; /* No backtracking */22202221/* After \C in UTF mode, Lstart_eptr might be in the middle of a2222Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't2223go too far. */22242225for (;;)2226{2227RMATCH(Fecode, RM201);2228if (rrc != MATCH_NOMATCH) RRETURN(rrc);2229if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */2230BACKCHAR(Feptr);2231}2232}2233else2234#endif2235/* Not UTF mode */2236{2237for (i = Lmin; i < Lmax; i++)2238{2239if (Feptr >= mb->end_subject)2240{2241SCHECK_PARTIAL();2242break;2243}2244fc = *Feptr;2245#if PCRE2_CODE_UNIT_WIDTH != 82246if (fc > 255)2247{2248if (Fop == OP_CLASS) break;2249}2250else2251#endif2252if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;2253Feptr++;2254}22552256if (reptype == REPTYPE_POS) continue; /* No backtracking */22572258while (Feptr >= Lstart_eptr)2259{2260RMATCH(Fecode, RM24);2261if (rrc != MATCH_NOMATCH) RRETURN(rrc);2262Feptr--;2263}2264}22652266RRETURN(MATCH_NOMATCH);2267}2268}22692270PCRE2_UNREACHABLE(); /* Control never reaches here */22712272#undef Lbyte_map_address2273#undef Lbyte_map2274#undef Lstart_eptr2275#undef Lmin2276#undef Lmax227722782279/* ===================================================================== */2280/* Match an extended character class. In the 8-bit library, this opcode is2281encountered only when UTF-8 mode mode is supported. In the 16-bit and228232-bit libraries, codepoints greater than 255 may be encountered even when2283UTF is not supported. */22842285#define Lstart_eptr F->temp_sptr[0]2286#define Lxclass_data F->temp_sptr[1]2287#define Lmin F->temp_32[0]2288#define Lmax F->temp_32[1]22892290#ifdef SUPPORT_WIDE_CHARS2291case OP_XCLASS:2292{2293Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */2294Fecode += GET(Fecode, 1); /* Advance past the item */22952296switch (*Fecode)2297{2298case OP_CRSTAR:2299case OP_CRMINSTAR:2300case OP_CRPLUS:2301case OP_CRMINPLUS:2302case OP_CRQUERY:2303case OP_CRMINQUERY:2304case OP_CRPOSSTAR:2305case OP_CRPOSPLUS:2306case OP_CRPOSQUERY:2307fc = *Fecode++ - OP_CRSTAR;2308Lmin = rep_min[fc];2309Lmax = rep_max[fc];2310reptype = rep_typ[fc];2311break;23122313case OP_CRRANGE:2314case OP_CRMINRANGE:2315case OP_CRPOSRANGE:2316Lmin = GET2(Fecode, 1);2317Lmax = GET2(Fecode, 1 + IMM2_SIZE);2318if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */2319reptype = rep_typ[*Fecode - OP_CRSTAR];2320Fecode += 1 + 2 * IMM2_SIZE;2321break;23222323default: /* No repeat follows */2324Lmin = Lmax = 1;2325break;2326}23272328/* First, ensure the minimum number of matches are present. */23292330for (i = 1; i <= Lmin; i++)2331{2332if (Feptr >= mb->end_subject)2333{2334SCHECK_PARTIAL();2335RRETURN(MATCH_NOMATCH);2336}2337GETCHARINCTEST(fc, Feptr);2338if (!PRIV(xclass)(fc, Lxclass_data,2339(const uint8_t*)mb->start_code, utf))2340RRETURN(MATCH_NOMATCH);2341}23422343/* If Lmax == Lmin we can just continue with the main loop. */23442345if (Lmin == Lmax) continue;23462347/* If minimizing, keep testing the rest of the expression and advancing2348the pointer while it matches the class. */23492350if (reptype == REPTYPE_MIN)2351{2352for (;;)2353{2354RMATCH(Fecode, RM100);2355if (rrc != MATCH_NOMATCH) RRETURN(rrc);2356if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2357if (Feptr >= mb->end_subject)2358{2359SCHECK_PARTIAL();2360RRETURN(MATCH_NOMATCH);2361}2362GETCHARINCTEST(fc, Feptr);2363if (!PRIV(xclass)(fc, Lxclass_data,2364(const uint8_t*)mb->start_code, utf))2365RRETURN(MATCH_NOMATCH);2366}2367PCRE2_UNREACHABLE(); /* Control never reaches here */2368}23692370/* If maximizing, find the longest possible run, then work backwards. */23712372else2373{2374Lstart_eptr = Feptr;2375for (i = Lmin; i < Lmax; i++)2376{2377int len = 1;2378if (Feptr >= mb->end_subject)2379{2380SCHECK_PARTIAL();2381break;2382}2383#ifdef SUPPORT_UNICODE2384GETCHARLENTEST(fc, Feptr, len);2385#else2386fc = *Feptr;2387#endif2388if (!PRIV(xclass)(fc, Lxclass_data,2389(const uint8_t*)mb->start_code, utf)) break;2390Feptr += len;2391}23922393if (reptype == REPTYPE_POS) continue; /* No backtracking */23942395/* After \C in UTF mode, Lstart_eptr might be in the middle of a2396Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't2397go too far. */23982399for(;;)2400{2401RMATCH(Fecode, RM101);2402if (rrc != MATCH_NOMATCH) RRETURN(rrc);2403if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */2404#ifdef SUPPORT_UNICODE2405if (utf) BACKCHAR(Feptr);2406#endif2407}2408RRETURN(MATCH_NOMATCH);2409}24102411PCRE2_UNREACHABLE(); /* Control never reaches here */2412}2413#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */24142415#undef Lstart_eptr2416#undef Lxclass_data2417#undef Lmin2418#undef Lmax241924202421/* ===================================================================== */2422/* Match a complex, set-based character class. This opcodes are used when2423there is complex nesting or logical operations within the character2424class. */24252426#define Lstart_eptr F->temp_sptr[0]2427#define Leclass_data F->temp_sptr[1]2428#define Leclass_len F->temp_size2429#define Lmin F->temp_32[0]2430#define Lmax F->temp_32[1]24312432#ifdef SUPPORT_WIDE_CHARS2433case OP_ECLASS:2434{2435Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */2436Fecode += GET(Fecode, 1); /* Advance past the item */2437Leclass_len = (PCRE2_SIZE)(Fecode - Leclass_data);24382439switch (*Fecode)2440{2441case OP_CRSTAR:2442case OP_CRMINSTAR:2443case OP_CRPLUS:2444case OP_CRMINPLUS:2445case OP_CRQUERY:2446case OP_CRMINQUERY:2447case OP_CRPOSSTAR:2448case OP_CRPOSPLUS:2449case OP_CRPOSQUERY:2450fc = *Fecode++ - OP_CRSTAR;2451Lmin = rep_min[fc];2452Lmax = rep_max[fc];2453reptype = rep_typ[fc];2454break;24552456case OP_CRRANGE:2457case OP_CRMINRANGE:2458case OP_CRPOSRANGE:2459Lmin = GET2(Fecode, 1);2460Lmax = GET2(Fecode, 1 + IMM2_SIZE);2461if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */2462reptype = rep_typ[*Fecode - OP_CRSTAR];2463Fecode += 1 + 2 * IMM2_SIZE;2464break;24652466default: /* No repeat follows */2467Lmin = Lmax = 1;2468break;2469}24702471/* First, ensure the minimum number of matches are present. */24722473for (i = 1; i <= Lmin; i++)2474{2475if (Feptr >= mb->end_subject)2476{2477SCHECK_PARTIAL();2478RRETURN(MATCH_NOMATCH);2479}2480GETCHARINCTEST(fc, Feptr);2481if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,2482(const uint8_t*)mb->start_code, utf))2483RRETURN(MATCH_NOMATCH);2484}24852486/* If Lmax == Lmin we can just continue with the main loop. */24872488if (Lmin == Lmax) continue;24892490/* If minimizing, keep testing the rest of the expression and advancing2491the pointer while it matches the class. */24922493if (reptype == REPTYPE_MIN)2494{2495for (;;)2496{2497RMATCH(Fecode, RM102);2498if (rrc != MATCH_NOMATCH) RRETURN(rrc);2499if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2500if (Feptr >= mb->end_subject)2501{2502SCHECK_PARTIAL();2503RRETURN(MATCH_NOMATCH);2504}2505GETCHARINCTEST(fc, Feptr);2506if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,2507(const uint8_t*)mb->start_code, utf))2508RRETURN(MATCH_NOMATCH);2509}2510PCRE2_UNREACHABLE(); /* Control never reaches here */2511}25122513/* If maximizing, find the longest possible run, then work backwards. */25142515else2516{2517Lstart_eptr = Feptr;2518for (i = Lmin; i < Lmax; i++)2519{2520int len = 1;2521if (Feptr >= mb->end_subject)2522{2523SCHECK_PARTIAL();2524break;2525}2526#ifdef SUPPORT_UNICODE2527GETCHARLENTEST(fc, Feptr, len);2528#else2529fc = *Feptr;2530#endif2531if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,2532(const uint8_t*)mb->start_code, utf))2533break;2534Feptr += len;2535}25362537if (reptype == REPTYPE_POS) continue; /* No backtracking */25382539/* After \C in UTF mode, Lstart_eptr might be in the middle of a2540Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't2541go too far. */25422543for(;;)2544{2545RMATCH(Fecode, RM103);2546if (rrc != MATCH_NOMATCH) RRETURN(rrc);2547if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */2548#ifdef SUPPORT_UNICODE2549if (utf) BACKCHAR(Feptr);2550#endif2551}2552RRETURN(MATCH_NOMATCH);2553}25542555PCRE2_UNREACHABLE(); /* Control never reaches here */2556}2557#endif /* SUPPORT_WIDE_CHARS: end of ECLASS */25582559#undef Lstart_eptr2560#undef Leclass_data2561#undef Leclass_len2562#undef Lmin2563#undef Lmax256425652566/* ===================================================================== */2567/* Match various character types when PCRE2_UCP is not set. These opcodes2568are not generated when PCRE2_UCP is set - instead appropriate property2569tests are compiled. */25702571case OP_NOT_DIGIT:2572if (Feptr >= mb->end_subject)2573{2574SCHECK_PARTIAL();2575RRETURN(MATCH_NOMATCH);2576}2577GETCHARINCTEST(fc, Feptr);2578if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)2579RRETURN(MATCH_NOMATCH);2580Fecode++;2581break;25822583case OP_DIGIT:2584if (Feptr >= mb->end_subject)2585{2586SCHECK_PARTIAL();2587RRETURN(MATCH_NOMATCH);2588}2589GETCHARINCTEST(fc, Feptr);2590if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)2591RRETURN(MATCH_NOMATCH);2592Fecode++;2593break;25942595case OP_NOT_WHITESPACE:2596if (Feptr >= mb->end_subject)2597{2598SCHECK_PARTIAL();2599RRETURN(MATCH_NOMATCH);2600}2601GETCHARINCTEST(fc, Feptr);2602if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)2603RRETURN(MATCH_NOMATCH);2604Fecode++;2605break;26062607case OP_WHITESPACE:2608if (Feptr >= mb->end_subject)2609{2610SCHECK_PARTIAL();2611RRETURN(MATCH_NOMATCH);2612}2613GETCHARINCTEST(fc, Feptr);2614if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)2615RRETURN(MATCH_NOMATCH);2616Fecode++;2617break;26182619case OP_NOT_WORDCHAR:2620if (Feptr >= mb->end_subject)2621{2622SCHECK_PARTIAL();2623RRETURN(MATCH_NOMATCH);2624}2625GETCHARINCTEST(fc, Feptr);2626if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)2627RRETURN(MATCH_NOMATCH);2628Fecode++;2629break;26302631case OP_WORDCHAR:2632if (Feptr >= mb->end_subject)2633{2634SCHECK_PARTIAL();2635RRETURN(MATCH_NOMATCH);2636}2637GETCHARINCTEST(fc, Feptr);2638if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)2639RRETURN(MATCH_NOMATCH);2640Fecode++;2641break;26422643case OP_ANYNL:2644if (Feptr >= mb->end_subject)2645{2646SCHECK_PARTIAL();2647RRETURN(MATCH_NOMATCH);2648}2649GETCHARINCTEST(fc, Feptr);2650switch(fc)2651{2652default: RRETURN(MATCH_NOMATCH);26532654case CHAR_CR:2655if (Feptr >= mb->end_subject)2656{2657SCHECK_PARTIAL();2658}2659else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;2660break;26612662case CHAR_LF:2663break;26642665case CHAR_VT:2666case CHAR_FF:2667case CHAR_NEL:2668#ifndef EBCDIC2669case 0x2028:2670case 0x2029:2671#endif /* Not EBCDIC */2672if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);2673break;2674}2675Fecode++;2676break;26772678case OP_NOT_HSPACE:2679if (Feptr >= mb->end_subject)2680{2681SCHECK_PARTIAL();2682RRETURN(MATCH_NOMATCH);2683}2684GETCHARINCTEST(fc, Feptr);2685switch(fc)2686{2687HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */2688default: break;2689}2690Fecode++;2691break;26922693case OP_HSPACE:2694if (Feptr >= mb->end_subject)2695{2696SCHECK_PARTIAL();2697RRETURN(MATCH_NOMATCH);2698}2699GETCHARINCTEST(fc, Feptr);2700switch(fc)2701{2702HSPACE_CASES: break; /* Byte and multibyte cases */2703default: RRETURN(MATCH_NOMATCH);2704}2705Fecode++;2706break;27072708case OP_NOT_VSPACE:2709if (Feptr >= mb->end_subject)2710{2711SCHECK_PARTIAL();2712RRETURN(MATCH_NOMATCH);2713}2714GETCHARINCTEST(fc, Feptr);2715switch(fc)2716{2717VSPACE_CASES: RRETURN(MATCH_NOMATCH);2718default: break;2719}2720Fecode++;2721break;27222723case OP_VSPACE:2724if (Feptr >= mb->end_subject)2725{2726SCHECK_PARTIAL();2727RRETURN(MATCH_NOMATCH);2728}2729GETCHARINCTEST(fc, Feptr);2730switch(fc)2731{2732VSPACE_CASES: break;2733default: RRETURN(MATCH_NOMATCH);2734}2735Fecode++;2736break;273727382739#ifdef SUPPORT_UNICODE27402741/* ===================================================================== */2742/* Check the next character by Unicode property. We will get here only2743if the support is in the binary; otherwise a compile-time error occurs. */27442745case OP_PROP:2746case OP_NOTPROP:2747if (Feptr >= mb->end_subject)2748{2749SCHECK_PARTIAL();2750RRETURN(MATCH_NOMATCH);2751}2752GETCHARINCTEST(fc, Feptr);2753{2754const uint32_t *cp;2755uint32_t chartype;2756const ucd_record *prop = GET_UCD(fc);2757BOOL notmatch = Fop == OP_NOTPROP;27582759switch(Fecode[1])2760{2761case PT_LAMP:2762chartype = prop->chartype;2763if ((chartype == ucp_Lu ||2764chartype == ucp_Ll ||2765chartype == ucp_Lt) == notmatch)2766RRETURN(MATCH_NOMATCH);2767break;27682769case PT_GC:2770if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)2771RRETURN(MATCH_NOMATCH);2772break;27732774case PT_PC:2775if ((Fecode[2] == prop->chartype) == notmatch)2776RRETURN(MATCH_NOMATCH);2777break;27782779case PT_SC:2780if ((Fecode[2] == prop->script) == notmatch)2781RRETURN(MATCH_NOMATCH);2782break;27832784case PT_SCX:2785{2786BOOL ok = (Fecode[2] == prop->script ||2787MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);2788if (ok == notmatch) RRETURN(MATCH_NOMATCH);2789}2790break;27912792/* These are specials */27932794case PT_ALNUM:2795chartype = prop->chartype;2796if ((PRIV(ucp_gentype)[chartype] == ucp_L ||2797PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)2798RRETURN(MATCH_NOMATCH);2799break;28002801/* Perl space used to exclude VT, but from Perl 5.18 it is included,2802which means that Perl space and POSIX space are now identical. PCRE2803was changed at release 8.34. */28042805case PT_SPACE: /* Perl space */2806case PT_PXSPACE: /* POSIX space */2807switch(fc)2808{2809HSPACE_CASES:2810VSPACE_CASES:2811if (notmatch) RRETURN(MATCH_NOMATCH);2812break;28132814default:2815if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)2816RRETURN(MATCH_NOMATCH);2817break;2818}2819break;28202821case PT_WORD:2822chartype = prop->chartype;2823if ((PRIV(ucp_gentype)[chartype] == ucp_L ||2824PRIV(ucp_gentype)[chartype] == ucp_N ||2825chartype == ucp_Mn ||2826chartype == ucp_Pc) == notmatch)2827RRETURN(MATCH_NOMATCH);2828break;28292830case PT_CLIST:2831#if PCRE2_CODE_UNIT_WIDTH == 322832if (fc > MAX_UTF_CODE_POINT)2833{2834if (notmatch) break;;2835RRETURN(MATCH_NOMATCH);2836}2837#endif2838cp = PRIV(ucd_caseless_sets) + Fecode[2];2839for (;;)2840{2841if (fc < *cp)2842{ if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }2843if (fc == *cp++)2844{ if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }2845}2846break;28472848case PT_UCNC:2849if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||2850fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||2851fc >= 0xe000) == notmatch)2852RRETURN(MATCH_NOMATCH);2853break;28542855case PT_BIDICL:2856if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)2857RRETURN(MATCH_NOMATCH);2858break;28592860case PT_BOOL:2861{2862BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +2863UCD_BPROPS_PROP(prop), Fecode[2]) != 0;2864if (ok == notmatch) RRETURN(MATCH_NOMATCH);2865}2866break;28672868/* This should never occur */28692870/* LCOV_EXCL_START */2871default:2872PCRE2_DEBUG_UNREACHABLE();2873return PCRE2_ERROR_INTERNAL;2874/* LCOV_EXCL_STOP */2875}28762877Fecode += 3;2878}2879break;288028812882/* ===================================================================== */2883/* Match an extended Unicode sequence. We will get here only if the support2884is in the binary; otherwise a compile-time error occurs. */28852886case OP_EXTUNI:2887if (Feptr >= mb->end_subject)2888{2889SCHECK_PARTIAL();2890RRETURN(MATCH_NOMATCH);2891}2892else2893{2894GETCHARINCTEST(fc, Feptr);2895Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,2896NULL);2897}2898CHECK_PARTIAL();2899Fecode++;2900break;29012902#endif /* SUPPORT_UNICODE */290329042905/* ===================================================================== */2906/* Match a single character type repeatedly. Note that the property type2907does not need to be in a stack frame as it is not used within an RMATCH()2908loop. */29092910#define Lstart_eptr F->temp_sptr[0]2911#define Lmin F->temp_32[0]2912#define Lmax F->temp_32[1]2913#define Lctype F->temp_32[2]2914#define Lpropvalue F->temp_32[3]29152916case OP_TYPEEXACT:2917Lmin = Lmax = GET2(Fecode, 1);2918Fecode += 1 + IMM2_SIZE;2919goto REPEATTYPE;29202921case OP_TYPEUPTO:2922case OP_TYPEMINUPTO:2923Lmin = 0;2924Lmax = GET2(Fecode, 1);2925reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;2926Fecode += 1 + IMM2_SIZE;2927goto REPEATTYPE;29282929case OP_TYPEPOSSTAR:2930reptype = REPTYPE_POS;2931Lmin = 0;2932Lmax = UINT32_MAX;2933Fecode++;2934goto REPEATTYPE;29352936case OP_TYPEPOSPLUS:2937reptype = REPTYPE_POS;2938Lmin = 1;2939Lmax = UINT32_MAX;2940Fecode++;2941goto REPEATTYPE;29422943case OP_TYPEPOSQUERY:2944reptype = REPTYPE_POS;2945Lmin = 0;2946Lmax = 1;2947Fecode++;2948goto REPEATTYPE;29492950case OP_TYPEPOSUPTO:2951reptype = REPTYPE_POS;2952Lmin = 0;2953Lmax = GET2(Fecode, 1);2954Fecode += 1 + IMM2_SIZE;2955goto REPEATTYPE;29562957case OP_TYPESTAR:2958case OP_TYPEMINSTAR:2959case OP_TYPEPLUS:2960case OP_TYPEMINPLUS:2961case OP_TYPEQUERY:2962case OP_TYPEMINQUERY:2963fc = *Fecode++ - OP_TYPESTAR;2964Lmin = rep_min[fc];2965Lmax = rep_max[fc];2966reptype = rep_typ[fc];29672968/* Common code for all repeated character type matches. */29692970REPEATTYPE:2971Lctype = *Fecode++; /* Code for the character type */29722973#ifdef SUPPORT_UNICODE2974if (Lctype == OP_PROP || Lctype == OP_NOTPROP)2975{2976proptype = *Fecode++;2977Lpropvalue = *Fecode++;2978}2979else proptype = -1;2980#endif29812982/* First, ensure the minimum number of matches are present. Use inline2983code for maximizing the speed, and do the type test once at the start2984(i.e. keep it out of the loops). As there are no calls to RMATCH in the2985loops, we can use an ordinary variable for "notmatch". The code for UTF2986mode is separated out for tidiness, except for Unicode property tests. */29872988if (Lmin > 0)2989{2990#ifdef SUPPORT_UNICODE2991if (proptype >= 0) /* Property tests in all modes */2992{2993BOOL notmatch = Lctype == OP_NOTPROP;2994switch(proptype)2995{2996case PT_LAMP:2997for (i = 1; i <= Lmin; i++)2998{2999int chartype;3000if (Feptr >= mb->end_subject)3001{3002SCHECK_PARTIAL();3003RRETURN(MATCH_NOMATCH);3004}3005GETCHARINCTEST(fc, Feptr);3006chartype = UCD_CHARTYPE(fc);3007if ((chartype == ucp_Lu ||3008chartype == ucp_Ll ||3009chartype == ucp_Lt) == notmatch)3010RRETURN(MATCH_NOMATCH);3011}3012break;30133014case PT_GC:3015for (i = 1; i <= Lmin; i++)3016{3017if (Feptr >= mb->end_subject)3018{3019SCHECK_PARTIAL();3020RRETURN(MATCH_NOMATCH);3021}3022GETCHARINCTEST(fc, Feptr);3023if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)3024RRETURN(MATCH_NOMATCH);3025}3026break;30273028case PT_PC:3029for (i = 1; i <= Lmin; i++)3030{3031if (Feptr >= mb->end_subject)3032{3033SCHECK_PARTIAL();3034RRETURN(MATCH_NOMATCH);3035}3036GETCHARINCTEST(fc, Feptr);3037if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)3038RRETURN(MATCH_NOMATCH);3039}3040break;30413042case PT_SC:3043for (i = 1; i <= Lmin; i++)3044{3045if (Feptr >= mb->end_subject)3046{3047SCHECK_PARTIAL();3048RRETURN(MATCH_NOMATCH);3049}3050GETCHARINCTEST(fc, Feptr);3051if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)3052RRETURN(MATCH_NOMATCH);3053}3054break;30553056case PT_SCX:3057for (i = 1; i <= Lmin; i++)3058{3059BOOL ok;3060const ucd_record *prop;3061if (Feptr >= mb->end_subject)3062{3063SCHECK_PARTIAL();3064RRETURN(MATCH_NOMATCH);3065}3066GETCHARINCTEST(fc, Feptr);3067prop = GET_UCD(fc);3068ok = (prop->script == Lpropvalue ||3069MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);3070if (ok == notmatch)3071RRETURN(MATCH_NOMATCH);3072}3073break;30743075case PT_ALNUM:3076for (i = 1; i <= Lmin; i++)3077{3078int category;3079if (Feptr >= mb->end_subject)3080{3081SCHECK_PARTIAL();3082RRETURN(MATCH_NOMATCH);3083}3084GETCHARINCTEST(fc, Feptr);3085category = UCD_CATEGORY(fc);3086if ((category == ucp_L || category == ucp_N) == notmatch)3087RRETURN(MATCH_NOMATCH);3088}3089break;30903091/* Perl space used to exclude VT, but from Perl 5.18 it is included,3092which means that Perl space and POSIX space are now identical. PCRE3093was changed at release 8.34. */30943095case PT_SPACE: /* Perl space */3096case PT_PXSPACE: /* POSIX space */3097for (i = 1; i <= Lmin; i++)3098{3099if (Feptr >= mb->end_subject)3100{3101SCHECK_PARTIAL();3102RRETURN(MATCH_NOMATCH);3103}3104GETCHARINCTEST(fc, Feptr);3105switch(fc)3106{3107HSPACE_CASES:3108VSPACE_CASES:3109if (notmatch) RRETURN(MATCH_NOMATCH);3110break;31113112default:3113if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)3114RRETURN(MATCH_NOMATCH);3115break;3116}3117}3118break;31193120case PT_WORD:3121for (i = 1; i <= Lmin; i++)3122{3123int chartype, category;3124if (Feptr >= mb->end_subject)3125{3126SCHECK_PARTIAL();3127RRETURN(MATCH_NOMATCH);3128}3129GETCHARINCTEST(fc, Feptr);3130chartype = UCD_CHARTYPE(fc);3131category = PRIV(ucp_gentype)[chartype];3132if ((category == ucp_L || category == ucp_N ||3133chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)3134RRETURN(MATCH_NOMATCH);3135}3136break;31373138case PT_CLIST:3139for (i = 1; i <= Lmin; i++)3140{3141const uint32_t *cp;3142if (Feptr >= mb->end_subject)3143{3144SCHECK_PARTIAL();3145RRETURN(MATCH_NOMATCH);3146}3147GETCHARINCTEST(fc, Feptr);3148#if PCRE2_CODE_UNIT_WIDTH == 323149if (fc > MAX_UTF_CODE_POINT)3150{3151if (notmatch) continue;3152RRETURN(MATCH_NOMATCH);3153}3154#endif3155cp = PRIV(ucd_caseless_sets) + Lpropvalue;3156for (;;)3157{3158if (fc < *cp)3159{3160if (notmatch) break;3161RRETURN(MATCH_NOMATCH);3162}3163if (fc == *cp++)3164{3165if (notmatch) RRETURN(MATCH_NOMATCH);3166break;3167}3168}3169}3170break;31713172case PT_UCNC:3173for (i = 1; i <= Lmin; i++)3174{3175if (Feptr >= mb->end_subject)3176{3177SCHECK_PARTIAL();3178RRETURN(MATCH_NOMATCH);3179}3180GETCHARINCTEST(fc, Feptr);3181if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||3182fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||3183fc >= 0xe000) == notmatch)3184RRETURN(MATCH_NOMATCH);3185}3186break;31873188case PT_BIDICL:3189for (i = 1; i <= Lmin; i++)3190{3191if (Feptr >= mb->end_subject)3192{3193SCHECK_PARTIAL();3194RRETURN(MATCH_NOMATCH);3195}3196GETCHARINCTEST(fc, Feptr);3197if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)3198RRETURN(MATCH_NOMATCH);3199}3200break;32013202case PT_BOOL:3203for (i = 1; i <= Lmin; i++)3204{3205BOOL ok;3206const ucd_record *prop;3207if (Feptr >= mb->end_subject)3208{3209SCHECK_PARTIAL();3210RRETURN(MATCH_NOMATCH);3211}3212GETCHARINCTEST(fc, Feptr);3213prop = GET_UCD(fc);3214ok = MAPBIT(PRIV(ucd_boolprop_sets) +3215UCD_BPROPS_PROP(prop), Lpropvalue) != 0;3216if (ok == notmatch)3217RRETURN(MATCH_NOMATCH);3218}3219break;32203221/* This should not occur */32223223/* LCOV_EXCL_START */3224default:3225PCRE2_DEBUG_UNREACHABLE();3226return PCRE2_ERROR_INTERNAL;3227/* LCOV_EXCL_STOP */3228}3229}32303231/* Match extended Unicode sequences. We will get here only if the3232support is in the binary; otherwise a compile-time error occurs. */32333234else if (Lctype == OP_EXTUNI)3235{3236for (i = 1; i <= Lmin; i++)3237{3238if (Feptr >= mb->end_subject)3239{3240SCHECK_PARTIAL();3241RRETURN(MATCH_NOMATCH);3242}3243else3244{3245GETCHARINCTEST(fc, Feptr);3246Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,3247mb->end_subject, utf, NULL);3248}3249CHECK_PARTIAL();3250}3251}3252else3253#endif /* SUPPORT_UNICODE */32543255/* Handle all other cases in UTF mode */32563257#ifdef SUPPORT_UNICODE3258if (utf) switch(Lctype)3259{3260case OP_ANY:3261for (i = 1; i <= Lmin; i++)3262{3263if (Feptr >= mb->end_subject)3264{3265SCHECK_PARTIAL();3266RRETURN(MATCH_NOMATCH);3267}3268if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);3269if (mb->partial != 0 &&3270Feptr + 1 >= mb->end_subject &&3271NLBLOCK->nltype == NLTYPE_FIXED &&3272NLBLOCK->nllen == 2 &&3273UCHAR21(Feptr) == NLBLOCK->nl[0])3274{3275mb->hitend = TRUE;3276if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;3277}3278Feptr++;3279ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3280}3281break;32823283case OP_ALLANY:3284for (i = 1; i <= Lmin; i++)3285{3286if (Feptr >= mb->end_subject)3287{3288SCHECK_PARTIAL();3289RRETURN(MATCH_NOMATCH);3290}3291Feptr++;3292ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3293}3294break;32953296case OP_ANYBYTE:3297if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);3298Feptr += Lmin;3299break;33003301case OP_ANYNL:3302for (i = 1; i <= Lmin; i++)3303{3304if (Feptr >= mb->end_subject)3305{3306SCHECK_PARTIAL();3307RRETURN(MATCH_NOMATCH);3308}3309GETCHARINC(fc, Feptr);3310switch(fc)3311{3312default: RRETURN(MATCH_NOMATCH);33133314case CHAR_CR:3315if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;3316break;33173318case CHAR_LF:3319break;33203321case CHAR_VT:3322case CHAR_FF:3323case CHAR_NEL:3324#ifndef EBCDIC3325case 0x2028:3326case 0x2029:3327#endif /* Not EBCDIC */3328if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);3329break;3330}3331}3332break;33333334case OP_NOT_HSPACE:3335for (i = 1; i <= Lmin; i++)3336{3337if (Feptr >= mb->end_subject)3338{3339SCHECK_PARTIAL();3340RRETURN(MATCH_NOMATCH);3341}3342GETCHARINC(fc, Feptr);3343switch(fc)3344{3345HSPACE_CASES: RRETURN(MATCH_NOMATCH);3346default: break;3347}3348}3349break;33503351case OP_HSPACE:3352for (i = 1; i <= Lmin; i++)3353{3354if (Feptr >= mb->end_subject)3355{3356SCHECK_PARTIAL();3357RRETURN(MATCH_NOMATCH);3358}3359GETCHARINC(fc, Feptr);3360switch(fc)3361{3362HSPACE_CASES: break;3363default: RRETURN(MATCH_NOMATCH);3364}3365}3366break;33673368case OP_NOT_VSPACE:3369for (i = 1; i <= Lmin; i++)3370{3371if (Feptr >= mb->end_subject)3372{3373SCHECK_PARTIAL();3374RRETURN(MATCH_NOMATCH);3375}3376GETCHARINC(fc, Feptr);3377switch(fc)3378{3379VSPACE_CASES: RRETURN(MATCH_NOMATCH);3380default: break;3381}3382}3383break;33843385case OP_VSPACE:3386for (i = 1; i <= Lmin; i++)3387{3388if (Feptr >= mb->end_subject)3389{3390SCHECK_PARTIAL();3391RRETURN(MATCH_NOMATCH);3392}3393GETCHARINC(fc, Feptr);3394switch(fc)3395{3396VSPACE_CASES: break;3397default: RRETURN(MATCH_NOMATCH);3398}3399}3400break;34013402case OP_NOT_DIGIT:3403for (i = 1; i <= Lmin; i++)3404{3405if (Feptr >= mb->end_subject)3406{3407SCHECK_PARTIAL();3408RRETURN(MATCH_NOMATCH);3409}3410GETCHARINC(fc, Feptr);3411if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)3412RRETURN(MATCH_NOMATCH);3413}3414break;34153416case OP_DIGIT:3417for (i = 1; i <= Lmin; i++)3418{3419uint32_t cc;3420if (Feptr >= mb->end_subject)3421{3422SCHECK_PARTIAL();3423RRETURN(MATCH_NOMATCH);3424}3425cc = UCHAR21(Feptr);3426if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)3427RRETURN(MATCH_NOMATCH);3428Feptr++;3429/* No need to skip more code units - we know it has only one. */3430}3431break;34323433case OP_NOT_WHITESPACE:3434for (i = 1; i <= Lmin; i++)3435{3436uint32_t cc;3437if (Feptr >= mb->end_subject)3438{3439SCHECK_PARTIAL();3440RRETURN(MATCH_NOMATCH);3441}3442cc = UCHAR21(Feptr);3443if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)3444RRETURN(MATCH_NOMATCH);3445Feptr++;3446ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3447}3448break;34493450case OP_WHITESPACE:3451for (i = 1; i <= Lmin; i++)3452{3453uint32_t cc;3454if (Feptr >= mb->end_subject)3455{3456SCHECK_PARTIAL();3457RRETURN(MATCH_NOMATCH);3458}3459cc = UCHAR21(Feptr);3460if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)3461RRETURN(MATCH_NOMATCH);3462Feptr++;3463/* No need to skip more code units - we know it has only one. */3464}3465break;34663467case OP_NOT_WORDCHAR:3468for (i = 1; i <= Lmin; i++)3469{3470uint32_t cc;3471if (Feptr >= mb->end_subject)3472{3473SCHECK_PARTIAL();3474RRETURN(MATCH_NOMATCH);3475}3476cc = UCHAR21(Feptr);3477if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)3478RRETURN(MATCH_NOMATCH);3479Feptr++;3480ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3481}3482break;34833484case OP_WORDCHAR:3485for (i = 1; i <= Lmin; i++)3486{3487uint32_t cc;3488if (Feptr >= mb->end_subject)3489{3490SCHECK_PARTIAL();3491RRETURN(MATCH_NOMATCH);3492}3493cc = UCHAR21(Feptr);3494if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)3495RRETURN(MATCH_NOMATCH);3496Feptr++;3497/* No need to skip more code units - we know it has only one. */3498}3499break;35003501/* LCOV_EXCL_START */3502default:3503PCRE2_DEBUG_UNREACHABLE();3504return PCRE2_ERROR_INTERNAL;3505/* LCOV_EXCL_STOP */3506} /* End switch(Lctype) */35073508else3509#endif /* SUPPORT_UNICODE */35103511/* Code for the non-UTF case for minimum matching of operators other3512than OP_PROP and OP_NOTPROP. */35133514switch(Lctype)3515{3516case OP_ANY:3517for (i = 1; i <= Lmin; i++)3518{3519if (Feptr >= mb->end_subject)3520{3521SCHECK_PARTIAL();3522RRETURN(MATCH_NOMATCH);3523}3524if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);3525if (mb->partial != 0 &&3526Feptr + 1 >= mb->end_subject &&3527NLBLOCK->nltype == NLTYPE_FIXED &&3528NLBLOCK->nllen == 2 &&3529*Feptr == NLBLOCK->nl[0])3530{3531mb->hitend = TRUE;3532if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;3533}3534Feptr++;3535}3536break;35373538case OP_ALLANY:3539if (Feptr > mb->end_subject - Lmin)3540{3541SCHECK_PARTIAL();3542RRETURN(MATCH_NOMATCH);3543}3544Feptr += Lmin;3545break;35463547/* This OP_ANYBYTE case will never be reached because \C gets turned3548into OP_ALLANY in non-UTF mode. Cut out the code so that coverage3549reports don't complain about it's never being used. */35503551/* case OP_ANYBYTE:3552* if (Feptr > mb->end_subject - Lmin)3553* {3554* SCHECK_PARTIAL();3555* RRETURN(MATCH_NOMATCH);3556* }3557* Feptr += Lmin;3558* break;3559*/3560case OP_ANYNL:3561for (i = 1; i <= Lmin; i++)3562{3563if (Feptr >= mb->end_subject)3564{3565SCHECK_PARTIAL();3566RRETURN(MATCH_NOMATCH);3567}3568switch(*Feptr++)3569{3570default: RRETURN(MATCH_NOMATCH);35713572case CHAR_CR:3573if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;3574break;35753576case CHAR_LF:3577break;35783579case CHAR_VT:3580case CHAR_FF:3581case CHAR_NEL:3582#if PCRE2_CODE_UNIT_WIDTH != 83583case 0x2028:3584case 0x2029:3585#endif3586if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);3587break;3588}3589}3590break;35913592case OP_NOT_HSPACE:3593for (i = 1; i <= Lmin; i++)3594{3595if (Feptr >= mb->end_subject)3596{3597SCHECK_PARTIAL();3598RRETURN(MATCH_NOMATCH);3599}3600switch(*Feptr++)3601{3602default: break;3603HSPACE_BYTE_CASES:3604#if PCRE2_CODE_UNIT_WIDTH != 83605HSPACE_MULTIBYTE_CASES:3606#endif3607RRETURN(MATCH_NOMATCH);3608}3609}3610break;36113612case OP_HSPACE:3613for (i = 1; i <= Lmin; i++)3614{3615if (Feptr >= mb->end_subject)3616{3617SCHECK_PARTIAL();3618RRETURN(MATCH_NOMATCH);3619}3620switch(*Feptr++)3621{3622default: RRETURN(MATCH_NOMATCH);3623HSPACE_BYTE_CASES:3624#if PCRE2_CODE_UNIT_WIDTH != 83625HSPACE_MULTIBYTE_CASES:3626#endif3627break;3628}3629}3630break;36313632case OP_NOT_VSPACE:3633for (i = 1; i <= Lmin; i++)3634{3635if (Feptr >= mb->end_subject)3636{3637SCHECK_PARTIAL();3638RRETURN(MATCH_NOMATCH);3639}3640switch(*Feptr++)3641{3642VSPACE_BYTE_CASES:3643#if PCRE2_CODE_UNIT_WIDTH != 83644VSPACE_MULTIBYTE_CASES:3645#endif3646RRETURN(MATCH_NOMATCH);3647default: break;3648}3649}3650break;36513652case OP_VSPACE:3653for (i = 1; i <= Lmin; i++)3654{3655if (Feptr >= mb->end_subject)3656{3657SCHECK_PARTIAL();3658RRETURN(MATCH_NOMATCH);3659}3660switch(*Feptr++)3661{3662default: RRETURN(MATCH_NOMATCH);3663VSPACE_BYTE_CASES:3664#if PCRE2_CODE_UNIT_WIDTH != 83665VSPACE_MULTIBYTE_CASES:3666#endif3667break;3668}3669}3670break;36713672case OP_NOT_DIGIT:3673for (i = 1; i <= Lmin; i++)3674{3675if (Feptr >= mb->end_subject)3676{3677SCHECK_PARTIAL();3678RRETURN(MATCH_NOMATCH);3679}3680if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)3681RRETURN(MATCH_NOMATCH);3682Feptr++;3683}3684break;36853686case OP_DIGIT:3687for (i = 1; i <= Lmin; i++)3688{3689if (Feptr >= mb->end_subject)3690{3691SCHECK_PARTIAL();3692RRETURN(MATCH_NOMATCH);3693}3694if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)3695RRETURN(MATCH_NOMATCH);3696Feptr++;3697}3698break;36993700case OP_NOT_WHITESPACE:3701for (i = 1; i <= Lmin; i++)3702{3703if (Feptr >= mb->end_subject)3704{3705SCHECK_PARTIAL();3706RRETURN(MATCH_NOMATCH);3707}3708if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)3709RRETURN(MATCH_NOMATCH);3710Feptr++;3711}3712break;37133714case OP_WHITESPACE:3715for (i = 1; i <= Lmin; i++)3716{3717if (Feptr >= mb->end_subject)3718{3719SCHECK_PARTIAL();3720RRETURN(MATCH_NOMATCH);3721}3722if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)3723RRETURN(MATCH_NOMATCH);3724Feptr++;3725}3726break;37273728case OP_NOT_WORDCHAR:3729for (i = 1; i <= Lmin; i++)3730{3731if (Feptr >= mb->end_subject)3732{3733SCHECK_PARTIAL();3734RRETURN(MATCH_NOMATCH);3735}3736if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)3737RRETURN(MATCH_NOMATCH);3738Feptr++;3739}3740break;37413742case OP_WORDCHAR:3743for (i = 1; i <= Lmin; i++)3744{3745if (Feptr >= mb->end_subject)3746{3747SCHECK_PARTIAL();3748RRETURN(MATCH_NOMATCH);3749}3750if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)3751RRETURN(MATCH_NOMATCH);3752Feptr++;3753}3754break;37553756/* LCOV_EXCL_START */3757default:3758PCRE2_DEBUG_UNREACHABLE();3759return PCRE2_ERROR_INTERNAL;3760/* LCOV_EXCL_STOP */3761}3762}37633764/* If Lmin = Lmax we are done. Continue with the main loop. */37653766if (Lmin == Lmax) continue;37673768/* If minimizing, we have to test the rest of the pattern before each3769subsequent match. This means we cannot use a local "notmatch" variable as3770in the other cases. As all 4 temporary 32-bit values in the frame are3771already in use, just test the type each time. */37723773if (reptype == REPTYPE_MIN)3774{3775#ifdef SUPPORT_UNICODE3776if (proptype >= 0)3777{3778switch(proptype)3779{3780case PT_LAMP:3781for (;;)3782{3783int chartype;3784RMATCH(Fecode, RM208);3785if (rrc != MATCH_NOMATCH) RRETURN(rrc);3786if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3787if (Feptr >= mb->end_subject)3788{3789SCHECK_PARTIAL();3790RRETURN(MATCH_NOMATCH);3791}3792GETCHARINCTEST(fc, Feptr);3793chartype = UCD_CHARTYPE(fc);3794if ((chartype == ucp_Lu ||3795chartype == ucp_Ll ||3796chartype == ucp_Lt) == (Lctype == OP_NOTPROP))3797RRETURN(MATCH_NOMATCH);3798}3799PCRE2_UNREACHABLE(); /* Control never reaches here */38003801case PT_GC:3802for (;;)3803{3804RMATCH(Fecode, RM209);3805if (rrc != MATCH_NOMATCH) RRETURN(rrc);3806if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3807if (Feptr >= mb->end_subject)3808{3809SCHECK_PARTIAL();3810RRETURN(MATCH_NOMATCH);3811}3812GETCHARINCTEST(fc, Feptr);3813if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3814RRETURN(MATCH_NOMATCH);3815}3816PCRE2_UNREACHABLE(); /* Control never reaches here */38173818case PT_PC:3819for (;;)3820{3821RMATCH(Fecode, RM210);3822if (rrc != MATCH_NOMATCH) RRETURN(rrc);3823if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3824if (Feptr >= mb->end_subject)3825{3826SCHECK_PARTIAL();3827RRETURN(MATCH_NOMATCH);3828}3829GETCHARINCTEST(fc, Feptr);3830if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3831RRETURN(MATCH_NOMATCH);3832}3833PCRE2_UNREACHABLE(); /* Control never reaches here */38343835case PT_SC:3836for (;;)3837{3838RMATCH(Fecode, RM211);3839if (rrc != MATCH_NOMATCH) RRETURN(rrc);3840if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3841if (Feptr >= mb->end_subject)3842{3843SCHECK_PARTIAL();3844RRETURN(MATCH_NOMATCH);3845}3846GETCHARINCTEST(fc, Feptr);3847if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3848RRETURN(MATCH_NOMATCH);3849}3850PCRE2_UNREACHABLE(); /* Control never reaches here */38513852case PT_SCX:3853for (;;)3854{3855BOOL ok;3856const ucd_record *prop;3857RMATCH(Fecode, RM224);3858if (rrc != MATCH_NOMATCH) RRETURN(rrc);3859if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3860if (Feptr >= mb->end_subject)3861{3862SCHECK_PARTIAL();3863RRETURN(MATCH_NOMATCH);3864}3865GETCHARINCTEST(fc, Feptr);3866prop = GET_UCD(fc);3867ok = (prop->script == Lpropvalue3868|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);3869if (ok == (Lctype == OP_NOTPROP))3870RRETURN(MATCH_NOMATCH);3871}3872PCRE2_UNREACHABLE(); /* Control never reaches here */38733874case PT_ALNUM:3875for (;;)3876{3877int category;3878RMATCH(Fecode, RM212);3879if (rrc != MATCH_NOMATCH) RRETURN(rrc);3880if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3881if (Feptr >= mb->end_subject)3882{3883SCHECK_PARTIAL();3884RRETURN(MATCH_NOMATCH);3885}3886GETCHARINCTEST(fc, Feptr);3887category = UCD_CATEGORY(fc);3888if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))3889RRETURN(MATCH_NOMATCH);3890}3891PCRE2_UNREACHABLE(); /* Control never reaches here */38923893/* Perl space used to exclude VT, but from Perl 5.18 it is included,3894which means that Perl space and POSIX space are now identical. PCRE3895was changed at release 8.34. */38963897case PT_SPACE: /* Perl space */3898case PT_PXSPACE: /* POSIX space */3899for (;;)3900{3901RMATCH(Fecode, RM213);3902if (rrc != MATCH_NOMATCH) RRETURN(rrc);3903if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3904if (Feptr >= mb->end_subject)3905{3906SCHECK_PARTIAL();3907RRETURN(MATCH_NOMATCH);3908}3909GETCHARINCTEST(fc, Feptr);3910switch(fc)3911{3912HSPACE_CASES:3913VSPACE_CASES:3914if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);3915break;39163917default:3918if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))3919RRETURN(MATCH_NOMATCH);3920break;3921}3922}3923PCRE2_UNREACHABLE(); /* Control never reaches here */39243925case PT_WORD:3926for (;;)3927{3928int chartype, category;3929RMATCH(Fecode, RM214);3930if (rrc != MATCH_NOMATCH) RRETURN(rrc);3931if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3932if (Feptr >= mb->end_subject)3933{3934SCHECK_PARTIAL();3935RRETURN(MATCH_NOMATCH);3936}3937GETCHARINCTEST(fc, Feptr);3938chartype = UCD_CHARTYPE(fc);3939category = PRIV(ucp_gentype)[chartype];3940if ((category == ucp_L ||3941category == ucp_N ||3942chartype == ucp_Mn ||3943chartype == ucp_Pc) == (Lctype == OP_NOTPROP))3944RRETURN(MATCH_NOMATCH);3945}3946PCRE2_UNREACHABLE(); /* Control never reaches here */39473948case PT_CLIST:3949for (;;)3950{3951const uint32_t *cp;3952RMATCH(Fecode, RM215);3953if (rrc != MATCH_NOMATCH) RRETURN(rrc);3954if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3955if (Feptr >= mb->end_subject)3956{3957SCHECK_PARTIAL();3958RRETURN(MATCH_NOMATCH);3959}3960GETCHARINCTEST(fc, Feptr);3961#if PCRE2_CODE_UNIT_WIDTH == 323962if (fc > MAX_UTF_CODE_POINT)3963{3964if (Lctype == OP_NOTPROP) continue;3965RRETURN(MATCH_NOMATCH);3966}3967#endif3968cp = PRIV(ucd_caseless_sets) + Lpropvalue;3969for (;;)3970{3971if (fc < *cp)3972{3973if (Lctype == OP_NOTPROP) break;3974RRETURN(MATCH_NOMATCH);3975}3976if (fc == *cp++)3977{3978if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);3979break;3980}3981}3982}3983PCRE2_UNREACHABLE(); /* Control never reaches here */39843985case PT_UCNC:3986for (;;)3987{3988RMATCH(Fecode, RM216);3989if (rrc != MATCH_NOMATCH) RRETURN(rrc);3990if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3991if (Feptr >= mb->end_subject)3992{3993SCHECK_PARTIAL();3994RRETURN(MATCH_NOMATCH);3995}3996GETCHARINCTEST(fc, Feptr);3997if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||3998fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||3999fc >= 0xe000) == (Lctype == OP_NOTPROP))4000RRETURN(MATCH_NOMATCH);4001}4002PCRE2_UNREACHABLE(); /* Control never reaches here */40034004case PT_BIDICL:4005for (;;)4006{4007RMATCH(Fecode, RM223);4008if (rrc != MATCH_NOMATCH) RRETURN(rrc);4009if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);4010if (Feptr >= mb->end_subject)4011{4012SCHECK_PARTIAL();4013RRETURN(MATCH_NOMATCH);4014}4015GETCHARINCTEST(fc, Feptr);4016if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))4017RRETURN(MATCH_NOMATCH);4018}4019PCRE2_UNREACHABLE(); /* Control never reaches here */40204021case PT_BOOL:4022for (;;)4023{4024BOOL ok;4025const ucd_record *prop;4026RMATCH(Fecode, RM222);4027if (rrc != MATCH_NOMATCH) RRETURN(rrc);4028if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);4029if (Feptr >= mb->end_subject)4030{4031SCHECK_PARTIAL();4032RRETURN(MATCH_NOMATCH);4033}4034GETCHARINCTEST(fc, Feptr);4035prop = GET_UCD(fc);4036ok = MAPBIT(PRIV(ucd_boolprop_sets) +4037UCD_BPROPS_PROP(prop), Lpropvalue) != 0;4038if (ok == (Lctype == OP_NOTPROP))4039RRETURN(MATCH_NOMATCH);4040}4041PCRE2_UNREACHABLE(); /* Control never reaches here */40424043/* This should never occur */40444045/* LCOV_EXCL_START */4046default:4047PCRE2_DEBUG_UNREACHABLE();4048return PCRE2_ERROR_INTERNAL;4049/* LCOV_EXCL_STOP */4050}4051}40524053/* Match extended Unicode sequences. We will get here only if the4054support is in the binary; otherwise a compile-time error occurs. */40554056else if (Lctype == OP_EXTUNI)4057{4058for (;;)4059{4060RMATCH(Fecode, RM217);4061if (rrc != MATCH_NOMATCH) RRETURN(rrc);4062if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);4063if (Feptr >= mb->end_subject)4064{4065SCHECK_PARTIAL();4066RRETURN(MATCH_NOMATCH);4067}4068else4069{4070GETCHARINCTEST(fc, Feptr);4071Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,4072utf, NULL);4073}4074CHECK_PARTIAL();4075}4076}4077else4078#endif /* SUPPORT_UNICODE */40794080/* UTF mode for non-property testing character types. */40814082#ifdef SUPPORT_UNICODE4083if (utf)4084{4085for (;;)4086{4087RMATCH(Fecode, RM218);4088if (rrc != MATCH_NOMATCH) RRETURN(rrc);4089if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);4090if (Feptr >= mb->end_subject)4091{4092SCHECK_PARTIAL();4093RRETURN(MATCH_NOMATCH);4094}4095if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);4096GETCHARINC(fc, Feptr);4097switch(Lctype)4098{4099case OP_ANY: /* This is the non-NL case */4100if (mb->partial != 0 && /* Take care with CRLF partial */4101Feptr >= mb->end_subject &&4102NLBLOCK->nltype == NLTYPE_FIXED &&4103NLBLOCK->nllen == 2 &&4104fc == NLBLOCK->nl[0])4105{4106mb->hitend = TRUE;4107if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4108}4109break;41104111case OP_ALLANY:4112case OP_ANYBYTE:4113break;41144115case OP_ANYNL:4116switch(fc)4117{4118default: RRETURN(MATCH_NOMATCH);41194120case CHAR_CR:4121if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;4122break;41234124case CHAR_LF:4125break;41264127case CHAR_VT:4128case CHAR_FF:4129case CHAR_NEL:4130#ifndef EBCDIC4131case 0x2028:4132case 0x2029:4133#endif /* Not EBCDIC */4134if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)4135RRETURN(MATCH_NOMATCH);4136break;4137}4138break;41394140case OP_NOT_HSPACE:4141switch(fc)4142{4143HSPACE_CASES: RRETURN(MATCH_NOMATCH);4144default: break;4145}4146break;41474148case OP_HSPACE:4149switch(fc)4150{4151HSPACE_CASES: break;4152default: RRETURN(MATCH_NOMATCH);4153}4154break;41554156case OP_NOT_VSPACE:4157switch(fc)4158{4159VSPACE_CASES: RRETURN(MATCH_NOMATCH);4160default: break;4161}4162break;41634164case OP_VSPACE:4165switch(fc)4166{4167VSPACE_CASES: break;4168default: RRETURN(MATCH_NOMATCH);4169}4170break;41714172case OP_NOT_DIGIT:4173if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)4174RRETURN(MATCH_NOMATCH);4175break;41764177case OP_DIGIT:4178if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)4179RRETURN(MATCH_NOMATCH);4180break;41814182case OP_NOT_WHITESPACE:4183if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)4184RRETURN(MATCH_NOMATCH);4185break;41864187case OP_WHITESPACE:4188if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)4189RRETURN(MATCH_NOMATCH);4190break;41914192case OP_NOT_WORDCHAR:4193if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)4194RRETURN(MATCH_NOMATCH);4195break;41964197case OP_WORDCHAR:4198if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)4199RRETURN(MATCH_NOMATCH);4200break;42014202/* LCOV_EXCL_START */4203default:4204PCRE2_DEBUG_UNREACHABLE();4205return PCRE2_ERROR_INTERNAL;4206/* LCOV_EXCL_STOP */4207}4208}4209}4210else4211#endif /* SUPPORT_UNICODE */42124213/* Not UTF mode */4214{4215for (;;)4216{4217RMATCH(Fecode, RM33);4218if (rrc != MATCH_NOMATCH) RRETURN(rrc);4219if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);4220if (Feptr >= mb->end_subject)4221{4222SCHECK_PARTIAL();4223RRETURN(MATCH_NOMATCH);4224}4225if (Lctype == OP_ANY && IS_NEWLINE(Feptr))4226RRETURN(MATCH_NOMATCH);4227fc = *Feptr++;4228switch(Lctype)4229{4230case OP_ANY: /* This is the non-NL case */4231if (mb->partial != 0 && /* Take care with CRLF partial */4232Feptr >= mb->end_subject &&4233NLBLOCK->nltype == NLTYPE_FIXED &&4234NLBLOCK->nllen == 2 &&4235fc == NLBLOCK->nl[0])4236{4237mb->hitend = TRUE;4238if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4239}4240break;42414242case OP_ALLANY:4243case OP_ANYBYTE:4244break;42454246case OP_ANYNL:4247switch(fc)4248{4249default: RRETURN(MATCH_NOMATCH);42504251case CHAR_CR:4252if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;4253break;42544255case CHAR_LF:4256break;42574258case CHAR_VT:4259case CHAR_FF:4260case CHAR_NEL:4261#if PCRE2_CODE_UNIT_WIDTH != 84262case 0x2028:4263case 0x2029:4264#endif4265if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)4266RRETURN(MATCH_NOMATCH);4267break;4268}4269break;42704271case OP_NOT_HSPACE:4272switch(fc)4273{4274default: break;4275HSPACE_BYTE_CASES:4276#if PCRE2_CODE_UNIT_WIDTH != 84277HSPACE_MULTIBYTE_CASES:4278#endif4279RRETURN(MATCH_NOMATCH);4280}4281break;42824283case OP_HSPACE:4284switch(fc)4285{4286default: RRETURN(MATCH_NOMATCH);4287HSPACE_BYTE_CASES:4288#if PCRE2_CODE_UNIT_WIDTH != 84289HSPACE_MULTIBYTE_CASES:4290#endif4291break;4292}4293break;42944295case OP_NOT_VSPACE:4296switch(fc)4297{4298default: break;4299VSPACE_BYTE_CASES:4300#if PCRE2_CODE_UNIT_WIDTH != 84301VSPACE_MULTIBYTE_CASES:4302#endif4303RRETURN(MATCH_NOMATCH);4304}4305break;43064307case OP_VSPACE:4308switch(fc)4309{4310default: RRETURN(MATCH_NOMATCH);4311VSPACE_BYTE_CASES:4312#if PCRE2_CODE_UNIT_WIDTH != 84313VSPACE_MULTIBYTE_CASES:4314#endif4315break;4316}4317break;43184319case OP_NOT_DIGIT:4320if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)4321RRETURN(MATCH_NOMATCH);4322break;43234324case OP_DIGIT:4325if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)4326RRETURN(MATCH_NOMATCH);4327break;43284329case OP_NOT_WHITESPACE:4330if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)4331RRETURN(MATCH_NOMATCH);4332break;43334334case OP_WHITESPACE:4335if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)4336RRETURN(MATCH_NOMATCH);4337break;43384339case OP_NOT_WORDCHAR:4340if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)4341RRETURN(MATCH_NOMATCH);4342break;43434344case OP_WORDCHAR:4345if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)4346RRETURN(MATCH_NOMATCH);4347break;43484349/* LCOV_EXCL_START */4350default:4351PCRE2_DEBUG_UNREACHABLE();4352return PCRE2_ERROR_INTERNAL;4353/* LCOV_EXCL_STOP */4354}4355}4356}43574358PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */4359}43604361/* If maximizing, it is worth using inline code for speed, doing the type4362test once at the start (i.e. keep it out of the loops). Once again,4363"notmatch" can be an ordinary local variable because the loops do not call4364RMATCH. */43654366else4367{4368Lstart_eptr = Feptr; /* Remember where we started */43694370#ifdef SUPPORT_UNICODE4371if (proptype >= 0)4372{4373BOOL notmatch = Lctype == OP_NOTPROP;4374switch(proptype)4375{4376case PT_LAMP:4377for (i = Lmin; i < Lmax; i++)4378{4379int chartype;4380int len = 1;4381if (Feptr >= mb->end_subject)4382{4383SCHECK_PARTIAL();4384break;4385}4386GETCHARLENTEST(fc, Feptr, len);4387chartype = UCD_CHARTYPE(fc);4388if ((chartype == ucp_Lu ||4389chartype == ucp_Ll ||4390chartype == ucp_Lt) == notmatch)4391break;4392Feptr+= len;4393}4394break;43954396case PT_GC:4397for (i = Lmin; i < Lmax; i++)4398{4399int len = 1;4400if (Feptr >= mb->end_subject)4401{4402SCHECK_PARTIAL();4403break;4404}4405GETCHARLENTEST(fc, Feptr, len);4406if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;4407Feptr+= len;4408}4409break;44104411case PT_PC:4412for (i = Lmin; i < Lmax; i++)4413{4414int len = 1;4415if (Feptr >= mb->end_subject)4416{4417SCHECK_PARTIAL();4418break;4419}4420GETCHARLENTEST(fc, Feptr, len);4421if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;4422Feptr+= len;4423}4424break;44254426case PT_SC:4427for (i = Lmin; i < Lmax; i++)4428{4429int len = 1;4430if (Feptr >= mb->end_subject)4431{4432SCHECK_PARTIAL();4433break;4434}4435GETCHARLENTEST(fc, Feptr, len);4436if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;4437Feptr+= len;4438}4439break;44404441case PT_SCX:4442for (i = Lmin; i < Lmax; i++)4443{4444BOOL ok;4445const ucd_record *prop;4446int len = 1;4447if (Feptr >= mb->end_subject)4448{4449SCHECK_PARTIAL();4450break;4451}4452GETCHARLENTEST(fc, Feptr, len);4453prop = GET_UCD(fc);4454ok = (prop->script == Lpropvalue ||4455MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);4456if (ok == notmatch) break;4457Feptr+= len;4458}4459break;44604461case PT_ALNUM:4462for (i = Lmin; i < Lmax; i++)4463{4464int category;4465int len = 1;4466if (Feptr >= mb->end_subject)4467{4468SCHECK_PARTIAL();4469break;4470}4471GETCHARLENTEST(fc, Feptr, len);4472category = UCD_CATEGORY(fc);4473if ((category == ucp_L || category == ucp_N) == notmatch)4474break;4475Feptr+= len;4476}4477break;44784479/* Perl space used to exclude VT, but from Perl 5.18 it is included,4480which means that Perl space and POSIX space are now identical. PCRE4481was changed at release 8.34. */44824483case PT_SPACE: /* Perl space */4484case PT_PXSPACE: /* POSIX space */4485for (i = Lmin; i < Lmax; i++)4486{4487int len = 1;4488if (Feptr >= mb->end_subject)4489{4490SCHECK_PARTIAL();4491break;4492}4493GETCHARLENTEST(fc, Feptr, len);4494switch(fc)4495{4496HSPACE_CASES:4497VSPACE_CASES:4498if (notmatch) goto ENDLOOP99; /* Break the loop */4499break;45004501default:4502if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)4503goto ENDLOOP99; /* Break the loop */4504break;4505}4506Feptr+= len;4507}4508ENDLOOP99:4509break;45104511case PT_WORD:4512for (i = Lmin; i < Lmax; i++)4513{4514int chartype, category;4515int len = 1;4516if (Feptr >= mb->end_subject)4517{4518SCHECK_PARTIAL();4519break;4520}4521GETCHARLENTEST(fc, Feptr, len);4522chartype = UCD_CHARTYPE(fc);4523category = PRIV(ucp_gentype)[chartype];4524if ((category == ucp_L ||4525category == ucp_N ||4526chartype == ucp_Mn ||4527chartype == ucp_Pc) == notmatch)4528break;4529Feptr+= len;4530}4531break;45324533case PT_CLIST:4534for (i = Lmin; i < Lmax; i++)4535{4536const uint32_t *cp;4537int len = 1;4538if (Feptr >= mb->end_subject)4539{4540SCHECK_PARTIAL();4541break;4542}4543GETCHARLENTEST(fc, Feptr, len);4544#if PCRE2_CODE_UNIT_WIDTH == 324545if (fc > MAX_UTF_CODE_POINT)4546{4547if (!notmatch) goto GOT_MAX;4548}4549else4550#endif4551{4552cp = PRIV(ucd_caseless_sets) + Lpropvalue;4553for (;;)4554{4555if (fc < *cp)4556{ if (notmatch) break; else goto GOT_MAX; }4557if (fc == *cp++)4558{ if (notmatch) goto GOT_MAX; else break; }4559}4560}45614562Feptr += len;4563}4564GOT_MAX:4565break;45664567case PT_UCNC:4568for (i = Lmin; i < Lmax; i++)4569{4570int len = 1;4571if (Feptr >= mb->end_subject)4572{4573SCHECK_PARTIAL();4574break;4575}4576GETCHARLENTEST(fc, Feptr, len);4577if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||4578fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||4579fc >= 0xe000) == notmatch)4580break;4581Feptr += len;4582}4583break;45844585case PT_BIDICL:4586for (i = Lmin; i < Lmax; i++)4587{4588int len = 1;4589if (Feptr >= mb->end_subject)4590{4591SCHECK_PARTIAL();4592break;4593}4594GETCHARLENTEST(fc, Feptr, len);4595if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;4596Feptr+= len;4597}4598break;45994600case PT_BOOL:4601for (i = Lmin; i < Lmax; i++)4602{4603BOOL ok;4604const ucd_record *prop;4605int len = 1;4606if (Feptr >= mb->end_subject)4607{4608SCHECK_PARTIAL();4609break;4610}4611GETCHARLENTEST(fc, Feptr, len);4612prop = GET_UCD(fc);4613ok = MAPBIT(PRIV(ucd_boolprop_sets) +4614UCD_BPROPS_PROP(prop), Lpropvalue) != 0;4615if (ok == notmatch) break;4616Feptr+= len;4617}4618break;46194620/* LCOV_EXCL_START */4621default:4622PCRE2_DEBUG_UNREACHABLE();4623return PCRE2_ERROR_INTERNAL;4624/* LCOV_EXCL_STOP */4625}46264627/* Feptr is now past the end of the maximum run */46284629if (reptype == REPTYPE_POS) continue; /* No backtracking */46304631/* After \C in UTF mode, Lstart_eptr might be in the middle of a4632Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't4633go too far. */46344635for(;;)4636{4637if (Feptr <= Lstart_eptr) break;4638RMATCH(Fecode, RM221);4639if (rrc != MATCH_NOMATCH) RRETURN(rrc);4640Feptr--;4641if (utf) BACKCHAR(Feptr);4642}4643}46444645/* Match extended Unicode grapheme clusters. We will get here only if the4646support is in the binary; otherwise a compile-time error occurs. */46474648else if (Lctype == OP_EXTUNI)4649{4650for (i = Lmin; i < Lmax; i++)4651{4652if (Feptr >= mb->end_subject)4653{4654SCHECK_PARTIAL();4655break;4656}4657else4658{4659GETCHARINCTEST(fc, Feptr);4660Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,4661utf, NULL);4662}4663CHECK_PARTIAL();4664}46654666/* Feptr is now past the end of the maximum run */46674668if (reptype == REPTYPE_POS) continue; /* No backtracking */46694670/* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start4671of the run while backtracking because the use of \C in UTF mode can4672cause BACKCHAR to move back past Lstart_eptr. This is just palliative;4673the use of \C in UTF mode is fraught with danger. */46744675for(;;)4676{4677int lgb, rgb;4678PCRE2_SPTR fptr;46794680if (Feptr <= Lstart_eptr) break; /* At start of char run */4681RMATCH(Fecode, RM219);4682if (rrc != MATCH_NOMATCH) RRETURN(rrc);46834684/* Backtracking over an extended grapheme cluster involves inspecting4685the previous two characters (if present) to see if a break is4686permitted between them. */46874688Feptr--;4689if (!utf) fc = *Feptr; else4690{4691BACKCHAR(Feptr);4692GETCHAR(fc, Feptr);4693}4694rgb = UCD_GRAPHBREAK(fc);46954696for (;;)4697{4698if (Feptr <= Lstart_eptr) break; /* At start of char run */4699fptr = Feptr - 1;4700if (!utf) fc = *fptr; else4701{4702BACKCHAR(fptr);4703GETCHAR(fc, fptr);4704}4705lgb = UCD_GRAPHBREAK(fc);4706if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;4707Feptr = fptr;4708rgb = lgb;4709}4710}4711}47124713else4714#endif /* SUPPORT_UNICODE */47154716#ifdef SUPPORT_UNICODE4717if (utf)4718{4719switch(Lctype)4720{4721case OP_ANY:4722for (i = Lmin; i < Lmax; i++)4723{4724if (Feptr >= mb->end_subject)4725{4726SCHECK_PARTIAL();4727break;4728}4729if (IS_NEWLINE(Feptr)) break;4730if (mb->partial != 0 && /* Take care with CRLF partial */4731Feptr + 1 >= mb->end_subject &&4732NLBLOCK->nltype == NLTYPE_FIXED &&4733NLBLOCK->nllen == 2 &&4734UCHAR21(Feptr) == NLBLOCK->nl[0])4735{4736mb->hitend = TRUE;4737if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4738}4739Feptr++;4740ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);4741}4742break;47434744case OP_ALLANY:4745if (Lmax < UINT32_MAX)4746{4747for (i = Lmin; i < Lmax; i++)4748{4749if (Feptr >= mb->end_subject)4750{4751SCHECK_PARTIAL();4752break;4753}4754Feptr++;4755ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);4756}4757}4758else4759{4760Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */4761SCHECK_PARTIAL();4762}4763break;47644765/* The "byte" (i.e. "code unit") case is the same as non-UTF */47664767case OP_ANYBYTE:4768fc = Lmax - Lmin;4769if (fc > (uint32_t)(mb->end_subject - Feptr))4770{4771Feptr = mb->end_subject;4772SCHECK_PARTIAL();4773}4774else Feptr += fc;4775break;47764777case OP_ANYNL:4778for (i = Lmin; i < Lmax; i++)4779{4780int len = 1;4781if (Feptr >= mb->end_subject)4782{4783SCHECK_PARTIAL();4784break;4785}4786GETCHARLEN(fc, Feptr, len);4787if (fc == CHAR_CR)4788{4789if (++Feptr >= mb->end_subject) break;4790if (UCHAR21(Feptr) == CHAR_LF) Feptr++;4791}4792else4793{4794if (fc != CHAR_LF &&4795(mb->bsr_convention == PCRE2_BSR_ANYCRLF ||4796(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL4797#ifndef EBCDIC4798&& fc != 0x2028 && fc != 0x20294799#endif /* Not EBCDIC */4800)))4801break;4802Feptr += len;4803}4804}4805break;48064807case OP_NOT_HSPACE:4808case OP_HSPACE:4809for (i = Lmin; i < Lmax; i++)4810{4811BOOL gotspace;4812int len = 1;4813if (Feptr >= mb->end_subject)4814{4815SCHECK_PARTIAL();4816break;4817}4818GETCHARLEN(fc, Feptr, len);4819switch(fc)4820{4821HSPACE_CASES: gotspace = TRUE; break;4822default: gotspace = FALSE; break;4823}4824if (gotspace == (Lctype == OP_NOT_HSPACE)) break;4825Feptr += len;4826}4827break;48284829case OP_NOT_VSPACE:4830case OP_VSPACE:4831for (i = Lmin; i < Lmax; i++)4832{4833BOOL gotspace;4834int len = 1;4835if (Feptr >= mb->end_subject)4836{4837SCHECK_PARTIAL();4838break;4839}4840GETCHARLEN(fc, Feptr, len);4841switch(fc)4842{4843VSPACE_CASES: gotspace = TRUE; break;4844default: gotspace = FALSE; break;4845}4846if (gotspace == (Lctype == OP_NOT_VSPACE)) break;4847Feptr += len;4848}4849break;48504851case OP_NOT_DIGIT:4852for (i = Lmin; i < Lmax; i++)4853{4854int len = 1;4855if (Feptr >= mb->end_subject)4856{4857SCHECK_PARTIAL();4858break;4859}4860GETCHARLEN(fc, Feptr, len);4861if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;4862Feptr+= len;4863}4864break;48654866case OP_DIGIT:4867for (i = Lmin; i < Lmax; i++)4868{4869int len = 1;4870if (Feptr >= mb->end_subject)4871{4872SCHECK_PARTIAL();4873break;4874}4875GETCHARLEN(fc, Feptr, len);4876if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;4877Feptr+= len;4878}4879break;48804881case OP_NOT_WHITESPACE:4882for (i = Lmin; i < Lmax; i++)4883{4884int len = 1;4885if (Feptr >= mb->end_subject)4886{4887SCHECK_PARTIAL();4888break;4889}4890GETCHARLEN(fc, Feptr, len);4891if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;4892Feptr+= len;4893}4894break;48954896case OP_WHITESPACE:4897for (i = Lmin; i < Lmax; i++)4898{4899int len = 1;4900if (Feptr >= mb->end_subject)4901{4902SCHECK_PARTIAL();4903break;4904}4905GETCHARLEN(fc, Feptr, len);4906if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;4907Feptr+= len;4908}4909break;49104911case OP_NOT_WORDCHAR:4912for (i = Lmin; i < Lmax; i++)4913{4914int len = 1;4915if (Feptr >= mb->end_subject)4916{4917SCHECK_PARTIAL();4918break;4919}4920GETCHARLEN(fc, Feptr, len);4921if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;4922Feptr+= len;4923}4924break;49254926case OP_WORDCHAR:4927for (i = Lmin; i < Lmax; i++)4928{4929int len = 1;4930if (Feptr >= mb->end_subject)4931{4932SCHECK_PARTIAL();4933break;4934}4935GETCHARLEN(fc, Feptr, len);4936if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;4937Feptr+= len;4938}4939break;49404941/* LCOV_EXCL_START */4942default:4943PCRE2_DEBUG_UNREACHABLE();4944return PCRE2_ERROR_INTERNAL;4945/* LCOV_EXCL_STOP */4946}49474948if (reptype == REPTYPE_POS) continue; /* No backtracking */49494950/* After \C in UTF mode, Lstart_eptr might be in the middle of a4951Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go4952too far. */49534954for(;;)4955{4956if (Feptr <= Lstart_eptr) break;4957RMATCH(Fecode, RM220);4958if (rrc != MATCH_NOMATCH) RRETURN(rrc);4959Feptr--;4960BACKCHAR(Feptr);4961if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&4962UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)4963Feptr--;4964}4965}4966else4967#endif /* SUPPORT_UNICODE */49684969/* Not UTF mode */4970{4971switch(Lctype)4972{4973case OP_ANY:4974for (i = Lmin; i < Lmax; i++)4975{4976if (Feptr >= mb->end_subject)4977{4978SCHECK_PARTIAL();4979break;4980}4981if (IS_NEWLINE(Feptr)) break;4982if (mb->partial != 0 && /* Take care with CRLF partial */4983Feptr + 1 >= mb->end_subject &&4984NLBLOCK->nltype == NLTYPE_FIXED &&4985NLBLOCK->nllen == 2 &&4986*Feptr == NLBLOCK->nl[0])4987{4988mb->hitend = TRUE;4989if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4990}4991Feptr++;4992}4993break;49944995case OP_ALLANY:4996case OP_ANYBYTE:4997fc = Lmax - Lmin;4998if (fc > (uint32_t)(mb->end_subject - Feptr))4999{5000Feptr = mb->end_subject;5001SCHECK_PARTIAL();5002}5003else Feptr += fc;5004break;50055006case OP_ANYNL:5007for (i = Lmin; i < Lmax; i++)5008{5009if (Feptr >= mb->end_subject)5010{5011SCHECK_PARTIAL();5012break;5013}5014fc = *Feptr;5015if (fc == CHAR_CR)5016{5017if (++Feptr >= mb->end_subject) break;5018if (*Feptr == CHAR_LF) Feptr++;5019}5020else5021{5022if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||5023(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL5024#if PCRE2_CODE_UNIT_WIDTH != 85025&& fc != 0x2028 && fc != 0x20295026#endif5027))) break;5028Feptr++;5029}5030}5031break;50325033case OP_NOT_HSPACE:5034for (i = Lmin; i < Lmax; i++)5035{5036if (Feptr >= mb->end_subject)5037{5038SCHECK_PARTIAL();5039break;5040}5041switch(*Feptr)5042{5043default: Feptr++; break;5044HSPACE_BYTE_CASES:5045#if PCRE2_CODE_UNIT_WIDTH != 85046HSPACE_MULTIBYTE_CASES:5047#endif5048goto ENDLOOP00;5049}5050}5051ENDLOOP00:5052break;50535054case OP_HSPACE:5055for (i = Lmin; i < Lmax; i++)5056{5057if (Feptr >= mb->end_subject)5058{5059SCHECK_PARTIAL();5060break;5061}5062switch(*Feptr)5063{5064default: goto ENDLOOP01;5065HSPACE_BYTE_CASES:5066#if PCRE2_CODE_UNIT_WIDTH != 85067HSPACE_MULTIBYTE_CASES:5068#endif5069Feptr++; break;5070}5071}5072ENDLOOP01:5073break;50745075case OP_NOT_VSPACE:5076for (i = Lmin; i < Lmax; i++)5077{5078if (Feptr >= mb->end_subject)5079{5080SCHECK_PARTIAL();5081break;5082}5083switch(*Feptr)5084{5085default: Feptr++; break;5086VSPACE_BYTE_CASES:5087#if PCRE2_CODE_UNIT_WIDTH != 85088VSPACE_MULTIBYTE_CASES:5089#endif5090goto ENDLOOP02;5091}5092}5093ENDLOOP02:5094break;50955096case OP_VSPACE:5097for (i = Lmin; i < Lmax; i++)5098{5099if (Feptr >= mb->end_subject)5100{5101SCHECK_PARTIAL();5102break;5103}5104switch(*Feptr)5105{5106default: goto ENDLOOP03;5107VSPACE_BYTE_CASES:5108#if PCRE2_CODE_UNIT_WIDTH != 85109VSPACE_MULTIBYTE_CASES:5110#endif5111Feptr++; break;5112}5113}5114ENDLOOP03:5115break;51165117case OP_NOT_DIGIT:5118for (i = Lmin; i < Lmax; i++)5119{5120if (Feptr >= mb->end_subject)5121{5122SCHECK_PARTIAL();5123break;5124}5125if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)5126break;5127Feptr++;5128}5129break;51305131case OP_DIGIT:5132for (i = Lmin; i < Lmax; i++)5133{5134if (Feptr >= mb->end_subject)5135{5136SCHECK_PARTIAL();5137break;5138}5139if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)5140break;5141Feptr++;5142}5143break;51445145case OP_NOT_WHITESPACE:5146for (i = Lmin; i < Lmax; i++)5147{5148if (Feptr >= mb->end_subject)5149{5150SCHECK_PARTIAL();5151break;5152}5153if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)5154break;5155Feptr++;5156}5157break;51585159case OP_WHITESPACE:5160for (i = Lmin; i < Lmax; i++)5161{5162if (Feptr >= mb->end_subject)5163{5164SCHECK_PARTIAL();5165break;5166}5167if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)5168break;5169Feptr++;5170}5171break;51725173case OP_NOT_WORDCHAR:5174for (i = Lmin; i < Lmax; i++)5175{5176if (Feptr >= mb->end_subject)5177{5178SCHECK_PARTIAL();5179break;5180}5181if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)5182break;5183Feptr++;5184}5185break;51865187case OP_WORDCHAR:5188for (i = Lmin; i < Lmax; i++)5189{5190if (Feptr >= mb->end_subject)5191{5192SCHECK_PARTIAL();5193break;5194}5195if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)5196break;5197Feptr++;5198}5199break;52005201/* LCOV_EXCL_START */5202default:5203PCRE2_DEBUG_UNREACHABLE();5204return PCRE2_ERROR_INTERNAL;5205/* LCOV_EXCL_STOP */5206}52075208if (reptype == REPTYPE_POS) continue; /* No backtracking */52095210for (;;)5211{5212if (Feptr == Lstart_eptr) break;5213RMATCH(Fecode, RM34);5214if (rrc != MATCH_NOMATCH) RRETURN(rrc);5215Feptr--;5216if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&5217Feptr[-1] == CHAR_CR) Feptr--;5218}5219}5220}5221break; /* End of repeat character type processing */52225223#undef Lstart_eptr5224#undef Lmin5225#undef Lmax5226#undef Lctype5227#undef Lpropvalue522852295230/* ===================================================================== */5231/* Match a back reference, possibly repeatedly. Look past the end of the5232item to see if there is repeat information following. The OP_REF and5233OP_REFI opcodes are used for a reference to a numbered group or to a5234non-duplicated named group. For a duplicated named group, OP_DNREF and5235OP_DNREFI are used. In this case we must scan the list of groups to which5236the name refers, and use the first one that is set. */52375238#define Lmin F->temp_32[0]5239#define Lmax F->temp_32[1]5240#define Lcaseless F->temp_32[2]5241#define Lcaseopts F->temp_32[3]5242#define Lstart F->temp_sptr[0]5243#define Loffset F->temp_size52445245case OP_DNREF:5246case OP_DNREFI:5247Lcaseless = (Fop == OP_DNREFI);5248Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0;5249{5250int count = GET2(Fecode, 1+IMM2_SIZE);5251PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;5252Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0);52535254while (count-- > 0)5255{5256Loffset = (GET2(slot, 0) << 1) - 2;5257if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;5258slot += mb->name_entry_size;5259}5260}5261goto REF_REPEAT;52625263case OP_REF:5264case OP_REFI:5265Lcaseless = (Fop == OP_REFI);5266Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0;5267Loffset = (GET2(Fecode, 1) << 1) - 2;5268Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0);52695270/* Set up for repetition, or handle the non-repeated case. The maximum and5271minimum must be in the heap frame, but as they are short-term values, we5272use temporary fields. */52735274REF_REPEAT:5275switch (*Fecode)5276{5277case OP_CRSTAR:5278case OP_CRMINSTAR:5279case OP_CRPLUS:5280case OP_CRMINPLUS:5281case OP_CRQUERY:5282case OP_CRMINQUERY:5283fc = *Fecode++ - OP_CRSTAR;5284Lmin = rep_min[fc];5285Lmax = rep_max[fc];5286reptype = rep_typ[fc];5287break;52885289case OP_CRRANGE:5290case OP_CRMINRANGE:5291Lmin = GET2(Fecode, 1);5292Lmax = GET2(Fecode, 1 + IMM2_SIZE);5293reptype = rep_typ[*Fecode - OP_CRSTAR];5294if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */5295Fecode += 1 + 2 * IMM2_SIZE;5296break;52975298default: /* No repeat follows */5299{5300rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length);5301if (rrc != 0)5302{5303if (rrc > 0) Feptr = mb->end_subject; /* Partial match */5304CHECK_PARTIAL();5305RRETURN(MATCH_NOMATCH);5306}5307}5308Feptr += length;5309continue; /* With the main loop */5310}53115312/* Handle repeated back references. If a set group has length zero, just5313continue with the main loop, because it matches however many times. For an5314unset reference, if the minimum is zero, we can also just continue. We can5315also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset5316group behave as a zero-length group. For any other unset cases, carrying5317on will result in NOMATCH. */53185319if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)5320{5321if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;5322}5323else /* Group is not set */5324{5325if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)5326continue;5327}53285329/* First, ensure the minimum number of matches are present. */53305331for (i = 1; i <= Lmin; i++)5332{5333PCRE2_SIZE slength;5334rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5335if (rrc != 0)5336{5337if (rrc > 0) Feptr = mb->end_subject; /* Partial match */5338CHECK_PARTIAL();5339RRETURN(MATCH_NOMATCH);5340}5341Feptr += slength;5342}53435344/* If min = max, we are done. They are not both allowed to be zero. */53455346if (Lmin == Lmax) continue;53475348/* If minimizing, keep trying and advancing the pointer. */53495350if (reptype == REPTYPE_MIN)5351{5352for (;;)5353{5354PCRE2_SIZE slength;5355RMATCH(Fecode, RM20);5356if (rrc != MATCH_NOMATCH) RRETURN(rrc);5357if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);5358rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5359if (rrc != 0)5360{5361if (rrc > 0) Feptr = mb->end_subject; /* Partial match */5362CHECK_PARTIAL();5363RRETURN(MATCH_NOMATCH);5364}5365Feptr += slength;5366}53675368PCRE2_UNREACHABLE(); /* Control never reaches here */5369}53705371/* If maximizing, find the longest string and work backwards, as long as5372the matched lengths for each iteration are the same. */53735374else5375{5376BOOL samelengths = TRUE;5377Lstart = Feptr; /* Starting position */5378Flength = Fovector[Loffset+1] - Fovector[Loffset];53795380for (i = Lmin; i < Lmax; i++)5381{5382PCRE2_SIZE slength;5383rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5384if (rrc != 0)5385{5386/* Can't use CHECK_PARTIAL because we don't want to update Feptr in5387the soft partial matching case. */53885389if (rrc > 0 && mb->partial != 0 &&5390mb->end_subject > mb->start_used_ptr)5391{5392mb->hitend = TRUE;5393if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;5394}5395break;5396}53975398if (slength != Flength) samelengths = FALSE;5399Feptr += slength;5400}54015402/* If the length matched for each repetition is the same as the length of5403the captured group, we can easily work backwards. This is the normal5404case. However, in caseless UTF-8 mode there are pairs of case-equivalent5405characters whose lengths (in terms of code units) differ. However, this5406is very rare, so we handle it by re-matching fewer and fewer times. */54075408if (samelengths)5409{5410while (Feptr >= Lstart)5411{5412RMATCH(Fecode, RM21);5413if (rrc != MATCH_NOMATCH) RRETURN(rrc);5414Feptr -= Flength;5415}5416}54175418/* The rare case of non-matching lengths. Re-scan the repetition for each5419iteration. We know that match_ref() will succeed every time. */54205421else5422{5423Lmax = i;5424for (;;)5425{5426RMATCH(Fecode, RM22);5427if (rrc != MATCH_NOMATCH) RRETURN(rrc);5428if (Feptr == Lstart) break; /* Failed after minimal repetition */5429Feptr = Lstart;5430Lmax--;5431for (i = Lmin; i < Lmax; i++)5432{5433PCRE2_SIZE slength;5434(void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5435Feptr += slength;5436}5437}5438}54395440RRETURN(MATCH_NOMATCH);5441}54425443PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */54445445#undef Lcaseless5446#undef Lmin5447#undef Lmax5448#undef Lstart5449#undef Loffset5450545154525453/* ========================================================================= */5454/* Opcodes for the start of various parenthesized items */5455/* ========================================================================= */54565457/* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the5458(*THEN) is within the current branch by comparing the address of OP_THEN5459that is passed back with the end of the branch. If (*THEN) is within the5460current branch, and the branch is one of two or more alternatives (it5461either starts or ends with OP_ALT), we have reached the limit of THEN's5462action, so convert the return code to NOMATCH, which will cause normal5463backtracking to happen from now on. Otherwise, THEN is passed back to an5464outer alternative. This implements Perl's treatment of parenthesized5465groups, where a group not containing | does not affect the current5466alternative, that is, (X) is NOT the same as (X|(*F)). */546754685469/* ===================================================================== */5470/* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive5471bracket group, indicating that it may occur zero times. It may repeat5472infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in5473the pattern. Brackets with fixed upper repeat limits are compiled as a5474number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.5475Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */54765477#define Lnext_ecode F->temp_sptr[0]54785479case OP_BRAZERO:5480Lnext_ecode = Fecode + 1;5481RMATCH(Lnext_ecode, RM9);5482if (rrc != MATCH_NOMATCH) RRETURN(rrc);5483do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);5484Fecode = Lnext_ecode + 1 + LINK_SIZE;5485break;54865487case OP_BRAMINZERO:5488Lnext_ecode = Fecode + 1;5489do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);5490RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);5491if (rrc != MATCH_NOMATCH) RRETURN(rrc);5492Fecode++;5493break;54945495#undef Lnext_ecode54965497case OP_SKIPZERO:5498Fecode++;5499do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);5500Fecode += 1 + LINK_SIZE;5501break;550255035504/* ===================================================================== */5505/* Handle possessive brackets with an unlimited repeat. The end of these5506brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without5507going further in the pattern. */55085509#define Lframe_type F->temp_32[0]5510#define Lmatched_once F->temp_32[1]5511#define Lzero_allowed F->temp_32[2]5512#define Lstart_eptr F->temp_sptr[0]5513#define Lstart_group F->temp_sptr[1]55145515case OP_BRAPOSZERO:5516Lzero_allowed = TRUE; /* Zero repeat is allowed */5517Fecode += 1;5518if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)5519goto POSSESSIVE_CAPTURE;5520goto POSSESSIVE_NON_CAPTURE;55215522case OP_BRAPOS:5523case OP_SBRAPOS:5524Lzero_allowed = FALSE; /* Zero repeat not allowed */55255526POSSESSIVE_NON_CAPTURE:5527Lframe_type = GF_NOCAPTURE; /* Remembered frame type */5528goto POSSESSIVE_GROUP;55295530case OP_CBRAPOS:5531case OP_SCBRAPOS:5532Lzero_allowed = FALSE; /* Zero repeat not allowed */55335534POSSESSIVE_CAPTURE:5535number = GET2(Fecode, 1+LINK_SIZE);5536Lframe_type = GF_CAPTURE | number; /* Remembered frame type */55375538POSSESSIVE_GROUP:5539Lmatched_once = FALSE; /* Never matched */5540Lstart_group = Fecode; /* Start of this group */55415542for (;;)5543{5544Lstart_eptr = Feptr; /* Position at group start */5545group_frame_type = Lframe_type;5546RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);5547if (rrc == MATCH_KETRPOS)5548{5549Lmatched_once = TRUE; /* Matched at least once */5550if (Feptr == Lstart_eptr) /* Empty match; skip to end */5551{5552do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5553break;5554}55555556Fecode = Lstart_group;5557continue;5558}55595560/* See comment above about handling THEN. */55615562if (rrc == MATCH_THEN)5563{5564PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);5565if (mb->verb_ecode_ptr < next_ecode &&5566(*Fecode == OP_ALT || *next_ecode == OP_ALT))5567rrc = MATCH_NOMATCH;5568}55695570if (rrc != MATCH_NOMATCH) RRETURN(rrc);5571Fecode += GET(Fecode, 1);5572if (*Fecode != OP_ALT) break;5573}55745575/* Success if matched something or zero repeat allowed */55765577if (Lmatched_once || Lzero_allowed)5578{5579Fecode += 1 + LINK_SIZE;5580break;5581}55825583RRETURN(MATCH_NOMATCH);55845585#undef Lmatched_once5586#undef Lzero_allowed5587#undef Lframe_type5588#undef Lstart_eptr5589#undef Lstart_group559055915592/* ===================================================================== */5593/* Handle non-capturing brackets that cannot match an empty string. When we5594get to the final alternative within the brackets, as long as there are no5595THEN's in the pattern, we can optimize by not recording a new backtracking5596point. (Ideally we should test for a THEN within this group, but we don't5597have that information.) Don't do this if we are at the very top level,5598however, because that would make handling assertions and once-only brackets5599messier when there is nothing to go back to. */56005601#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */5602#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */56035604case OP_BRA:5605if (mb->hasthen || Frdepth == 0)5606{5607Lframe_type = 0;5608goto GROUPLOOP;5609}56105611for (;;)5612{5613Lnext_branch = Fecode + GET(Fecode, 1);5614if (*Lnext_branch != OP_ALT) break;56155616/* This is never the final branch. We do not need to test for MATCH_THEN5617here because this code is not used when there is a THEN in the pattern. */56185619RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);5620if (rrc != MATCH_NOMATCH) RRETURN(rrc);5621Fecode = Lnext_branch;5622}56235624/* Hit the start of the final branch. Continue at this level. */56255626Fecode += PRIV(OP_lengths)[*Fecode];5627break;56285629#undef Lnext_branch563056315632/* ===================================================================== */5633/* Handle a capturing bracket, other than those that are possessive with an5634unlimited repeat. */56355636case OP_CBRA:5637case OP_SCBRA:5638Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);5639goto GROUPLOOP;564056415642/* ===================================================================== */5643/* Atomic groups and non-capturing brackets that can match an empty string5644must record a backtracking point and also set up a chained frame. */56455646case OP_ONCE:5647case OP_SCRIPT_RUN:5648case OP_SBRA:5649Lframe_type = GF_NOCAPTURE | Fop;56505651GROUPLOOP:5652for (;;)5653{5654group_frame_type = Lframe_type;5655RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);5656if (rrc == MATCH_THEN)5657{5658PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);5659if (mb->verb_ecode_ptr < next_ecode &&5660(*Fecode == OP_ALT || *next_ecode == OP_ALT))5661rrc = MATCH_NOMATCH;5662}5663if (rrc != MATCH_NOMATCH) RRETURN(rrc);5664Fecode += GET(Fecode, 1);5665if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);5666}5667PCRE2_UNREACHABLE(); /* Control never reaches here */56685669#undef Lframe_type567056715672/* ===================================================================== */5673/* Pattern recursion either matches the current regex, or some5674subexpression. The offset data is the offset to the starting bracket from5675the start of the whole pattern. This is so that it works from duplicated5676subpatterns. For a whole-pattern recursion, we have to infer the number5677zero. */56785679#define Lframe_type F->temp_32[0]5680#define Lstart_branch F->temp_sptr[0]56815682case OP_RECURSE:5683bracode = mb->start_code + GET(Fecode, 1);5684number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);56855686/* If we are already in a pattern recursion, check for repeating the same5687one without changing the subject pointer or the last referenced character5688in the subject. This should catch convoluted mutual recursions; some5689simple cases are caught at compile time. However, there are rare cases when5690this check needs to be turned off. In this case, actual recursion loops5691will be caught by the match or heap limits. */56925693if (Fcurrent_recurse != RECURSE_UNSET)5694{5695offset = Flast_group_offset;5696while (offset != PCRE2_UNSET)5697{5698N = (heapframe *)((char *)match_data->heapframes + offset);5699P = (heapframe *)((char *)N - frame_size);5700if (N->group_frame_type == (GF_RECURSE | number))5701{5702if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&5703(mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)5704return PCRE2_ERROR_RECURSELOOP;5705break;5706}5707offset = P->last_group_offset;5708}5709}57105711/* Remember the current last referenced character and then run the5712recursion branch by branch. */57135714F->recurse_last_used = mb->last_used_ptr;5715Lstart_branch = bracode;5716Lframe_type = GF_RECURSE | number;57175718for (;;)5719{5720PCRE2_SPTR next_ecode;57215722group_frame_type = Lframe_type;5723RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);5724next_ecode = Lstart_branch + GET(Lstart_branch,1);57255726/* Handle backtracking verbs, which are defined in a range that can5727easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to5728escape beyond a recursion; they cause a NOMATCH for the entire recursion.57295730When one of these verbs triggers, the current recursion group number is5731recorded. If it matches the recursion we are processing, the verb5732happened within the recursion and we must deal with it. Otherwise it must5733have happened after the recursion completed, and so has to be passed5734back. See comment above about handling THEN. */57355736if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&5737mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))5738{5739if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&5740(*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))5741rrc = MATCH_NOMATCH;5742else RRETURN(MATCH_NOMATCH);5743}57445745/* Note that carrying on after (*ACCEPT) in a recursion is handled in the5746OP_ACCEPT code. Nothing needs to be done here. */57475748if (rrc != MATCH_NOMATCH) RRETURN(rrc);5749Lstart_branch = next_ecode;5750if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);5751}5752PCRE2_UNREACHABLE(); /* Control never reaches here */57535754#undef Lframe_type5755#undef Lstart_branch575657575758/* ===================================================================== */5759/* Positive assertions are like other groups except that PCRE doesn't allow5760the effect of (*THEN) to escape beyond an assertion; it is therefore5761treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its5762captures and mark retained. Any other return is an error. */57635764#define Lframe_type F->temp_32[0]57655766case OP_ASSERT:5767case OP_ASSERTBACK:5768case OP_ASSERT_NA:5769case OP_ASSERTBACK_NA:5770Lframe_type = GF_NOCAPTURE | Fop;5771for (;;)5772{5773group_frame_type = Lframe_type;5774RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);5775if (rrc == MATCH_ACCEPT)5776{5777memcpy(Fovector,5778(char *)assert_accept_frame + offsetof(heapframe, ovector),5779assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));5780Foffset_top = assert_accept_frame->offset_top;5781Fmark = assert_accept_frame->mark;5782break;5783}5784if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);5785Fecode += GET(Fecode, 1);5786if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);5787}57885789do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5790Fecode += 1 + LINK_SIZE;5791break;57925793#undef Lframe_type579457955796/* ===================================================================== */5797/* Handle negative assertions. Loop for each non-matching branch as for5798positive assertions. */57995800#define Lframe_type F->temp_32[0]58015802case OP_ASSERT_NOT:5803case OP_ASSERTBACK_NOT:5804Lframe_type = GF_NOCAPTURE | Fop;58055806for (;;)5807{5808group_frame_type = Lframe_type;5809RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);5810switch(rrc)5811{5812case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */5813case MATCH_MATCH:5814RRETURN (MATCH_NOMATCH);58155816case MATCH_NOMATCH: /* Branch failed, try next if present. */5817case MATCH_THEN:5818Fecode += GET(Fecode, 1);5819if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;5820break;58215822case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */5823case MATCH_SKIP:5824case MATCH_PRUNE:5825do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5826goto ASSERT_NOT_FAILED;58275828default: /* Pass back any other return */5829RRETURN(rrc);5830}5831}58325833/* None of the branches have matched or there was a backtrack to (*COMMIT),5834(*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a5835negative assertion, so carry on. */58365837ASSERT_NOT_FAILED:5838Fecode += 1 + LINK_SIZE;5839break;58405841#undef Lframe_type58425843/* ===================================================================== */5844/* Handle scan substring operation. */58455846#define Lframe_type F->temp_32[0]5847#define Lextra_size F->temp_32[1]5848#define Lsaved_moptions F->temp_32[2]5849#define Lsaved_end_subject F->temp_sptr[0]5850#define Lsaved_eptr F->temp_sptr[1]5851#define Ltrue_end_extra F->temp_size58525853case OP_ASSERT_SCS:5854{5855PCRE2_SPTR ecode = Fecode + 1 + LINK_SIZE;5856uint32_t extra_size = 0;5857int count;5858PCRE2_SPTR slot;58595860/* Disable compiler warning. */5861offset = 0;5862(void)offset;58635864for (;;)5865{5866if (*ecode == OP_CREF)5867{5868extra_size += 1+IMM2_SIZE;5869offset = (GET2(ecode, 1) << 1) - 2;5870ecode += 1+IMM2_SIZE;5871if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)5872goto SCS_OFFSET_FOUND;5873continue;5874}58755876if (*ecode != OP_DNCREF) RRETURN(MATCH_NOMATCH);58775878count = GET2(ecode, 1 + IMM2_SIZE);5879slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;5880extra_size += 1+2*IMM2_SIZE;5881ecode += 1+2*IMM2_SIZE;58825883while (count > 0)5884{5885offset = (GET2(slot, 0) << 1) - 2;5886if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)5887goto SCS_OFFSET_FOUND;5888slot += mb->name_entry_size;5889count--;5890}5891}58925893SCS_OFFSET_FOUND:58945895/* Skip remaining options. */5896for (;;)5897{5898if (*ecode == OP_CREF)5899{5900extra_size += 1+IMM2_SIZE;5901ecode += 1+IMM2_SIZE;5902}5903else if (*ecode == OP_DNCREF)5904{5905extra_size += 1+2*IMM2_SIZE;5906ecode += 1+2*IMM2_SIZE;5907}5908else break;5909}59105911Lextra_size = extra_size;5912}59135914Lsaved_end_subject = mb->end_subject;5915Ltrue_end_extra = mb->true_end_subject - mb->end_subject;5916Lsaved_eptr = Feptr;5917Lsaved_moptions = mb->moptions;59185919Feptr = mb->start_subject + Fovector[offset];5920mb->true_end_subject = mb->end_subject =5921mb->start_subject + Fovector[offset + 1];5922mb->moptions &= ~PCRE2_NOTEOL;59235924Lframe_type = GF_NOCAPTURE | Fop;5925for (;;)5926{5927group_frame_type = Lframe_type;5928RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38);5929if (rrc == MATCH_ACCEPT)5930{5931memcpy(Fovector,5932(char *)assert_accept_frame + offsetof(heapframe, ovector),5933assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));5934Foffset_top = assert_accept_frame->offset_top;5935Fmark = assert_accept_frame->mark;5936mb->end_subject = Lsaved_end_subject;5937mb->true_end_subject = mb->end_subject + Ltrue_end_extra;5938mb->moptions = Lsaved_moptions;5939break;5940}59415942if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)5943{5944mb->end_subject = Lsaved_end_subject;5945mb->true_end_subject = mb->end_subject + Ltrue_end_extra;5946mb->moptions = Lsaved_moptions;5947RRETURN(rrc);5948}59495950Fecode += GET(Fecode, 1);5951if (*Fecode != OP_ALT)5952{5953mb->end_subject = Lsaved_end_subject;5954mb->true_end_subject = mb->end_subject + Ltrue_end_extra;5955mb->moptions = Lsaved_moptions;5956RRETURN(MATCH_NOMATCH);5957}5958Lextra_size = 0;5959}59605961do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5962Fecode += 1 + LINK_SIZE;5963Feptr = Lsaved_eptr;5964break;59655966#undef Lframe_type5967#undef Lextra_size5968#undef Lsaved_end_subject5969#undef Lsaved_eptr5970#undef Ltrue_end_extra5971#undef Lsave_moptions59725973/* ===================================================================== */5974/* The callout item calls an external function, if one is provided, passing5975details of the match so far. This is mainly for debugging, though the5976function is able to force a failure. */59775978case OP_CALLOUT:5979case OP_CALLOUT_STR:5980rrc = do_callout(F, mb, &length);5981if (rrc > 0) RRETURN(MATCH_NOMATCH);5982if (rrc < 0) RRETURN(rrc);5983Fecode += length;5984break;598559865987/* ===================================================================== */5988/* Conditional group: compilation checked that there are no more than two5989branches. If the condition is false, skipping the first branch takes us5990past the end of the item if there is only one branch, but that's exactly5991what we want. */59925993case OP_COND:5994case OP_SCOND:59955996/* The variable Flength will be added to Fecode when the condition is5997false, to get to the second branch. Setting it to the offset to the ALT or5998KET, then incrementing Fecode achieves this effect. However, if the second5999branch is non-existent, we must point to the KET so that the end of the6000group is correctly processed. We now have Fecode pointing to the condition6001or callout. */60026003Flength = GET(Fecode, 1); /* Offset to the second branch */6004if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;6005Fecode += 1 + LINK_SIZE; /* From this opcode */60066007/* Because of the way auto-callout works during compile, a callout item is6008inserted between OP_COND and an assertion condition. Such a callout can6009also be inserted manually. */60106011if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)6012{6013rrc = do_callout(F, mb, &length);6014if (rrc > 0) RRETURN(MATCH_NOMATCH);6015if (rrc < 0) RRETURN(rrc);60166017/* Advance Fecode past the callout, so it now points to the condition. We6018must adjust Flength so that the value of Fecode+Flength is unchanged. */60196020Fecode += length;6021Flength -= length;6022}60236024/* Test the various possible conditions */60256026condition = FALSE;6027switch(*Fecode)6028{6029case OP_RREF: /* Group recursion test */6030if (Fcurrent_recurse != RECURSE_UNSET)6031{6032number = GET2(Fecode, 1);6033condition = (number == RREF_ANY || number == Fcurrent_recurse);6034}6035break;60366037case OP_DNRREF: /* Duplicate named group recursion test */6038if (Fcurrent_recurse != RECURSE_UNSET)6039{6040int count = GET2(Fecode, 1 + IMM2_SIZE);6041PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;6042while (count-- > 0)6043{6044number = GET2(slot, 0);6045condition = number == Fcurrent_recurse;6046if (condition) break;6047slot += mb->name_entry_size;6048}6049}6050break;60516052case OP_CREF: /* Numbered group used test */6053offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */6054condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;6055break;60566057case OP_DNCREF: /* Duplicate named group used test */6058{6059int count = GET2(Fecode, 1 + IMM2_SIZE);6060PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;6061while (count-- > 0)6062{6063offset = (GET2(slot, 0) << 1) - 2;6064condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;6065if (condition) break;6066slot += mb->name_entry_size;6067}6068}6069break;60706071case OP_FALSE:6072case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */6073break;60746075case OP_TRUE:6076condition = TRUE;6077break;60786079/* The condition is an assertion. Run code similar to the assertion code6080above. */60816082#define Lpositive F->temp_32[0]6083#define Lstart_branch F->temp_sptr[0]60846085default:6086Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);6087Lstart_branch = Fecode;60886089for (;;)6090{6091group_frame_type = GF_CONDASSERT | *Fecode;6092RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);60936094switch(rrc)6095{6096case MATCH_ACCEPT: /* Save captures */6097memcpy(Fovector,6098(char *)assert_accept_frame + offsetof(heapframe, ovector),6099assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));6100Foffset_top = assert_accept_frame->offset_top;61016102PCRE2_FALLTHROUGH /* Fall through */6103/* In the case of a match, the captures have already been put into6104the current frame. */61056106case MATCH_MATCH:6107condition = Lpositive; /* TRUE for positive assertion */6108break;61096110/* PCRE doesn't allow the effect of (*THEN) to escape beyond an6111assertion; it is therefore always treated as NOMATCH. */61126113case MATCH_NOMATCH:6114case MATCH_THEN:6115Lstart_branch += GET(Lstart_branch, 1);6116if (*Lstart_branch == OP_ALT) continue; /* Try next branch */6117condition = !Lpositive; /* TRUE for negative assertion */6118break;61196120/* These force no match without checking other branches. */61216122case MATCH_COMMIT:6123case MATCH_SKIP:6124case MATCH_PRUNE:6125condition = !Lpositive;6126break;61276128default:6129RRETURN(rrc);6130}6131break; /* Out of the branch loop */6132}61336134/* If the condition is true, find the end of the assertion so that6135advancing past it gets us to the start of the first branch. */61366137if (condition)6138{6139do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);6140}6141break; /* End of assertion condition */6142}61436144#undef Lpositive6145#undef Lstart_branch61466147/* Choose branch according to the condition. */61486149Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;61506151/* If the opcode is OP_SCOND it means we are at a repeated conditional6152group that might match an empty string. We must therefore descend a level6153so that the start is remembered for checking. For OP_COND we can just6154continue at this level. */61556156if (Fop == OP_SCOND)6157{6158group_frame_type = GF_NOCAPTURE | Fop;6159RMATCH(Fecode, RM35);6160RRETURN(rrc);6161}6162break;6163616461656166/* ========================================================================= */6167/* End of start of parenthesis opcodes */6168/* ========================================================================= */616961706171/* ===================================================================== */6172/* Move the subject pointer back by one fixed amount. This occurs at the6173start of each branch that has a fixed length in a lookbehind assertion. If6174we are too close to the start to move back, fail. When working with UTF-86175we move back a number of characters, not bytes. */61766177case OP_REVERSE:6178number = GET2(Fecode, 1);6179#ifdef SUPPORT_UNICODE6180if (utf)6181{6182/* We used to do a simpler `while (number-- > 0)` but that triggers6183clang's unsigned integer overflow sanitizer. */6184while (number > 0)6185{6186--number;6187if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);6188Feptr--;6189BACKCHAR(Feptr);6190}6191}6192else6193#endif61946195/* No UTF support, or not in UTF mode: count is code unit count */61966197{6198if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);6199Feptr -= number;6200}62016202/* Save the earliest consulted character, then skip to next opcode */62036204if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;6205Fecode += 1 + IMM2_SIZE;6206break;620762086209/* ===================================================================== */6210/* Move the subject pointer back by a variable amount. This occurs at the6211start of each branch of a lookbehind assertion when the branch has a6212variable, but limited, length. A loop is needed to try matching the branch6213after moving back different numbers of characters. If we are too close to6214the start to move back even the minimum amount, fail. When working with6215UTF-8 we move back a number of characters, not bytes. */62166217#define Lmin F->temp_32[0]6218#define Lmax F->temp_32[1]6219#define Leptr F->temp_sptr[0]62206221case OP_VREVERSE:6222Lmin = GET2(Fecode, 1);6223Lmax = GET2(Fecode, 1 + IMM2_SIZE);6224Leptr = Feptr;62256226/* Move back by the maximum branch length and then work forwards. This6227ensures that items such as \d{3,5} get the maximum length, which is6228relevant for captures, and makes for Perl compatibility. */62296230#ifdef SUPPORT_UNICODE6231if (utf)6232{6233for (i = 0; i < Lmax; i++)6234{6235if (Feptr == mb->start_subject)6236{6237if (i < Lmin) RRETURN(MATCH_NOMATCH);6238Lmax = i;6239break;6240}6241Feptr--;6242BACKCHAR(Feptr);6243}6244}6245else6246#endif62476248/* No UTF support or not in UTF mode */62496250{6251ptrdiff_t diff = Feptr - mb->start_subject;6252uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);6253if (Lmin > available) RRETURN(MATCH_NOMATCH);6254if (Lmax > available) Lmax = available;6255Feptr -= Lmax;6256}62576258/* Now try matching, moving forward one character on failure, until we6259reach the minimum back length. */62606261for (;;)6262{6263RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);6264if (rrc != MATCH_NOMATCH) RRETURN(rrc);6265if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);6266Feptr++;6267#ifdef SUPPORT_UNICODE6268if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }6269#endif6270}6271PCRE2_UNREACHABLE(); /* Control never reaches here */62726273#undef Lmin6274#undef Lmax6275#undef Leptr62766277/* ===================================================================== */6278/* An alternation is the end of a branch; scan along to find the end of the6279bracketed group. */62806281case OP_ALT:6282branch_end = Fecode;6283do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);6284break;628562866287/* ===================================================================== */6288/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the6289starting frame was added to the chained frames in order to remember the6290starting subject position for the group. (Not true for OP_BRA when it's a6291whole pattern recursion, but that is handled separately below.)*/62926293case OP_KET:6294case OP_KETRMIN:6295case OP_KETRMAX:6296case OP_KETRPOS:62976298bracode = Fecode - GET(Fecode, 1);62996300if (branch_end == NULL) branch_end = Fecode;6301branch_start = bracode;6302while (branch_start + GET(branch_start, 1) != branch_end)6303branch_start += GET(branch_start, 1);6304branch_end = NULL;63056306/* Point N to the frame at the start of the most recent group, and P to its6307predecessor. Remember the subject pointer at the start of the group. */63086309if (*bracode != OP_BRA && *bracode != OP_COND)6310{6311N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);6312P = (heapframe *)((char *)N - frame_size);6313Flast_group_offset = P->last_group_offset;63146315#ifdef DEBUG_SHOW_RMATCH6316fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",6317N->rdepth, N->group_frame_type,6318(char *)P->eptr - (char *)mb->start_subject);6319#endif63206321/* If we are at the end of an assertion that is a condition, first check6322to see if we are at the end of a variable-length branch in a lookbehind.6323If this is the case and we have not landed on the current character,6324return no match. Compare code below for non-condition lookbehinds. In6325other cases, return a match, discarding any intermediate backtracking6326points. Copy back the mark setting and the captures into the frame before6327N so that they are set on return. Doing this for all assertions, both6328positive and negative, seems to match what Perl does. */63296330if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)6331{6332if ((*bracode == OP_ASSERTBACK || *bracode == OP_ASSERTBACK_NOT) &&6333branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6334RRETURN(MATCH_NOMATCH);6335memcpy((char *)P + offsetof(heapframe, ovector), Fovector,6336Foffset_top * sizeof(PCRE2_SIZE));6337P->offset_top = Foffset_top;6338P->mark = Fmark;6339Fback_frame = (char *)F - (char *)P;6340RRETURN(MATCH_MATCH);6341}6342}6343else P = NULL; /* Indicates starting frame not recorded */63446345/* The group was not a conditional assertion. */63466347switch (*bracode)6348{6349/* Whole pattern recursion is handled as a recursion into group 0, but6350the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing6351group - a design mistake: it should perhaps have been capture group 0.6352Anyway, that means the end of such recursion must be handled here. It is6353detected by checking for an immediately following OP_END when we are6354recursing in group 0. If this is not the end of a whole-pattern6355recursion, there is nothing to be done. */63566357case OP_BRA:6358if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;63596360/* It is the end of whole-pattern recursion. */63616362offset = Flast_group_offset;63636364/* Corrupted heapframes?. Trigger an assert and return an error */6365PCRE2_ASSERT(offset != PCRE2_UNSET);6366if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;63676368N = (heapframe *)((char *)match_data->heapframes + offset);6369P = (heapframe *)((char *)N - frame_size);6370Flast_group_offset = P->last_group_offset;63716372/* Reinstate the previous set of captures and then carry on after the6373recursion call. */63746375Fecode = P->ecode + 1 + LINK_SIZE;63766377if (*Fecode != OP_CREF)6378{6379memcpy(F->ovector, P->ovector, Foffset_top * sizeof(PCRE2_SIZE));6380Foffset_top = P->offset_top;6381}6382else6383recurse_update_offsets(F, P);63846385Fcapture_last = P->capture_last;6386Fcurrent_recurse = P->current_recurse;6387continue; /* With next opcode */63886389case OP_COND: /* No need to do anything for these */6390case OP_SCOND:6391break;63926393/* Non-atomic positive assertions are like OP_BRA, except that the6394subject pointer must be put back to where it was at the start of the6395assertion. For a variable lookbehind, check its end point. */63966397case OP_ASSERTBACK_NA:6398if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6399RRETURN(MATCH_NOMATCH);6400PCRE2_FALLTHROUGH /* Fall through */64016402case OP_ASSERT_NA:6403if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;6404Feptr = P->eptr;6405break;64066407/* Atomic positive assertions are like OP_ONCE, except that in addition6408the subject pointer must be put back to where it was at the start of the6409assertion. For a variable lookbehind, check its end point. */64106411case OP_ASSERTBACK:6412if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6413RRETURN(MATCH_NOMATCH);6414PCRE2_FALLTHROUGH /* Fall through */64156416case OP_ASSERT:6417if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;6418Feptr = P->eptr;6419PCRE2_FALLTHROUGH /* Fall through */64206421/* For an atomic group, discard internal backtracking points. We must6422also ensure that any remaining branches within the top-level of the group6423are not tried. Do this by adjusting the code pointer within the backtrack6424frame so that it points to the final branch. */64256426case OP_ONCE:6427Fback_frame = ((char *)F - (char *)P);6428for (;;)6429{6430uint32_t y = GET(P->ecode,1);6431if ((P->ecode)[y] != OP_ALT) break;6432P->ecode += y;6433}6434break;64356436/* A matching negative assertion returns MATCH, which is turned into6437NOMATCH at the assertion level. For a variable lookbehind, check its end6438point. */64396440case OP_ASSERTBACK_NOT:6441if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6442RRETURN(MATCH_NOMATCH);6443PCRE2_FALLTHROUGH /* Fall through */64446445case OP_ASSERT_NOT:6446RRETURN(MATCH_MATCH);64476448/* A scan substring group must preserve the current end_subject,6449and restore it before the backtracking is performed into its sub6450pattern. */64516452case OP_ASSERT_SCS:6453F->temp_sptr[0] = mb->end_subject;6454mb->end_subject = P->temp_sptr[0];6455mb->true_end_subject = mb->end_subject + P->temp_size;6456Feptr = P->temp_sptr[1];64576458RMATCH(Fecode + 1 + LINK_SIZE, RM39);64596460mb->end_subject = F->temp_sptr[0];6461mb->true_end_subject = mb->end_subject;6462RRETURN(rrc);6463break;64646465/* At the end of a script run, apply the script-checking rules. This code6466will never by exercised if Unicode support it not compiled, because in6467that environment script runs cause an error at compile time. */64686469case OP_SCRIPT_RUN:6470if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);6471break;64726473/* Whole-pattern recursion is coded as a recurse into group 0, and is6474handled with OP_BRA above. Other recursion is handled here. */64756476case OP_CBRA:6477case OP_CBRAPOS:6478case OP_SCBRA:6479case OP_SCBRAPOS:6480number = GET2(bracode, 1+LINK_SIZE);64816482/* Handle a recursively called group. We reinstate the previous set of6483captures and then carry on after the recursion call. */64846485if (Fcurrent_recurse == number)6486{6487P = (heapframe *)((char *)N - frame_size);6488Fecode = P->ecode + 1 + LINK_SIZE;64896490if (*Fecode != OP_CREF)6491{6492memcpy(F->ovector, P->ovector, Foffset_top * sizeof(PCRE2_SIZE));6493Foffset_top = P->offset_top;6494}6495else6496recurse_update_offsets(F, P);64976498Fcapture_last = P->capture_last;6499Fcurrent_recurse = P->current_recurse;6500continue; /* With next opcode */6501}65026503/* Deal with actual capturing. */65046505offset = (number << 1) - 2;6506Fcapture_last = number;6507Fovector[offset] = P->eptr - mb->start_subject;6508Fovector[offset+1] = Feptr - mb->start_subject;6509if (offset >= Foffset_top) Foffset_top = offset + 2;6510break;6511} /* End actions relating to the starting opcode */65126513/* OP_KETRPOS is a possessive repeating ket. Remember the current position,6514and return the MATCH_KETRPOS. This makes it possible to do the repeats one6515at a time from the outer level. This must precede the empty string test -6516in this case that test is done at the outer level. */65176518if (*Fecode == OP_KETRPOS)6519{6520memcpy((char *)P + offsetof(heapframe, eptr),6521(char *)F + offsetof(heapframe, eptr),6522frame_copy_size);6523RRETURN(MATCH_KETRPOS);6524}65256526/* Handle the different kinds of closing brackets. A non-repeating ket6527needs no special action, just continuing at this level. This also happens6528for the repeating kets if the group matched no characters, in order to6529forcibly break infinite loops. Otherwise, the repeating kets try the rest6530of the pattern or restart from the preceding bracket, in the appropriate6531order. */65326533if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))6534{6535if (Fop == OP_KETRMIN)6536{6537RMATCH(Fecode + 1 + LINK_SIZE, RM6);6538if (rrc != MATCH_NOMATCH) RRETURN(rrc);6539Fecode -= GET(Fecode, 1);6540break; /* End of ket processing */6541}65426543/* Repeat the maximum number of times (KETRMAX) */65446545RMATCH(bracode, RM7);6546if (rrc != MATCH_NOMATCH) RRETURN(rrc);6547}65486549/* Carry on at this level for a non-repeating ket, or after matching an6550empty string, or after repeating for a maximum number of times. */65516552Fecode += 1 + LINK_SIZE;6553break;655465556556/* ===================================================================== */6557/* Start and end of line assertions, not multiline mode. */65586559case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */6560if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)6561RRETURN(MATCH_NOMATCH);6562Fecode++;6563break;65646565case OP_SOD: /* Unconditional start of subject */6566if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);6567Fecode++;6568break;65696570/* When PCRE2_NOTEOL is unset, assert before the subject end, or a6571terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */65726573case OP_DOLL:6574if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);6575if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;65766577PCRE2_FALLTHROUGH /* Fall through */6578/* Unconditional end of subject assertion (\z). */65796580case OP_EOD:6581if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);6582if (mb->partial != 0)6583{6584mb->hitend = TRUE;6585if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6586}6587Fecode++;6588break;65896590/* End of subject or ending \n assertion (\Z) */65916592case OP_EODN:6593ASSERT_NL_OR_EOS:6594if (Feptr < mb->true_end_subject &&6595(!IS_NEWLINE(Feptr) || Feptr != mb->true_end_subject - mb->nllen))6596{6597if (mb->partial != 0 &&6598Feptr + 1 >= mb->end_subject &&6599NLBLOCK->nltype == NLTYPE_FIXED &&6600NLBLOCK->nllen == 2 &&6601UCHAR21TEST(Feptr) == NLBLOCK->nl[0])6602{6603mb->hitend = TRUE;6604if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6605}6606RRETURN(MATCH_NOMATCH);6607}66086609/* Either at end of string or \n before end. */66106611if (mb->partial != 0)6612{6613mb->hitend = TRUE;6614if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6615}6616Fecode++;6617break;661866196620/* ===================================================================== */6621/* Start and end of line assertions, multiline mode. */66226623/* Start of subject unless notbol, or after any newline except for one at6624the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */66256626case OP_CIRCM:6627if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)6628RRETURN(MATCH_NOMATCH);6629if (Feptr != mb->start_subject &&6630((Feptr == mb->end_subject &&6631(mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||6632!WAS_NEWLINE(Feptr)))6633RRETURN(MATCH_NOMATCH);6634Fecode++;6635break;66366637/* Assert before any newline, or before end of subject unless noteol is6638set. */66396640case OP_DOLLM:6641if (Feptr < mb->end_subject)6642{6643if (!IS_NEWLINE(Feptr))6644{6645if (mb->partial != 0 &&6646Feptr + 1 >= mb->end_subject &&6647NLBLOCK->nltype == NLTYPE_FIXED &&6648NLBLOCK->nllen == 2 &&6649UCHAR21TEST(Feptr) == NLBLOCK->nl[0])6650{6651mb->hitend = TRUE;6652if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6653}6654RRETURN(MATCH_NOMATCH);6655}6656}6657else6658{6659if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);6660SCHECK_PARTIAL();6661}6662Fecode++;6663break;666466656666/* ===================================================================== */6667/* Start of match assertion */66686669case OP_SOM:6670if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);6671Fecode++;6672break;667366746675/* ===================================================================== */6676/* Reset the start of match point */66776678case OP_SET_SOM:6679Fstart_match = Feptr;6680Fecode++;6681break;668266836684/* ===================================================================== */6685/* Word boundary assertions. Find out if the previous and current6686characters are "word" characters. It takes a bit more work in UTF mode.6687Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is6688not set. When it is set, use Unicode properties if available, even when not6689in UTF mode. Remember the earliest and latest consulted characters. */66906691case OP_NOT_WORD_BOUNDARY:6692case OP_WORD_BOUNDARY:6693case OP_NOT_UCP_WORD_BOUNDARY:6694case OP_UCP_WORD_BOUNDARY:6695if (Feptr == mb->check_subject) prev_is_word = FALSE; else6696{6697PCRE2_SPTR lastptr = Feptr - 1;6698#ifdef SUPPORT_UNICODE6699if (utf)6700{6701BACKCHAR(lastptr);6702GETCHAR(fc, lastptr);6703}6704else6705#endif /* SUPPORT_UNICODE */6706fc = *lastptr;6707if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;6708#ifdef SUPPORT_UNICODE6709if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)6710{6711int chartype = UCD_CHARTYPE(fc);6712int category = PRIV(ucp_gentype)[chartype];6713prev_is_word = (category == ucp_L || category == ucp_N ||6714chartype == ucp_Mn || chartype == ucp_Pc);6715}6716else6717#endif /* SUPPORT_UNICODE */6718prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;6719}67206721/* Get status of next character */67226723if (Feptr >= mb->end_subject)6724{6725SCHECK_PARTIAL();6726cur_is_word = FALSE;6727}6728else6729{6730PCRE2_SPTR nextptr = Feptr + 1;6731#ifdef SUPPORT_UNICODE6732if (utf)6733{6734FORWARDCHARTEST(nextptr, mb->end_subject);6735GETCHAR(fc, Feptr);6736}6737else6738#endif /* SUPPORT_UNICODE */6739fc = *Feptr;6740if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;6741#ifdef SUPPORT_UNICODE6742if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)6743{6744int chartype = UCD_CHARTYPE(fc);6745int category = PRIV(ucp_gentype)[chartype];6746cur_is_word = (category == ucp_L || category == ucp_N ||6747chartype == ucp_Mn || chartype == ucp_Pc);6748}6749else6750#endif /* SUPPORT_UNICODE */6751cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;6752}67536754/* Now see if the situation is what we want */67556756if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?6757cur_is_word == prev_is_word : cur_is_word != prev_is_word)6758RRETURN(MATCH_NOMATCH);6759break;676067616762/* ===================================================================== */6763/* Backtracking (*VERB)s, with and without arguments. Note that if the6764pattern is successfully matched, we do not come back from RMATCH. */67656766case OP_MARK:6767Fmark = mb->nomatch_mark = Fecode + 2;6768RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);67696770/* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an6771argument, and we must check whether that argument matches this MARK's6772argument. It is passed back in mb->verb_skip_ptr. If it does match, we6773return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject6774position that corresponds to this mark. Otherwise, pass back the return6775code unaltered. */67766777if (rrc == MATCH_SKIP_ARG &&6778PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)6779{6780mb->verb_skip_ptr = Feptr; /* Pass back current position */6781RRETURN(MATCH_SKIP);6782}6783RRETURN(rrc);67846785case OP_FAIL:6786RRETURN(MATCH_NOMATCH);67876788/* Record the current recursing group number in mb->verb_current_recurse6789when a backtracking return such as MATCH_COMMIT is given. This enables the6790recurse processing to catch verbs from within the recursion. */67916792case OP_COMMIT:6793RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);6794if (rrc != MATCH_NOMATCH) RRETURN(rrc);6795mb->verb_current_recurse = Fcurrent_recurse;6796RRETURN(MATCH_COMMIT);67976798case OP_COMMIT_ARG:6799Fmark = mb->nomatch_mark = Fecode + 2;6800RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);6801if (rrc != MATCH_NOMATCH) RRETURN(rrc);6802mb->verb_current_recurse = Fcurrent_recurse;6803RRETURN(MATCH_COMMIT);68046805case OP_PRUNE:6806RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);6807if (rrc != MATCH_NOMATCH) RRETURN(rrc);6808mb->verb_current_recurse = Fcurrent_recurse;6809RRETURN(MATCH_PRUNE);68106811case OP_PRUNE_ARG:6812Fmark = mb->nomatch_mark = Fecode + 2;6813RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);6814if (rrc != MATCH_NOMATCH) RRETURN(rrc);6815mb->verb_current_recurse = Fcurrent_recurse;6816RRETURN(MATCH_PRUNE);68176818case OP_SKIP:6819RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);6820if (rrc != MATCH_NOMATCH) RRETURN(rrc);6821mb->verb_skip_ptr = Feptr; /* Pass back current position */6822mb->verb_current_recurse = Fcurrent_recurse;6823RRETURN(MATCH_SKIP);68246825/* Note that, for Perl compatibility, SKIP with an argument does NOT set6826nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was6827not a matching mark, we have to re-run the match, ignoring the SKIP_ARG6828that failed and any that precede it (either they also failed, or were not6829triggered). To do this, we maintain a count of executed SKIP_ARGs. If a6830SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg6831set to the count of the one that failed. */68326833case OP_SKIP_ARG:6834mb->skip_arg_count++;6835if (mb->skip_arg_count <= mb->ignore_skip_arg)6836{6837Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];6838break;6839}6840RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);6841if (rrc != MATCH_NOMATCH) RRETURN(rrc);68426843/* Pass back the current skip name and return the special MATCH_SKIP_ARG6844return code. This will either be caught by a matching MARK, or get to the6845top, where it causes a rematch with mb->ignore_skip_arg set to the value of6846mb->skip_arg_count. */68476848mb->verb_skip_ptr = Fecode + 2;6849mb->verb_current_recurse = Fcurrent_recurse;6850RRETURN(MATCH_SKIP_ARG);68516852/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that6853the branch in which it occurs can be determined. */68546855case OP_THEN:6856RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);6857if (rrc != MATCH_NOMATCH) RRETURN(rrc);6858mb->verb_ecode_ptr = Fecode;6859mb->verb_current_recurse = Fcurrent_recurse;6860RRETURN(MATCH_THEN);68616862case OP_THEN_ARG:6863Fmark = mb->nomatch_mark = Fecode + 2;6864RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);6865if (rrc != MATCH_NOMATCH) RRETURN(rrc);6866mb->verb_ecode_ptr = Fecode;6867mb->verb_current_recurse = Fcurrent_recurse;6868RRETURN(MATCH_THEN);686968706871/* ===================================================================== */6872/* There's been some horrible disaster. Arrival here can only mean there is6873something seriously wrong in the code above or the OP_xxx definitions. */68746875/* LCOV_EXCL_START */6876default:6877PCRE2_DEBUG_UNREACHABLE();6878return PCRE2_ERROR_INTERNAL;6879/* LCOV_EXCL_STOP */6880}68816882/* Do not insert any code in here without much thought; it is assumed6883that "continue" in the code above comes out to here to repeat the main6884loop. */68856886} /* End of main loop */68876888PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */68896890/* ========================================================================= */6891/* The RRETURN() macro jumps here. The number that is saved in Freturn_id6892indicates which label we actually want to return to. The value in Frdepth is6893the index number of the frame in the vector. The return value has been placed6894in rrc. */68956896#define LBL(val) case val: goto L_RM##val;68976898RETURN_SWITCH:6899if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;6900if (Frdepth == 0) return rrc; /* Exit from the top level */6901F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */6902mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */69036904#ifdef DEBUG_SHOW_RMATCH6905fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);6906#endif69076908switch (Freturn_id)6909{6910LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)6911LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)6912LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)6913LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)6914LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39)69156916#ifdef SUPPORT_WIDE_CHARS6917LBL(100) LBL(101) LBL(102) LBL(103)6918#endif69196920#ifdef SUPPORT_UNICODE6921LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)6922LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)6923LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)6924LBL(221) LBL(222) LBL(223) LBL(224)6925#endif69266927/* LCOV_EXCL_START */6928default:6929PCRE2_DEBUG_UNREACHABLE();6930return PCRE2_ERROR_INTERNAL;6931/* LCOV_EXCL_STOP */6932}6933#undef LBL6934}693569366937/*************************************************6938* Match a Regular Expression *6939*************************************************/69406941/* This function applies a compiled pattern to a subject string and picks out6942portions of the string if it matches. Two elements in the vector are set for6943each substring: the offsets to the start and end of the substring.69446945Arguments:6946code points to the compiled expression6947subject points to the subject string6948length length of subject string (may contain binary zeros)6949start_offset where to start in the subject string6950options option bits6951match_data points to a match_data block6952mcontext points a PCRE2 context69536954Returns: > 0 => success; value is the number of ovector pairs filled6955= 0 => success, but ovector is not big enough6956= -1 => failed to match (PCRE2_ERROR_NOMATCH)6957= -2 => partial match (PCRE2_ERROR_PARTIAL)6958< -2 => some kind of unexpected problem6959*/69606961PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION6962pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,6963PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,6964pcre2_match_context *mcontext)6965{6966int rc;6967const uint8_t *start_bits = NULL;6968const pcre2_real_code *re = (const pcre2_real_code *)code;6969uint32_t original_options = options;69706971BOOL anchored;6972BOOL firstline;6973BOOL has_first_cu = FALSE;6974BOOL has_req_cu = FALSE;6975BOOL startline;69766977#if PCRE2_CODE_UNIT_WIDTH == 86978PCRE2_SPTR memchr_found_first_cu;6979PCRE2_SPTR memchr_found_first_cu2;6980#endif69816982PCRE2_UCHAR first_cu = 0;6983PCRE2_UCHAR first_cu2 = 0;6984PCRE2_UCHAR req_cu = 0;6985PCRE2_UCHAR req_cu2 = 0;69866987PCRE2_UCHAR null_str[1] = { 0xcd };6988PCRE2_SPTR original_subject = subject;6989PCRE2_SPTR bumpalong_limit;6990PCRE2_SPTR end_subject;6991PCRE2_SPTR true_end_subject;6992PCRE2_SPTR start_match;6993PCRE2_SPTR req_cu_ptr;6994PCRE2_SPTR start_partial;6995PCRE2_SPTR match_partial;69966997#ifdef SUPPORT_JIT6998BOOL use_jit;6999#endif70007001/* This flag is needed even when Unicode is not supported for convenience7002(it is used by the IS_NEWLINE macro). */70037004BOOL utf = FALSE;70057006#ifdef SUPPORT_UNICODE7007BOOL ucp = FALSE;7008BOOL allow_invalid;7009uint32_t fragment_options = 0;7010#ifdef SUPPORT_JIT7011BOOL jit_checked_utf = FALSE;7012#endif7013#endif /* SUPPORT_UNICODE */70147015PCRE2_SIZE frame_size;7016PCRE2_SIZE heapframes_size;70177018/* We need to have mb as a pointer to a match block, because the IS_NEWLINE7019macro is used below, and it expects NLBLOCK to be defined as a pointer. */70207021pcre2_callout_block cb;7022match_block actual_match_block;7023match_block *mb = &actual_match_block;70247025/* Recognize NULL, length 0 as an empty string. */70267027if (subject == NULL && length == 0) subject = null_str;70287029/* Plausibility checks */70307031if (match_data == NULL) return PCRE2_ERROR_NULL;7032if (code == NULL || subject == NULL)7033return match_data->rc = PCRE2_ERROR_NULL;7034if ((options & ~PUBLIC_MATCH_OPTIONS) != 0)7035return match_data->rc = PCRE2_ERROR_BADOPTION;70367037start_match = subject + start_offset;7038req_cu_ptr = start_match - 1;7039if (length == PCRE2_ZERO_TERMINATED)7040{7041length = PRIV(strlen)(subject);7042}7043true_end_subject = end_subject = subject + length;70447045if (start_offset > length) return match_data->rc = PCRE2_ERROR_BADOFFSET;70467047/* Check that the first field in the block is the magic number. */70487049if (re->magic_number != MAGIC_NUMBER)7050return match_data->rc = PCRE2_ERROR_BADMAGIC;70517052/* Check the code unit width. */70537054if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)7055return match_data->rc = PCRE2_ERROR_BADMODE;70567057/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the7058options variable for this function. Users of PCRE2 who are not calling the7059function directly would like to have a way of setting these flags, in the same7060way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with7061constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and7062(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now7063transfer to the options for this function. The bits are guaranteed to be7064adjacent, but do not have the same values. This bit of Boolean trickery assumes7065that the match-time bits are not more significant than the flag bits. If by7066accident this is not the case, a compile-time division by zero error will7067occur. */70687069#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)7070#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)7071options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));7072#undef FF7073#undef OO70747075/* If the pattern was successfully studied with JIT support, we will run the7076JIT executable instead of the rest of this function. Most options must be set7077at compile time for the JIT code to be usable. */70787079#ifdef SUPPORT_JIT7080use_jit = (re->executable_jit != NULL &&7081(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);7082#endif70837084/* Initialize UTF/UCP parameters. */70857086#ifdef SUPPORT_UNICODE7087utf = (re->overall_options & PCRE2_UTF) != 0;7088allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;7089ucp = (re->overall_options & PCRE2_UCP) != 0;7090#endif /* SUPPORT_UNICODE */70917092/* Convert the partial matching flags into an integer. */70937094mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :7095((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;70967097/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same7098time. */70997100if (mb->partial != 0 &&7101((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)7102return match_data->rc = PCRE2_ERROR_BADOPTION;71037104/* It is an error to set an offset limit without setting the flag at compile7105time. */71067107if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&7108(re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)7109return match_data->rc = PCRE2_ERROR_BADOFFSETLIMIT;71107111/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,7112free the memory that was obtained. Set the field to NULL for match error7113cases. */71147115if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)7116{7117match_data->memctl.free((void *)match_data->subject,7118match_data->memctl.memory_data);7119match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;7120}7121match_data->subject = NULL;71227123/* Zero the error offset in case the first code unit is invalid UTF. */71247125match_data->startchar = 0;712671277128/* ============================= JIT matching ============================== */71297130/* Prepare for JIT matching. Check a UTF string for validity unless no check is7131requested or invalid UTF can be handled. We check only the portion of the7132subject that might be be inspected during matching - from the offset minus the7133maximum lookbehind to the given length. This saves time when a small part of a7134large subject is being matched by the use of a starting offset. Note that the7135maximum lookbehind is a number of characters, not code units. */71367137#ifdef SUPPORT_JIT7138if (use_jit)7139{7140#ifdef SUPPORT_UNICODE7141if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)7142{71437144/* For 8-bit and 16-bit UTF, check that the first code unit is a valid7145character start. */71467147#if PCRE2_CODE_UNIT_WIDTH != 327148if (start_match < end_subject && NOT_FIRSTCU(*start_match))7149{7150if (start_offset > 0) return match_data->rc = PCRE2_ERROR_BADUTFOFFSET;7151#if PCRE2_CODE_UNIT_WIDTH == 87152return match_data->rc = PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */7153#else7154return match_data->rc = PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */7155#endif7156}7157#endif /* WIDTH != 32 */71587159/* Move back by the maximum lookbehind, just in case it happens at the very7160start of matching. */71617162#if PCRE2_CODE_UNIT_WIDTH != 327163for (unsigned int i = re->max_lookbehind; i > 0 && start_match > subject; i--)7164{7165start_match--;7166while (start_match > subject &&7167#if PCRE2_CODE_UNIT_WIDTH == 87168(*start_match & 0xc0) == 0x80)7169#else /* 16-bit */7170(*start_match & 0xfc00) == 0xdc00)7171#endif7172start_match--;7173}7174#else /* PCRE2_CODE_UNIT_WIDTH != 32 */71757176/* In the 32-bit library, one code unit equals one character. However,7177we cannot just subtract the lookbehind and then compare pointers, because7178a very large lookbehind could create an invalid pointer. */71797180if (start_offset >= re->max_lookbehind)7181start_match -= re->max_lookbehind;7182else7183start_match = subject;7184#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */71857186/* Validate the relevant portion of the subject. Adjust the offset of an7187invalid code point to be an absolute offset in the whole string. */71887189rc = PRIV(valid_utf)(start_match,7190length - (start_match - subject), &(match_data->startchar));7191if (rc != 0)7192{7193match_data->startchar += start_match - subject;7194return match_data->rc = rc;7195}7196jit_checked_utf = TRUE;7197}7198#endif /* SUPPORT_UNICODE */71997200/* If JIT returns BADOPTION, which means that the selected complete or7201partial matching mode was not compiled, fall through to the interpreter. */72027203rc = pcre2_jit_match(code, subject, length, start_offset, options,7204match_data, mcontext);7205if (rc != PCRE2_ERROR_JIT_BADOPTION)7206{7207match_data->options = original_options;7208if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)7209{7210if (length != 0)7211{7212match_data->subject = match_data->memctl.malloc(CU2BYTES(length),7213match_data->memctl.memory_data);7214if (match_data->subject == NULL)7215return match_data->rc = PCRE2_ERROR_NOMEMORY;7216memcpy((void *)match_data->subject, subject, CU2BYTES(length));7217}7218else7219match_data->subject = NULL;7220match_data->flags |= PCRE2_MD_COPIED_SUBJECT;7221}7222else7223{7224/* When pcre2_jit_match sets the subject, it doesn't know what the7225original passed-in pointer was. */7226if (match_data->subject != NULL) match_data->subject = original_subject;7227}7228return rc;7229}7230}7231#endif /* SUPPORT_JIT */72327233/* ========================= End of JIT matching ========================== */723472357236/* Proceed with non-JIT matching. The default is to allow lookbehinds to the7237start of the subject. A UTF check when there is a non-zero offset may change7238this. */72397240mb->check_subject = subject;72417242/* If a UTF subject string was not checked for validity in the JIT code above,7243check it here, and handle support for invalid UTF strings. The check above7244happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.7245If we get here in those circumstances, it means the subject string is valid,7246but for some reason JIT matching was not successful. There is no need to check7247the subject again.72487249We check only the portion of the subject that might be be inspected during7250matching - from the offset minus the maximum lookbehind to the given length.7251This saves time when a small part of a large subject is being matched by the7252use of a starting offset. Note that the maximum lookbehind is a number of7253characters, not code units.72547255Note also that support for invalid UTF forces a check, overriding the setting7256of PCRE2_NO_CHECK_UTF. */72577258#ifdef SUPPORT_UNICODE7259if (utf &&7260#ifdef SUPPORT_JIT7261!jit_checked_utf &&7262#endif7263((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))7264{7265#if PCRE2_CODE_UNIT_WIDTH != 327266BOOL skipped_bad_start = FALSE;7267#endif72687269/* For 8-bit and 16-bit UTF, check that the first code unit is a valid7270character start. If we are handling invalid UTF, just skip over such code7271units. Otherwise, give an appropriate error. */72727273#if PCRE2_CODE_UNIT_WIDTH != 327274if (allow_invalid)7275{7276while (start_match < end_subject && NOT_FIRSTCU(*start_match))7277{7278start_match++;7279skipped_bad_start = TRUE;7280}7281}7282else if (start_match < end_subject && NOT_FIRSTCU(*start_match))7283{7284if (start_offset > 0) return match_data->rc = PCRE2_ERROR_BADUTFOFFSET;7285#if PCRE2_CODE_UNIT_WIDTH == 87286return match_data->rc = PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */7287#else7288return match_data->rc = PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */7289#endif7290}7291#endif /* WIDTH != 32 */72927293/* The mb->check_subject field points to the start of UTF checking;7294lookbehinds can go back no further than this. */72957296mb->check_subject = start_match;72977298/* Move back by the maximum lookbehind, just in case it happens at the very7299start of matching, but don't do this if we skipped bad 8-bit or 16-bit code7300units above. */73017302#if PCRE2_CODE_UNIT_WIDTH != 327303if (!skipped_bad_start)7304{7305unsigned int i;7306for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)7307{7308mb->check_subject--;7309while (mb->check_subject > subject &&7310#if PCRE2_CODE_UNIT_WIDTH == 87311(*mb->check_subject & 0xc0) == 0x80)7312#else /* 16-bit */7313(*mb->check_subject & 0xfc00) == 0xdc00)7314#endif7315mb->check_subject--;7316}7317}7318#else /* PCRE2_CODE_UNIT_WIDTH != 32 */73197320/* In the 32-bit library, one code unit equals one character. However,7321we cannot just subtract the lookbehind and then compare pointers, because7322a very large lookbehind could create an invalid pointer. */73237324if (start_offset >= re->max_lookbehind)7325mb->check_subject -= re->max_lookbehind;7326else7327mb->check_subject = subject;7328#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */73297330/* Validate the relevant portion of the subject. There's a loop in case we7331encounter bad UTF in the characters preceding start_match which we are7332scanning because of a lookbehind. */73337334for (;;)7335{7336rc = PRIV(valid_utf)(mb->check_subject,7337length - (mb->check_subject - subject), &(match_data->startchar));73387339if (rc == 0) break; /* Valid UTF string */73407341/* Invalid UTF string. Adjust the offset to be an absolute offset in the7342whole string. If we are handling invalid UTF strings, set end_subject to7343stop before the bad code unit, and set the options to "not end of line".7344Otherwise return the error. */73457346match_data->startchar += mb->check_subject - subject;7347if (!allow_invalid || rc > 0) return match_data->rc = rc;7348end_subject = subject + match_data->startchar;73497350/* If the end precedes start_match, it means there is invalid UTF in the7351extra code units we reversed over because of a lookbehind. Advance past the7352first bad code unit, and then skip invalid character starting code units in73538-bit and 16-bit modes, and try again with the original end point. */73547355if (end_subject < start_match)7356{7357mb->check_subject = end_subject + 1;7358#if PCRE2_CODE_UNIT_WIDTH != 327359while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))7360mb->check_subject++;7361#endif7362end_subject = true_end_subject;7363}73647365/* Otherwise, set the not end of line option, and do the match. */73667367else7368{7369fragment_options = PCRE2_NOTEOL;7370break;7371}7372}7373}7374#endif /* SUPPORT_UNICODE */73757376/* A NULL match context means "use a default context", but we take the memory7377control functions from the pattern. */73787379if (mcontext == NULL)7380{7381mcontext = (pcre2_match_context *)(&PRIV(default_match_context));7382mb->memctl = re->memctl;7383}7384else mb->memctl = mcontext->memctl;73857386anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;7387firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;7388startline = (re->flags & PCRE2_STARTLINE) != 0;7389bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?7390true_end_subject : subject + mcontext->offset_limit;73917392/* Initialize and set up the fixed fields in the callout block, with a pointer7393in the match block. */73947395mb->cb = &cb;7396cb.version = 2;7397cb.subject = subject;7398cb.subject_length = (PCRE2_SIZE)(end_subject - subject);7399cb.callout_flags = 0;74007401/* Fill in the remaining fields in the match block, except for moptions, which7402gets set later. */74037404mb->callout = mcontext->callout;7405mb->callout_data = mcontext->callout_data;74067407mb->start_subject = subject;7408mb->start_offset = start_offset;7409mb->end_subject = end_subject;7410mb->true_end_subject = true_end_subject;7411mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;7412mb->hasbsk = (re->flags & PCRE2_HASBSK) != 0;7413mb->allowemptypartial = (re->max_lookbehind > 0) ||7414(re->flags & PCRE2_MATCH_EMPTY) != 0;7415mb->allowlookaroundbsk =7416(re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0;7417mb->poptions = re->overall_options; /* Pattern options */7418mb->ignore_skip_arg = 0;7419mb->mark = mb->nomatch_mark = NULL; /* In case never set */74207421/* The name table is needed for finding all the numbers associated with a7422given name, for condition testing. The code follows the name table. */74237424mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code));7425mb->name_count = re->name_count;7426mb->name_entry_size = re->name_entry_size;7427mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);74287429/* Process the \R and newline settings. */74307431mb->bsr_convention = re->bsr_convention;7432mb->nltype = NLTYPE_FIXED;7433switch(re->newline_convention)7434{7435case PCRE2_NEWLINE_CR:7436mb->nllen = 1;7437mb->nl[0] = CHAR_CR;7438break;74397440case PCRE2_NEWLINE_LF:7441mb->nllen = 1;7442mb->nl[0] = CHAR_NL;7443break;74447445case PCRE2_NEWLINE_NUL:7446mb->nllen = 1;7447mb->nl[0] = CHAR_NUL;7448break;74497450case PCRE2_NEWLINE_CRLF:7451mb->nllen = 2;7452mb->nl[0] = CHAR_CR;7453mb->nl[1] = CHAR_NL;7454break;74557456case PCRE2_NEWLINE_ANY:7457mb->nltype = NLTYPE_ANY;7458break;74597460case PCRE2_NEWLINE_ANYCRLF:7461mb->nltype = NLTYPE_ANYCRLF;7462break;74637464/* LCOV_EXCL_START */7465default:7466PCRE2_DEBUG_UNREACHABLE();7467return match_data->rc = PCRE2_ERROR_INTERNAL;7468/* LCOV_EXCL_STOP */7469}74707471/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE7472vector at the end, whose size depends on the number of capturing parentheses in7473the pattern. It is not used at all if there are no capturing parentheses.74747475frame_size is the total size of each frame7476match_data->heapframes is the pointer to the frames vector7477match_data->heapframes_size is the allocated size of the vector74787479We must pad the frame_size for alignment to ensure subsequent frames are as7480aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE7481array, that does not guarantee it is suitably aligned for pointers, as some7482architectures have pointers that are larger than a size_t. */74837484frame_size = (offsetof(heapframe, ovector) +7485re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &7486~(HEAPFRAME_ALIGNMENT - 1);74877488/* Limits set in the pattern override the match context only if they are7489smaller. */74907491mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?7492mcontext->heap_limit : re->limit_heap);74937494mb->match_limit = (mcontext->match_limit < re->limit_match)?7495mcontext->match_limit : re->limit_match;74967497mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?7498mcontext->depth_limit : re->limit_depth;74997500/* If a pattern has very many capturing parentheses, the frame size may be very7501large. Set the initial frame vector size to ensure that there are at least 107502available frames, but enforce a minimum of START_FRAMES_SIZE. If this is7503greater than the heap limit, get as large a vector as possible. */75047505heapframes_size = frame_size * 10;7506if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;7507if (heapframes_size / 1024 > mb->heap_limit)7508{7509PCRE2_SIZE max_size = 1024 * mb->heap_limit;7510if (max_size < frame_size) return match_data->rc = PCRE2_ERROR_HEAPLIMIT;7511heapframes_size = max_size;7512}75137514/* If an existing frame vector in the match_data block is large enough, we can7515use it. Otherwise, free any pre-existing vector and get a new one. */75167517if (match_data->heapframes_size < heapframes_size)7518{7519match_data->memctl.free(match_data->heapframes,7520match_data->memctl.memory_data);7521match_data->heapframes = match_data->memctl.malloc(heapframes_size,7522match_data->memctl.memory_data);7523if (match_data->heapframes == NULL)7524{7525match_data->heapframes_size = 0;7526return match_data->rc = PCRE2_ERROR_NOMEMORY;7527}7528match_data->heapframes_size = heapframes_size;7529}75307531/* Write to the ovector within the first frame to mark every capture unset and7532to avoid uninitialized memory read errors when it is copied to a new frame. */75337534memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,7535frame_size - offsetof(heapframe, ovector));75367537/* Pointers to the individual character tables */75387539mb->lcc = re->tables + lcc_offset;7540mb->fcc = re->tables + fcc_offset;7541mb->ctypes = re->tables + ctypes_offset;75427543/* Set up the first code unit to match, if available. If there's no first code7544unit there may be a bitmap of possible first characters. */75457546if ((re->flags & PCRE2_FIRSTSET) != 0)7547{7548has_first_cu = TRUE;7549first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);7550if ((re->flags & PCRE2_FIRSTCASELESS) != 0)7551{7552first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);7553#ifdef SUPPORT_UNICODE7554#if PCRE2_CODE_UNIT_WIDTH == 87555if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);7556#else7557if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);7558#endif7559#endif /* SUPPORT_UNICODE */7560}7561}7562else7563if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)7564start_bits = re->start_bitmap;75657566/* There may also be a "last known required character" set. */75677568if ((re->flags & PCRE2_LASTSET) != 0)7569{7570has_req_cu = TRUE;7571req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);7572if ((re->flags & PCRE2_LASTCASELESS) != 0)7573{7574req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);7575#ifdef SUPPORT_UNICODE7576#if PCRE2_CODE_UNIT_WIDTH == 87577if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);7578#else7579if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);7580#endif7581#endif /* SUPPORT_UNICODE */7582}7583}758475857586/* ==========================================================================*/75877588/* Loop for handling unanchored repeated matching attempts; for anchored regexs7589the loop runs just once. */75907591#ifdef SUPPORT_UNICODE7592FRAGMENT_RESTART:7593#endif75947595start_partial = match_partial = NULL;7596mb->hitend = FALSE;75977598#if PCRE2_CODE_UNIT_WIDTH == 87599memchr_found_first_cu = NULL;7600memchr_found_first_cu2 = NULL;7601#endif76027603for(;;)7604{7605PCRE2_SPTR new_start_match;76067607/* ----------------- Start of match optimizations ---------------- */76087609/* There are some optimizations that avoid running the match if a known7610starting point is not found, or if a known later code unit is not present.7611However, there is an option (settable at compile time) that disables these,7612for testing and for ensuring that all callouts do actually occur. */76137614if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)7615{7616/* If firstline is TRUE, the start of the match is constrained to the first7617line of a multiline string. That is, the match must be before or at the7618first newline following the start of matching. Temporarily adjust7619end_subject so that we stop the scans for a first code unit at a newline.7620If the match fails at the newline, later code breaks the loop. */76217622if (firstline)7623{7624PCRE2_SPTR t = start_match;7625#ifdef SUPPORT_UNICODE7626if (utf)7627{7628while (t < end_subject && !IS_NEWLINE(t))7629{7630t++;7631ACROSSCHAR(t < end_subject, t, t++);7632}7633}7634else7635#endif7636while (t < end_subject && !IS_NEWLINE(t)) t++;7637end_subject = t;7638}76397640/* Anchored: check the first code unit if one is recorded. This may seem7641pointless but it can help in detecting a no match case without scanning for7642the required code unit. */76437644if (anchored)7645{7646if (has_first_cu || start_bits != NULL)7647{7648BOOL ok = start_match < end_subject;7649if (ok)7650{7651PCRE2_UCHAR c = UCHAR21TEST(start_match);7652ok = has_first_cu && (c == first_cu || c == first_cu2);7653if (!ok && start_bits != NULL)7654{7655#if PCRE2_CODE_UNIT_WIDTH != 87656if (c > 255) c = 255;7657#endif7658ok = (start_bits[c/8] & (1u << (c&7))) != 0;7659}7660}7661if (!ok)7662{7663rc = MATCH_NOMATCH;7664break;7665}7666}7667}76687669/* Not anchored. Advance to a unique first code unit if there is one. */76707671else7672{7673if (has_first_cu)7674{7675if (first_cu != first_cu2) /* Caseless */7676{7677/* In 16-bit and 32_bit modes we have to do our own search, so can7678look for both cases at once. */76797680#if PCRE2_CODE_UNIT_WIDTH != 87681PCRE2_UCHAR smc;7682while (start_match < end_subject &&7683(smc = UCHAR21TEST(start_match)) != first_cu &&7684smc != first_cu2)7685start_match++;7686#else7687/* In 8-bit mode, the use of memchr() gives a big speed up, even7688though we have to call it twice in order to find the earliest7689occurrence of the code unit in either of its cases. Caching is used7690to remember the positions of previously found code units. This can7691make a huge difference when the strings are very long and only one7692case is actually present. */76937694PCRE2_SPTR pp1 = NULL;7695PCRE2_SPTR pp2 = NULL;7696PCRE2_SIZE searchlength = end_subject - start_match;76977698/* If we haven't got a previously found position for first_cu, or if7699the current starting position is later, we need to do a search. If7700the code unit is not found, set it to the end. */77017702if (memchr_found_first_cu == NULL ||7703start_match > memchr_found_first_cu)7704{7705pp1 = memchr(start_match, first_cu, searchlength);7706memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;7707}77087709/* If the start is before a previously found position, use the7710previous position, or NULL if a previous search failed. */77117712else pp1 = (memchr_found_first_cu == end_subject)? NULL :7713memchr_found_first_cu;77147715/* Do the same thing for the other case. */77167717if (memchr_found_first_cu2 == NULL ||7718start_match > memchr_found_first_cu2)7719{7720pp2 = memchr(start_match, first_cu2, searchlength);7721memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;7722}77237724else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :7725memchr_found_first_cu2;77267727/* Set the start to the end of the subject if neither case was found.7728Otherwise, use the earlier found point. */77297730if (pp1 == NULL)7731start_match = (pp2 == NULL)? end_subject : pp2;7732else7733start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;77347735#endif /* 8-bit handling */7736}77377738/* The caseful case is much simpler. */77397740else7741{7742#if PCRE2_CODE_UNIT_WIDTH != 87743while (start_match < end_subject && UCHAR21TEST(start_match) !=7744first_cu)7745start_match++;7746#else7747start_match = memchr(start_match, first_cu, end_subject - start_match);7748if (start_match == NULL) start_match = end_subject;7749#endif7750}77517752/* If we can't find the required first code unit, having reached the7753true end of the subject, break the bumpalong loop, to force a match7754failure, except when doing partial matching, when we let the next cycle7755run at the end of the subject. To see why, consider the pattern7756/(?<=abc)def/, which partially matches "abc", even though the string7757does not contain the starting character "d". If we have not reached the7758true end of the subject (PCRE2_FIRSTLINE caused end_subject to be7759temporarily modified) we also let the cycle run, because the matching7760string is legitimately allowed to start with the first code unit of a7761newline. */77627763if (mb->partial == 0 && start_match >= mb->end_subject)7764{7765rc = MATCH_NOMATCH;7766break;7767}7768}77697770/* If there's no first code unit, advance to just after a linebreak for a7771multiline match if required. */77727773else if (startline)7774{7775if (start_match > mb->start_subject + start_offset)7776{7777#ifdef SUPPORT_UNICODE7778if (utf)7779{7780while (start_match < end_subject && !WAS_NEWLINE(start_match))7781{7782start_match++;7783ACROSSCHAR(start_match < end_subject, start_match, start_match++);7784}7785}7786else7787#endif7788while (start_match < end_subject && !WAS_NEWLINE(start_match))7789start_match++;77907791/* If we have just passed a CR and the newline option is ANY or7792ANYCRLF, and we are now at a LF, advance the match position by one7793more code unit. */77947795if (start_match[-1] == CHAR_CR &&7796(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&7797start_match < end_subject &&7798UCHAR21TEST(start_match) == CHAR_NL)7799start_match++;7800}7801}78027803/* If there's no first code unit or a requirement for a multiline line7804start, advance to a non-unique first code unit if any have been7805identified. The bitmap contains only 256 bits. When code units are 16 or780632 bits wide, all code units greater than 254 set the 255 bit. */78077808else if (start_bits != NULL)7809{7810while (start_match < end_subject)7811{7812uint32_t c = UCHAR21TEST(start_match);7813#if PCRE2_CODE_UNIT_WIDTH != 87814if (c > 255) c = 255;7815#endif7816if ((start_bits[c/8] & (1u << (c&7))) != 0) break;7817start_match++;7818}78197820/* See comment above in first_cu checking about the next few lines. */78217822if (mb->partial == 0 && start_match >= mb->end_subject)7823{7824rc = MATCH_NOMATCH;7825break;7826}7827}7828} /* End first code unit handling */78297830/* Restore fudged end_subject */78317832end_subject = mb->end_subject;78337834/* The following two optimizations must be disabled for partial matching. */78357836if (mb->partial == 0)7837{7838PCRE2_SPTR p;78397840/* The minimum matching length is a lower bound; no string of that length7841may actually match the pattern. Although the value is, strictly, in7842characters, we treat it as code units to avoid spending too much time in7843this optimization. */78447845if (end_subject - start_match < re->minlength)7846{7847rc = MATCH_NOMATCH;7848break;7849}78507851/* If req_cu is set, we know that that code unit must appear in the7852subject for the (non-partial) match to succeed. If the first code unit is7853set, req_cu must be later in the subject; otherwise the test starts at7854the match point. This optimization can save a huge amount of backtracking7855in patterns with nested unlimited repeats that aren't going to match.7856Writing separate code for caseful/caseless versions makes it go faster,7857as does using an autoincrement and backing off on a match. As in the case7858of the first code unit, using memchr() in the 8-bit library gives a big7859speed up. Unlike the first_cu check above, we do not need to call7860memchr() twice in the caseless case because we only need to check for the7861presence of the character in either case, not find the first occurrence.78627863The search can be skipped if the code unit was found later than the7864current starting point in a previous iteration of the bumpalong loop.78657866HOWEVER: when the subject string is very, very long, searching to its end7867can take a long time, and give bad performance on quite ordinary7868anchored patterns. This showed up when somebody was matching something7869like /^\d+C/ on a 32-megabyte string... so we don't do this when the7870string is sufficiently long, but it's worth searching a lot more for7871unanchored patterns. */78727873p = start_match + (has_first_cu? 1:0);7874if (has_req_cu && p > req_cu_ptr)7875{7876PCRE2_SIZE check_length = end_subject - start_match;78777878if (check_length < REQ_CU_MAX ||7879(!anchored && check_length < REQ_CU_MAX * 1000))7880{7881if (req_cu != req_cu2) /* Caseless */7882{7883#if PCRE2_CODE_UNIT_WIDTH != 87884while (p < end_subject)7885{7886uint32_t pp = UCHAR21INCTEST(p);7887if (pp == req_cu || pp == req_cu2) { p--; break; }7888}7889#else /* 8-bit code units */7890PCRE2_SPTR pp = p;7891p = memchr(pp, req_cu, end_subject - pp);7892if (p == NULL)7893{7894p = memchr(pp, req_cu2, end_subject - pp);7895if (p == NULL) p = end_subject;7896}7897#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */7898}78997900/* The caseful case */79017902else7903{7904#if PCRE2_CODE_UNIT_WIDTH != 87905while (p < end_subject)7906{7907if (UCHAR21INCTEST(p) == req_cu) { p--; break; }7908}79097910#else /* 8-bit code units */7911p = memchr(p, req_cu, end_subject - p);7912if (p == NULL) p = end_subject;7913#endif7914}79157916/* If we can't find the required code unit, break the bumpalong loop,7917forcing a match failure. */79187919if (p >= end_subject)7920{7921rc = MATCH_NOMATCH;7922break;7923}79247925/* If we have found the required code unit, save the point where we7926found it, so that we don't search again next time round the bumpalong7927loop if the start hasn't yet passed this code unit. */79287929req_cu_ptr = p;7930}7931}7932}7933}79347935/* ------------ End of start of match optimizations ------------ */79367937/* Give no match if we have passed the bumpalong limit. */79387939if (start_match > bumpalong_limit)7940{7941rc = MATCH_NOMATCH;7942break;7943}79447945/* OK, we can now run the match. If "hitend" is set afterwards, remember the7946first starting point for which a partial match was found. */79477948cb.start_match = (PCRE2_SIZE)(start_match - subject);7949cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;79507951mb->start_used_ptr = start_match;7952mb->last_used_ptr = start_match;7953#ifdef SUPPORT_UNICODE7954mb->moptions = options | fragment_options;7955#else7956mb->moptions = options;7957#endif7958mb->match_call_count = 0;7959mb->end_offset_top = 0;7960mb->skip_arg_count = 0;79617962#ifdef DEBUG_SHOW_OPS7963fprintf(stderr, "++ Calling match()\n");7964#endif79657966rc = match(start_match, mb->start_code, re->top_bracket, frame_size,7967match_data, mb);79687969#ifdef DEBUG_SHOW_OPS7970fprintf(stderr, "++ match() returned %d\n\n", rc);7971#endif79727973if (mb->hitend && start_partial == NULL)7974{7975start_partial = mb->start_used_ptr;7976match_partial = start_match;7977}79787979switch(rc)7980{7981/* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched7982the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP7983entirely. The only way we can do that is to re-do the match at the same7984point, with a flag to force SKIP with an argument to be ignored. Just7985treating this case as NOMATCH does not work because it does not check other7986alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */79877988case MATCH_SKIP_ARG:7989new_start_match = start_match;7990mb->ignore_skip_arg = mb->skip_arg_count;7991break;79927993/* SKIP passes back the next starting point explicitly, but if it is no7994greater than the match we have just done, treat it as NOMATCH. */79957996case MATCH_SKIP:7997if (mb->verb_skip_ptr > start_match)7998{7999new_start_match = mb->verb_skip_ptr;8000break;8001}8002PCRE2_FALLTHROUGH /* Fall through */80038004/* NOMATCH and PRUNE advance by one character. THEN at this level acts8005exactly like PRUNE. Unset ignore SKIP-with-argument. */80068007case MATCH_NOMATCH:8008case MATCH_PRUNE:8009case MATCH_THEN:8010mb->ignore_skip_arg = 0;8011new_start_match = start_match + 1;8012#ifdef SUPPORT_UNICODE8013if (utf)8014ACROSSCHAR(new_start_match < end_subject, new_start_match,8015new_start_match++);8016#endif8017break;80188019/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */80208021case MATCH_COMMIT:8022rc = MATCH_NOMATCH;8023goto ENDLOOP;80248025/* Any other return is either a match, or some kind of error. */80268027default:8028goto ENDLOOP;8029}80308031/* Control reaches here for the various types of "no match at this point"8032result. Reset the code to MATCH_NOMATCH for subsequent checking. */80338034rc = MATCH_NOMATCH;80358036/* If PCRE2_FIRSTLINE is set, the match must happen before or at the first8037newline in the subject (though it may continue over the newline). Therefore,8038if we have just failed to match, starting at a newline, do not continue. */80398040if (firstline && IS_NEWLINE(start_match)) break;80418042/* Advance to new matching position */80438044start_match = new_start_match;80458046/* Break the loop if the pattern is anchored or if we have passed the end of8047the subject. */80488049if (anchored || start_match > end_subject) break;80508051/* If we have just passed a CR and we are now at a LF, and the pattern does8052not contain any explicit matches for \r or \n, and the newline option is CRLF8053or ANY or ANYCRLF, advance the match position by one more code unit. In8054normal matching start_match will aways be greater than the first position at8055this stage, but a failed *SKIP can cause a return at the same point, which is8056why the first test exists. */80578058if (start_match > subject + start_offset &&8059start_match[-1] == CHAR_CR &&8060start_match < end_subject &&8061*start_match == CHAR_NL &&8062(re->flags & PCRE2_HASCRORLF) == 0 &&8063(mb->nltype == NLTYPE_ANY ||8064mb->nltype == NLTYPE_ANYCRLF ||8065mb->nllen == 2))8066start_match++;80678068mb->mark = NULL; /* Reset for start of next match attempt */8069} /* End of for(;;) "bumpalong" loop */80708071/* ==========================================================================*/80728073/* When we reach here, one of the following stopping conditions is true:80748075(1) The match succeeded, either completely, or partially;80768077(2) The pattern is anchored or the match was failed after (*COMMIT);80788079(3) We are past the end of the subject or the bumpalong limit;80808081(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because8082this option requests that a match occur at or before the first newline in8083the subject.80848085(5) Some kind of error occurred.80868087*/80888089ENDLOOP:80908091/* If end_subject != true_end_subject, it means we are handling invalid UTF,8092and have just processed a non-terminal fragment. If this resulted in no match8093or a partial match we must carry on to the next fragment (a partial match is8094returned to the caller only at the very end of the subject). A loop is used to8095avoid trying to match against empty fragments; if the pattern can match an8096empty string it would have done so already. */80978098#ifdef SUPPORT_UNICODE8099if (utf && end_subject != true_end_subject &&8100(rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))8101{8102for (;;)8103{8104/* Advance past the first bad code unit, and then skip invalid character8105starting code units in 8-bit and 16-bit modes. */81068107start_match = end_subject + 1;81088109#if PCRE2_CODE_UNIT_WIDTH != 328110while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))8111start_match++;8112#endif81138114/* If we have hit the end of the subject, there isn't another non-empty8115fragment, so give up. */81168117if (start_match >= true_end_subject)8118{8119rc = MATCH_NOMATCH; /* In case it was partial */8120match_partial = NULL;8121break;8122}81238124/* Check the rest of the subject */81258126mb->check_subject = start_match;8127rc = PRIV(valid_utf)(start_match, length - (start_match - subject),8128&(match_data->startchar));81298130/* The rest of the subject is valid UTF. */81318132if (rc == 0)8133{8134mb->end_subject = end_subject = true_end_subject;8135fragment_options = PCRE2_NOTBOL;8136goto FRAGMENT_RESTART;8137}81388139/* A subsequent UTF error has been found; if the next fragment is8140non-empty, set up to process it. Otherwise, let the loop advance. */81418142else if (rc < 0)8143{8144mb->end_subject = end_subject = start_match + match_data->startchar;8145if (end_subject > start_match)8146{8147fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;8148goto FRAGMENT_RESTART;8149}8150}8151}8152}8153#endif /* SUPPORT_UNICODE */81548155/* Fill in fields that are always returned in the match data. */81568157match_data->code = re;8158match_data->mark = mb->mark;8159match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;8160match_data->options = original_options;81618162/* Handle a fully successful match. Set the return code to the number of8163captured strings, or 0 if there were too many to fit into the ovector, and then8164set the remaining returned values before returning. Make a copy of the subject8165string if requested. */81668167if (rc == MATCH_MATCH)8168{8169match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?81700 : (int)mb->end_offset_top/2 + 1;8171match_data->subject_length = length;8172match_data->start_offset = start_offset;8173match_data->startchar = start_match - subject;8174match_data->leftchar = mb->start_used_ptr - subject;8175match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?8176mb->last_used_ptr : mb->end_match_ptr) - subject;8177if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)8178{8179if (length != 0)8180{8181match_data->subject = match_data->memctl.malloc(CU2BYTES(length),8182match_data->memctl.memory_data);8183if (match_data->subject == NULL)8184return match_data->rc = PCRE2_ERROR_NOMEMORY;8185memcpy((void *)match_data->subject, subject, CU2BYTES(length));8186}8187else8188match_data->subject = NULL;8189match_data->flags |= PCRE2_MD_COPIED_SUBJECT;8190}8191else match_data->subject = original_subject;81928193return match_data->rc;8194}81958196/* Control gets here if there has been a partial match, an error, or if the8197overall match attempt has failed at all permitted starting positions. Any mark8198data is in the nomatch_mark field. */81998200match_data->mark = mb->nomatch_mark;82018202/* For anything other than nomatch or partial match, just return the code. */82038204if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;82058206/* Handle a partial match. If a "soft" partial match was requested, searching8207for a complete match will have continued, and the value of rc at this point8208will be MATCH_NOMATCH. For a "hard" partial match, it will already be8209PCRE2_ERROR_PARTIAL. */82108211else if (match_partial != NULL)8212{8213match_data->subject = original_subject;8214match_data->subject_length = length;8215match_data->start_offset = start_offset;8216match_data->ovector[0] = match_partial - subject;8217match_data->ovector[1] = end_subject - subject;8218match_data->startchar = match_partial - subject;8219match_data->leftchar = start_partial - subject;8220match_data->rightchar = end_subject - subject;8221match_data->rc = PCRE2_ERROR_PARTIAL;8222}82238224/* Else this is the classic nomatch case. */82258226else8227{8228match_data->subject = original_subject;8229match_data->subject_length = length;8230match_data->start_offset = start_offset;8231match_data->rc = PCRE2_ERROR_NOMATCH;8232}82338234return match_data->rc;8235}82368237/* These #undefs are here to enable unity builds with CMake. */82388239#undef NLBLOCK /* Block containing newline information */8240#undef PSSTART /* Field containing processed string start */8241#undef PSEND /* Field containing processed string end */82428243/* End of pcre2_match.c */824482458246