Path: blob/master/thirdparty/pcre2/src/pcre2_dfa_match.c
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041/* This module contains the external function pcre2_dfa_match(), which is an42alternative matching function that uses a sort of DFA algorithm (not a true43FSM). This is NOT Perl-compatible, but it has advantages in certain44applications. */454647/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved48the performance of his patterns greatly. I could not use it as it stood, as it49was not thread safe, and made assumptions about pattern sizes. Also, it caused50test 7 to loop, and test 9 to crash with a segfault.5152The issue is the check for duplicate states, which is done by a simple linear53search up the state list. (Grep for "duplicate" below to find the code.) For54many patterns, there will never be many states active at one time, so a simple55linear search is fine. In patterns that have many active states, it might be a56bottleneck. The suggested code used an indexing scheme to remember which states57had previously been used for each character, and avoided the linear search when58it knew there was no chance of a duplicate. This was implemented when adding59states to the state lists.6061I wrote some thread-safe, not-limited code to try something similar at the time62of checking for duplicates (instead of when adding states), using index vectors63on the stack. It did give a 13% improvement with one specially constructed64pattern for certain subject strings, but on other strings and on many of the65simpler patterns in the test suite it did worse. The major problem, I think,66was the extra time to initialize the index. This had to be done for each call67of internal_dfa_match(). (The supplied patch used a static vector, initialized68only once - I suspect this was the cause of the problems with the tests.)6970Overall, I concluded that the gains in some cases did not outweigh the losses71in others, so I abandoned this code. */727374#ifdef HAVE_CONFIG_H75#include "config.h"76#endif7778#define NLBLOCK mb /* Block containing newline information */79#define PSSTART start_subject /* Field containing processed string start */80#define PSEND end_subject /* Field containing processed string end */8182#include "pcre2_internal.h"8384#define PUBLIC_DFA_MATCH_OPTIONS \85(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \86PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \87PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \88PCRE2_COPY_MATCHED_SUBJECT)899091/*************************************************92* Code parameters and static tables *93*************************************************/9495/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes96into others, under special conditions. A gap of 20 between the blocks should be97enough. The resulting opcodes don't have to be less than 256 because they are98never stored, so we push them well clear of the normal opcodes. */99100#define OP_PROP_EXTRA 300101#define OP_EXTUNI_EXTRA 320102#define OP_ANYNL_EXTRA 340103#define OP_HSPACE_EXTRA 360104#define OP_VSPACE_EXTRA 380105106107/* This table identifies those opcodes that are followed immediately by a108character that is to be tested in some way. This makes it possible to109centralize the loading of these characters. In the case of Type * etc, the110"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a111small value. Non-zero values in the table are the offsets from the opcode where112the character is to be found. ***NOTE*** If the start of this table is113modified, the three tables that follow must also be modified. */114115static const uint8_t coptable[] = {1160, /* End */1170, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */1180, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */1190, 0, 0, /* Any, AllAny, Anybyte */1200, 0, /* \P, \p */1210, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */1220, /* \X */1230, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */1241, /* Char */1251, /* Chari */1261, /* not */1271, /* noti */128/* Positive single-char repeats */1291, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */1301+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */1311+IMM2_SIZE, /* exact */1321, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */1331, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */1341+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */1351+IMM2_SIZE, /* exact I */1361, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */137/* Negative single-char repeats - only for chars < 256 */1381, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */1391+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */1401+IMM2_SIZE, /* NOT exact */1411, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */1421, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */1431+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */1441+IMM2_SIZE, /* NOT exact I */1451, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */146/* Positive type repeats */1471, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */1481+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */1491+IMM2_SIZE, /* Type exact */1501, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */151/* Character class & ref repeats */1520, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */1530, 0, /* CRRANGE, CRMINRANGE */1540, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */1550, /* CLASS */1560, /* NCLASS */1570, /* XCLASS - variable length */1580, /* ECLASS - variable length */1590, /* REF */1600, /* REFI */1610, /* DNREF */1620, /* DNREFI */1630, /* RECURSE */1640, /* CALLOUT */1650, /* CALLOUT_STR */1660, /* Alt */1670, /* Ket */1680, /* KetRmax */1690, /* KetRmin */1700, /* KetRpos */1710, 0, /* Reverse, Vreverse */1720, /* Assert */1730, /* Assert not */1740, /* Assert behind */1750, /* Assert behind not */1760, /* NA assert */1770, /* NA assert behind */1780, /* Assert scan substring */1790, /* ONCE */1800, /* SCRIPT_RUN */1810, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */1820, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */1830, 0, /* CREF, DNCREF */1840, 0, /* RREF, DNRREF */1850, 0, /* FALSE, TRUE */1860, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */1870, 0, 0, /* MARK, PRUNE, PRUNE_ARG */1880, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */1890, 0, /* COMMIT, COMMIT_ARG */1900, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */1910, 0, 0, /* CLOSE, SKIPZERO, DEFINE */1920, 0, /* \B and \b in UCP mode */193};194195/* This table identifies those opcodes that inspect a character. It is used to196remember the fact that a character could have been inspected when the end of197the subject is reached. ***NOTE*** If the start of this table is modified, the198two tables that follow must also be modified. */199200static const uint8_t poptable[] = {2010, /* End */2020, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */2031, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */2041, 1, 1, /* Any, AllAny, Anybyte */2051, 1, /* \P, \p */2061, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */2071, /* \X */2080, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */2091, /* Char */2101, /* Chari */2111, /* not */2121, /* noti */213/* Positive single-char repeats */2141, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */2151, 1, 1, /* upto, minupto, exact */2161, 1, 1, 1, /* *+, ++, ?+, upto+ */2171, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */2181, 1, 1, /* upto I, minupto I, exact I */2191, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */220/* Negative single-char repeats - only for chars < 256 */2211, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */2221, 1, 1, /* NOT upto, minupto, exact */2231, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */2241, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */2251, 1, 1, /* NOT upto I, minupto I, exact I */2261, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */227/* Positive type repeats */2281, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */2291, 1, 1, /* Type upto, minupto, exact */2301, 1, 1, 1, /* Type *+, ++, ?+, upto+ */231/* Character class & ref repeats */2321, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */2331, 1, /* CRRANGE, CRMINRANGE */2341, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */2351, /* CLASS */2361, /* NCLASS */2371, /* XCLASS - variable length */2381, /* ECLASS - variable length */2390, /* REF */2400, /* REFI */2410, /* DNREF */2420, /* DNREFI */2430, /* RECURSE */2440, /* CALLOUT */2450, /* CALLOUT_STR */2460, /* Alt */2470, /* Ket */2480, /* KetRmax */2490, /* KetRmin */2500, /* KetRpos */2510, 0, /* Reverse, Vreverse */2520, /* Assert */2530, /* Assert not */2540, /* Assert behind */2550, /* Assert behind not */2560, /* NA assert */2570, /* NA assert behind */2580, /* Assert scan substring */2590, /* ONCE */2600, /* SCRIPT_RUN */2610, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */2620, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */2630, 0, /* CREF, DNCREF */2640, 0, /* RREF, DNRREF */2650, 0, /* FALSE, TRUE */2660, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */2670, 0, 0, /* MARK, PRUNE, PRUNE_ARG */2680, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */2690, 0, /* COMMIT, COMMIT_ARG */2700, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */2710, 0, 0, /* CLOSE, SKIPZERO, DEFINE */2721, 1, /* \B and \b in UCP mode */273};274275/* Compile-time check that these tables have the correct size. */276STATIC_ASSERT(sizeof(coptable) == OP_TABLE_LENGTH, coptable);277STATIC_ASSERT(sizeof(poptable) == OP_TABLE_LENGTH, poptable);278279/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,280and \w */281282static const uint8_t toptable1[] = {2830, 0, 0, 0, 0, 0,284ctype_digit, ctype_digit,285ctype_space, ctype_space,286ctype_word, ctype_word,2870, 0 /* OP_ANY, OP_ALLANY */288};289290static const uint8_t toptable2[] = {2910, 0, 0, 0, 0, 0,292ctype_digit, 0,293ctype_space, 0,294ctype_word, 0,2951, 1 /* OP_ANY, OP_ALLANY */296};297298299/* Structure for holding data about a particular state, which is in effect the300current data for an active path through the match tree. It must consist301entirely of ints because the working vector we are passed, and which we put302these structures in, is a vector of ints. */303304typedef struct stateblock {305int offset; /* Offset to opcode (-ve has meaning) */306int count; /* Count for repeats */307int data; /* Some use extra data */308} stateblock;309310#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))311312313/* Before version 10.32 the recursive calls of internal_dfa_match() were passed314local working space and output vectors that were created on the stack. This has315caused issues for some patterns, especially in small-stack environments such as316Windows. A new scheme is now in use which sets up a vector on the stack, but if317this is too small, heap memory is used, up to the heap_limit. The main318parameters are all numbers of ints because the workspace is a vector of ints.319320The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is321defined in pcre2_internal.h so as to be available to pcre2test when it is322finding the minimum heap requirement for a match. */323324#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))325326#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */327#define RWS_RSIZE 1000 /* Work size for recursion */328#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */329#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */330331/* This structure is at the start of each workspace block. */332333typedef struct RWS_anchor {334struct RWS_anchor *next;335uint32_t size; /* Number of ints */336uint32_t free; /* Number of ints */337} RWS_anchor;338339#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))340341342343/*************************************************344* Process a callout *345*************************************************/346347/* This function is called to perform a callout.348349Arguments:350code current code pointer351offsets points to current capture offsets352current_subject start of current subject match353ptr current position in subject354mb the match block355extracode extra code offset when called from condition356lengthptr where to return the callout length357358Returns: the return from the callout359*/360361static int362do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,363PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,364PCRE2_SIZE *lengthptr)365{366pcre2_callout_block *cb = mb->cb;367368*lengthptr = (code[extracode] == OP_CALLOUT)?369(PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :370(PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);371372if (mb->callout == NULL) return 0; /* No callout provided */373374/* Fixed fields in the callout block are set once and for all at the start of375matching. */376377cb->offset_vector = offsets;378cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);379cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);380cb->pattern_position = GET(code, 1 + extracode);381cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);382383if (code[extracode] == OP_CALLOUT)384{385cb->callout_number = code[1 + 2*LINK_SIZE + extracode];386cb->callout_string_offset = 0;387cb->callout_string = NULL;388cb->callout_string_length = 0;389}390else391{392cb->callout_number = 0;393cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);394cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;395cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;396}397398return (mb->callout)(cb, mb->callout_data);399}400401402403/*************************************************404* Expand local workspace memory *405*************************************************/406407/* This function is called when internal_dfa_match() is about to be called408recursively and there is insufficient working space left in the current409workspace block. If there's an existing next block, use it; otherwise get a new410block unless the heap limit is reached.411412Arguments:413rwsptr pointer to block pointer (updated)414ovecsize space needed for an ovector415mb the match block416417Returns: 0 rwsptr has been updated418!0 an error code419*/420421static int422more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)423{424RWS_anchor *rws = *rwsptr;425RWS_anchor *new;426427if (rws->next != NULL)428{429new = rws->next;430}431432/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but433mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid434overflow. */435436else437{438uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;439uint32_t newsizeK = newsize/(1024/sizeof(int));440441if (newsizeK + mb->heap_used > mb->heap_limit)442newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);443newsize = newsizeK*(1024/sizeof(int));444445if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)446return PCRE2_ERROR_HEAPLIMIT;447new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);448if (new == NULL) return PCRE2_ERROR_NOMEMORY;449mb->heap_used += newsizeK;450new->next = NULL;451new->size = newsize;452rws->next = new;453}454455new->free = new->size - RWS_ANCHOR_SIZE;456*rwsptr = new;457return 0;458}459460461462/*************************************************463* Match a Regular Expression - DFA engine *464*************************************************/465466/* This internal function applies a compiled pattern to a subject string,467starting at a given point, using a DFA engine. This function is called from the468external one, possibly multiple times if the pattern is not anchored. The469function calls itself recursively for some kinds of subpattern.470471Arguments:472mb the match_data block with fixed information473this_start_code the opening bracket of this subexpression's code474current_subject where we currently are in the subject string475start_offset start offset in the subject string476offsets vector to contain the matching string offsets477offsetcount size of same478workspace vector of workspace479wscount size of same480rlevel function call recursion level481482Returns: > 0 => number of match offset pairs placed in offsets483= 0 => offsets overflowed; longest matches are present484-1 => failed to match485< -1 => some kind of unexpected problem486487The following macros are used for adding states to the two state vectors (one488for the current character, one for the following character). */489490#define ADD_ACTIVE(x,y) \491if (active_count++ < wscount) \492{ \493next_active_state->offset = (x); \494next_active_state->count = (y); \495next_active_state++; \496} \497else return PCRE2_ERROR_DFA_WSSIZE498499#define ADD_ACTIVE_DATA(x,y,z) \500if (active_count++ < wscount) \501{ \502next_active_state->offset = (x); \503next_active_state->count = (y); \504next_active_state->data = (z); \505next_active_state++; \506} \507else return PCRE2_ERROR_DFA_WSSIZE508509#define ADD_NEW(x,y) \510if (new_count++ < wscount) \511{ \512next_new_state->offset = (x); \513next_new_state->count = (y); \514next_new_state++; \515} \516else return PCRE2_ERROR_DFA_WSSIZE517518#define ADD_NEW_DATA(x,y,z) \519if (new_count++ < wscount) \520{ \521next_new_state->offset = (x); \522next_new_state->count = (y); \523next_new_state->data = (z); \524next_new_state++; \525} \526else return PCRE2_ERROR_DFA_WSSIZE527528/* And now, here is the code */529530static int531internal_dfa_match(532dfa_match_block *mb,533PCRE2_SPTR this_start_code,534PCRE2_SPTR current_subject,535PCRE2_SIZE start_offset,536PCRE2_SIZE *offsets,537uint32_t offsetcount,538int *workspace,539int wscount,540uint32_t rlevel,541int *RWS)542{543stateblock *active_states, *new_states, *temp_states;544stateblock *next_active_state, *next_new_state;545const uint8_t *ctypes, *lcc, *fcc;546PCRE2_SPTR ptr;547PCRE2_SPTR end_code;548dfa_recursion_info new_recursive;549int active_count, new_count, match_count;550551/* Some fields in the mb block are frequently referenced, so we load them into552independent variables in the hope that this will perform better. */553554PCRE2_SPTR start_subject = mb->start_subject;555PCRE2_SPTR end_subject = mb->end_subject;556PCRE2_SPTR start_code = mb->start_code;557558#ifdef SUPPORT_UNICODE559BOOL utf = (mb->poptions & PCRE2_UTF) != 0;560BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;561#else562BOOL utf = FALSE;563#endif564565BOOL reset_could_continue = FALSE;566567if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;568if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;569offsetcount &= (uint32_t)(-2); /* Round down */570571wscount -= 2;572wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /573(2 * INTS_PER_STATEBLOCK);574575ctypes = mb->tables + ctypes_offset;576lcc = mb->tables + lcc_offset;577fcc = mb->tables + fcc_offset;578579match_count = PCRE2_ERROR_NOMATCH; /* A negative number */580581active_states = (stateblock *)(workspace + 2);582next_new_state = new_states = active_states + wscount;583new_count = 0;584585/* The first thing in any (sub) pattern is a bracket of some sort. Push all586the alternative states onto the list, and find out where the end is. This587makes is possible to use this function recursively, when we want to stop at a588matching internal ket rather than at the end.589590If we are dealing with a backward assertion we have to find out the maximum591amount to move back, and set up each alternative appropriately. */592593if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)594{595size_t max_back = 0;596size_t gone_back;597598end_code = this_start_code;599do600{601size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);602if (back > max_back) max_back = back;603end_code += GET(end_code, 1);604}605while (*end_code == OP_ALT);606607/* If we can't go back the amount required for the longest lookbehind608pattern, go back as far as we can; some alternatives may still be viable. */609610#ifdef SUPPORT_UNICODE611/* In character mode we have to step back character by character */612613if (utf)614{615for (gone_back = 0; gone_back < max_back; gone_back++)616{617if (current_subject <= start_subject) break;618current_subject--;619ACROSSCHAR(current_subject > start_subject, current_subject,620current_subject--);621}622}623else624#endif625626/* In byte-mode we can do this quickly. */627628{629size_t current_offset = (size_t)(current_subject - start_subject);630gone_back = (current_offset < max_back)? current_offset : max_back;631current_subject -= gone_back;632}633634/* Save the earliest consulted character */635636if (current_subject < mb->start_used_ptr)637mb->start_used_ptr = current_subject;638639/* Now we can process the individual branches. There will be an OP_REVERSE at640the start of each branch, except when the length of the branch is zero. */641642end_code = this_start_code;643do644{645uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;646size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);647if (back <= gone_back)648{649int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);650ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));651}652end_code += GET(end_code, 1);653}654while (*end_code == OP_ALT);655}656657/* This is the code for a "normal" subpattern (not a backward assertion). The658start of a whole pattern is always one of these. If we are at the top level,659we may be asked to restart matching from the same point that we reached for a660previous partial match. We still have to scan through the top-level branches to661find the end state. */662663else664{665end_code = this_start_code;666667/* Restarting */668669if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)670{671do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);672new_count = workspace[1];673if (!workspace[0])674memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));675}676677/* Not restarting */678679else680{681int length = 1 + LINK_SIZE +682((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||683*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)684? IMM2_SIZE:0);685do686{687ADD_NEW((int)(end_code - start_code + length), 0);688end_code += GET(end_code, 1);689length = 1 + LINK_SIZE;690}691while (*end_code == OP_ALT);692}693}694695workspace[0] = 0; /* Bit indicating which vector is current */696697/* Loop for scanning the subject */698699ptr = current_subject;700for (;;)701{702int i, j;703int clen, dlen;704uint32_t c, d;705BOOL partial_newline = FALSE;706BOOL could_continue = reset_could_continue;707reset_could_continue = FALSE;708709if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;710711/* Make the new state list into the active state list and empty the712new state list. */713714temp_states = active_states;715active_states = new_states;716new_states = temp_states;717active_count = new_count;718new_count = 0;719720workspace[0] ^= 1; /* Remember for the restarting feature */721workspace[1] = active_count;722723/* Set the pointers for adding new states */724725next_active_state = active_states + active_count;726next_new_state = new_states;727728/* Load the current character from the subject outside the loop, as many729different states may want to look at it, and we assume that at least one730will. */731732if (ptr < end_subject)733{734clen = 1; /* Number of data items in the character */735#ifdef SUPPORT_UNICODE736GETCHARLENTEST(c, ptr, clen);737#else738c = *ptr;739#endif /* SUPPORT_UNICODE */740}741else742{743clen = 0; /* This indicates the end of the subject */744c = NOTACHAR; /* This value should never actually be used */745}746747/* Scan up the active states and act on each one. The result of an action748may be to add more states to the currently active list (e.g. on hitting a749parenthesis) or it may be to put states on the new list, for considering750when we move the character pointer on. */751752for (i = 0; i < active_count; i++)753{754stateblock *current_state = active_states + i;755BOOL caseless = FALSE;756PCRE2_SPTR code;757uint32_t codevalue;758int state_offset = current_state->offset;759int rrc;760int count;761762/* A negative offset is a special case meaning "hold off going to this763(negated) state until the number of characters in the data field have764been skipped". If the could_continue flag was passed over from a previous765state, arrange for it to passed on. */766767if (state_offset < 0)768{769if (current_state->data > 0)770{771ADD_NEW_DATA(state_offset, current_state->count,772current_state->data - 1);773if (could_continue) reset_could_continue = TRUE;774continue;775}776else777{778current_state->offset = state_offset = -state_offset;779}780}781782/* Check for a duplicate state with the same count, and skip if found.783See the note at the head of this module about the possibility of improving784performance here. */785786for (j = 0; j < i; j++)787{788if (active_states[j].offset == state_offset &&789active_states[j].count == current_state->count)790goto NEXT_ACTIVE_STATE;791}792793/* The state offset is the offset to the opcode */794795code = start_code + state_offset;796codevalue = *code;797798/* If this opcode inspects a character, but we are at the end of the799subject, remember the fact for use when testing for a partial match. */800801if (clen == 0 && poptable[codevalue] != 0)802could_continue = TRUE;803804/* If this opcode is followed by an inline character, load it. It is805tempting to test for the presence of a subject character here, but that806is wrong, because sometimes zero repetitions of the subject are807permitted.808809We also use this mechanism for opcodes such as OP_TYPEPLUS that take an810argument that is not a data character - but is always one byte long because811the values are small. We have to take special action to deal with \P, \p,812\H, \h, \V, \v and \X in this case. To keep the other cases fast, convert813these ones to new opcodes. */814815if (coptable[codevalue] > 0)816{817dlen = 1;818#ifdef SUPPORT_UNICODE819if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else820#endif /* SUPPORT_UNICODE */821d = code[coptable[codevalue]];822if (codevalue >= OP_TYPESTAR)823{824switch(d)825{826case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;827case OP_NOTPROP:828case OP_PROP: codevalue += OP_PROP_EXTRA; break;829case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;830case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;831case OP_NOT_HSPACE:832case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;833case OP_NOT_VSPACE:834case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;835default: break;836}837}838}839else840{841dlen = 0; /* Not strictly necessary, but compilers moan */842d = NOTACHAR; /* if these variables are not set. */843}844845846/* Now process the individual opcodes */847848switch (codevalue)849{850/* ========================================================================== */851/* Reached a closing bracket. If not at the end of the pattern, carry852on with the next opcode. For repeating opcodes, also add the repeat853state. Note that KETRPOS will always be encountered at the end of the854subpattern, because the possessive subpattern repeats are always handled855using recursive calls. Thus, it never adds any new states.856857At the end of the (sub)pattern, unless we have an empty string and858PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the859start of the subject, save the match data, shifting up all previous860matches so we always have the longest first. */861862case OP_KET:863case OP_KETRMIN:864case OP_KETRMAX:865case OP_KETRPOS:866if (code != end_code)867{868ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);869if (codevalue != OP_KET)870{871ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);872}873}874else875{876if (ptr > current_subject ||877((mb->moptions & PCRE2_NOTEMPTY) == 0 &&878((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||879current_subject > start_subject + mb->start_offset)))880{881if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;882else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)883match_count = 0;884count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;885if (count > 0) (void)memmove(offsets + 2, offsets,886(size_t)count * sizeof(PCRE2_SIZE));887if (offsetcount >= 2)888{889offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);890offsets[1] = (PCRE2_SIZE)(ptr - start_subject);891}892if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;893}894}895break;896897/* ========================================================================== */898/* These opcodes add to the current list of states without looking899at the current character. */900901/*-----------------------------------------------------------------*/902case OP_ALT:903do { code += GET(code, 1); } while (*code == OP_ALT);904ADD_ACTIVE((int)(code - start_code), 0);905break;906907/*-----------------------------------------------------------------*/908case OP_BRA:909case OP_SBRA:910do911{912ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);913code += GET(code, 1);914}915while (*code == OP_ALT);916break;917918/*-----------------------------------------------------------------*/919case OP_CBRA:920case OP_SCBRA:921ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);922code += GET(code, 1);923while (*code == OP_ALT)924{925ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);926code += GET(code, 1);927}928break;929930/*-----------------------------------------------------------------*/931case OP_BRAZERO:932case OP_BRAMINZERO:933ADD_ACTIVE(state_offset + 1, 0);934code += 1 + GET(code, 2);935while (*code == OP_ALT) code += GET(code, 1);936ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);937break;938939/*-----------------------------------------------------------------*/940case OP_SKIPZERO:941code += 1 + GET(code, 2);942while (*code == OP_ALT) code += GET(code, 1);943ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);944break;945946/*-----------------------------------------------------------------*/947case OP_CIRC:948if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)949{ ADD_ACTIVE(state_offset + 1, 0); }950break;951952/*-----------------------------------------------------------------*/953case OP_CIRCM:954if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||955((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )956&& WAS_NEWLINE(ptr)))957{ ADD_ACTIVE(state_offset + 1, 0); }958break;959960/*-----------------------------------------------------------------*/961case OP_EOD:962if (ptr >= end_subject)963{964if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)965return PCRE2_ERROR_PARTIAL;966else { ADD_ACTIVE(state_offset + 1, 0); }967}968break;969970/*-----------------------------------------------------------------*/971case OP_SOD:972if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }973break;974975/*-----------------------------------------------------------------*/976case OP_SOM:977if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }978break;979980981/* ========================================================================== */982/* These opcodes inspect the next subject character, and sometimes983the previous one as well, but do not have an argument. The variable984clen contains the length of the current character and is zero if we are985at the end of the subject. */986987/*-----------------------------------------------------------------*/988case OP_ANY:989if (clen > 0 && !IS_NEWLINE(ptr))990{991if (ptr + 1 >= mb->end_subject &&992(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&993NLBLOCK->nltype == NLTYPE_FIXED &&994NLBLOCK->nllen == 2 &&995c == NLBLOCK->nl[0])996{997could_continue = partial_newline = TRUE;998}999else1000{1001ADD_NEW(state_offset + 1, 0);1002}1003}1004break;10051006/*-----------------------------------------------------------------*/1007case OP_ALLANY:1008if (clen > 0)1009{ ADD_NEW(state_offset + 1, 0); }1010break;10111012/*-----------------------------------------------------------------*/1013case OP_EODN:1014if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))1015{1016if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)1017return PCRE2_ERROR_PARTIAL;1018ADD_ACTIVE(state_offset + 1, 0);1019}1020break;10211022/*-----------------------------------------------------------------*/1023case OP_DOLL:1024if ((mb->moptions & PCRE2_NOTEOL) == 0)1025{1026if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)1027could_continue = TRUE;1028else if (clen == 0 ||1029((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&1030(ptr == end_subject - mb->nllen)1031))1032{ ADD_ACTIVE(state_offset + 1, 0); }1033else if (ptr + 1 >= mb->end_subject &&1034(mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&1035NLBLOCK->nltype == NLTYPE_FIXED &&1036NLBLOCK->nllen == 2 &&1037c == NLBLOCK->nl[0])1038{1039if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)1040{1041reset_could_continue = TRUE;1042ADD_NEW_DATA(-(state_offset + 1), 0, 1);1043}1044else could_continue = partial_newline = TRUE;1045}1046}1047break;10481049/*-----------------------------------------------------------------*/1050case OP_DOLLM:1051if ((mb->moptions & PCRE2_NOTEOL) == 0)1052{1053if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)1054could_continue = TRUE;1055else if (clen == 0 ||1056((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))1057{ ADD_ACTIVE(state_offset + 1, 0); }1058else if (ptr + 1 >= mb->end_subject &&1059(mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&1060NLBLOCK->nltype == NLTYPE_FIXED &&1061NLBLOCK->nllen == 2 &&1062c == NLBLOCK->nl[0])1063{1064if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)1065{1066reset_could_continue = TRUE;1067ADD_NEW_DATA(-(state_offset + 1), 0, 1);1068}1069else could_continue = partial_newline = TRUE;1070}1071}1072else if (IS_NEWLINE(ptr))1073{ ADD_ACTIVE(state_offset + 1, 0); }1074break;10751076/*-----------------------------------------------------------------*/10771078case OP_DIGIT:1079case OP_WHITESPACE:1080case OP_WORDCHAR:1081if (clen > 0 && c < 256 &&1082((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)1083{ ADD_NEW(state_offset + 1, 0); }1084break;10851086/*-----------------------------------------------------------------*/1087case OP_NOT_DIGIT:1088case OP_NOT_WHITESPACE:1089case OP_NOT_WORDCHAR:1090if (clen > 0 && (c >= 256 ||1091((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))1092{ ADD_NEW(state_offset + 1, 0); }1093break;10941095/*-----------------------------------------------------------------*/1096case OP_WORD_BOUNDARY:1097case OP_NOT_WORD_BOUNDARY:1098case OP_NOT_UCP_WORD_BOUNDARY:1099case OP_UCP_WORD_BOUNDARY:1100{1101int left_word, right_word;11021103if (ptr > start_subject)1104{1105PCRE2_SPTR temp = ptr - 1;1106if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;1107#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321108if (utf) { BACKCHAR(temp); }1109#endif1110GETCHARTEST(d, temp);1111#ifdef SUPPORT_UNICODE1112if (codevalue == OP_UCP_WORD_BOUNDARY ||1113codevalue == OP_NOT_UCP_WORD_BOUNDARY)1114{1115int chartype = UCD_CHARTYPE(d);1116int category = PRIV(ucp_gentype)[chartype];1117left_word = (category == ucp_L || category == ucp_N ||1118chartype == ucp_Mn || chartype == ucp_Pc);1119}1120else1121#endif1122left_word = d < 256 && (ctypes[d] & ctype_word) != 0;1123}1124else left_word = FALSE;11251126if (clen > 0)1127{1128if (ptr >= mb->last_used_ptr)1129{1130PCRE2_SPTR temp = ptr + 1;1131#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321132if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }1133#endif1134mb->last_used_ptr = temp;1135}1136#ifdef SUPPORT_UNICODE1137if (codevalue == OP_UCP_WORD_BOUNDARY ||1138codevalue == OP_NOT_UCP_WORD_BOUNDARY)1139{1140int chartype = UCD_CHARTYPE(c);1141int category = PRIV(ucp_gentype)[chartype];1142right_word = (category == ucp_L || category == ucp_N ||1143chartype == ucp_Mn || chartype == ucp_Pc);1144}1145else1146#endif1147right_word = c < 256 && (ctypes[c] & ctype_word) != 0;1148}1149else right_word = FALSE;11501151if ((left_word == right_word) ==1152(codevalue == OP_NOT_WORD_BOUNDARY ||1153codevalue == OP_NOT_UCP_WORD_BOUNDARY))1154{ ADD_ACTIVE(state_offset + 1, 0); }1155}1156break;115711581159/*-----------------------------------------------------------------*/1160/* Check the next character by Unicode property. We will get here only1161if the support is in the binary; otherwise a compile-time error occurs.1162*/11631164#ifdef SUPPORT_UNICODE1165case OP_PROP:1166case OP_NOTPROP:1167if (clen > 0)1168{1169BOOL OK;1170int chartype;1171const uint32_t *cp;1172const ucd_record * prop = GET_UCD(c);1173switch(code[1])1174{1175case PT_LAMP:1176chartype = prop->chartype;1177OK = chartype == ucp_Lu || chartype == ucp_Ll ||1178chartype == ucp_Lt;1179break;11801181case PT_GC:1182OK = PRIV(ucp_gentype)[prop->chartype] == code[2];1183break;11841185case PT_PC:1186OK = prop->chartype == code[2];1187break;11881189case PT_SC:1190OK = prop->script == code[2];1191break;11921193case PT_SCX:1194OK = (prop->script == code[2] ||1195MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);1196break;11971198/* These are specials for combination cases. */11991200case PT_ALNUM:1201chartype = prop->chartype;1202OK = PRIV(ucp_gentype)[chartype] == ucp_L ||1203PRIV(ucp_gentype)[chartype] == ucp_N;1204break;12051206/* Perl space used to exclude VT, but from Perl 5.18 it is included,1207which means that Perl space and POSIX space are now identical. PCRE1208was changed at release 8.34. */12091210case PT_SPACE: /* Perl space */1211case PT_PXSPACE: /* POSIX space */1212switch(c)1213{1214HSPACE_CASES:1215VSPACE_CASES:1216OK = TRUE;1217break;12181219default:1220OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;1221break;1222}1223break;12241225case PT_WORD:1226chartype = prop->chartype;1227OK = PRIV(ucp_gentype)[chartype] == ucp_L ||1228PRIV(ucp_gentype)[chartype] == ucp_N ||1229chartype == ucp_Mn || chartype == ucp_Pc;1230break;12311232case PT_CLIST:1233#if PCRE2_CODE_UNIT_WIDTH == 321234if (c > MAX_UTF_CODE_POINT)1235{1236OK = FALSE;1237break;1238}1239#endif1240cp = PRIV(ucd_caseless_sets) + code[2];1241for (;;)1242{1243if (c < *cp) { OK = FALSE; break; }1244if (c == *cp++) { OK = TRUE; break; }1245}1246break;12471248case PT_UCNC:1249OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||1250c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||1251c >= 0xe000;1252break;12531254case PT_BIDICL:1255OK = UCD_BIDICLASS(c) == code[2];1256break;12571258case PT_BOOL:1259OK = MAPBIT(PRIV(ucd_boolprop_sets) +1260UCD_BPROPS_PROP(prop), code[2]) != 0;1261break;12621263/* Should never occur, but keep compilers from grumbling. */12641265default:1266OK = codevalue != OP_PROP;1267break;1268}12691270if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }1271}1272break;1273#endif1274127512761277/* ========================================================================== */1278/* These opcodes likewise inspect the subject character, but have an1279argument that is not a data character. It is one of these opcodes:1280OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,1281OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */12821283case OP_TYPEPLUS:1284case OP_TYPEMINPLUS:1285case OP_TYPEPOSPLUS:1286count = current_state->count; /* Already matched */1287if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }1288if (clen > 0)1289{1290if (d == OP_ANY && ptr + 1 >= mb->end_subject &&1291(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&1292NLBLOCK->nltype == NLTYPE_FIXED &&1293NLBLOCK->nllen == 2 &&1294c == NLBLOCK->nl[0])1295{1296could_continue = partial_newline = TRUE;1297}1298else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||1299(c < 256 &&1300(d != OP_ANY || !IS_NEWLINE(ptr)) &&1301((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))1302{1303if (count > 0 && codevalue == OP_TYPEPOSPLUS)1304{1305active_count--; /* Remove non-match possibility */1306next_active_state--;1307}1308count++;1309ADD_NEW(state_offset, count);1310}1311}1312break;13131314/*-----------------------------------------------------------------*/1315case OP_TYPEQUERY:1316case OP_TYPEMINQUERY:1317case OP_TYPEPOSQUERY:1318ADD_ACTIVE(state_offset + 2, 0);1319if (clen > 0)1320{1321if (d == OP_ANY && ptr + 1 >= mb->end_subject &&1322(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&1323NLBLOCK->nltype == NLTYPE_FIXED &&1324NLBLOCK->nllen == 2 &&1325c == NLBLOCK->nl[0])1326{1327could_continue = partial_newline = TRUE;1328}1329else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||1330(c < 256 &&1331(d != OP_ANY || !IS_NEWLINE(ptr)) &&1332((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))1333{1334if (codevalue == OP_TYPEPOSQUERY)1335{1336active_count--; /* Remove non-match possibility */1337next_active_state--;1338}1339ADD_NEW(state_offset + 2, 0);1340}1341}1342break;13431344/*-----------------------------------------------------------------*/1345case OP_TYPESTAR:1346case OP_TYPEMINSTAR:1347case OP_TYPEPOSSTAR:1348ADD_ACTIVE(state_offset + 2, 0);1349if (clen > 0)1350{1351if (d == OP_ANY && ptr + 1 >= mb->end_subject &&1352(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&1353NLBLOCK->nltype == NLTYPE_FIXED &&1354NLBLOCK->nllen == 2 &&1355c == NLBLOCK->nl[0])1356{1357could_continue = partial_newline = TRUE;1358}1359else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||1360(c < 256 &&1361(d != OP_ANY || !IS_NEWLINE(ptr)) &&1362((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))1363{1364if (codevalue == OP_TYPEPOSSTAR)1365{1366active_count--; /* Remove non-match possibility */1367next_active_state--;1368}1369ADD_NEW(state_offset, 0);1370}1371}1372break;13731374/*-----------------------------------------------------------------*/1375case OP_TYPEEXACT:1376count = current_state->count; /* Number already matched */1377if (clen > 0)1378{1379if (d == OP_ANY && ptr + 1 >= mb->end_subject &&1380(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&1381NLBLOCK->nltype == NLTYPE_FIXED &&1382NLBLOCK->nllen == 2 &&1383c == NLBLOCK->nl[0])1384{1385could_continue = partial_newline = TRUE;1386}1387else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||1388(c < 256 &&1389(d != OP_ANY || !IS_NEWLINE(ptr)) &&1390((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))1391{1392if (++count >= (int)GET2(code, 1))1393{ ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }1394else1395{ ADD_NEW(state_offset, count); }1396}1397}1398break;13991400/*-----------------------------------------------------------------*/1401case OP_TYPEUPTO:1402case OP_TYPEMINUPTO:1403case OP_TYPEPOSUPTO:1404ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);1405count = current_state->count; /* Number already matched */1406if (clen > 0)1407{1408if (d == OP_ANY && ptr + 1 >= mb->end_subject &&1409(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&1410NLBLOCK->nltype == NLTYPE_FIXED &&1411NLBLOCK->nllen == 2 &&1412c == NLBLOCK->nl[0])1413{1414could_continue = partial_newline = TRUE;1415}1416else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||1417(c < 256 &&1418(d != OP_ANY || !IS_NEWLINE(ptr)) &&1419((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))1420{1421if (codevalue == OP_TYPEPOSUPTO)1422{1423active_count--; /* Remove non-match possibility */1424next_active_state--;1425}1426if (++count >= (int)GET2(code, 1))1427{ ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }1428else1429{ ADD_NEW(state_offset, count); }1430}1431}1432break;14331434/* ========================================================================== */1435/* These are virtual opcodes that are used when something like1436OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its1437argument. It keeps the code above fast for the other cases. The argument1438is in the d variable. */14391440#ifdef SUPPORT_UNICODE1441case OP_PROP_EXTRA + OP_TYPEPLUS:1442case OP_PROP_EXTRA + OP_TYPEMINPLUS:1443case OP_PROP_EXTRA + OP_TYPEPOSPLUS:1444count = current_state->count; /* Already matched */1445if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }1446if (clen > 0)1447{1448BOOL OK;1449int chartype;1450const uint32_t *cp;1451const ucd_record * prop = GET_UCD(c);1452switch(code[2])1453{1454case PT_LAMP:1455chartype = prop->chartype;1456OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;1457break;14581459case PT_GC:1460OK = PRIV(ucp_gentype)[prop->chartype] == code[3];1461break;14621463case PT_PC:1464OK = prop->chartype == code[3];1465break;14661467case PT_SC:1468OK = prop->script == code[3];1469break;14701471case PT_SCX:1472OK = (prop->script == code[3] ||1473MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);1474break;14751476/* These are specials for combination cases. */14771478case PT_ALNUM:1479chartype = prop->chartype;1480OK = PRIV(ucp_gentype)[chartype] == ucp_L ||1481PRIV(ucp_gentype)[chartype] == ucp_N;1482break;14831484/* Perl space used to exclude VT, but from Perl 5.18 it is included,1485which means that Perl space and POSIX space are now identical. PCRE1486was changed at release 8.34. */14871488case PT_SPACE: /* Perl space */1489case PT_PXSPACE: /* POSIX space */1490switch(c)1491{1492HSPACE_CASES:1493VSPACE_CASES:1494OK = TRUE;1495break;14961497default:1498OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;1499break;1500}1501break;15021503case PT_WORD:1504chartype = prop->chartype;1505OK = PRIV(ucp_gentype)[chartype] == ucp_L ||1506PRIV(ucp_gentype)[chartype] == ucp_N ||1507chartype == ucp_Mn || chartype == ucp_Pc;1508break;15091510case PT_CLIST:1511#if PCRE2_CODE_UNIT_WIDTH == 321512if (c > MAX_UTF_CODE_POINT)1513{1514OK = FALSE;1515break;1516}1517#endif1518cp = PRIV(ucd_caseless_sets) + code[3];1519for (;;)1520{1521if (c < *cp) { OK = FALSE; break; }1522if (c == *cp++) { OK = TRUE; break; }1523}1524break;15251526case PT_UCNC:1527OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||1528c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||1529c >= 0xe000;1530break;15311532case PT_BIDICL:1533OK = UCD_BIDICLASS(c) == code[3];1534break;15351536case PT_BOOL:1537OK = MAPBIT(PRIV(ucd_boolprop_sets) +1538UCD_BPROPS_PROP(prop), code[3]) != 0;1539break;15401541/* Should never occur, but keep compilers from grumbling. */15421543default:1544OK = codevalue != OP_PROP;1545break;1546}15471548if (OK == (d == OP_PROP))1549{1550if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)1551{1552active_count--; /* Remove non-match possibility */1553next_active_state--;1554}1555count++;1556ADD_NEW(state_offset, count);1557}1558}1559break;15601561/*-----------------------------------------------------------------*/1562case OP_EXTUNI_EXTRA + OP_TYPEPLUS:1563case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:1564case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:1565count = current_state->count; /* Already matched */1566if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }1567if (clen > 0)1568{1569int ncount = 0;1570if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)1571{1572active_count--; /* Remove non-match possibility */1573next_active_state--;1574}1575(void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,1576&ncount);1577count++;1578ADD_NEW_DATA(-state_offset, count, ncount);1579}1580break;1581#endif15821583/*-----------------------------------------------------------------*/1584case OP_ANYNL_EXTRA + OP_TYPEPLUS:1585case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:1586case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:1587count = current_state->count; /* Already matched */1588if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }1589if (clen > 0)1590{1591int ncount = 0;1592switch (c)1593{1594case CHAR_VT:1595case CHAR_FF:1596case CHAR_NEL:1597#ifndef EBCDIC1598case 0x2028:1599case 0x2029:1600#endif /* Not EBCDIC */1601if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;1602goto ANYNL01;16031604case CHAR_CR:1605if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;1606/* Fall through */16071608ANYNL01:1609case CHAR_LF:1610if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)1611{1612active_count--; /* Remove non-match possibility */1613next_active_state--;1614}1615count++;1616ADD_NEW_DATA(-state_offset, count, ncount);1617break;16181619default:1620break;1621}1622}1623break;16241625/*-----------------------------------------------------------------*/1626case OP_VSPACE_EXTRA + OP_TYPEPLUS:1627case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:1628case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:1629count = current_state->count; /* Already matched */1630if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }1631if (clen > 0)1632{1633BOOL OK;1634switch (c)1635{1636VSPACE_CASES:1637OK = TRUE;1638break;16391640default:1641OK = FALSE;1642break;1643}16441645if (OK == (d == OP_VSPACE))1646{1647if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)1648{1649active_count--; /* Remove non-match possibility */1650next_active_state--;1651}1652count++;1653ADD_NEW_DATA(-state_offset, count, 0);1654}1655}1656break;16571658/*-----------------------------------------------------------------*/1659case OP_HSPACE_EXTRA + OP_TYPEPLUS:1660case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:1661case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:1662count = current_state->count; /* Already matched */1663if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }1664if (clen > 0)1665{1666BOOL OK;1667switch (c)1668{1669HSPACE_CASES:1670OK = TRUE;1671break;16721673default:1674OK = FALSE;1675break;1676}16771678if (OK == (d == OP_HSPACE))1679{1680if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)1681{1682active_count--; /* Remove non-match possibility */1683next_active_state--;1684}1685count++;1686ADD_NEW_DATA(-state_offset, count, 0);1687}1688}1689break;16901691/*-----------------------------------------------------------------*/1692#ifdef SUPPORT_UNICODE1693case OP_PROP_EXTRA + OP_TYPEQUERY:1694case OP_PROP_EXTRA + OP_TYPEMINQUERY:1695case OP_PROP_EXTRA + OP_TYPEPOSQUERY:1696count = 4;1697goto QS1;16981699case OP_PROP_EXTRA + OP_TYPESTAR:1700case OP_PROP_EXTRA + OP_TYPEMINSTAR:1701case OP_PROP_EXTRA + OP_TYPEPOSSTAR:1702count = 0;17031704QS1:17051706ADD_ACTIVE(state_offset + 4, 0);1707if (clen > 0)1708{1709BOOL OK;1710int chartype;1711const uint32_t *cp;1712const ucd_record * prop = GET_UCD(c);1713switch(code[2])1714{1715case PT_LAMP:1716chartype = prop->chartype;1717OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;1718break;17191720case PT_GC:1721OK = PRIV(ucp_gentype)[prop->chartype] == code[3];1722break;17231724case PT_PC:1725OK = prop->chartype == code[3];1726break;17271728case PT_SC:1729OK = prop->script == code[3];1730break;17311732case PT_SCX:1733OK = (prop->script == code[3] ||1734MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);1735break;17361737/* These are specials for combination cases. */17381739case PT_ALNUM:1740chartype = prop->chartype;1741OK = PRIV(ucp_gentype)[chartype] == ucp_L ||1742PRIV(ucp_gentype)[chartype] == ucp_N;1743break;17441745/* Perl space used to exclude VT, but from Perl 5.18 it is included,1746which means that Perl space and POSIX space are now identical. PCRE1747was changed at release 8.34. */17481749case PT_SPACE: /* Perl space */1750case PT_PXSPACE: /* POSIX space */1751switch(c)1752{1753HSPACE_CASES:1754VSPACE_CASES:1755OK = TRUE;1756break;17571758default:1759OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;1760break;1761}1762break;17631764case PT_WORD:1765chartype = prop->chartype;1766OK = PRIV(ucp_gentype)[chartype] == ucp_L ||1767PRIV(ucp_gentype)[chartype] == ucp_N ||1768chartype == ucp_Mn || chartype == ucp_Pc;1769break;17701771case PT_CLIST:1772#if PCRE2_CODE_UNIT_WIDTH == 321773if (c > MAX_UTF_CODE_POINT)1774{1775OK = FALSE;1776break;1777}1778#endif1779cp = PRIV(ucd_caseless_sets) + code[3];1780for (;;)1781{1782if (c < *cp) { OK = FALSE; break; }1783if (c == *cp++) { OK = TRUE; break; }1784}1785break;17861787case PT_UCNC:1788OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||1789c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||1790c >= 0xe000;1791break;17921793case PT_BIDICL:1794OK = UCD_BIDICLASS(c) == code[3];1795break;17961797case PT_BOOL:1798OK = MAPBIT(PRIV(ucd_boolprop_sets) +1799UCD_BPROPS_PROP(prop), code[3]) != 0;1800break;18011802/* Should never occur, but keep compilers from grumbling. */18031804default:1805OK = codevalue != OP_PROP;1806break;1807}18081809if (OK == (d == OP_PROP))1810{1811if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||1812codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)1813{1814active_count--; /* Remove non-match possibility */1815next_active_state--;1816}1817ADD_NEW(state_offset + count, 0);1818}1819}1820break;18211822/*-----------------------------------------------------------------*/1823case OP_EXTUNI_EXTRA + OP_TYPEQUERY:1824case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:1825case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:1826count = 2;1827goto QS2;18281829case OP_EXTUNI_EXTRA + OP_TYPESTAR:1830case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:1831case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:1832count = 0;18331834QS2:18351836ADD_ACTIVE(state_offset + 2, 0);1837if (clen > 0)1838{1839int ncount = 0;1840if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||1841codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)1842{1843active_count--; /* Remove non-match possibility */1844next_active_state--;1845}1846(void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,1847&ncount);1848ADD_NEW_DATA(-(state_offset + count), 0, ncount);1849}1850break;1851#endif18521853/*-----------------------------------------------------------------*/1854case OP_ANYNL_EXTRA + OP_TYPEQUERY:1855case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:1856case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:1857count = 2;1858goto QS3;18591860case OP_ANYNL_EXTRA + OP_TYPESTAR:1861case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:1862case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:1863count = 0;18641865QS3:1866ADD_ACTIVE(state_offset + 2, 0);1867if (clen > 0)1868{1869int ncount = 0;1870switch (c)1871{1872case CHAR_VT:1873case CHAR_FF:1874case CHAR_NEL:1875#ifndef EBCDIC1876case 0x2028:1877case 0x2029:1878#endif /* Not EBCDIC */1879if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;1880goto ANYNL02;18811882case CHAR_CR:1883if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;1884/* Fall through */18851886ANYNL02:1887case CHAR_LF:1888if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||1889codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)1890{1891active_count--; /* Remove non-match possibility */1892next_active_state--;1893}1894ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);1895break;18961897default:1898break;1899}1900}1901break;19021903/*-----------------------------------------------------------------*/1904case OP_VSPACE_EXTRA + OP_TYPEQUERY:1905case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:1906case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:1907count = 2;1908goto QS4;19091910case OP_VSPACE_EXTRA + OP_TYPESTAR:1911case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:1912case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:1913count = 0;19141915QS4:1916ADD_ACTIVE(state_offset + 2, 0);1917if (clen > 0)1918{1919BOOL OK;1920switch (c)1921{1922VSPACE_CASES:1923OK = TRUE;1924break;19251926default:1927OK = FALSE;1928break;1929}1930if (OK == (d == OP_VSPACE))1931{1932if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||1933codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)1934{1935active_count--; /* Remove non-match possibility */1936next_active_state--;1937}1938ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);1939}1940}1941break;19421943/*-----------------------------------------------------------------*/1944case OP_HSPACE_EXTRA + OP_TYPEQUERY:1945case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:1946case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:1947count = 2;1948goto QS5;19491950case OP_HSPACE_EXTRA + OP_TYPESTAR:1951case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:1952case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:1953count = 0;19541955QS5:1956ADD_ACTIVE(state_offset + 2, 0);1957if (clen > 0)1958{1959BOOL OK;1960switch (c)1961{1962HSPACE_CASES:1963OK = TRUE;1964break;19651966default:1967OK = FALSE;1968break;1969}19701971if (OK == (d == OP_HSPACE))1972{1973if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||1974codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)1975{1976active_count--; /* Remove non-match possibility */1977next_active_state--;1978}1979ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);1980}1981}1982break;19831984/*-----------------------------------------------------------------*/1985#ifdef SUPPORT_UNICODE1986case OP_PROP_EXTRA + OP_TYPEEXACT:1987case OP_PROP_EXTRA + OP_TYPEUPTO:1988case OP_PROP_EXTRA + OP_TYPEMINUPTO:1989case OP_PROP_EXTRA + OP_TYPEPOSUPTO:1990if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)1991{ ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }1992count = current_state->count; /* Number already matched */1993if (clen > 0)1994{1995BOOL OK;1996int chartype;1997const uint32_t *cp;1998const ucd_record * prop = GET_UCD(c);1999switch(code[1 + IMM2_SIZE + 1])2000{2001case PT_LAMP:2002chartype = prop->chartype;2003OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;2004break;20052006case PT_GC:2007OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];2008break;20092010case PT_PC:2011OK = prop->chartype == code[1 + IMM2_SIZE + 2];2012break;20132014case PT_SC:2015OK = prop->script == code[1 + IMM2_SIZE + 2];2016break;20172018case PT_SCX:2019OK = (prop->script == code[1 + IMM2_SIZE + 2] ||2020MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),2021code[1 + IMM2_SIZE + 2]) != 0);2022break;20232024/* These are specials for combination cases. */20252026case PT_ALNUM:2027chartype = prop->chartype;2028OK = PRIV(ucp_gentype)[chartype] == ucp_L ||2029PRIV(ucp_gentype)[chartype] == ucp_N;2030break;20312032/* Perl space used to exclude VT, but from Perl 5.18 it is included,2033which means that Perl space and POSIX space are now identical. PCRE2034was changed at release 8.34. */20352036case PT_SPACE: /* Perl space */2037case PT_PXSPACE: /* POSIX space */2038switch(c)2039{2040HSPACE_CASES:2041VSPACE_CASES:2042OK = TRUE;2043break;20442045default:2046OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;2047break;2048}2049break;20502051case PT_WORD:2052chartype = prop->chartype;2053OK = PRIV(ucp_gentype)[chartype] == ucp_L ||2054PRIV(ucp_gentype)[chartype] == ucp_N ||2055chartype == ucp_Mn || chartype == ucp_Pc;2056break;20572058case PT_CLIST:2059#if PCRE2_CODE_UNIT_WIDTH == 322060if (c > MAX_UTF_CODE_POINT)2061{2062OK = FALSE;2063break;2064}2065#endif2066cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];2067for (;;)2068{2069if (c < *cp) { OK = FALSE; break; }2070if (c == *cp++) { OK = TRUE; break; }2071}2072break;20732074case PT_UCNC:2075OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||2076c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||2077c >= 0xe000;2078break;20792080case PT_BIDICL:2081OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];2082break;20832084case PT_BOOL:2085OK = MAPBIT(PRIV(ucd_boolprop_sets) +2086UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;2087break;20882089/* Should never occur, but keep compilers from grumbling. */20902091default:2092OK = codevalue != OP_PROP;2093break;2094}20952096if (OK == (d == OP_PROP))2097{2098if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)2099{2100active_count--; /* Remove non-match possibility */2101next_active_state--;2102}2103if (++count >= (int)GET2(code, 1))2104{ ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }2105else2106{ ADD_NEW(state_offset, count); }2107}2108}2109break;21102111/*-----------------------------------------------------------------*/2112case OP_EXTUNI_EXTRA + OP_TYPEEXACT:2113case OP_EXTUNI_EXTRA + OP_TYPEUPTO:2114case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:2115case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:2116if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)2117{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }2118count = current_state->count; /* Number already matched */2119if (clen > 0)2120{2121PCRE2_SPTR nptr;2122int ncount = 0;2123if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)2124{2125active_count--; /* Remove non-match possibility */2126next_active_state--;2127}2128nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,2129&ncount);2130if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)2131reset_could_continue = TRUE;2132if (++count >= (int)GET2(code, 1))2133{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }2134else2135{ ADD_NEW_DATA(-state_offset, count, ncount); }2136}2137break;2138#endif21392140/*-----------------------------------------------------------------*/2141case OP_ANYNL_EXTRA + OP_TYPEEXACT:2142case OP_ANYNL_EXTRA + OP_TYPEUPTO:2143case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:2144case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:2145if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)2146{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }2147count = current_state->count; /* Number already matched */2148if (clen > 0)2149{2150int ncount = 0;2151switch (c)2152{2153case CHAR_VT:2154case CHAR_FF:2155case CHAR_NEL:2156#ifndef EBCDIC2157case 0x2028:2158case 0x2029:2159#endif /* Not EBCDIC */2160if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;2161goto ANYNL03;21622163case CHAR_CR:2164if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;2165/* Fall through */21662167ANYNL03:2168case CHAR_LF:2169if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)2170{2171active_count--; /* Remove non-match possibility */2172next_active_state--;2173}2174if (++count >= (int)GET2(code, 1))2175{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }2176else2177{ ADD_NEW_DATA(-state_offset, count, ncount); }2178break;21792180default:2181break;2182}2183}2184break;21852186/*-----------------------------------------------------------------*/2187case OP_VSPACE_EXTRA + OP_TYPEEXACT:2188case OP_VSPACE_EXTRA + OP_TYPEUPTO:2189case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:2190case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:2191if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)2192{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }2193count = current_state->count; /* Number already matched */2194if (clen > 0)2195{2196BOOL OK;2197switch (c)2198{2199VSPACE_CASES:2200OK = TRUE;2201break;22022203default:2204OK = FALSE;2205}22062207if (OK == (d == OP_VSPACE))2208{2209if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)2210{2211active_count--; /* Remove non-match possibility */2212next_active_state--;2213}2214if (++count >= (int)GET2(code, 1))2215{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }2216else2217{ ADD_NEW_DATA(-state_offset, count, 0); }2218}2219}2220break;22212222/*-----------------------------------------------------------------*/2223case OP_HSPACE_EXTRA + OP_TYPEEXACT:2224case OP_HSPACE_EXTRA + OP_TYPEUPTO:2225case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:2226case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:2227if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)2228{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }2229count = current_state->count; /* Number already matched */2230if (clen > 0)2231{2232BOOL OK;2233switch (c)2234{2235HSPACE_CASES:2236OK = TRUE;2237break;22382239default:2240OK = FALSE;2241break;2242}22432244if (OK == (d == OP_HSPACE))2245{2246if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)2247{2248active_count--; /* Remove non-match possibility */2249next_active_state--;2250}2251if (++count >= (int)GET2(code, 1))2252{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }2253else2254{ ADD_NEW_DATA(-state_offset, count, 0); }2255}2256}2257break;22582259/* ========================================================================== */2260/* These opcodes are followed by a character that is usually compared2261to the current subject character; it is loaded into d. We still get2262here even if there is no subject character, because in some cases zero2263repetitions are permitted. */22642265/*-----------------------------------------------------------------*/2266case OP_CHAR:2267if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }2268break;22692270/*-----------------------------------------------------------------*/2271case OP_CHARI:2272if (clen == 0) break;22732274#ifdef SUPPORT_UNICODE2275if (utf_or_ucp)2276{2277if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else2278{2279unsigned int othercase;2280if (c < 128)2281othercase = fcc[c];2282else2283othercase = UCD_OTHERCASE(c);2284if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }2285}2286}2287else2288#endif /* SUPPORT_UNICODE */2289/* Not UTF or UCP mode */2290{2291if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))2292{ ADD_NEW(state_offset + 2, 0); }2293}2294break;229522962297#ifdef SUPPORT_UNICODE2298/*-----------------------------------------------------------------*/2299/* This is a tricky one because it can match more than one character.2300Find out how many characters to skip, and then set up a negative state2301to wait for them to pass before continuing. */23022303case OP_EXTUNI:2304if (clen > 0)2305{2306int ncount = 0;2307PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,2308end_subject, utf, &ncount);2309if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)2310reset_could_continue = TRUE;2311ADD_NEW_DATA(-(state_offset + 1), 0, ncount);2312}2313break;2314#endif23152316/*-----------------------------------------------------------------*/2317/* This is a tricky like EXTUNI because it too can match more than one2318character (when CR is followed by LF). In this case, set up a negative2319state to wait for one character to pass before continuing. */23202321case OP_ANYNL:2322if (clen > 0) switch(c)2323{2324case CHAR_VT:2325case CHAR_FF:2326case CHAR_NEL:2327#ifndef EBCDIC2328case 0x2028:2329case 0x2029:2330#endif /* Not EBCDIC */2331if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;2332/* Fall through */23332334case CHAR_LF:2335ADD_NEW(state_offset + 1, 0);2336break;23372338case CHAR_CR:2339if (ptr + 1 >= end_subject)2340{2341ADD_NEW(state_offset + 1, 0);2342if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)2343reset_could_continue = TRUE;2344}2345else if (UCHAR21TEST(ptr + 1) == CHAR_LF)2346{2347ADD_NEW_DATA(-(state_offset + 1), 0, 1);2348}2349else2350{2351ADD_NEW(state_offset + 1, 0);2352}2353break;2354}2355break;23562357/*-----------------------------------------------------------------*/2358case OP_NOT_VSPACE:2359if (clen > 0) switch(c)2360{2361VSPACE_CASES:2362break;23632364default:2365ADD_NEW(state_offset + 1, 0);2366break;2367}2368break;23692370/*-----------------------------------------------------------------*/2371case OP_VSPACE:2372if (clen > 0) switch(c)2373{2374VSPACE_CASES:2375ADD_NEW(state_offset + 1, 0);2376break;23772378default:2379break;2380}2381break;23822383/*-----------------------------------------------------------------*/2384case OP_NOT_HSPACE:2385if (clen > 0) switch(c)2386{2387HSPACE_CASES:2388break;23892390default:2391ADD_NEW(state_offset + 1, 0);2392break;2393}2394break;23952396/*-----------------------------------------------------------------*/2397case OP_HSPACE:2398if (clen > 0) switch(c)2399{2400HSPACE_CASES:2401ADD_NEW(state_offset + 1, 0);2402break;24032404default:2405break;2406}2407break;24082409/*-----------------------------------------------------------------*/2410/* Match a negated single character casefully. */24112412case OP_NOT:2413if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }2414break;24152416/*-----------------------------------------------------------------*/2417/* Match a negated single character caselessly. */24182419case OP_NOTI:2420if (clen > 0)2421{2422uint32_t otherd;2423#ifdef SUPPORT_UNICODE2424if (utf_or_ucp && d >= 128)2425otherd = UCD_OTHERCASE(d);2426else2427#endif /* SUPPORT_UNICODE */2428otherd = TABLE_GET(d, fcc, d);2429if (c != d && c != otherd)2430{ ADD_NEW(state_offset + dlen + 1, 0); }2431}2432break;24332434/*-----------------------------------------------------------------*/2435case OP_PLUSI:2436case OP_MINPLUSI:2437case OP_POSPLUSI:2438case OP_NOTPLUSI:2439case OP_NOTMINPLUSI:2440case OP_NOTPOSPLUSI:2441caseless = TRUE;2442codevalue -= OP_STARI - OP_STAR;24432444/* Fall through */2445case OP_PLUS:2446case OP_MINPLUS:2447case OP_POSPLUS:2448case OP_NOTPLUS:2449case OP_NOTMINPLUS:2450case OP_NOTPOSPLUS:2451count = current_state->count; /* Already matched */2452if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }2453if (clen > 0)2454{2455uint32_t otherd = NOTACHAR;2456if (caseless)2457{2458#ifdef SUPPORT_UNICODE2459if (utf_or_ucp && d >= 128)2460otherd = UCD_OTHERCASE(d);2461else2462#endif /* SUPPORT_UNICODE */2463otherd = TABLE_GET(d, fcc, d);2464}2465if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))2466{2467if (count > 0 &&2468(codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))2469{2470active_count--; /* Remove non-match possibility */2471next_active_state--;2472}2473count++;2474ADD_NEW(state_offset, count);2475}2476}2477break;24782479/*-----------------------------------------------------------------*/2480case OP_QUERYI:2481case OP_MINQUERYI:2482case OP_POSQUERYI:2483case OP_NOTQUERYI:2484case OP_NOTMINQUERYI:2485case OP_NOTPOSQUERYI:2486caseless = TRUE;2487codevalue -= OP_STARI - OP_STAR;2488/* Fall through */2489case OP_QUERY:2490case OP_MINQUERY:2491case OP_POSQUERY:2492case OP_NOTQUERY:2493case OP_NOTMINQUERY:2494case OP_NOTPOSQUERY:2495ADD_ACTIVE(state_offset + dlen + 1, 0);2496if (clen > 0)2497{2498uint32_t otherd = NOTACHAR;2499if (caseless)2500{2501#ifdef SUPPORT_UNICODE2502if (utf_or_ucp && d >= 128)2503otherd = UCD_OTHERCASE(d);2504else2505#endif /* SUPPORT_UNICODE */2506otherd = TABLE_GET(d, fcc, d);2507}2508if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))2509{2510if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)2511{2512active_count--; /* Remove non-match possibility */2513next_active_state--;2514}2515ADD_NEW(state_offset + dlen + 1, 0);2516}2517}2518break;25192520/*-----------------------------------------------------------------*/2521case OP_STARI:2522case OP_MINSTARI:2523case OP_POSSTARI:2524case OP_NOTSTARI:2525case OP_NOTMINSTARI:2526case OP_NOTPOSSTARI:2527caseless = TRUE;2528codevalue -= OP_STARI - OP_STAR;2529/* Fall through */2530case OP_STAR:2531case OP_MINSTAR:2532case OP_POSSTAR:2533case OP_NOTSTAR:2534case OP_NOTMINSTAR:2535case OP_NOTPOSSTAR:2536ADD_ACTIVE(state_offset + dlen + 1, 0);2537if (clen > 0)2538{2539uint32_t otherd = NOTACHAR;2540if (caseless)2541{2542#ifdef SUPPORT_UNICODE2543if (utf_or_ucp && d >= 128)2544otherd = UCD_OTHERCASE(d);2545else2546#endif /* SUPPORT_UNICODE */2547otherd = TABLE_GET(d, fcc, d);2548}2549if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))2550{2551if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)2552{2553active_count--; /* Remove non-match possibility */2554next_active_state--;2555}2556ADD_NEW(state_offset, 0);2557}2558}2559break;25602561/*-----------------------------------------------------------------*/2562case OP_EXACTI:2563case OP_NOTEXACTI:2564caseless = TRUE;2565codevalue -= OP_STARI - OP_STAR;2566/* Fall through */2567case OP_EXACT:2568case OP_NOTEXACT:2569count = current_state->count; /* Number already matched */2570if (clen > 0)2571{2572uint32_t otherd = NOTACHAR;2573if (caseless)2574{2575#ifdef SUPPORT_UNICODE2576if (utf_or_ucp && d >= 128)2577otherd = UCD_OTHERCASE(d);2578else2579#endif /* SUPPORT_UNICODE */2580otherd = TABLE_GET(d, fcc, d);2581}2582if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))2583{2584if (++count >= (int)GET2(code, 1))2585{ ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }2586else2587{ ADD_NEW(state_offset, count); }2588}2589}2590break;25912592/*-----------------------------------------------------------------*/2593case OP_UPTOI:2594case OP_MINUPTOI:2595case OP_POSUPTOI:2596case OP_NOTUPTOI:2597case OP_NOTMINUPTOI:2598case OP_NOTPOSUPTOI:2599caseless = TRUE;2600codevalue -= OP_STARI - OP_STAR;2601/* Fall through */2602case OP_UPTO:2603case OP_MINUPTO:2604case OP_POSUPTO:2605case OP_NOTUPTO:2606case OP_NOTMINUPTO:2607case OP_NOTPOSUPTO:2608ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);2609count = current_state->count; /* Number already matched */2610if (clen > 0)2611{2612uint32_t otherd = NOTACHAR;2613if (caseless)2614{2615#ifdef SUPPORT_UNICODE2616if (utf_or_ucp && d >= 128)2617otherd = UCD_OTHERCASE(d);2618else2619#endif /* SUPPORT_UNICODE */2620otherd = TABLE_GET(d, fcc, d);2621}2622if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))2623{2624if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)2625{2626active_count--; /* Remove non-match possibility */2627next_active_state--;2628}2629if (++count >= (int)GET2(code, 1))2630{ ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }2631else2632{ ADD_NEW(state_offset, count); }2633}2634}2635break;263626372638/* ========================================================================== */2639/* These are the class-handling opcodes */26402641case OP_CLASS:2642case OP_NCLASS:2643#ifdef SUPPORT_WIDE_CHARS2644case OP_XCLASS:2645case OP_ECLASS:2646#endif2647{2648BOOL isinclass = FALSE;2649int next_state_offset;2650PCRE2_SPTR ecode;26512652#ifdef SUPPORT_WIDE_CHARS2653/* An extended class may have a table or a list of single characters,2654ranges, or both, and it may be positive or negative. There's a2655function that sorts all this out. */26562657if (codevalue == OP_XCLASS)2658{2659ecode = code + GET(code, 1);2660if (clen > 0)2661isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE,2662(const uint8_t*)mb->start_code, utf);2663}26642665/* A nested set-based class has internal opcodes for performing2666set operations. */26672668else if (codevalue == OP_ECLASS)2669{2670ecode = code + GET(code, 1);2671if (clen > 0)2672isinclass = PRIV(eclass)(c, code + 1 + LINK_SIZE, ecode,2673(const uint8_t*)mb->start_code, utf);2674}26752676else2677#endif /* SUPPORT_WIDE_CHARS */26782679/* For a simple class, there is always just a 32-byte table, and we2680can set isinclass from it. */26812682{2683ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));2684if (clen > 0)2685{2686isinclass = (c > 255)? (codevalue == OP_NCLASS) :2687((((const uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);2688}2689}26902691/* At this point, isinclass is set for all kinds of class, and ecode2692points to the byte after the end of the class. If there is a2693quantifier, this is where it will be. */26942695next_state_offset = (int)(ecode - start_code);26962697switch (*ecode)2698{2699case OP_CRSTAR:2700case OP_CRMINSTAR:2701case OP_CRPOSSTAR:2702ADD_ACTIVE(next_state_offset + 1, 0);2703if (isinclass)2704{2705if (*ecode == OP_CRPOSSTAR)2706{2707active_count--; /* Remove non-match possibility */2708next_active_state--;2709}2710ADD_NEW(state_offset, 0);2711}2712break;27132714case OP_CRPLUS:2715case OP_CRMINPLUS:2716case OP_CRPOSPLUS:2717count = current_state->count; /* Already matched */2718if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }2719if (isinclass)2720{2721if (count > 0 && *ecode == OP_CRPOSPLUS)2722{2723active_count--; /* Remove non-match possibility */2724next_active_state--;2725}2726count++;2727ADD_NEW(state_offset, count);2728}2729break;27302731case OP_CRQUERY:2732case OP_CRMINQUERY:2733case OP_CRPOSQUERY:2734ADD_ACTIVE(next_state_offset + 1, 0);2735if (isinclass)2736{2737if (*ecode == OP_CRPOSQUERY)2738{2739active_count--; /* Remove non-match possibility */2740next_active_state--;2741}2742ADD_NEW(next_state_offset + 1, 0);2743}2744break;27452746case OP_CRRANGE:2747case OP_CRMINRANGE:2748case OP_CRPOSRANGE:2749count = current_state->count; /* Already matched */2750if (count >= (int)GET2(ecode, 1))2751{ ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }2752if (isinclass)2753{2754int max = (int)GET2(ecode, 1 + IMM2_SIZE);27552756if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))2757{2758active_count--; /* Remove non-match possibility */2759next_active_state--;2760}27612762if (++count >= max && max != 0) /* Max 0 => no limit */2763{ ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }2764else2765{ ADD_NEW(state_offset, count); }2766}2767break;27682769default:2770if (isinclass) { ADD_NEW(next_state_offset, 0); }2771break;2772}2773}2774break;27752776/* ========================================================================== */2777/* These are the opcodes for fancy brackets of various kinds. We have2778to use recursion in order to handle them. The "always failing" assertion2779(?!) is optimised to OP_FAIL when compiling, so we have to support that,2780though the other "backtracking verbs" are not supported. */27812782case OP_FAIL:2783break;27842785case OP_ASSERT:2786case OP_ASSERT_NOT:2787case OP_ASSERTBACK:2788case OP_ASSERTBACK_NOT:2789{2790int rc;2791int *local_workspace;2792PCRE2_SIZE *local_offsets;2793PCRE2_SPTR endasscode = code + GET(code, 1);2794RWS_anchor *rws = (RWS_anchor *)RWS;27952796if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)2797{2798rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);2799if (rc != 0) return rc;2800RWS = (int *)rws;2801}28022803local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);2804local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;2805rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;28062807while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);28082809rc = internal_dfa_match(2810mb, /* static match data */2811code, /* this subexpression's code */2812ptr, /* where we currently are */2813(PCRE2_SIZE)(ptr - start_subject), /* start offset */2814local_offsets, /* offset vector */2815RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */2816local_workspace, /* workspace vector */2817RWS_RSIZE, /* size of same */2818rlevel, /* function recursion level */2819RWS); /* recursion workspace */28202821rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;28222823if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;2824if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))2825{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }2826}2827break;28282829/*-----------------------------------------------------------------*/2830case OP_COND:2831case OP_SCOND:2832{2833int codelink = (int)GET(code, 1);2834PCRE2_UCHAR condcode;28352836/* Because of the way auto-callout works during compile, a callout item2837is inserted between OP_COND and an assertion condition. This does not2838happen for the other conditions. */28392840if (code[LINK_SIZE + 1] == OP_CALLOUT2841|| code[LINK_SIZE + 1] == OP_CALLOUT_STR)2842{2843PCRE2_SIZE callout_length;2844rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,28451 + LINK_SIZE, &callout_length);2846if (rrc < 0) return rrc; /* Abandon */2847if (rrc > 0) break; /* Fail this thread */2848code += callout_length; /* Skip callout data */2849}28502851condcode = code[LINK_SIZE+1];28522853/* Back reference conditions and duplicate named recursion conditions2854are not supported */28552856if (condcode == OP_CREF || condcode == OP_DNCREF ||2857condcode == OP_DNRREF)2858return PCRE2_ERROR_DFA_UCOND;28592860/* The DEFINE condition is always false, and the assertion (?!) is2861converted to OP_FAIL. */28622863if (condcode == OP_FALSE || condcode == OP_FAIL)2864{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }28652866/* There is also an always-true condition */28672868else if (condcode == OP_TRUE)2869{ ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }28702871/* The only supported version of OP_RREF is for the value RREF_ANY,2872which means "test if in any recursion". We can't test for specifically2873recursed groups. */28742875else if (condcode == OP_RREF)2876{2877unsigned int value = GET2(code, LINK_SIZE + 2);2878if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;2879if (mb->recursive != NULL)2880{ ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }2881else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }2882}28832884/* Otherwise, the condition is an assertion */28852886else2887{2888int rc;2889int *local_workspace;2890PCRE2_SIZE *local_offsets;2891PCRE2_SPTR asscode = code + LINK_SIZE + 1;2892PCRE2_SPTR endasscode = asscode + GET(asscode, 1);2893RWS_anchor *rws = (RWS_anchor *)RWS;28942895if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)2896{2897rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);2898if (rc != 0) return rc;2899RWS = (int *)rws;2900}29012902local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);2903local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;2904rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;29052906while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);29072908rc = internal_dfa_match(2909mb, /* fixed match data */2910asscode, /* this subexpression's code */2911ptr, /* where we currently are */2912(PCRE2_SIZE)(ptr - start_subject), /* start offset */2913local_offsets, /* offset vector */2914RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */2915local_workspace, /* workspace vector */2916RWS_RSIZE, /* size of same */2917rlevel, /* function recursion level */2918RWS); /* recursion workspace */29192920rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;29212922if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;2923if ((rc >= 0) ==2924(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))2925{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }2926else2927{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }2928}2929}2930break;29312932/*-----------------------------------------------------------------*/2933case OP_RECURSE:2934{2935int rc;2936int *local_workspace;2937PCRE2_SIZE *local_offsets;2938RWS_anchor *rws = (RWS_anchor *)RWS;2939PCRE2_SPTR callpat = start_code + GET(code, 1);2940uint32_t recno = (callpat == mb->start_code)? 0 :2941GET2(callpat, 1 + LINK_SIZE);29422943if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)2944{2945rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);2946if (rc != 0) return rc;2947RWS = (int *)rws;2948}29492950local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);2951local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;2952rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;29532954/* Check for repeating a recursion without advancing the subject2955pointer or last used character. This should catch convoluted mutual2956recursions. (Some simple cases are caught at compile time.) */29572958for (dfa_recursion_info *ri = mb->recursive;2959ri != NULL;2960ri = ri->prevrec)2961{2962if (recno == ri->group_num && ptr == ri->subject_position &&2963mb->last_used_ptr == ri->last_used_ptr)2964return PCRE2_ERROR_RECURSELOOP;2965}29662967/* Remember this recursion and where we started it so as to2968catch infinite loops. */29692970new_recursive.group_num = recno;2971new_recursive.subject_position = ptr;2972new_recursive.last_used_ptr = mb->last_used_ptr;2973new_recursive.prevrec = mb->recursive;2974mb->recursive = &new_recursive;29752976rc = internal_dfa_match(2977mb, /* fixed match data */2978callpat, /* this subexpression's code */2979ptr, /* where we currently are */2980(PCRE2_SIZE)(ptr - start_subject), /* start offset */2981local_offsets, /* offset vector */2982RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */2983local_workspace, /* workspace vector */2984RWS_RSIZE, /* size of same */2985rlevel, /* function recursion level */2986RWS); /* recursion workspace */29872988rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;2989mb->recursive = new_recursive.prevrec; /* Done this recursion */29902991/* Ran out of internal offsets */29922993if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;29942995/* For each successful matched substring, set up the next state with a2996count of characters to skip before trying it. Note that the count is in2997characters, not bytes. */29982999if (rc > 0)3000{3001for (rc = rc*2 - 2; rc >= 0; rc -= 2)3002{3003PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];3004#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 323005if (utf)3006{3007PCRE2_SPTR p = start_subject + local_offsets[rc];3008PCRE2_SPTR pp = start_subject + local_offsets[rc+1];3009while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;3010}3011#endif3012if (charcount > 0)3013{3014ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,3015(int)(charcount - 1));3016}3017else3018{3019ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);3020}3021}3022}3023else if (rc != PCRE2_ERROR_NOMATCH) return rc;3024}3025break;30263027/*-----------------------------------------------------------------*/3028case OP_BRAPOS:3029case OP_SBRAPOS:3030case OP_CBRAPOS:3031case OP_SCBRAPOS:3032case OP_BRAPOSZERO:3033{3034int rc;3035int *local_workspace;3036PCRE2_SIZE *local_offsets;3037PCRE2_SIZE charcount, matched_count;3038PCRE2_SPTR local_ptr = ptr;3039RWS_anchor *rws = (RWS_anchor *)RWS;3040BOOL allow_zero;30413042if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)3043{3044rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);3045if (rc != 0) return rc;3046RWS = (int *)rws;3047}30483049local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);3050local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;3051rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;30523053if (codevalue == OP_BRAPOSZERO)3054{3055allow_zero = TRUE;3056++code; /* The following opcode will be one of the above BRAs */3057}3058else allow_zero = FALSE;30593060/* Loop to match the subpattern as many times as possible as if it were3061a complete pattern. */30623063for (matched_count = 0;; matched_count++)3064{3065rc = internal_dfa_match(3066mb, /* fixed match data */3067code, /* this subexpression's code */3068local_ptr, /* where we currently are */3069(PCRE2_SIZE)(ptr - start_subject), /* start offset */3070local_offsets, /* offset vector */3071RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */3072local_workspace, /* workspace vector */3073RWS_RSIZE, /* size of same */3074rlevel, /* function recursion level */3075RWS); /* recursion workspace */30763077/* Failed to match */30783079if (rc < 0)3080{3081if (rc != PCRE2_ERROR_NOMATCH) return rc;3082break;3083}30843085/* Matched: break the loop if zero characters matched. */30863087charcount = local_offsets[1] - local_offsets[0];3088if (charcount == 0) break;3089local_ptr += charcount; /* Advance temporary position ptr */3090}30913092rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;30933094/* At this point we have matched the subpattern matched_count3095times, and local_ptr is pointing to the character after the end of the3096last match. */30973098if (matched_count > 0 || allow_zero)3099{3100PCRE2_SPTR end_subpattern = code;3101int next_state_offset;31023103do { end_subpattern += GET(end_subpattern, 1); }3104while (*end_subpattern == OP_ALT);3105next_state_offset =3106(int)(end_subpattern - start_code + LINK_SIZE + 1);31073108/* Optimization: if there are no more active states, and there3109are no new states yet set up, then skip over the subject string3110right here, to save looping. Otherwise, set up the new state to swing3111into action when the end of the matched substring is reached. */31123113if (i + 1 >= active_count && new_count == 0)3114{3115ptr = local_ptr;3116clen = 0;3117ADD_NEW(next_state_offset, 0);3118}3119else3120{3121PCRE2_SPTR p = ptr;3122PCRE2_SPTR pp = local_ptr;3123charcount = (PCRE2_SIZE)(pp - p);3124#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 323125if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;3126#endif3127ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));3128}3129}3130}3131break;31323133/*-----------------------------------------------------------------*/3134case OP_ONCE:3135{3136int rc;3137int *local_workspace;3138PCRE2_SIZE *local_offsets;3139RWS_anchor *rws = (RWS_anchor *)RWS;31403141if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)3142{3143rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);3144if (rc != 0) return rc;3145RWS = (int *)rws;3146}31473148local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);3149local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;3150rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;31513152rc = internal_dfa_match(3153mb, /* fixed match data */3154code, /* this subexpression's code */3155ptr, /* where we currently are */3156(PCRE2_SIZE)(ptr - start_subject), /* start offset */3157local_offsets, /* offset vector */3158RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */3159local_workspace, /* workspace vector */3160RWS_RSIZE, /* size of same */3161rlevel, /* function recursion level */3162RWS); /* recursion workspace */31633164rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;31653166if (rc >= 0)3167{3168PCRE2_SPTR end_subpattern = code;3169PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];3170int next_state_offset, repeat_state_offset;31713172do { end_subpattern += GET(end_subpattern, 1); }3173while (*end_subpattern == OP_ALT);3174next_state_offset =3175(int)(end_subpattern - start_code + LINK_SIZE + 1);31763177/* If the end of this subpattern is KETRMAX or KETRMIN, we must3178arrange for the repeat state also to be added to the relevant list.3179Calculate the offset, or set -1 for no repeat. */31803181repeat_state_offset = (*end_subpattern == OP_KETRMAX ||3182*end_subpattern == OP_KETRMIN)?3183(int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;31843185/* If we have matched an empty string, add the next state at the3186current character pointer. This is important so that the duplicate3187checking kicks in, which is what breaks infinite loops that match an3188empty string. */31893190if (charcount == 0)3191{3192ADD_ACTIVE(next_state_offset, 0);3193}31943195/* Optimization: if there are no more active states, and there3196are no new states yet set up, then skip over the subject string3197right here, to save looping. Otherwise, set up the new state to swing3198into action when the end of the matched substring is reached. */31993200else if (i + 1 >= active_count && new_count == 0)3201{3202ptr += charcount;3203clen = 0;3204ADD_NEW(next_state_offset, 0);32053206/* If we are adding a repeat state at the new character position,3207we must fudge things so that it is the only current state.3208Otherwise, it might be a duplicate of one we processed before, and3209that would cause it to be skipped. */32103211if (repeat_state_offset >= 0)3212{3213next_active_state = active_states;3214active_count = 0;3215i = -1;3216ADD_ACTIVE(repeat_state_offset, 0);3217}3218}3219else3220{3221#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 323222if (utf)3223{3224PCRE2_SPTR p = start_subject + local_offsets[0];3225PCRE2_SPTR pp = start_subject + local_offsets[1];3226while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;3227}3228#endif3229ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));3230if (repeat_state_offset >= 0)3231{ ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }3232}3233}3234else if (rc != PCRE2_ERROR_NOMATCH) return rc;3235}3236break;323732383239/* ========================================================================== */3240/* Handle callouts */32413242case OP_CALLOUT:3243case OP_CALLOUT_STR:3244{3245PCRE2_SIZE callout_length;3246rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,3247&callout_length);3248if (rrc < 0) return rrc; /* Abandon */3249if (rrc == 0)3250{ ADD_ACTIVE(state_offset + (int)callout_length, 0); }3251}3252break;325332543255/* ========================================================================== */3256default: /* Unsupported opcode */3257return PCRE2_ERROR_DFA_UITEM;3258}32593260NEXT_ACTIVE_STATE: continue;32613262} /* End of loop scanning active states */32633264/* We have finished the processing at the current subject character. If no3265new states have been set for the next character, we have found all the3266matches that we are going to find. If partial matching has been requested,3267check for appropriate conditions.32683269The "could_continue" variable is true if a state could have continued but3270for the fact that the end of the subject was reached. */32713272if (new_count <= 0)3273{3274if (could_continue && /* Some could go on, and */3275( /* either... */3276(mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */3277|| /* or... */3278((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */3279match_count < 0) /* no matches */3280) && /* And... */3281(3282partial_newline || /* Either partial NL */3283( /* or ... */3284ptr >= end_subject && /* End of subject and */3285( /* either */3286ptr > mb->start_used_ptr || /* Inspected non-empty string */3287mb->allowemptypartial /* or pattern has lookbehind */3288) /* or could match empty */3289)3290))3291match_count = PCRE2_ERROR_PARTIAL;3292break; /* Exit from loop along the subject string */3293}32943295/* One or more states are active for the next character. */32963297ptr += clen; /* Advance to next subject character */3298} /* Loop to move along the subject string */32993300/* Control gets here from "break" a few lines above. If we have a match and3301PCRE2_ENDANCHORED is set, the match fails. */33023303if (match_count >= 0 &&3304((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&3305ptr < end_subject)3306match_count = PCRE2_ERROR_NOMATCH;33073308return match_count;3309}3310331133123313/*************************************************3314* Match a pattern using the DFA algorithm *3315*************************************************/33163317/* This function matches a compiled pattern to a subject string, using the3318alternate matching algorithm that finds all matches at once.33193320Arguments:3321code points to the compiled pattern3322subject subject string3323length length of subject string3324startoffset where to start matching in the subject3325options option bits3326match_data points to a match data structure3327gcontext points to a match context3328workspace pointer to workspace3329wscount size of workspace33303331Returns: > 0 => number of match offset pairs placed in offsets3332= 0 => offsets overflowed; longest matches are present3333-1 => failed to match3334< -1 => some kind of unexpected problem3335*/33363337PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION3338pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,3339PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,3340pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)3341{3342int rc;3343int was_zero_terminated = 0;33443345const pcre2_real_code *re = (const pcre2_real_code *)code;33463347PCRE2_SPTR start_match;3348PCRE2_SPTR end_subject;3349PCRE2_SPTR bumpalong_limit;3350PCRE2_SPTR req_cu_ptr;33513352BOOL utf, anchored, startline, firstline;3353BOOL has_first_cu = FALSE;3354BOOL has_req_cu = FALSE;33553356#if PCRE2_CODE_UNIT_WIDTH == 83357PCRE2_SPTR memchr_found_first_cu = NULL;3358PCRE2_SPTR memchr_found_first_cu2 = NULL;3359#endif33603361PCRE2_UCHAR first_cu = 0;3362PCRE2_UCHAR first_cu2 = 0;3363PCRE2_UCHAR req_cu = 0;3364PCRE2_UCHAR req_cu2 = 0;33653366const uint8_t *start_bits = NULL;33673368/* We need to have mb pointing to a match block, because the IS_NEWLINE macro3369is used below, and it expects NLBLOCK to be defined as a pointer. */33703371pcre2_callout_block cb;3372dfa_match_block actual_match_block;3373dfa_match_block *mb = &actual_match_block;33743375/* Set up a starting block of memory for use during recursive calls to3376internal_dfa_match(). By putting this on the stack, it minimizes resource use3377in the case when it is not needed. If this is too small, more memory is3378obtained from the heap. At the start of each block is an anchor structure.*/33793380int base_recursion_workspace[RWS_BASE_SIZE];3381RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;3382rws->next = NULL;3383rws->size = RWS_BASE_SIZE;3384rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;33853386/* Recognize NULL, length 0 as an empty string. */33873388if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";33893390/* Plausibility checks */33913392if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;3393if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)3394return PCRE2_ERROR_NULL;33953396if (length == PCRE2_ZERO_TERMINATED)3397{3398length = PRIV(strlen)(subject);3399was_zero_terminated = 1;3400}34013402if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;3403if (start_offset > length) return PCRE2_ERROR_BADOFFSET;34043405/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same3406time. */34073408if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&3409((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)3410return PCRE2_ERROR_BADOPTION;34113412/* Invalid UTF support is not available for DFA matching. */34133414if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)3415return PCRE2_ERROR_DFA_UINVALID_UTF;34163417/* Check that the first field in the block is the magic number. If it is not,3418return with PCRE2_ERROR_BADMAGIC. */34193420if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;34213422/* Check the code unit width. */34233424if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)3425return PCRE2_ERROR_BADMODE;34263427/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the3428options variable for this function. Users of PCRE2 who are not calling the3429function directly would like to have a way of setting these flags, in the same3430way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with3431constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and3432(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be3433transferred to the options for this function. The bits are guaranteed to be3434adjacent, but do not have the same values. This bit of Boolean trickery assumes3435that the match-time bits are not more significant than the flag bits. If by3436accident this is not the case, a compile-time division by zero error will3437occur. */34383439#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)3440#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)3441options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));3442#undef FF3443#undef OO34443445/* If restarting after a partial match, do some sanity checks on the contents3446of the workspace. */34473448if ((options & PCRE2_DFA_RESTART) != 0)3449{3450if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||3451workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))3452return PCRE2_ERROR_DFA_BADRESTART;3453}34543455/* Set some local values */34563457utf = (re->overall_options & PCRE2_UTF) != 0;3458start_match = subject + start_offset;3459end_subject = subject + length;3460req_cu_ptr = start_match - 1;3461anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||3462(re->overall_options & PCRE2_ANCHORED) != 0;34633464/* The "must be at the start of a line" flags are used in a loop when finding3465where to start. */34663467startline = (re->flags & PCRE2_STARTLINE) != 0;3468firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;3469bumpalong_limit = end_subject;34703471/* Initialize and set up the fixed fields in the callout block, with a pointer3472in the match block. */34733474mb->cb = &cb;3475cb.version = 2;3476cb.subject = subject;3477cb.subject_length = (PCRE2_SIZE)(end_subject - subject);3478cb.callout_flags = 0;3479cb.capture_top = 1; /* No capture support */3480cb.capture_last = 0;3481cb.mark = NULL; /* No (*MARK) support */34823483/* Get data from the match context, if present, and fill in the remaining3484fields in the match block. It is an error to set an offset limit without3485setting the flag at compile time. */34863487if (mcontext == NULL)3488{3489mb->callout = NULL;3490mb->memctl = re->memctl;3491mb->match_limit = PRIV(default_match_context).match_limit;3492mb->match_limit_depth = PRIV(default_match_context).depth_limit;3493mb->heap_limit = PRIV(default_match_context).heap_limit;3494}3495else3496{3497if (mcontext->offset_limit != PCRE2_UNSET)3498{3499if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)3500return PCRE2_ERROR_BADOFFSETLIMIT;3501bumpalong_limit = subject + mcontext->offset_limit;3502}3503mb->callout = mcontext->callout;3504mb->callout_data = mcontext->callout_data;3505mb->memctl = mcontext->memctl;3506mb->match_limit = mcontext->match_limit;3507mb->match_limit_depth = mcontext->depth_limit;3508mb->heap_limit = mcontext->heap_limit;3509}35103511if (mb->match_limit > re->limit_match)3512mb->match_limit = re->limit_match;35133514if (mb->match_limit_depth > re->limit_depth)3515mb->match_limit_depth = re->limit_depth;35163517if (mb->heap_limit > re->limit_heap)3518mb->heap_limit = re->limit_heap;35193520mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);3521mb->tables = re->tables;3522mb->start_subject = subject;3523mb->end_subject = end_subject;3524mb->start_offset = start_offset;3525mb->allowemptypartial = (re->max_lookbehind > 0) ||3526(re->flags & PCRE2_MATCH_EMPTY) != 0;3527mb->moptions = options;3528mb->poptions = re->overall_options;3529mb->match_call_count = 0;3530mb->heap_used = 0;35313532/* Process the \R and newline settings. */35333534mb->bsr_convention = re->bsr_convention;3535mb->nltype = NLTYPE_FIXED;3536switch(re->newline_convention)3537{3538case PCRE2_NEWLINE_CR:3539mb->nllen = 1;3540mb->nl[0] = CHAR_CR;3541break;35423543case PCRE2_NEWLINE_LF:3544mb->nllen = 1;3545mb->nl[0] = CHAR_NL;3546break;35473548case PCRE2_NEWLINE_NUL:3549mb->nllen = 1;3550mb->nl[0] = CHAR_NUL;3551break;35523553case PCRE2_NEWLINE_CRLF:3554mb->nllen = 2;3555mb->nl[0] = CHAR_CR;3556mb->nl[1] = CHAR_NL;3557break;35583559case PCRE2_NEWLINE_ANY:3560mb->nltype = NLTYPE_ANY;3561break;35623563case PCRE2_NEWLINE_ANYCRLF:3564mb->nltype = NLTYPE_ANYCRLF;3565break;35663567default:3568PCRE2_DEBUG_UNREACHABLE();3569return PCRE2_ERROR_INTERNAL;3570}35713572/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,3573we must also check that a starting offset does not point into the middle of a3574multiunit character. We check only the portion of the subject that is going to3575be inspected during matching - from the offset minus the maximum back reference3576to the given length. This saves time when a small part of a large subject is3577being matched by the use of a starting offset. Note that the maximum lookbehind3578is a number of characters, not code units. */35793580#ifdef SUPPORT_UNICODE3581if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)3582{3583PCRE2_SPTR check_subject = start_match; /* start_match includes offset */35843585if (start_offset > 0)3586{3587#if PCRE2_CODE_UNIT_WIDTH != 323588unsigned int i;3589if (start_match < end_subject && NOT_FIRSTCU(*start_match))3590return PCRE2_ERROR_BADUTFOFFSET;3591for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)3592{3593check_subject--;3594while (check_subject > subject &&3595#if PCRE2_CODE_UNIT_WIDTH == 83596(*check_subject & 0xc0) == 0x80)3597#else /* 16-bit */3598(*check_subject & 0xfc00) == 0xdc00)3599#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */3600check_subject--;3601}3602#else /* In the 32-bit library, one code unit equals one character. */3603check_subject -= re->max_lookbehind;3604if (check_subject < subject) check_subject = subject;3605#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */3606}36073608/* Validate the relevant portion of the subject. After an error, adjust the3609offset to be an absolute offset in the whole string. */36103611match_data->rc = PRIV(valid_utf)(check_subject,3612length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));3613if (match_data->rc != 0)3614{3615match_data->startchar += (PCRE2_SIZE)(check_subject - subject);3616return match_data->rc;3617}3618}3619#endif /* SUPPORT_UNICODE */36203621/* Set up the first code unit to match, if available. If there's no first code3622unit there may be a bitmap of possible first characters. */36233624if ((re->flags & PCRE2_FIRSTSET) != 0)3625{3626has_first_cu = TRUE;3627first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);3628if ((re->flags & PCRE2_FIRSTCASELESS) != 0)3629{3630first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);3631#ifdef SUPPORT_UNICODE3632#if PCRE2_CODE_UNIT_WIDTH == 83633if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)3634first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);3635#else3636if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))3637first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);3638#endif3639#endif /* SUPPORT_UNICODE */3640}3641}3642else3643if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)3644start_bits = re->start_bitmap;36453646/* There may be a "last known required code unit" set. */36473648if ((re->flags & PCRE2_LASTSET) != 0)3649{3650has_req_cu = TRUE;3651req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);3652if ((re->flags & PCRE2_LASTCASELESS) != 0)3653{3654req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);3655#ifdef SUPPORT_UNICODE3656#if PCRE2_CODE_UNIT_WIDTH == 83657if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)3658req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);3659#else3660if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))3661req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);3662#endif3663#endif /* SUPPORT_UNICODE */3664}3665}36663667/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,3668free the memory that was obtained. */36693670if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)3671{3672match_data->memctl.free((void *)match_data->subject,3673match_data->memctl.memory_data);3674match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;3675}36763677/* Fill in fields that are always returned in the match data. */36783679match_data->code = re;3680match_data->subject = NULL; /* Default for no match */3681match_data->mark = NULL;3682match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;36833684/* Call the main matching function, looping for a non-anchored regex after a3685failed match. If not restarting, perform certain optimizations at the start of3686a match. */36873688for (;;)3689{3690/* ----------------- Start of match optimizations ---------------- */36913692/* There are some optimizations that avoid running the match if a known3693starting point is not found, or if a known later code unit is not present.3694However, there is an option (settable at compile time) that disables3695these, for testing and for ensuring that all callouts do actually occur.3696The optimizations must also be avoided when restarting a DFA match. */36973698if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0 &&3699(options & PCRE2_DFA_RESTART) == 0)3700{3701/* If firstline is TRUE, the start of the match is constrained to the first3702line of a multiline string. That is, the match must be before or at the3703first newline following the start of matching. Temporarily adjust3704end_subject so that we stop the optimization scans for a first code unit3705immediately after the first character of a newline (the first code unit can3706legitimately be a newline). If the match fails at the newline, later code3707breaks this loop. */37083709if (firstline)3710{3711PCRE2_SPTR t = start_match;3712#ifdef SUPPORT_UNICODE3713if (utf)3714{3715while (t < end_subject && !IS_NEWLINE(t))3716{3717t++;3718ACROSSCHAR(t < end_subject, t, t++);3719}3720}3721else3722#endif3723while (t < end_subject && !IS_NEWLINE(t)) t++;3724end_subject = t;3725}37263727/* Anchored: check the first code unit if one is recorded. This may seem3728pointless but it can help in detecting a no match case without scanning for3729the required code unit. */37303731if (anchored)3732{3733if (has_first_cu || start_bits != NULL)3734{3735BOOL ok = start_match < end_subject;3736if (ok)3737{3738PCRE2_UCHAR c = UCHAR21TEST(start_match);3739ok = has_first_cu && (c == first_cu || c == first_cu2);3740if (!ok && start_bits != NULL)3741{3742#if PCRE2_CODE_UNIT_WIDTH != 83743if (c > 255) c = 255;3744#endif3745ok = (start_bits[c/8] & (1u << (c&7))) != 0;3746}3747}3748if (!ok) break;3749}3750}37513752/* Not anchored. Advance to a unique first code unit if there is one. */37533754else3755{3756if (has_first_cu)3757{3758if (first_cu != first_cu2) /* Caseless */3759{3760/* In 16-bit and 32_bit modes we have to do our own search, so can3761look for both cases at once. */37623763#if PCRE2_CODE_UNIT_WIDTH != 83764PCRE2_UCHAR smc;3765while (start_match < end_subject &&3766(smc = UCHAR21TEST(start_match)) != first_cu &&3767smc != first_cu2)3768start_match++;3769#else3770/* In 8-bit mode, the use of memchr() gives a big speed up, even3771though we have to call it twice in order to find the earliest3772occurrence of the code unit in either of its cases. Caching is used3773to remember the positions of previously found code units. This can3774make a huge difference when the strings are very long and only one3775case is actually present. */37763777PCRE2_SPTR pp1 = NULL;3778PCRE2_SPTR pp2 = NULL;3779PCRE2_SIZE searchlength = end_subject - start_match;37803781/* If we haven't got a previously found position for first_cu, or if3782the current starting position is later, we need to do a search. If3783the code unit is not found, set it to the end. */37843785if (memchr_found_first_cu == NULL ||3786start_match > memchr_found_first_cu)3787{3788pp1 = memchr(start_match, first_cu, searchlength);3789memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;3790}37913792/* If the start is before a previously found position, use the3793previous position, or NULL if a previous search failed. */37943795else pp1 = (memchr_found_first_cu == end_subject)? NULL :3796memchr_found_first_cu;37973798/* Do the same thing for the other case. */37993800if (memchr_found_first_cu2 == NULL ||3801start_match > memchr_found_first_cu2)3802{3803pp2 = memchr(start_match, first_cu2, searchlength);3804memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;3805}38063807else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :3808memchr_found_first_cu2;38093810/* Set the start to the end of the subject if neither case was found.3811Otherwise, use the earlier found point. */38123813if (pp1 == NULL)3814start_match = (pp2 == NULL)? end_subject : pp2;3815else3816start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;38173818#endif /* 8-bit handling */3819}38203821/* The caseful case is much simpler. */38223823else3824{3825#if PCRE2_CODE_UNIT_WIDTH != 83826while (start_match < end_subject && UCHAR21TEST(start_match) !=3827first_cu)3828start_match++;3829#else /* 8-bit code units */3830start_match = memchr(start_match, first_cu, end_subject - start_match);3831if (start_match == NULL) start_match = end_subject;3832#endif3833}38343835/* If we can't find the required code unit, having reached the true end3836of the subject, break the bumpalong loop, to force a match failure,3837except when doing partial matching, when we let the next cycle run at3838the end of the subject. To see why, consider the pattern /(?<=abc)def/,3839which partially matches "abc", even though the string does not contain3840the starting character "d". If we have not reached the true end of the3841subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)3842we also let the cycle run, because the matching string is legitimately3843allowed to start with the first code unit of a newline. */38443845if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&3846start_match >= mb->end_subject)3847break;3848}38493850/* If there's no first code unit, advance to just after a linebreak for a3851multiline match if required. */38523853else if (startline)3854{3855if (start_match > mb->start_subject + start_offset)3856{3857#ifdef SUPPORT_UNICODE3858if (utf)3859{3860while (start_match < end_subject && !WAS_NEWLINE(start_match))3861{3862start_match++;3863ACROSSCHAR(start_match < end_subject, start_match, start_match++);3864}3865}3866else3867#endif3868while (start_match < end_subject && !WAS_NEWLINE(start_match))3869start_match++;38703871/* If we have just passed a CR and the newline option is ANY or3872ANYCRLF, and we are now at a LF, advance the match position by one3873more code unit. */38743875if (start_match[-1] == CHAR_CR &&3876(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&3877start_match < end_subject &&3878UCHAR21TEST(start_match) == CHAR_NL)3879start_match++;3880}3881}38823883/* If there's no first code unit or a requirement for a multiline line3884start, advance to a non-unique first code unit if any have been3885identified. The bitmap contains only 256 bits. When code units are 16 or388632 bits wide, all code units greater than 254 set the 255 bit. */38873888else if (start_bits != NULL)3889{3890while (start_match < end_subject)3891{3892uint32_t c = UCHAR21TEST(start_match);3893#if PCRE2_CODE_UNIT_WIDTH != 83894if (c > 255) c = 255;3895#endif3896if ((start_bits[c/8] & (1u << (c&7))) != 0) break;3897start_match++;3898}38993900/* See comment above in first_cu checking about the next line. */39013902if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&3903start_match >= mb->end_subject)3904break;3905}3906} /* End of first code unit handling */39073908/* Restore fudged end_subject */39093910end_subject = mb->end_subject;39113912/* The following two optimizations are disabled for partial matching. */39133914if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)3915{3916PCRE2_SPTR p;39173918/* The minimum matching length is a lower bound; no actual string of that3919length may actually match the pattern. Although the value is, strictly,3920in characters, we treat it as code units to avoid spending too much time3921in this optimization. */39223923if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;39243925/* If req_cu is set, we know that that code unit must appear in the3926subject for the match to succeed. If the first code unit is set, req_cu3927must be later in the subject; otherwise the test starts at the match3928point. This optimization can save a huge amount of backtracking in3929patterns with nested unlimited repeats that aren't going to match.3930Writing separate code for cased/caseless versions makes it go faster, as3931does using an autoincrement and backing off on a match. As in the case of3932the first code unit, using memchr() in the 8-bit library gives a big3933speed up. Unlike the first_cu check above, we do not need to call3934memchr() twice in the caseless case because we only need to check for the3935presence of the character in either case, not find the first occurrence.39363937The search can be skipped if the code unit was found later than the3938current starting point in a previous iteration of the bumpalong loop.39393940HOWEVER: when the subject string is very, very long, searching to its end3941can take a long time, and give bad performance on quite ordinary3942patterns. This showed up when somebody was matching something like3943/^\d+C/ on a 32-megabyte string... so we don't do this when the string is3944sufficiently long, but it's worth searching a lot more for unanchored3945patterns. */39463947p = start_match + (has_first_cu? 1:0);3948if (has_req_cu && p > req_cu_ptr)3949{3950PCRE2_SIZE check_length = end_subject - start_match;39513952if (check_length < REQ_CU_MAX ||3953(!anchored && check_length < REQ_CU_MAX * 1000))3954{3955if (req_cu != req_cu2) /* Caseless */3956{3957#if PCRE2_CODE_UNIT_WIDTH != 83958while (p < end_subject)3959{3960uint32_t pp = UCHAR21INCTEST(p);3961if (pp == req_cu || pp == req_cu2) { p--; break; }3962}3963#else /* 8-bit code units */3964PCRE2_SPTR pp = p;3965p = memchr(pp, req_cu, end_subject - pp);3966if (p == NULL)3967{3968p = memchr(pp, req_cu2, end_subject - pp);3969if (p == NULL) p = end_subject;3970}3971#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */3972}39733974/* The caseful case */39753976else3977{3978#if PCRE2_CODE_UNIT_WIDTH != 83979while (p < end_subject)3980{3981if (UCHAR21INCTEST(p) == req_cu) { p--; break; }3982}39833984#else /* 8-bit code units */3985p = memchr(p, req_cu, end_subject - p);3986if (p == NULL) p = end_subject;3987#endif3988}39893990/* If we can't find the required code unit, break the matching loop,3991forcing a match failure. */39923993if (p >= end_subject) break;39943995/* If we have found the required code unit, save the point where we3996found it, so that we don't search again next time round the loop if3997the start hasn't passed this code unit yet. */39983999req_cu_ptr = p;4000}4001}4002}4003}40044005/* ------------ End of start of match optimizations ------------ */40064007/* Give no match if we have passed the bumpalong limit. */40084009if (start_match > bumpalong_limit) break;40104011/* OK, now we can do the business */40124013mb->start_used_ptr = start_match;4014mb->last_used_ptr = start_match;4015mb->recursive = NULL;40164017rc = internal_dfa_match(4018mb, /* fixed match data */4019mb->start_code, /* this subexpression's code */4020start_match, /* where we currently are */4021start_offset, /* start offset in subject */4022match_data->ovector, /* offset vector */4023(uint32_t)match_data->oveccount * 2, /* actual size of same */4024workspace, /* workspace vector */4025(int)wscount, /* size of same */40260, /* function recurse level */4027base_recursion_workspace); /* initial workspace for recursion */40284029/* Anything other than "no match" means we are done, always; otherwise, carry4030on only if not anchored. */40314032if (rc != PCRE2_ERROR_NOMATCH || anchored)4033{4034if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)4035{4036match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);4037match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);4038}4039match_data->subject_length = length;4040match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);4041match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);4042match_data->startchar = (PCRE2_SIZE)(start_match - subject);4043match_data->rc = rc;40444045if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)4046{4047length = CU2BYTES(length + was_zero_terminated);4048match_data->subject = match_data->memctl.malloc(length,4049match_data->memctl.memory_data);4050if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;4051memcpy((void *)match_data->subject, subject, length);4052match_data->flags |= PCRE2_MD_COPIED_SUBJECT;4053}4054else4055{4056if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;4057}4058goto EXIT;4059}40604061/* Advance to the next subject character unless we are at the end of a line4062and firstline is set. */40634064if (firstline && IS_NEWLINE(start_match)) break;4065start_match++;4066#ifdef SUPPORT_UNICODE4067if (utf)4068{4069ACROSSCHAR(start_match < end_subject, start_match, start_match++);4070}4071#endif4072if (start_match > end_subject) break;40734074/* If we have just passed a CR and we are now at a LF, and the pattern does4075not contain any explicit matches for \r or \n, and the newline option is CRLF4076or ANY or ANYCRLF, advance the match position by one more character. */40774078if (UCHAR21TEST(start_match - 1) == CHAR_CR &&4079start_match < end_subject &&4080UCHAR21TEST(start_match) == CHAR_NL &&4081(re->flags & PCRE2_HASCRORLF) == 0 &&4082(mb->nltype == NLTYPE_ANY ||4083mb->nltype == NLTYPE_ANYCRLF ||4084mb->nllen == 2))4085start_match++;40864087} /* "Bumpalong" loop */40884089NOMATCH_EXIT:4090rc = PCRE2_ERROR_NOMATCH;40914092EXIT:4093while (rws->next != NULL)4094{4095RWS_anchor *next = rws->next;4096rws->next = next->next;4097mb->memctl.free(next, mb->memctl.memory_data);4098}40994100return rc;4101}41024103/* These #undefs are here to enable unity builds with CMake. */41044105#undef NLBLOCK /* Block containing newline information */4106#undef PSSTART /* Field containing processed string start */4107#undef PSEND /* Field containing processed string end */41084109/* End of pcre2_dfa_match.c */411041114112