Path: blob/master/thirdparty/pcre2/src/pcre2_substitute.c
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#ifdef HAVE_CONFIG_H42#include "config.h"43#endif4445#include "pcre2_internal.h"4647#define PTR_STACK_SIZE 204849#define SUBSTITUTE_OPTIONS \50(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \51PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \52PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \53PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)54555657/*************************************************58* Find end of substitute text *59*************************************************/6061/* In extended mode, we recognize ${name:+set text:unset text} and similar62constructions. This requires the identification of unescaped : and }63characters. This function scans for such. It must deal with nested ${64constructions. The pointer to the text is updated, either to the required end65character, or to where an error was detected.6667Arguments:68code points to the compiled expression (for options)69ptrptr points to the pointer to the start of the text (updated)70ptrend end of the whole string71last TRUE if the last expected string (only } recognized)7273Returns: 0 on success74negative error code on failure75*/7677static int78find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,79BOOL last)80{81int rc = 0;82uint32_t nestlevel = 0;83BOOL literal = FALSE;84PCRE2_SPTR ptr = *ptrptr;8586for (; ptr < ptrend; ptr++)87{88if (literal)89{90if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)91{92literal = FALSE;93ptr += 1;94}95}9697else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)98{99if (nestlevel == 0) goto EXIT;100nestlevel--;101}102103else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;104105else if (*ptr == CHAR_DOLLAR_SIGN)106{107if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)108{109nestlevel++;110ptr += 1;111}112}113114else if (*ptr == CHAR_BACKSLASH)115{116int erc;117int errorcode;118uint32_t ch;119120if (ptr < ptrend - 1) switch (ptr[1])121{122case CHAR_L:123case CHAR_l:124case CHAR_U:125case CHAR_u:126ptr += 1;127continue;128}129130ptr += 1; /* Must point after \ */131erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,132code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);133ptr -= 1; /* Back to last code unit of escape */134if (errorcode != 0)135{136/* errorcode from check_escape is positive, so must not be returned by137pcre2_substitute(). */138rc = PCRE2_ERROR_BADREPESCAPE;139goto EXIT;140}141142switch(erc)143{144case 0: /* Data character */145case ESC_b: /* Data character */146case ESC_v: /* Data character */147case ESC_E: /* Isolated \E is ignored */148break;149150case ESC_Q:151literal = TRUE;152break;153154case ESC_g:155/* The \g<name> form (\g<number> already handled by check_escape)156157Don't worry about finding the matching ">". We are super, super lenient158about validating ${} replacements inside find_text_end(), so we certainly159don't need to worry about other syntax. Importantly, a \g<..> or $<...>160sequence can't contain a '}' character. */161break;162163default:164if (erc < 0)165break; /* capture group reference */166rc = PCRE2_ERROR_BADREPESCAPE;167goto EXIT;168}169}170}171172rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */173174EXIT:175*ptrptr = ptr;176return rc;177}178179180/*************************************************181* Validate group name *182*************************************************/183184/* This function scans for a capture group name, validating it185consists of legal characters, is not empty, and does not exceed186MAX_NAME_SIZE.187188Arguments:189ptrptr points to the pointer to the start of the text (updated)190ptrend end of the whole string191utf true if the input is UTF-encoded192ctypes pointer to the character types table193194Returns: TRUE if a name was read195FALSE otherwise196*/197198static BOOL199read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,200const uint8_t* ctypes)201{202PCRE2_SPTR ptr = *ptrptr;203PCRE2_SPTR nameptr = ptr;204205if (ptr >= ptrend) /* No characters in name */206goto FAILED;207208/* We do not need to check whether the name starts with a non-digit.209We are simply referencing names here, not defining them. */210211/* See read_name in the pcre2_compile.c for the corresponding logic212restricting group names inside the pattern itself. */213214#ifdef SUPPORT_UNICODE215if (utf)216{217uint32_t c, type;218219while (ptr < ptrend)220{221GETCHAR(c, ptr);222type = UCD_CHARTYPE(c);223if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&224c != CHAR_UNDERSCORE) break;225ptr++;226FORWARDCHARTEST(ptr, ptrend);227}228}229else230#else231(void)utf; /* Avoid compiler warning */232#endif /* SUPPORT_UNICODE */233234/* Handle group names in non-UTF modes. */235236{237while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)238{239ptr++;240}241}242243/* Check name length */244245if (ptr - nameptr > MAX_NAME_SIZE)246goto FAILED;247248/* Subpattern names must not be empty */249if (ptr == nameptr)250goto FAILED;251252*ptrptr = ptr;253return TRUE;254255FAILED:256*ptrptr = ptr;257return FALSE;258}259260261/*************************************************262* Case transformations *263*************************************************/264265#define PCRE2_SUBSTITUTE_CASE_NONE 0266// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.267#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST 4268269typedef struct {270int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */271BOOL single_char;272} case_state;273274/* Helper to guess how much a string is likely to increase in size when275case-transformed. Usually, strings don't change size at all, but some rare276characters do grow. Estimate +10%, plus another few characters.277278Performing this estimation is unfortunate, but inevitable, since we can't call279the callout if we ran out of buffer space to prepare its input.280281Because this estimate is inexact (and in pathological cases, underestimates the282required buffer size) we must document that when you have a283substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you284may need more than two calls to determine the final buffer size. */285286static PCRE2_SIZE287pessimistic_case_inflation(PCRE2_SIZE len)288{289return (len >> 3u) + 10;290}291292/* Case transformation behaviour if no callout is passed. */293294static PCRE2_SIZE295default_substitute_case_callout(296PCRE2_SPTR input, PCRE2_SIZE input_len,297PCRE2_UCHAR *output, PCRE2_SIZE output_cap,298case_state *state, const pcre2_code *code)299{300PCRE2_SPTR input_end = input + input_len;301#ifdef SUPPORT_UNICODE302BOOL utf;303BOOL ucp;304#endif305PCRE2_UCHAR temp[6];306BOOL next_to_upper;307BOOL rest_to_upper;308BOOL single_char;309BOOL overflow = FALSE;310PCRE2_SIZE written = 0;311312/* Helpful simplifying invariant: input and output are disjoint buffers.313I believe that this code is technically undefined behaviour, because the two314pointers input/output are "unrelated" pointers and hence not comparable. Casting315via char* bypasses some but not all of those technical rules. It is not included316in release builds, in any case. */317PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||318(char *)(output + output_cap) <= (char *)input);319320#ifdef SUPPORT_UNICODE321utf = (code->overall_options & PCRE2_UTF) != 0;322ucp = (code->overall_options & PCRE2_UCP) != 0;323#endif324325if (input_len == 0) return 0;326327switch (state->to_case)328{329default:330PCRE2_DEBUG_UNREACHABLE();331return 0;332333case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE334case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE335next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);336break;337338case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE339next_to_upper = TRUE;340rest_to_upper = FALSE;341state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;342break;343344case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE345next_to_upper = FALSE;346rest_to_upper = TRUE;347state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;348break;349}350351single_char = state->single_char;352if (single_char)353state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;354355while (input < input_end)356{357uint32_t ch;358unsigned int chlen;359360GETCHARINCTEST(ch, input);361362#ifdef SUPPORT_UNICODE363if ((utf || ucp) && ch >= 128)364{365uint32_t type = UCD_CHARTYPE(ch);366if (PRIV(ucp_gentype)[type] == ucp_L &&367type != (next_to_upper? ucp_Lu : ucp_Ll))368ch = UCD_OTHERCASE(ch);369370/* TODO This is far from correct... it doesn't support the SpecialCasing.txt371mappings, but worse, it's not even correct for all the ordinary case372mappings. We should add support for those (at least), and then add the373SpecialCasing.txt mappings for Esszet and ligatures, and finally use the374Turkish casing flag on the match context. */375}376else377#endif378if (MAX_255(ch))379{380if (((code->tables + cbits_offset +381(next_to_upper? cbit_upper:cbit_lower)382)[ch/8] & (1u << (ch%8))) == 0)383ch = (code->tables + fcc_offset)[ch];384}385386#ifdef SUPPORT_UNICODE387if (utf) chlen = PRIV(ord2utf)(ch, temp); else388#endif389{390temp[0] = ch;391chlen = 1;392}393394if (!overflow && chlen <= output_cap)395{396memcpy(output, temp, CU2BYTES(chlen));397output += chlen;398output_cap -= chlen;399}400else401{402overflow = TRUE;403}404405if (chlen > ~(PCRE2_SIZE)0 - written) /* Integer overflow */406return ~(PCRE2_SIZE)0;407written += chlen;408409next_to_upper = rest_to_upper;410411/* memcpy the remainder, if only transforming a single character. */412413if (single_char)414{415PCRE2_SIZE rest_len = input_end - input;416417if (!overflow && rest_len <= output_cap)418memcpy(output, input, CU2BYTES(rest_len));419420if (rest_len > ~(PCRE2_SIZE)0 - written) /* Integer overflow */421return ~(PCRE2_SIZE)0;422written += rest_len;423424return written;425}426}427428return written;429}430431/* Helper to perform the call to the substitute_case_callout. We wrap the432user-provided callout because our internal arguments are slightly extended. We433don't want the user callout to handle the case of "\l" (first character only to434lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because435those are not operations defined by Unicode. Instead the user callout simply436needs to provide the three Unicode primitives: lower, upper, titlecase. */437438static PCRE2_SIZE439do_case_copy(440PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,441case_state *state, BOOL utf,442PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,443PCRE2_SIZE, int, void *),444void *substitute_case_callout_data)445{446PCRE2_SPTR input = input_output;447PCRE2_UCHAR *output = input_output;448PCRE2_SIZE rc;449PCRE2_SIZE rc2;450int ch1_to_case;451int rest_to_case;452PCRE2_UCHAR ch1[6];453PCRE2_SIZE ch1_len;454PCRE2_SPTR rest;455PCRE2_SIZE rest_len;456BOOL ch1_overflow = FALSE;457BOOL rest_overflow = FALSE;458459#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)460(void)utf; /* Avoid compiler warning. */461#endif462463PCRE2_ASSERT(input_len != 0);464465switch (state->to_case)466{467default:468PCRE2_DEBUG_UNREACHABLE();469return 0;470471case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE472case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE473case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE474475/* The easy case, where our internal casing operations align with those of476the callout. */477478if (state->single_char == FALSE)479{480rc = substitute_case_callout(input, input_len, output, output_cap,481state->to_case, substitute_case_callout_data);482483if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)484state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;485486return rc;487}488489ch1_to_case = state->to_case;490rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;491break;492493case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE494ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;495rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;496break;497}498499/* Identify the leading character. Take copy, because its storage overlaps with500`output`, and hence may be scrambled by the callout. */501502{503PCRE2_SPTR ch_end = input;504uint32_t ch;505506GETCHARINCTEST(ch, ch_end);507(void) ch;508PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);509ch1_len = ch_end - input;510memcpy(ch1, input, CU2BYTES(ch1_len));511}512513rest = input + ch1_len;514rest_len = input_len - ch1_len;515516/* Transform just ch1. The buffers are always in-place (input == output). With a517custom callout, we need a loop to discover its required buffer size. The loop518wouldn't be required if the callout were well-behaved, but it might be naughty519and return "5" the first time, then "10" the next time we call it using the520exact same input! */521522{523PCRE2_SIZE ch1_cap;524PCRE2_SIZE max_ch1_cap;525526ch1_cap = ch1_len; /* First attempt uses the space vacated by ch1. */527PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);528max_ch1_cap = output_cap - rest_len;529530while (TRUE)531{532rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,533substitute_case_callout_data);534if (rc == ~(PCRE2_SIZE)0) return rc;535536if (rc <= ch1_cap) break;537538if (rc > max_ch1_cap)539{540ch1_overflow = TRUE;541break;542}543544/* Move the rest to the right, to make room for expanding ch1. */545546memmove(input_output + rc, rest, CU2BYTES(rest_len));547rest = input + rc;548549ch1_cap = rc;550551/* Proof of loop termination: `ch1_cap` is growing on each iteration, but552the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */553}554}555556if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)557{558if (!ch1_overflow)559{560PCRE2_ASSERT(rest_len <= output_cap - rc);561memmove(output + rc, rest, CU2BYTES(rest_len));562}563rc2 = rest_len;564565state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;566}567else568{569PCRE2_UCHAR dummy[1];570571rc2 = substitute_case_callout(rest, rest_len,572ch1_overflow? dummy : output + rc,573ch1_overflow? 0u : output_cap - rc,574rest_to_case, substitute_case_callout_data);575if (rc2 == ~(PCRE2_SIZE)0) return rc2;576577if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;578579/* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then580`rest` shrinks, it's actually possible for the total calculated length of581`xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't582report that, because it would make it seem that the operation succeeded.583If either of xform(ch1) or xform(rest) won't fit in the buffer, our final584result must be > output_cap. */585if (ch1_overflow && rc2 < rest_len)586rc2 = rest_len;587588state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;589}590591if (rc2 > ~(PCRE2_SIZE)0 - rc) /* Integer overflow */592return ~(PCRE2_SIZE)0;593594PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);595(void)rest_overflow;596597return rc + rc2;598}599600601/*************************************************602* Match and substitute *603*************************************************/604605/* This function applies a compiled re to a subject string and creates a new606string with substitutions. The first 7 arguments are the same as for607pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.608609Arguments:610code points to the compiled expression611subject points to the subject string612length length of subject string (may contain binary zeros)613start_offset where to start in the subject string614options option bits615match_data points to a match_data block, or is NULL616context points a PCRE2 context617replacement points to the replacement string618rlength length of replacement string619buffer where to put the substituted string620blength points to length of buffer; updated to length of string621622Returns: >= 0 number of substitutions made623< 0 an error code624PCRE2_ERROR_BADREPLACEMENT means invalid use of $625*/626627/* This macro checks for space in the buffer before copying into it. On628overflow, either give an error immediately, or keep on, accumulating the629length. */630631#define CHECKMEMCPY(from, length_) \632do { \633PCRE2_SIZE chkmc_length = length_; \634if (overflowed) \635{ \636if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \637goto TOOLARGEREPLACE; \638extra_needed += chkmc_length; \639} \640else if (lengthleft < chkmc_length) \641{ \642if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \643overflowed = TRUE; \644extra_needed = chkmc_length - lengthleft; \645} \646else \647{ \648memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \649buff_offset += chkmc_length; \650lengthleft -= chkmc_length; \651} \652} \653while (0)654655/* This macro checks for space and copies characters with casing modifications.656On overflow, it behaves as for CHECKMEMCPY().657658When substitute_case_callout is NULL, the source and destination buffers must659not overlap, because our default handler does not support this. */660661#define CHECKCASECPY_BASE(length_, do_call) \662do { \663PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \664PCRE2_SIZE chkcc_rc; \665do_call \666if (lengthleft < chkcc_rc) \667{ \668if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \669overflowed = TRUE; \670extra_needed = chkcc_rc - lengthleft; \671} \672else \673{ \674buff_offset += chkcc_rc; \675lengthleft -= chkcc_rc; \676} \677} \678while (0)679680#define CHECKCASECPY_DEFAULT(from, length_) \681CHECKCASECPY_BASE(length_, { \682chkcc_rc = default_substitute_case_callout(from, chkcc_length, \683buffer + buff_offset, \684overflowed? 0 : lengthleft, \685&forcecase, code); \686if (overflowed) \687{ \688if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \689goto TOOLARGEREPLACE; \690extra_needed += chkcc_rc; \691break; \692} \693})694695#define CHECKCASECPY_CALLOUT(length_) \696CHECKCASECPY_BASE(length_, { \697chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \698lengthleft, &forcecase, utf, \699substitute_case_callout, \700substitute_case_callout_data); \701if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \702})703704/* This macro does a delayed case transformation, for the situation when we have705a case-forcing callout. */706707#define DELAYEDFORCECASE() \708do { \709PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \710(extra_needed - casestart_extra_needed); \711if (chars_outstanding > 0) \712{ \713if (overflowed) \714{ \715PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \716if (guess > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \717goto TOOLARGEREPLACE; \718extra_needed += guess; \719} \720else \721{ \722/* Rewind the buffer */ \723lengthleft += (buff_offset - casestart_offset); \724buff_offset = casestart_offset; \725/* Care! In-place case transformation */ \726CHECKCASECPY_CALLOUT(chars_outstanding); \727} \728} \729} \730while (0)731732733/* Here's the function */734735PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION736pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,737PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,738pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,739PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)740{741int rc;742int subs;743uint32_t ovector_count;744uint32_t goptions = 0;745uint32_t suboptions;746pcre2_match_data *internal_match_data = NULL;747BOOL escaped_literal = FALSE;748BOOL overflowed = FALSE;749BOOL use_existing_match;750BOOL replacement_only;751BOOL utf = (code->overall_options & PCRE2_UTF) != 0;752PCRE2_UCHAR temp[6];753PCRE2_SPTR ptr;754PCRE2_SPTR repend = NULL;755PCRE2_SIZE extra_needed = 0;756PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;757PCRE2_SIZE *ovector;758PCRE2_SIZE ovecsave[3];759pcre2_substitute_callout_block scb;760PCRE2_SIZE sub_start_extra_needed;761PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,762PCRE2_SIZE, int, void *) = NULL;763void *substitute_case_callout_data = NULL;764765/* General initialization */766767buff_offset = 0;768lengthleft = buff_length = *blength;769*blength = PCRE2_UNSET;770ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;771772if (mcontext != NULL)773{774substitute_case_callout = mcontext->substitute_case_callout;775substitute_case_callout_data = mcontext->substitute_case_callout_data;776}777778/* Partial matching is not valid. This must come after setting *blength to779PCRE2_UNSET, so as not to imply an offset in the replacement. */780781if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)782return PCRE2_ERROR_BADOPTION;783784/* Validate length and find the end of the replacement. A NULL replacement of785zero length is interpreted as an empty string. */786787if (replacement == NULL)788{789if (rlength != 0) return PCRE2_ERROR_NULL;790replacement = (PCRE2_SPTR)"";791}792793if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);794repend = replacement + rlength;795796/* Check for using a match that has already happened. Note that the subject797pointer in the match data may be NULL after a no-match. */798799use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);800replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);801802/* If starting from an existing match, there must be an externally provided803match data block. We create an internal match_data block in two cases: (a) an804external one is not supplied (and we are not starting from an existing match);805(b) an existing match is to be used for the first substitution. In the latter806case, we copy the existing match into the internal block, except for any cached807heap frame size and pointer. This ensures that no changes are made to the808external match data block. */809810/* WARNING: In both cases below a general context is constructed "by hand"811because calling pcre2_general_context_create() involves a memory allocation. If812the contents of a general context control block are ever changed there will813have to be changes below. */814815if (match_data == NULL)816{817pcre2_general_context gcontext;818if (use_existing_match) return PCRE2_ERROR_NULL;819gcontext.memctl = (mcontext == NULL)?820((const pcre2_real_code *)code)->memctl :821((pcre2_real_match_context *)mcontext)->memctl;822match_data = internal_match_data =823pcre2_match_data_create_from_pattern(code, &gcontext);824if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;825}826827else if (use_existing_match)828{829int pairs;830pcre2_general_context gcontext;831gcontext.memctl = (mcontext == NULL)?832((const pcre2_real_code *)code)->memctl :833((pcre2_real_match_context *)mcontext)->memctl;834pairs = (code->top_bracket + 1 < match_data->oveccount)?835code->top_bracket + 1 : match_data->oveccount;836internal_match_data = pcre2_match_data_create(match_data->oveccount,837&gcontext);838if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;839memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)840+ 2*pairs*sizeof(PCRE2_SIZE));841internal_match_data->heapframes = NULL;842internal_match_data->heapframes_size = 0;843match_data = internal_match_data;844}845846/* Remember ovector details */847848ovector = pcre2_get_ovector_pointer(match_data);849ovector_count = pcre2_get_ovector_count(match_data);850851/* Fixed things in the callout block */852853scb.version = 0;854scb.input = subject;855scb.output = (PCRE2_SPTR)buffer;856scb.ovector = ovector;857858/* A NULL subject of zero length is treated as an empty string. */859860if (subject == NULL)861{862if (length != 0) return PCRE2_ERROR_NULL;863subject = (PCRE2_SPTR)"";864}865866/* Find length of zero-terminated subject */867868if (length == PCRE2_ZERO_TERMINATED)869length = subject? PRIV(strlen)(subject) : 0;870871/* Check UTF replacement string if necessary. */872873#ifdef SUPPORT_UNICODE874if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)875{876rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));877if (rc != 0)878{879match_data->leftchar = 0;880goto EXIT;881}882}883#endif /* SUPPORT_UNICODE */884885/* Save the substitute options and remove them from the match options. */886887suboptions = options & SUBSTITUTE_OPTIONS;888options &= ~SUBSTITUTE_OPTIONS;889890/* Error if the start match offset is greater than the length of the subject. */891892if (start_offset > length)893{894match_data->leftchar = 0;895rc = PCRE2_ERROR_BADOFFSET;896goto EXIT;897}898899/* Copy up to the start offset, unless only the replacement is required. */900901if (!replacement_only) CHECKMEMCPY(subject, start_offset);902903/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first904match is taken from the match_data that was passed in. */905906subs = 0;907do908{909PCRE2_SPTR ptrstack[PTR_STACK_SIZE];910uint32_t ptrstackptr = 0;911case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };912PCRE2_SIZE casestart_offset = 0;913PCRE2_SIZE casestart_extra_needed = 0;914915if (use_existing_match)916{917rc = match_data->rc;918use_existing_match = FALSE;919}920else rc = pcre2_match(code, subject, length, start_offset, options|goptions,921match_data, mcontext);922923#ifdef SUPPORT_UNICODE924if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */925#endif926927/* Any error other than no match returns the error code. No match when not928doing the special after-empty-match global rematch, or when at the end of the929subject, breaks the global loop. Otherwise, advance the starting point by one930character, copying it to the output, and try again. */931932if (rc < 0)933{934PCRE2_SIZE save_start;935936if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;937if (goptions == 0 || start_offset >= length) break;938939/* Advance by one code point. Then, if CRLF is a valid newline sequence and940we have advanced into the middle of it, advance one more code point. In941other words, do not start in the middle of CRLF, even if CR and LF on their942own are valid newlines. */943944save_start = start_offset++;945if (subject[start_offset-1] == CHAR_CR &&946(code->newline_convention == PCRE2_NEWLINE_CRLF ||947code->newline_convention == PCRE2_NEWLINE_ANY ||948code->newline_convention == PCRE2_NEWLINE_ANYCRLF) &&949start_offset < length &&950subject[start_offset] == CHAR_LF)951start_offset++;952953/* Otherwise, in UTF mode, advance past any secondary code points. */954955else if ((code->overall_options & PCRE2_UTF) != 0)956{957#if PCRE2_CODE_UNIT_WIDTH == 8958while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)959start_offset++;960#elif PCRE2_CODE_UNIT_WIDTH == 16961while (start_offset < length &&962(subject[start_offset] & 0xfc00) == 0xdc00)963start_offset++;964#endif965}966967/* Copy what we have advanced past (unless not required), reset the special968global options, and continue to the next match. */969970fraglength = start_offset - save_start;971if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);972goptions = 0;973continue;974}975976/* Handle a successful match. Matches that use \K to end before they start977or start before the current point in the subject are not supported. */978979if (ovector[1] < ovector[0] || ovector[0] < start_offset)980{981rc = PCRE2_ERROR_BADSUBSPATTERN;982goto EXIT;983}984985/* Check for the same match as previous. This is legitimate after matching an986empty string that starts after the initial match offset. We have tried again987at the match point in case the pattern is one like /(?<=\G.)/ which can never988match at its starting point, so running the match achieves the bumpalong. If989we do get the same (null) match at the original match point, it isn't such a990pattern, so we now do the empty string magic. In all other cases, a repeat991match should never occur. */992993if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])994{995if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)996{997goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;998ovecsave[2] = start_offset;999continue; /* Back to the top of the loop */1000}1001rc = PCRE2_ERROR_INTERNAL_DUPMATCH;1002goto EXIT;1003}10041005/* Count substitutions with a paranoid check for integer overflow; surely no1006real call to this function would ever hit this! */10071008if (subs == INT_MAX)1009{1010rc = PCRE2_ERROR_TOOMANYREPLACE;1011goto EXIT;1012}1013subs++;10141015/* Copy the text leading up to the match (unless not required); remember1016where the insert begins and how many ovector pairs are set; and remember how1017much space we have requested in extra_needed. */10181019if (rc == 0) rc = ovector_count;1020fraglength = ovector[0] - start_offset;1021if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);1022scb.output_offsets[0] = buff_offset;1023scb.oveccount = rc;1024sub_start_extra_needed = extra_needed;10251026/* Process the replacement string. If the entire replacement is literal, just1027copy it with length check. */10281029ptr = replacement;1030if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)1031{1032CHECKMEMCPY(ptr, rlength);1033}10341035/* Within a non-literal replacement, which must be scanned character by1036character, local literal mode can be set by \Q, but only in extended mode1037when backslashes are being interpreted. In extended mode we must handle1038nested substrings that are to be reprocessed. */10391040else for (;;)1041{1042uint32_t ch;1043unsigned int chlen;1044int group;1045uint32_t special;1046PCRE2_SPTR text1_start = NULL;1047PCRE2_SPTR text1_end = NULL;1048PCRE2_SPTR text2_start = NULL;1049PCRE2_SPTR text2_end = NULL;1050PCRE2_UCHAR name[MAX_NAME_SIZE + 1];10511052/* If at the end of a nested substring, pop the stack. */10531054if (ptr >= repend)1055{1056if (ptrstackptr == 0) break; /* End of replacement string */1057repend = ptrstack[--ptrstackptr];1058ptr = ptrstack[--ptrstackptr];1059continue;1060}10611062/* Handle the next character */10631064if (escaped_literal)1065{1066if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)1067{1068escaped_literal = FALSE;1069ptr += 2;1070continue;1071}1072goto LOADLITERAL;1073}10741075/* Not in literal mode. */10761077if (*ptr == CHAR_DOLLAR_SIGN)1078{1079BOOL inparens;1080BOOL inangle;1081BOOL star;1082PCRE2_SIZE sublength;1083PCRE2_UCHAR next;1084PCRE2_SPTR subptr, subptrend;10851086if (++ptr >= repend) goto BAD;1087if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;10881089special = 0;1090text1_start = NULL;1091text1_end = NULL;1092text2_start = NULL;1093text2_end = NULL;1094group = -1;1095inparens = FALSE;1096inangle = FALSE;1097star = FALSE;1098subptr = NULL;1099subptrend = NULL;11001101/* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */1102if (next == CHAR_AMPERSAND)1103{1104++ptr;1105group = 0;1106goto GROUP_SUBSTITUTE;1107}1108if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE)1109{1110++ptr;1111rc = pcre2_substring_length_bynumber(match_data, 0, &sublength);1112if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */11131114if (next == CHAR_GRAVE_ACCENT)1115{1116subptr = subject;1117subptrend = subject + ovector[0];1118}1119else1120{1121subptr = subject + ovector[1];1122subptrend = subject + length;1123}11241125goto SUBPTR_SUBSTITUTE;1126}1127if (next == CHAR_UNDERSCORE)1128{1129/* Java, .NET support $_ for "entire input string". */1130++ptr;1131subptr = subject;1132subptrend = subject + length;1133goto SUBPTR_SUBSTITUTE;1134}11351136if (next == CHAR_LEFT_CURLY_BRACKET)1137{1138if (++ptr >= repend) goto BAD;1139next = *ptr;1140inparens = TRUE;1141}1142else if (next == CHAR_LESS_THAN_SIGN)1143{1144/* JavaScript compatibility syntax, $<name>. Processes only named1145groups (not numbered) and does not support extensions such as star1146(you can do ${name} and ${*name}, but not $<*name>). */1147if (++ptr >= repend) goto BAD;1148next = *ptr;1149inangle = TRUE;1150}11511152if (!inangle && next == CHAR_ASTERISK)1153{1154if (++ptr >= repend) goto BAD;1155next = *ptr;1156star = TRUE;1157}11581159if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)1160{1161group = next - CHAR_0;1162while (++ptr < repend)1163{1164next = *ptr;1165if (next < CHAR_0 || next > CHAR_9) break;1166group = group * 10 + (next - CHAR_0);11671168/* A check for a number greater than the hightest captured group1169is sufficient here; no need for a separate overflow check. If unknown1170groups are to be treated as unset, just skip over any remaining1171digits and carry on. */11721173if (group > code->top_bracket)1174{1175if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)1176{1177while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);1178break;1179}1180else1181{1182rc = PCRE2_ERROR_NOSUBSTRING;1183goto PTREXIT;1184}1185}1186}1187}1188else1189{1190PCRE2_SIZE name_len;1191PCRE2_SPTR name_start = ptr;1192if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))1193goto BAD;1194name_len = ptr - name_start;1195memcpy(name, name_start, CU2BYTES(name_len));1196name[name_len] = 0;1197}11981199next = 0; /* not used or updated after this point */1200(void)next;12011202/* In extended mode we recognize ${name:+set text:unset text} and1203${name:-default text}. */12041205if (inparens)1206{1207if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&1208!star && ptr < repend - 2 && *ptr == CHAR_COLON)1209{1210special = *(++ptr);1211if (special != CHAR_PLUS && special != CHAR_MINUS)1212{1213rc = PCRE2_ERROR_BADSUBSTITUTION;1214goto PTREXIT;1215}12161217text1_start = ++ptr;1218rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);1219if (rc != 0) goto PTREXIT;1220text1_end = ptr;12211222if (special == CHAR_PLUS && *ptr == CHAR_COLON)1223{1224text2_start = ++ptr;1225rc = find_text_end(code, &ptr, repend, TRUE);1226if (rc != 0) goto PTREXIT;1227text2_end = ptr;1228}1229}12301231else1232{1233if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)1234{1235rc = PCRE2_ERROR_REPMISSINGBRACE;1236goto PTREXIT;1237}1238}12391240ptr++;1241}12421243if (inangle)1244{1245if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)1246goto BAD;1247ptr++;1248}12491250/* Have found a syntactically correct group number or name, or *name.1251Only *MARK is currently recognized. */12521253if (star)1254{1255if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)1256{1257PCRE2_SPTR mark = pcre2_get_mark(match_data);1258if (mark != NULL)1259{1260/* Peek backwards one code unit to obtain the length of the mark.1261It can (theoretically) contain an embedded NUL. */1262fraglength = mark[-1];1263if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1264substitute_case_callout == NULL)1265CHECKCASECPY_DEFAULT(mark, fraglength);1266else1267CHECKMEMCPY(mark, fraglength);1268}1269}1270else goto BAD;1271}12721273/* Substitute the contents of a group. We don't use substring_copy1274functions any more, in order to support case forcing. */12751276else1277{1278GROUP_SUBSTITUTE:1279/* Find a number for a named group. In case there are duplicate names,1280search for the first one that is set. If the name is not found when1281PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a1282non-existent group. */12831284if (group < 0)1285{1286PCRE2_SPTR first, last, entry;1287rc = pcre2_substring_nametable_scan(code, name, &first, &last);1288if (rc == PCRE2_ERROR_NOSUBSTRING &&1289(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)1290{1291group = code->top_bracket + 1;1292}1293else1294{1295if (rc < 0) goto PTREXIT;1296for (entry = first; entry <= last; entry += rc)1297{1298uint32_t ng = GET2(entry, 0);1299if (ng < ovector_count)1300{1301if (group < 0) group = ng; /* First in ovector */1302if (ovector[ng*2] != PCRE2_UNSET)1303{1304group = ng; /* First that is set */1305break;1306}1307}1308}13091310/* If group is still negative, it means we did not find a group1311that is in the ovector. Just set the first group. */13121313if (group < 0) group = GET2(first, 0);1314}1315}13161317/* We now have a group that is identified by number. Find the length of1318the captured string. If a group in a non-special substitution is unset1319when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */13201321rc = pcre2_substring_length_bynumber(match_data, group, &sublength);1322if (rc < 0)1323{1324if (rc == PCRE2_ERROR_NOSUBSTRING &&1325(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)1326{1327rc = PCRE2_ERROR_UNSET;1328}1329if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */1330if (special == 0) /* Plain substitution */1331{1332if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;1333goto PTREXIT; /* Else error */1334}1335}13361337/* If special is '+' we have a 'set' and possibly an 'unset' text,1338both of which are reprocessed when used. If special is '-' we have a1339default text for when the group is unset; it must be reprocessed. */13401341if (special != 0)1342{1343if (special == CHAR_MINUS)1344{1345if (rc == 0) goto LITERAL_SUBSTITUTE;1346text2_start = text1_start;1347text2_end = text1_end;1348}13491350if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;1351ptrstack[ptrstackptr++] = ptr;1352ptrstack[ptrstackptr++] = repend;13531354if (rc == 0)1355{1356ptr = text1_start;1357repend = text1_end;1358}1359else1360{1361ptr = text2_start;1362repend = text2_end;1363}1364continue;1365}13661367/* Otherwise we have a literal substitution of a group's contents. */13681369LITERAL_SUBSTITUTE:1370subptr = subject + ovector[group*2];1371subptrend = subject + ovector[group*2 + 1];13721373/* Substitute a literal string, possibly forcing alphabetic case. */13741375SUBPTR_SUBSTITUTE:1376if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1377substitute_case_callout == NULL)1378CHECKCASECPY_DEFAULT(subptr, subptrend - subptr);1379else1380CHECKMEMCPY(subptr, subptrend - subptr);1381}1382} /* End of $ processing */13831384/* Handle an escape sequence in extended mode. We can use check_escape()1385to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but1386the case-forcing escapes are not supported in pcre2_compile() so must be1387recognized here. */13881389else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&1390*ptr == CHAR_BACKSLASH)1391{1392int errorcode;1393case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };13941395if (ptr < repend - 1) switch (ptr[1])1396{1397case CHAR_L:1398new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;1399new_forcecase.single_char = FALSE;1400ptr += 2;1401break;14021403case CHAR_l:1404new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;1405new_forcecase.single_char = TRUE;1406ptr += 2;1407if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)1408{1409/* Perl reverse-title-casing feature for \l\U */1410new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST;1411new_forcecase.single_char = FALSE;1412ptr += 2;1413}1414break;14151416case CHAR_U:1417new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER;1418new_forcecase.single_char = FALSE;1419ptr += 2;1420break;14211422case CHAR_u:1423new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;1424new_forcecase.single_char = TRUE;1425ptr += 2;1426if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)1427{1428/* Perl title-casing feature for \u\L */1429new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;1430new_forcecase.single_char = FALSE;1431ptr += 2;1432}1433break;14341435default:1436break;1437}14381439if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)1440{1441SETFORCECASE:14421443/* If the substitute_case_callout is unset, our case-forcing is done1444immediately. If there is a callout however, then its action is delayed1445until all the characters have been collected.14461447Apply the callout now, before we set the new casing mode. */14481449if (substitute_case_callout != NULL &&1450forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)1451DELAYEDFORCECASE();14521453forcecase = new_forcecase;1454casestart_offset = buff_offset;1455casestart_extra_needed = extra_needed;1456continue;1457}14581459ptr++; /* Point after \ */1460rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,1461code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);1462if (errorcode != 0) goto BADESCAPE;14631464switch(rc)1465{1466case ESC_E:1467goto SETFORCECASE;14681469case ESC_Q:1470escaped_literal = TRUE;1471continue;14721473case 0: /* Data character */1474case ESC_b: /* \b is backspace in a substitution */1475case ESC_v: /* \v is vertical tab in a substitution */14761477if (rc == ESC_b) ch = CHAR_BS;1478if (rc == ESC_v) ch = CHAR_VT;14791480#ifdef SUPPORT_UNICODE1481if (utf) chlen = PRIV(ord2utf)(ch, temp); else1482#endif1483{1484temp[0] = ch;1485chlen = 1;1486}14871488if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1489substitute_case_callout == NULL)1490CHECKCASECPY_DEFAULT(temp, chlen);1491else1492CHECKMEMCPY(temp, chlen);1493continue;14941495case ESC_g:1496{1497PCRE2_SIZE name_len;1498PCRE2_SPTR name_start;14991500/* Parse the \g<name> form (\g<number> already handled by check_escape) */1501if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)1502goto BADESCAPE;1503++ptr;15041505name_start = ptr;1506if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))1507goto BADESCAPE;1508name_len = ptr - name_start;15091510if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)1511goto BADESCAPE;1512++ptr;15131514special = 0;1515group = -1;1516memcpy(name, name_start, CU2BYTES(name_len));1517name[name_len] = 0;1518goto GROUP_SUBSTITUTE;1519}15201521default:1522if (rc < 0)1523{1524special = 0;1525group = -rc - 1;1526goto GROUP_SUBSTITUTE;1527}1528goto BADESCAPE;1529}1530} /* End of backslash processing */15311532/* Handle a literal code unit */15331534else1535{1536PCRE2_SPTR ch_start;15371538LOADLITERAL:1539ch_start = ptr;1540GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */1541(void) ch;15421543if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1544substitute_case_callout == NULL)1545CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start);1546else1547CHECKMEMCPY(ch_start, ptr - ch_start);1548} /* End handling a literal code unit */1549} /* End of loop for scanning the replacement. */15501551/* If the substitute_case_callout is unset, our case-forcing is done1552immediately. If there is a callout however, then its action is delayed1553until all the characters have been collected.15541555We now clean up any trailing section of the replacement for which we deferred1556the case-forcing. */15571558if (substitute_case_callout != NULL &&1559forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)1560DELAYEDFORCECASE();15611562/* The replacement has been copied to the output, or its size has been1563remembered. Handle the callout if there is one. */15641565if (mcontext != NULL && mcontext->substitute_callout != NULL)1566{1567/* If we an actual (non-simulated) replacement, do the callout. */15681569if (!overflowed)1570{1571scb.subscount = subs;1572scb.output_offsets[1] = buff_offset;1573rc = mcontext->substitute_callout(&scb,1574mcontext->substitute_callout_data);15751576/* A non-zero return means cancel this substitution. Instead, copy the1577matched string fragment. */15781579if (rc != 0)1580{1581PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];1582PCRE2_SIZE oldlength = ovector[1] - ovector[0];15831584buff_offset -= newlength;1585lengthleft += newlength;1586if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);15871588/* A negative return means do not do any more. */15891590if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);1591}1592}15931594/* In this interesting case, we cannot do the callout, so it's hard to1595estimate the required buffer size. What callers want is to be able to make1596two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH1597to discover the buffer size, and then a second and final call. Older1598versions of PCRE2 violated this assumption, by proceding as if the callout1599had returned zero - but on the second call to pcre2_substitute() it could1600return non-zero and then overflow the buffer again. Callers probably don't1601want to keep on looping to incrementally discover the buffer size. */16021603else1604{1605PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0];1606PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed;1607PCRE2_SIZE newlength =1608(newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)? /* Integer overflow */1609~(PCRE2_SIZE)0 : newlength_buf + newlength_extra; /* Cap the addition */1610PCRE2_SIZE oldlength = ovector[1] - ovector[0];16111612/* Be pessimistic: request whichever buffer size is larger out of1613accepting or rejecting the substitution. */16141615if (oldlength > newlength)1616{1617PCRE2_SIZE additional = oldlength - newlength;1618if (additional > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */1619goto TOOLARGEREPLACE;1620extra_needed += additional;1621}16221623/* Proceed as if the callout did not return a negative. A negative1624effectively rejects all future substitutions, but we want to examine them1625pessimistically. */1626}1627}16281629/* Save the details of this match. See above for how this data is used. If we1630matched an empty string, do the magic for global matches. Update the start1631offset to point to the rest of the subject string. If we re-used an existing1632match for the first match, switch to the internal match data block. */16331634ovecsave[0] = ovector[0];1635ovecsave[1] = ovector[1];1636ovecsave[2] = start_offset;16371638goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :1639PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;1640start_offset = ovector[1];1641} while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */16421643/* Copy the rest of the subject unless not required, and terminate the output1644with a binary zero. */16451646if (!replacement_only)1647{1648fraglength = length - start_offset;1649CHECKMEMCPY(subject + start_offset, fraglength);1650}16511652temp[0] = 0;1653CHECKMEMCPY(temp, 1);16541655/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,1656and matching has carried on after a full buffer, in order to compute the length1657needed. Otherwise, an overflow generates an immediate error return. */16581659if (overflowed)1660{1661rc = PCRE2_ERROR_NOMEMORY;16621663if (extra_needed > ~(PCRE2_SIZE)0 - buff_length) /* Integer overflow */1664goto TOOLARGEREPLACE;1665*blength = buff_length + extra_needed;1666}16671668/* After a successful execution, return the number of substitutions and set the1669length of buffer used, excluding the trailing zero. */16701671else1672{1673rc = subs;1674*blength = buff_offset - 1;1675}16761677EXIT:1678if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);1679else match_data->rc = rc;1680return rc;16811682NOROOM:1683rc = PCRE2_ERROR_NOMEMORY;1684goto EXIT;16851686CASEERROR:1687rc = PCRE2_ERROR_REPLACECASE;1688goto EXIT;16891690TOOLARGEREPLACE:1691rc = PCRE2_ERROR_TOOLARGEREPLACE;1692goto EXIT;16931694BAD:1695rc = PCRE2_ERROR_BADREPLACEMENT;1696goto PTREXIT;16971698BADESCAPE:1699rc = PCRE2_ERROR_BADREPESCAPE;17001701PTREXIT:1702*blength = (PCRE2_SIZE)(ptr - replacement);1703goto EXIT;1704}17051706/* End of pcre2_substitute.c */170717081709