Path: blob/master/thirdparty/pcre2/src/pcre2_substitute.c
21745 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#include "pcre2_internal.h"42434445#define PTR_STACK_SIZE 204647#define SUBSTITUTE_OPTIONS \48(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \49PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \50PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \51PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)52535455/*************************************************56* Find end of substitute text *57*************************************************/5859/* In extended mode, we recognize ${name:+set text:unset text} and similar60constructions. This requires the identification of unescaped : and }61characters. This function scans for such. It must deal with nested ${62constructions. The pointer to the text is updated, either to the required end63character, or to where an error was detected.6465Arguments:66code points to the compiled expression (for options)67ptrptr points to the pointer to the start of the text (updated)68ptrend end of the whole string69last TRUE if the last expected string (only } recognized)7071Returns: 0 on success72negative error code on failure73*/7475static int76find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,77BOOL last)78{79int rc = 0;80uint32_t nestlevel = 0;81BOOL literal = FALSE;82PCRE2_SPTR ptr = *ptrptr;8384for (; ptr < ptrend; ptr++)85{86if (literal)87{88if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)89{90literal = FALSE;91ptr += 1;92}93}9495else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)96{97if (nestlevel == 0) goto EXIT;98nestlevel--;99}100101else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;102103else if (*ptr == CHAR_DOLLAR_SIGN)104{105if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)106{107nestlevel++;108ptr += 1;109}110}111112else if (*ptr == CHAR_BACKSLASH)113{114int erc;115int errorcode;116uint32_t ch;117118if (ptr < ptrend - 1) switch (ptr[1])119{120case CHAR_L:121case CHAR_l:122case CHAR_U:123case CHAR_u:124ptr += 1;125continue;126}127128ptr += 1; /* Must point after \ */129erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,130code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);131if (errorcode != 0)132{133/* errorcode from check_escape is positive, so must not be returned by134pcre2_substitute(). */135rc = PCRE2_ERROR_BADREPESCAPE;136goto EXIT;137}138139switch(erc)140{141case 0: /* Data character */142case ESC_b: /* Data character */143case ESC_v: /* Data character */144case ESC_E: /* Isolated \E is ignored */145break;146147case ESC_Q:148literal = TRUE;149break;150151case ESC_g:152/* The \g<name> form (\g<number> already handled by check_escape)153154Don't worry about finding the matching ">". We are super, super lenient155about validating ${} replacements inside find_text_end(), so we certainly156don't need to worry about other syntax. Importantly, a \g<..> or $<...>157sequence can't contain a '}' character. */158break;159160default:161if (erc < 0)162break; /* capture group reference */163rc = PCRE2_ERROR_BADREPESCAPE;164goto EXIT;165}166}167}168169rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */170171EXIT:172*ptrptr = ptr;173return rc;174}175176177/*************************************************178* Validate group name *179*************************************************/180181/* This function scans for a capture group name, validating it182consists of legal characters, is not empty, and does not exceed183MAX_NAME_SIZE.184185Arguments:186ptrptr points to the pointer to the start of the text (updated)187ptrend end of the whole string188utf true if the input is UTF-encoded189ctypes pointer to the character types table190191Returns: TRUE if a name was read192FALSE otherwise193*/194195static BOOL196read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,197const uint8_t* ctypes)198{199PCRE2_SPTR ptr = *ptrptr;200PCRE2_SPTR nameptr = ptr;201202if (ptr >= ptrend) /* No characters in name */203goto FAILED;204205/* We do not need to check whether the name starts with a non-digit.206We are simply referencing names here, not defining them. */207208/* See read_name in the pcre2_compile.c for the corresponding logic209restricting group names inside the pattern itself. */210211#ifdef SUPPORT_UNICODE212if (utf)213{214uint32_t c, type;215216while (ptr < ptrend)217{218GETCHAR(c, ptr);219type = UCD_CHARTYPE(c);220if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&221c != CHAR_UNDERSCORE) break;222ptr++;223FORWARDCHARTEST(ptr, ptrend);224}225}226else227#else228(void)utf; /* Avoid compiler warning */229#endif /* SUPPORT_UNICODE */230231/* Handle group names in non-UTF modes. */232233{234while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)235{236ptr++;237}238}239240/* Check name length */241242if (ptr - nameptr > MAX_NAME_SIZE)243goto FAILED;244245/* Subpattern names must not be empty */246if (ptr == nameptr)247goto FAILED;248249*ptrptr = ptr;250return TRUE;251252FAILED:253*ptrptr = ptr;254return FALSE;255}256257258/*************************************************259* Case transformations *260*************************************************/261262#define PCRE2_SUBSTITUTE_CASE_NONE 0263// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.264#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST 4265266typedef struct {267int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */268BOOL single_char;269} case_state;270271/* Helper to guess how much a string is likely to increase in size when272case-transformed. Usually, strings don't change size at all, but some rare273characters do grow. Estimate +10%, plus another few characters.274275Performing this estimation is unfortunate, but inevitable, since we can't call276the callout if we ran out of buffer space to prepare its input.277278Because this estimate is inexact (and in pathological cases, underestimates the279required buffer size) we must document that when you have a280substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you281may need more than two calls to determine the final buffer size. */282283static PCRE2_SIZE284pessimistic_case_inflation(PCRE2_SIZE len)285{286return (len >> 3u) + 10;287}288289/* Case transformation behaviour if no callout is passed. */290291static PCRE2_SIZE292default_substitute_case_callout(293PCRE2_SPTR input, PCRE2_SIZE input_len,294PCRE2_UCHAR *output, PCRE2_SIZE output_cap,295case_state *state, const pcre2_code *code)296{297PCRE2_SPTR input_end = input + input_len;298#ifdef SUPPORT_UNICODE299BOOL utf;300BOOL ucp;301#endif302PCRE2_UCHAR temp[6];303BOOL next_to_upper;304BOOL rest_to_upper;305BOOL single_char;306BOOL overflow = FALSE;307PCRE2_SIZE written = 0;308309/* Helpful simplifying invariant: input and output are disjoint buffers.310I believe that this code is technically undefined behaviour, because the two311pointers input/output are "unrelated" pointers and hence not comparable. Casting312via char* bypasses some but not all of those technical rules. It is not included313in release builds, in any case. */314PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||315(char *)(output + output_cap) <= (char *)input);316317#ifdef SUPPORT_UNICODE318utf = (code->overall_options & PCRE2_UTF) != 0;319ucp = (code->overall_options & PCRE2_UCP) != 0;320#endif321322if (input_len == 0) return 0;323324switch (state->to_case)325{326/* LCOV_EXCL_START */327default:328PCRE2_DEBUG_UNREACHABLE();329return 0;330/* LCOV_EXCL_STOP */331332case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE333case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE334next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);335break;336337case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE338next_to_upper = TRUE;339rest_to_upper = FALSE;340state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;341break;342343case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE344next_to_upper = FALSE;345rest_to_upper = TRUE;346state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;347break;348}349350single_char = state->single_char;351if (single_char)352state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;353354while (input < input_end)355{356uint32_t ch;357unsigned int chlen;358359GETCHARINCTEST(ch, input);360361#ifdef SUPPORT_UNICODE362if ((utf || ucp) && ch >= 128)363{364uint32_t type = UCD_CHARTYPE(ch);365if (PRIV(ucp_gentype)[type] == ucp_L &&366type != (next_to_upper? ucp_Lu : ucp_Ll))367ch = UCD_OTHERCASE(ch);368369/* TODO This is far from correct... it doesn't support the SpecialCasing.txt370mappings, but worse, it's not even correct for all the ordinary case371mappings. We should add support for those (at least), and then add the372SpecialCasing.txt mappings for Esszet and ligatures, and finally use the373Turkish casing flag on the match context. */374}375else376#endif377if (MAX_255(ch))378{379if (((code->tables + cbits_offset +380(next_to_upper? cbit_upper:cbit_lower)381)[ch/8] & (1u << (ch%8))) == 0)382ch = (code->tables + fcc_offset)[ch];383}384385#ifdef SUPPORT_UNICODE386if (utf) chlen = PRIV(ord2utf)(ch, temp); else387#endif388{389temp[0] = ch;390chlen = 1;391}392393if (!overflow && chlen <= output_cap)394{395memcpy(output, temp, CU2BYTES(chlen));396output += chlen;397output_cap -= chlen;398}399else400{401overflow = TRUE;402}403404if (chlen > ~(PCRE2_SIZE)0 - written) /* Integer overflow */405return ~(PCRE2_SIZE)0;406written += chlen;407408next_to_upper = rest_to_upper;409410/* memcpy the remainder, if only transforming a single character. */411412if (single_char)413{414PCRE2_SIZE rest_len = input_end - input;415416if (!overflow && rest_len <= output_cap)417memcpy(output, input, CU2BYTES(rest_len));418419if (rest_len > ~(PCRE2_SIZE)0 - written) /* Integer overflow */420return ~(PCRE2_SIZE)0;421written += rest_len;422423return written;424}425}426427return written;428}429430/* Helper to perform the call to the substitute_case_callout. We wrap the431user-provided callout because our internal arguments are slightly extended. We432don't want the user callout to handle the case of "\l" (first character only to433lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because434those are not operations defined by Unicode. Instead the user callout simply435needs to provide the three Unicode primitives: lower, upper, titlecase. */436437static PCRE2_SIZE438do_case_copy(439PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,440case_state *state, BOOL utf,441PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,442PCRE2_SIZE, int, void *),443void *substitute_case_callout_data)444{445PCRE2_SPTR input = input_output;446PCRE2_UCHAR *output = input_output;447PCRE2_SIZE rc;448PCRE2_SIZE rc2;449int ch1_to_case;450int rest_to_case;451PCRE2_UCHAR ch1[6];452PCRE2_SIZE ch1_len;453PCRE2_SPTR rest;454PCRE2_SIZE rest_len;455BOOL ch1_overflow = FALSE;456BOOL rest_overflow = FALSE;457458#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)459(void)utf; /* Avoid compiler warning. */460#endif461462PCRE2_ASSERT(input_len != 0);463464switch (state->to_case)465{466/* LCOV_EXCL_START */467default:468PCRE2_DEBUG_UNREACHABLE();469return 0;470/* LCOV_EXCL_STOP */471472case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE473case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE474case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE475476/* The easy case, where our internal casing operations align with those of477the callout. */478479if (state->single_char == FALSE)480{481rc = substitute_case_callout(input, input_len, output, output_cap,482state->to_case, substitute_case_callout_data);483484if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)485state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;486487return rc;488}489490ch1_to_case = state->to_case;491rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;492break;493494case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE495ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;496rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;497break;498}499500/* Identify the leading character. Take copy, because its storage overlaps with501`output`, and hence may be scrambled by the callout. */502503{504PCRE2_SPTR ch_end = input;505uint32_t ch;506507GETCHARINCTEST(ch, ch_end);508(void) ch;509PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);510ch1_len = ch_end - input;511memcpy(ch1, input, CU2BYTES(ch1_len));512}513514rest = input + ch1_len;515rest_len = input_len - ch1_len;516517/* Transform just ch1. The buffers are always in-place (input == output). With a518custom callout, we need a loop to discover its required buffer size. The loop519wouldn't be required if the callout were well-behaved, but it might be naughty520and return "5" the first time, then "10" the next time we call it using the521exact same input! */522523{524PCRE2_SIZE ch1_cap;525PCRE2_SIZE max_ch1_cap;526527ch1_cap = ch1_len; /* First attempt uses the space vacated by ch1. */528PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);529max_ch1_cap = output_cap - rest_len;530531while (TRUE)532{533rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,534substitute_case_callout_data);535if (rc == ~(PCRE2_SIZE)0) return rc;536537if (rc <= ch1_cap) break;538539if (rc > max_ch1_cap)540{541ch1_overflow = TRUE;542break;543}544545/* Move the rest to the right, to make room for expanding ch1. */546547memmove(input_output + rc, rest, CU2BYTES(rest_len));548rest = input + rc;549550ch1_cap = rc;551552/* Proof of loop termination: `ch1_cap` is growing on each iteration, but553the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */554}555}556557if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)558{559if (!ch1_overflow)560{561PCRE2_ASSERT(rest_len <= output_cap - rc);562memmove(output + rc, rest, CU2BYTES(rest_len));563}564rc2 = rest_len;565566state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;567}568else569{570PCRE2_UCHAR dummy[1];571572rc2 = substitute_case_callout(rest, rest_len,573ch1_overflow? dummy : output + rc,574ch1_overflow? 0u : output_cap - rc,575rest_to_case, substitute_case_callout_data);576if (rc2 == ~(PCRE2_SIZE)0) return rc2;577578if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;579580/* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then581`rest` shrinks, it's actually possible for the total calculated length of582`xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't583report that, because it would make it seem that the operation succeeded.584If either of xform(ch1) or xform(rest) won't fit in the buffer, our final585result must be > output_cap. */586if (ch1_overflow && rc2 < rest_len)587rc2 = rest_len;588589state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;590}591592if (rc2 > ~(PCRE2_SIZE)0 - rc) /* Integer overflow */593return ~(PCRE2_SIZE)0;594595PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);596(void)rest_overflow;597598return rc + rc2;599}600601602/*************************************************603* Match and substitute *604*************************************************/605606/* This function applies a compiled re to a subject string and creates a new607string with substitutions. The first 7 arguments are the same as for608pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.609610Arguments:611code points to the compiled expression612subject points to the subject string613length length of subject string (may contain binary zeros)614start_offset where to start in the subject string615options option bits616match_data points to a match_data block, or is NULL617context points a PCRE2 context618replacement points to the replacement string619rlength length of replacement string620buffer where to put the substituted string621blength points to length of buffer; updated to length of string622623Returns: >= 0 number of substitutions made624< 0 an error code625PCRE2_ERROR_BADREPLACEMENT means invalid use of $626*/627628/* This macro checks for space in the buffer before copying into it. On629overflow, either give an error immediately, or keep on, accumulating the630length. */631632#define CHECKMEMCPY(from, length_) \633do { \634PCRE2_SIZE chkmc_length = length_; \635if (overflowed) \636{ \637if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \638goto TOOLARGEREPLACE; \639extra_needed += chkmc_length; \640} \641else if (lengthleft < chkmc_length) \642{ \643if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \644overflowed = TRUE; \645extra_needed = chkmc_length - lengthleft; \646} \647else \648{ \649memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \650buff_offset += chkmc_length; \651lengthleft -= chkmc_length; \652} \653} \654while (0)655656/* This macro checks for space and copies characters with casing modifications.657On overflow, it behaves as for CHECKMEMCPY().658659When substitute_case_callout is NULL, the source and destination buffers must660not overlap, because our default handler does not support this. */661662#define CHECKCASECPY_BASE(length_, do_call) \663do { \664PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \665PCRE2_SIZE chkcc_rc; \666do_call \667if (lengthleft < chkcc_rc) \668{ \669if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \670overflowed = TRUE; \671extra_needed = chkcc_rc - lengthleft; \672} \673else \674{ \675buff_offset += chkcc_rc; \676lengthleft -= chkcc_rc; \677} \678} \679while (0)680681#define CHECKCASECPY_DEFAULT(from, length_) \682CHECKCASECPY_BASE(length_, { \683chkcc_rc = default_substitute_case_callout(from, chkcc_length, \684buffer + buff_offset, \685overflowed? 0 : lengthleft, \686&forcecase, code); \687if (overflowed) \688{ \689if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \690goto TOOLARGEREPLACE; \691extra_needed += chkcc_rc; \692break; \693} \694})695696#define CHECKCASECPY_CALLOUT(length_) \697CHECKCASECPY_BASE(length_, { \698chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \699lengthleft, &forcecase, utf, \700substitute_case_callout, \701substitute_case_callout_data); \702if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \703})704705/* This macro does a delayed case transformation, for the situation when we have706a case-forcing callout. */707708#define DELAYEDFORCECASE() \709do { \710PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \711(extra_needed - casestart_extra_needed); \712if (chars_outstanding > 0) \713{ \714if (overflowed) \715{ \716PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \717if (guess > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \718goto TOOLARGEREPLACE; \719extra_needed += guess; \720} \721else \722{ \723/* Rewind the buffer */ \724lengthleft += (buff_offset - casestart_offset); \725buff_offset = casestart_offset; \726/* Care! In-place case transformation */ \727CHECKCASECPY_CALLOUT(chars_outstanding); \728} \729} \730} \731while (0)732733734/* Here's the function */735736PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION737pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,738PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,739pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,740PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)741{742int rc;743int subs;744uint32_t ovector_count;745uint32_t goptions = 0;746uint32_t suboptions;747pcre2_match_data *internal_match_data = NULL;748BOOL escaped_literal = FALSE;749BOOL overflowed = FALSE;750BOOL use_existing_match;751BOOL replacement_only;752BOOL utf = (code->overall_options & PCRE2_UTF) != 0;753PCRE2_UCHAR temp[6];754PCRE2_UCHAR null_str[1] = { 0xcd };755PCRE2_SPTR original_subject = subject;756PCRE2_SPTR ptr;757PCRE2_SPTR repend = NULL;758PCRE2_SIZE extra_needed = 0;759PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;760PCRE2_SIZE *ovector;761PCRE2_SIZE ovecsave[2] = { 0, 0 };762pcre2_substitute_callout_block scb;763PCRE2_SIZE sub_start_extra_needed;764PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,765PCRE2_SIZE, int, void *) = NULL;766void *substitute_case_callout_data = NULL;767768/* General initialization */769770buff_offset = 0;771lengthleft = buff_length = *blength;772*blength = PCRE2_UNSET;773774if (mcontext != NULL)775{776substitute_case_callout = mcontext->substitute_case_callout;777substitute_case_callout_data = mcontext->substitute_case_callout_data;778}779780/* Partial matching is not valid. This must come after setting *blength to781PCRE2_UNSET, so as not to imply an offset in the replacement. */782783if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)784return PCRE2_ERROR_BADOPTION;785786/* Validate length and find the end of the replacement. A NULL replacement of787zero length is interpreted as an empty string. */788789if (replacement == NULL)790{791if (rlength != 0) return PCRE2_ERROR_NULL;792replacement = null_str;793}794795if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);796repend = replacement + rlength;797798/* A NULL subject of zero length is treated as an empty string. */799800if (subject == NULL)801{802if (length != 0) return PCRE2_ERROR_NULL;803subject = null_str;804}805806if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);807808/* Check for using a match that has already happened. Note that the subject809pointer in the match data may be NULL after a no-match. */810811use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);812replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);813814if (use_existing_match && match_data == NULL) return PCRE2_ERROR_NULL;815816/* If an existing match is being passed in, we should check that it matches817the passed-in subject pointer, length, and match options. We don't currently818have a use-case for someone to match on one subject, then try and use that819match data on a different subject. In a UTF-encoded string, a simple change820like replacing one character for another won't preserve the code unit offsets,821so it's hard to see, in the general case, how it would be safe or useful to822support swapping or mutating the subject string.823824Similarly, using different match options between the first (external) and825subsequent (internal, global) matches is hard to justify. */826827if (use_existing_match)828{829/* Return early, as the rest of the match_data may not have been830initialised. This duplicates and must be in sync with the check below that831aborts substitution on any result other than success or no-match. */832if (match_data->rc < 0 && match_data->rc != PCRE2_ERROR_NOMATCH)833return match_data->rc;834835/* Not supported if the passed-in match was from the DFA interpreter. */836if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)837return PCRE2_ERROR_DFA_UFUNC;838839if (code != match_data->code)840return PCRE2_ERROR_DIFFSUBSPATTERN;841842/* We want the passed-in subject strings to match. This implies the effective843length must match, and either: the pointers are equal (with strict matching844of NULL against NULL); or, the special case of PCRE2_COPY_MATCHED_SUBJECT845where we cannot compare pointers but we can verify the contents. */846if (length != match_data->subject_length ||847!(original_subject == match_data->subject ||848((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0 &&849(length == 0 ||850memcmp(subject, match_data->subject, CU2BYTES(length)) == 0))))851return PCRE2_ERROR_DIFFSUBSSUBJECT;852853if (start_offset != match_data->start_offset)854return PCRE2_ERROR_DIFFSUBSOFFSET;855856if ((options & ~SUBSTITUTE_OPTIONS) != match_data->options)857return PCRE2_ERROR_DIFFSUBSOPTIONS;858}859860/* If starting from an existing match, there must be an externally provided861match data block. We create an internal match_data block in two cases: (a) an862external one is not supplied (and we are not starting from an existing match);863(b) an existing match is to be used for the first substitution. In the latter864case, we copy the existing match into the internal block, except for any cached865heap frame size and pointer. This ensures that no changes are made to the866external match data block. */867868/* WARNING: In both cases below a general context is constructed "by hand"869because calling pcre2_general_context_create() involves a memory allocation. If870the contents of a general context control block are ever changed there will871have to be changes below. */872873if (match_data == NULL)874{875pcre2_general_context gcontext;876gcontext.memctl = (mcontext == NULL)?877((pcre2_real_code *)code)->memctl :878((pcre2_real_match_context *)mcontext)->memctl;879match_data = internal_match_data =880pcre2_match_data_create_from_pattern(code, &gcontext);881if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;882}883884else if (use_existing_match)885{886int pairs;887pcre2_general_context gcontext;888gcontext.memctl = (mcontext == NULL)?889((pcre2_real_code *)code)->memctl :890((pcre2_real_match_context *)mcontext)->memctl;891pairs = (code->top_bracket + 1 < match_data->oveccount)?892code->top_bracket + 1 : match_data->oveccount;893internal_match_data = pcre2_match_data_create(match_data->oveccount,894&gcontext);895if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;896memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)897+ 2*pairs*sizeof(PCRE2_SIZE));898internal_match_data->heapframes = NULL;899internal_match_data->heapframes_size = 0;900/* Ensure that the subject is not freed when internal_match_data is */901internal_match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;902match_data = internal_match_data;903}904905/* If using an internal match data, there's no need to copy the subject. */906907if (internal_match_data != NULL) options &= ~PCRE2_COPY_MATCHED_SUBJECT;908909/* Remember ovector details */910911ovector = pcre2_get_ovector_pointer(match_data);912ovector_count = pcre2_get_ovector_count(match_data);913914/* Fixed things in the callout block */915916scb.version = 0;917scb.input = subject;918scb.output = (PCRE2_SPTR)buffer;919scb.ovector = ovector;920921/* Check UTF replacement string if necessary. */922923#ifdef SUPPORT_UNICODE924if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)925{926rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));927if (rc != 0)928{929match_data->leftchar = 0;930goto EXIT;931}932}933#endif /* SUPPORT_UNICODE */934935/* Save the substitute options and remove them from the match options. */936937suboptions = options & SUBSTITUTE_OPTIONS;938options &= ~SUBSTITUTE_OPTIONS;939940/* Error if the start match offset is greater than the length of the subject. */941942if (start_offset > length)943{944match_data->leftchar = 0;945rc = PCRE2_ERROR_BADOFFSET;946goto EXIT;947}948949/* Copy up to the start offset, unless only the replacement is required. */950951if (!replacement_only) CHECKMEMCPY(subject, start_offset);952953/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first954match is taken from the match_data that was passed in. */955956subs = 0;957for (;;)958{959PCRE2_SPTR ptrstack[PTR_STACK_SIZE];960uint32_t ptrstackptr = 0;961case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };962PCRE2_SIZE casestart_offset = 0;963PCRE2_SIZE casestart_extra_needed = 0;964965if (use_existing_match)966{967rc = match_data->rc;968use_existing_match = FALSE;969}970else rc = pcre2_match(code, subject, length, start_offset, options|goptions,971match_data, mcontext);972973#ifdef SUPPORT_UNICODE974if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */975#endif976977/* Any error other than no match returns the error code. No match breaks the978global loop. */979980if (rc == PCRE2_ERROR_NOMATCH) break;981982if (rc < 0) goto EXIT;983984/* Handle a successful match. Matches that use \K to end before they start985or start before the current point in the subject are not supported. */986987if (ovector[1] < ovector[0] || ovector[0] < start_offset)988{989rc = PCRE2_ERROR_BADSUBSPATTERN;990goto EXIT;991}992993/* Assert that our replacement loop is making progress, checked even in994release builds. This should be impossible to hit, however, an infinite loop995would be fairly catastrophic.996997"Progress" is measured as ovector[1] strictly advancing, or, an empty match998after a non-empty match. */9991000/* LCOV_EXCL_START */1001if (subs > 0 &&1002!(ovector[1] > ovecsave[1] ||1003(ovector[1] == ovector[0] && ovecsave[1] > ovecsave[0] &&1004ovector[1] == ovecsave[1])))1005{1006PCRE2_DEBUG_UNREACHABLE();1007rc = PCRE2_ERROR_INTERNAL_DUPMATCH;1008goto EXIT;1009}1010/* LCOV_EXCL_STOP */10111012ovecsave[0] = ovector[0];1013ovecsave[1] = ovector[1];10141015/* Count substitutions with a paranoid check for integer overflow; surely no1016real call to this function would ever hit this! */10171018if (subs == INT_MAX)1019{1020rc = PCRE2_ERROR_TOOMANYREPLACE;1021goto EXIT;1022}1023subs++;10241025/* Copy the text leading up to the match (unless not required); remember1026where the insert begins and how many ovector pairs are set; and remember how1027much space we have requested in extra_needed. */10281029if (rc == 0) rc = ovector_count;1030fraglength = ovector[0] - start_offset;1031if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);1032scb.output_offsets[0] = buff_offset;1033scb.oveccount = rc;1034sub_start_extra_needed = extra_needed;10351036/* Process the replacement string. If the entire replacement is literal, just1037copy it with length check. */10381039ptr = replacement;1040if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)1041{1042CHECKMEMCPY(ptr, rlength);1043}10441045/* Within a non-literal replacement, which must be scanned character by1046character, local literal mode can be set by \Q, but only in extended mode1047when backslashes are being interpreted. In extended mode we must handle1048nested substrings that are to be reprocessed. */10491050else for (;;)1051{1052uint32_t ch;1053unsigned int chlen;1054int group;1055uint32_t special;1056PCRE2_SPTR text1_start = NULL;1057PCRE2_SPTR text1_end = NULL;1058PCRE2_SPTR text2_start = NULL;1059PCRE2_SPTR text2_end = NULL;1060PCRE2_UCHAR name[MAX_NAME_SIZE + 1];10611062/* If at the end of a nested substring, pop the stack. */10631064if (ptr >= repend)1065{1066if (ptrstackptr == 0) break; /* End of replacement string */1067repend = ptrstack[--ptrstackptr];1068ptr = ptrstack[--ptrstackptr];1069continue;1070}10711072/* Handle the next character */10731074if (escaped_literal)1075{1076if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)1077{1078escaped_literal = FALSE;1079ptr += 2;1080continue;1081}1082goto LOADLITERAL;1083}10841085/* Not in literal mode. */10861087if (*ptr == CHAR_DOLLAR_SIGN)1088{1089BOOL inparens;1090BOOL inangle;1091BOOL star;1092PCRE2_SIZE sublength;1093PCRE2_UCHAR next;1094PCRE2_SPTR subptr, subptrend;10951096if (++ptr >= repend) goto BAD;1097if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;10981099special = 0;1100text1_start = NULL;1101text1_end = NULL;1102text2_start = NULL;1103text2_end = NULL;1104group = -1;1105inparens = FALSE;1106inangle = FALSE;1107star = FALSE;1108subptr = NULL;1109subptrend = NULL;11101111/* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */1112if (next == CHAR_AMPERSAND)1113{1114++ptr;1115group = 0;1116goto GROUP_SUBSTITUTE;1117}1118if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE)1119{1120++ptr;1121rc = pcre2_substring_length_bynumber(match_data, 0, &sublength);1122if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */11231124if (next == CHAR_GRAVE_ACCENT)1125{1126subptr = subject;1127subptrend = subject + ovector[0];1128}1129else1130{1131subptr = subject + ovector[1];1132subptrend = subject + length;1133}11341135goto SUBPTR_SUBSTITUTE;1136}1137if (next == CHAR_UNDERSCORE)1138{1139/* Java, .NET support $_ for "entire input string". */1140++ptr;1141subptr = subject;1142subptrend = subject + length;1143goto SUBPTR_SUBSTITUTE;1144}1145else if (next == CHAR_PLUS &&1146!(ptr+1 < repend && ptr[1] == CHAR_LEFT_CURLY_BRACKET))1147{1148/* Perl supports $+ for "highest captured group" (not the same as $^N1149which is mainly only useful inside Perl's match callbacks). We also1150don't accept "$+{..." since that's Perl syntax for our ${name}. */1151++ptr;1152if (code->top_bracket == 0)1153{1154/* Treat either as "no such group" or "all groups unset" based on the1155PCRE2_SUBSTITUTE_UNKNOWN_UNSET option. */1156if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) == 0)1157{1158rc = PCRE2_ERROR_NOSUBSTRING;1159goto PTREXIT;1160}1161group = 0;1162}1163else1164{1165/* If we have any capture groups, then the ovector needs to be large1166enough for all of them, or the result won't be accurate. */1167if (match_data->oveccount < code->top_bracket + 1)1168{1169rc = PCRE2_ERROR_UNAVAILABLE;1170goto PTREXIT;1171}1172for (group = code->top_bracket; group > 0; group--)1173if (ovector[2*group] != PCRE2_UNSET) break;1174}1175if (group == 0)1176{1177if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;1178rc = PCRE2_ERROR_UNSET;1179goto PTREXIT;1180}1181goto GROUP_SUBSTITUTE;1182}11831184if (next == CHAR_LEFT_CURLY_BRACKET)1185{1186if (++ptr >= repend) goto BAD;1187next = *ptr;1188inparens = TRUE;1189}1190else if (next == CHAR_LESS_THAN_SIGN)1191{1192/* JavaScript compatibility syntax, $<name>. Processes only named1193groups (not numbered) and does not support extensions such as star1194(you can do ${name} and ${*name}, but not $<*name>). */1195if (++ptr >= repend) goto BAD;1196next = *ptr;1197inangle = TRUE;1198}11991200if (!inangle && next == CHAR_ASTERISK)1201{1202if (++ptr >= repend) goto BAD;1203next = *ptr;1204star = TRUE;1205}12061207if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)1208{1209group = next - CHAR_0;1210while (++ptr < repend)1211{1212next = *ptr;1213if (next < CHAR_0 || next > CHAR_9) break;1214group = group * 10 + (next - CHAR_0);12151216/* A check for a number greater than the hightest captured group1217is sufficient here; no need for a separate overflow check. If unknown1218groups are to be treated as unset, just skip over any remaining1219digits and carry on. */12201221if (group > code->top_bracket)1222{1223if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)1224{1225while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);1226break;1227}1228else1229{1230rc = PCRE2_ERROR_NOSUBSTRING;1231goto PTREXIT;1232}1233}1234}1235}1236else1237{1238PCRE2_SIZE name_len;1239PCRE2_SPTR name_start = ptr;1240if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))1241goto BAD;1242name_len = ptr - name_start;1243memcpy(name, name_start, CU2BYTES(name_len));1244name[name_len] = 0;1245}12461247next = 0; /* not used or updated after this point */1248(void)next;12491250/* In extended mode we recognize ${name:+set text:unset text} and1251${name:-default text}. */12521253if (inparens)1254{1255if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&1256!star && ptr < repend - 2 && *ptr == CHAR_COLON)1257{1258special = *(++ptr);1259if (special != CHAR_PLUS && special != CHAR_MINUS)1260{1261rc = PCRE2_ERROR_BADSUBSTITUTION;1262goto PTREXIT;1263}12641265text1_start = ++ptr;1266rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);1267if (rc != 0) goto PTREXIT;1268text1_end = ptr;12691270if (special == CHAR_PLUS && *ptr == CHAR_COLON)1271{1272text2_start = ++ptr;1273rc = find_text_end(code, &ptr, repend, TRUE);1274if (rc != 0) goto PTREXIT;1275text2_end = ptr;1276}1277}12781279else1280{1281if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)1282{1283rc = PCRE2_ERROR_REPMISSINGBRACE;1284goto PTREXIT;1285}1286}12871288ptr++;1289}12901291if (inangle)1292{1293if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)1294goto BAD;1295ptr++;1296}12971298/* Have found a syntactically correct group number or name, or *name.1299Only *MARK is currently recognized. */13001301if (star)1302{1303if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)1304{1305PCRE2_SPTR mark = pcre2_get_mark(match_data);1306if (mark != NULL)1307{1308/* Peek backwards one code unit to obtain the length of the mark.1309It can (theoretically) contain an embedded NUL. */1310fraglength = mark[-1];1311if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1312substitute_case_callout == NULL)1313CHECKCASECPY_DEFAULT(mark, fraglength);1314else1315CHECKMEMCPY(mark, fraglength);1316}1317}1318else goto BAD;1319}13201321/* Substitute the contents of a group. We don't use substring_copy1322functions any more, in order to support case forcing. */13231324else1325{1326GROUP_SUBSTITUTE:1327/* Find a number for a named group. In case there are duplicate names,1328search for the first one that is set. If the name is not found when1329PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a1330non-existent group. */13311332if (group < 0)1333{1334PCRE2_SPTR first, last, entry;1335rc = pcre2_substring_nametable_scan(code, name, &first, &last);1336if (rc == PCRE2_ERROR_NOSUBSTRING &&1337(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)1338{1339group = code->top_bracket + 1;1340}1341else1342{1343if (rc < 0) goto PTREXIT;1344for (entry = first; entry <= last; entry += rc)1345{1346uint32_t ng = GET2(entry, 0);1347if (ng < ovector_count)1348{1349if (group < 0) group = ng; /* First in ovector */1350if (ovector[ng*2] != PCRE2_UNSET)1351{1352group = ng; /* First that is set */1353break;1354}1355}1356}13571358/* If group is still negative, it means we did not find a group1359that is in the ovector. Just set the first group. */13601361if (group < 0) group = GET2(first, 0);1362}1363}13641365/* We now have a group that is identified by number. Find the length of1366the captured string. If a group in a non-special substitution is unset1367when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */13681369rc = pcre2_substring_length_bynumber(match_data, group, &sublength);1370if (rc < 0)1371{1372if (rc == PCRE2_ERROR_NOSUBSTRING &&1373(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)1374{1375rc = PCRE2_ERROR_UNSET;1376}1377if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */1378if (special == 0) /* Plain substitution */1379{1380if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;1381goto PTREXIT; /* Else error */1382}1383}13841385/* If special is '+' we have a 'set' and possibly an 'unset' text,1386both of which are reprocessed when used. If special is '-' we have a1387default text for when the group is unset; it must be reprocessed. */13881389if (special != 0)1390{1391if (special == CHAR_MINUS)1392{1393if (rc == 0) goto LITERAL_SUBSTITUTE;1394text2_start = text1_start;1395text2_end = text1_end;1396}13971398if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;1399ptrstack[ptrstackptr++] = ptr;1400ptrstack[ptrstackptr++] = repend;14011402if (rc == 0)1403{1404ptr = text1_start;1405repend = text1_end;1406}1407else1408{1409ptr = text2_start;1410repend = text2_end;1411}1412continue;1413}14141415/* Otherwise we have a literal substitution of a group's contents. */14161417LITERAL_SUBSTITUTE:1418subptr = subject + ovector[group*2];1419subptrend = subject + ovector[group*2 + 1];14201421/* Substitute a literal string, possibly forcing alphabetic case. */14221423SUBPTR_SUBSTITUTE:1424if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1425substitute_case_callout == NULL)1426CHECKCASECPY_DEFAULT(subptr, subptrend - subptr);1427else1428CHECKMEMCPY(subptr, subptrend - subptr);1429}1430} /* End of $ processing */14311432/* Handle an escape sequence in extended mode. We can use check_escape()1433to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but1434the case-forcing escapes are not supported in pcre2_compile() so must be1435recognized here. */14361437else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&1438*ptr == CHAR_BACKSLASH)1439{1440int errorcode;1441case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };14421443if (ptr < repend - 1) switch (ptr[1])1444{1445case CHAR_L:1446new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;1447new_forcecase.single_char = FALSE;1448ptr += 2;1449break;14501451case CHAR_l:1452new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;1453new_forcecase.single_char = TRUE;1454ptr += 2;1455if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)1456{1457/* Perl reverse-title-casing feature for \l\U */1458new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST;1459new_forcecase.single_char = FALSE;1460ptr += 2;1461}1462break;14631464case CHAR_U:1465new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER;1466new_forcecase.single_char = FALSE;1467ptr += 2;1468break;14691470case CHAR_u:1471new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;1472new_forcecase.single_char = TRUE;1473ptr += 2;1474if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)1475{1476/* Perl title-casing feature for \u\L */1477new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;1478new_forcecase.single_char = FALSE;1479ptr += 2;1480}1481break;14821483default:1484break;1485}14861487if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)1488{1489SETFORCECASE:14901491/* If the substitute_case_callout is unset, our case-forcing is done1492immediately. If there is a callout however, then its action is delayed1493until all the characters have been collected.14941495Apply the callout now, before we set the new casing mode. */14961497if (substitute_case_callout != NULL &&1498forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)1499DELAYEDFORCECASE();15001501forcecase = new_forcecase;1502casestart_offset = buff_offset;1503casestart_extra_needed = extra_needed;1504continue;1505}15061507ptr++; /* Point after \ */1508rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,1509code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);1510if (errorcode != 0) goto BADESCAPE;15111512switch(rc)1513{1514case ESC_E:1515goto SETFORCECASE;15161517case ESC_Q:1518escaped_literal = TRUE;1519continue;15201521case 0: /* Data character */1522case ESC_b: /* \b is backspace in a substitution */1523case ESC_v: /* \v is vertical tab in a substitution */15241525if (rc == ESC_b) ch = CHAR_BS;1526if (rc == ESC_v) ch = CHAR_VT;15271528#ifdef SUPPORT_UNICODE1529if (utf) chlen = PRIV(ord2utf)(ch, temp); else1530#endif1531{1532temp[0] = ch;1533chlen = 1;1534}15351536if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1537substitute_case_callout == NULL)1538CHECKCASECPY_DEFAULT(temp, chlen);1539else1540CHECKMEMCPY(temp, chlen);1541continue;15421543case ESC_g:1544{1545PCRE2_SIZE name_len;1546PCRE2_SPTR name_start;15471548/* Parse the \g<name> form (\g<number> already handled by check_escape) */1549if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)1550goto BADESCAPE;1551++ptr;15521553name_start = ptr;1554if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))1555goto BADESCAPE;1556name_len = ptr - name_start;15571558if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)1559goto BADESCAPE;1560++ptr;15611562special = 0;1563group = -1;1564memcpy(name, name_start, CU2BYTES(name_len));1565name[name_len] = 0;1566goto GROUP_SUBSTITUTE;1567}15681569default:1570if (rc < 0)1571{1572special = 0;1573group = -rc - 1;1574goto GROUP_SUBSTITUTE;1575}1576goto BADESCAPE;1577}1578} /* End of backslash processing */15791580/* Handle a literal code unit */15811582else1583{1584PCRE2_SPTR ch_start;15851586LOADLITERAL:1587ch_start = ptr;1588GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */1589(void) ch;15901591if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&1592substitute_case_callout == NULL)1593CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start);1594else1595CHECKMEMCPY(ch_start, ptr - ch_start);1596} /* End handling a literal code unit */1597} /* End of loop for scanning the replacement. */15981599/* If the substitute_case_callout is unset, our case-forcing is done1600immediately. If there is a callout however, then its action is delayed1601until all the characters have been collected.16021603We now clean up any trailing section of the replacement for which we deferred1604the case-forcing. */16051606if (substitute_case_callout != NULL &&1607forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)1608DELAYEDFORCECASE();16091610/* The replacement has been copied to the output, or its size has been1611remembered. Handle the callout if there is one. */16121613if (mcontext != NULL && mcontext->substitute_callout != NULL)1614{1615/* If we an actual (non-simulated) replacement, do the callout. */16161617if (!overflowed)1618{1619scb.subscount = subs;1620scb.output_offsets[1] = buff_offset;1621rc = mcontext->substitute_callout(&scb,1622mcontext->substitute_callout_data);16231624/* A non-zero return means cancel this substitution. Instead, copy the1625matched string fragment. */16261627if (rc != 0)1628{1629PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];1630PCRE2_SIZE oldlength = ovector[1] - ovector[0];16311632buff_offset -= newlength;1633lengthleft += newlength;1634if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);16351636/* A negative return means do not do any more. */16371638if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);1639}1640}16411642/* In this interesting case, we cannot do the callout, so it's hard to1643estimate the required buffer size. What callers want is to be able to make1644two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH1645to discover the buffer size, and then a second and final call. Older1646versions of PCRE2 violated this assumption, by proceding as if the callout1647had returned zero - but on the second call to pcre2_substitute() it could1648return non-zero and then overflow the buffer again. Callers probably don't1649want to keep on looping to incrementally discover the buffer size. */16501651else1652{1653PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0];1654PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed;1655PCRE2_SIZE newlength =1656(newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)? /* Integer overflow */1657~(PCRE2_SIZE)0 : newlength_buf + newlength_extra; /* Cap the addition */1658PCRE2_SIZE oldlength = ovector[1] - ovector[0];16591660/* Be pessimistic: request whichever buffer size is larger out of1661accepting or rejecting the substitution. */16621663if (oldlength > newlength)1664{1665PCRE2_SIZE additional = oldlength - newlength;1666if (additional > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */1667goto TOOLARGEREPLACE;1668extra_needed += additional;1669}16701671/* Proceed as if the callout did not return a negative. A negative1672effectively rejects all future substitutions, but we want to examine them1673pessimistically. */1674}1675}16761677/* Exit the global loop if we are not in global mode, or if pcre2_next_match()1678indicates we have reached the end of the subject. */16791680if ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) == 0 ||1681!pcre2_next_match(match_data, &start_offset, &goptions))1682{1683start_offset = ovector[1];1684break;1685}16861687/* Verify that pcre2_next_match() has not done a bumpalong (because we have1688already returned PCRE2_ERROR_BADSUBSPATTERN for \K in lookarounds).16891690We would otherwise have to memcpy the fragment spanning from ovector[1] to the1691new start_offset.*/16921693PCRE2_ASSERT(start_offset == ovector[1]);16941695} /* End of global loop */16961697/* Copy the rest of the subject unless not required, and terminate the output1698with a binary zero. */16991700if (!replacement_only)1701{1702fraglength = length - start_offset;1703CHECKMEMCPY(subject + start_offset, fraglength);1704}17051706temp[0] = 0;1707CHECKMEMCPY(temp, 1);17081709/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,1710and matching has carried on after a full buffer, in order to compute the length1711needed. Otherwise, an overflow generates an immediate error return. */17121713if (overflowed)1714{1715rc = PCRE2_ERROR_NOMEMORY;17161717if (extra_needed > ~(PCRE2_SIZE)0 - buff_length) /* Integer overflow */1718goto TOOLARGEREPLACE;1719*blength = buff_length + extra_needed;1720}17211722/* After a successful execution, return the number of substitutions and set the1723length of buffer used, excluding the trailing zero. */17241725else1726{1727rc = subs;1728*blength = buff_offset - 1;1729}17301731EXIT:1732if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);1733else match_data->rc = rc;1734return rc;17351736NOROOM:1737rc = PCRE2_ERROR_NOMEMORY;1738goto EXIT;17391740CASEERROR:1741rc = PCRE2_ERROR_REPLACECASE;1742goto EXIT;17431744TOOLARGEREPLACE:1745rc = PCRE2_ERROR_TOOLARGEREPLACE;1746goto EXIT;17471748BAD:1749rc = PCRE2_ERROR_BADREPLACEMENT;1750goto PTREXIT;17511752BADESCAPE:1753rc = PCRE2_ERROR_BADREPESCAPE;17541755PTREXIT:1756*blength = (PCRE2_SIZE)(ptr - replacement);1757goto EXIT;1758}17591760/* End of pcre2_substitute.c */176117621763