CoCalc -- pcre2_substitute.c

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_substitute.c
⁹⁸⁹⁸ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41

42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45

46
#include "pcre2_internal.h"
47

48
#define PTR_STACK_SIZE 20
49

50
#define SUBSTITUTE_OPTIONS \
51
  (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52
   PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53
   PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54
   PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55

56

57

58
/*************************************************
59
*           Find end of substitute text          *
60
*************************************************/
61

62
/* In extended mode, we recognize ${name:+set text:unset text} and similar
63
constructions. This requires the identification of unescaped : and }
64
characters. This function scans for such. It must deal with nested ${
65
constructions. The pointer to the text is updated, either to the required end
66
character, or to where an error was detected.
67

68
Arguments:
69
  code      points to the compiled expression (for options)
70
  ptrptr    points to the pointer to the start of the text (updated)
71
  ptrend    end of the whole string
72
  last      TRUE if the last expected string (only } recognized)
73

74
Returns:    0 on success
75
            negative error code on failure
76
*/
77

78
static int
79
find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80
  BOOL last)
81
{
82
int rc = 0;
83
uint32_t nestlevel = 0;
84
BOOL literal = FALSE;
85
PCRE2_SPTR ptr = *ptrptr;
86

87
for (; ptr < ptrend; ptr++)
88
  {
89
  if (literal)
90
    {
91
    if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92
      {
93
      literal = FALSE;
94
      ptr += 1;
95
      }
96
    }
97

98
  else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99
    {
100
    if (nestlevel == 0) goto EXIT;
101
    nestlevel--;
102
    }
103

104
  else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105

106
  else if (*ptr == CHAR_DOLLAR_SIGN)
107
    {
108
    if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109
      {
110
      nestlevel++;
111
      ptr += 1;
112
      }
113
    }
114

115
  else if (*ptr == CHAR_BACKSLASH)
116
    {
117
    int erc;
118
    int errorcode;
119
    uint32_t ch;
120

121
    if (ptr < ptrend - 1) switch (ptr[1])
122
      {
123
      case CHAR_L:
124
      case CHAR_l:
125
      case CHAR_U:
126
      case CHAR_u:
127
      ptr += 1;
128
      continue;
129
      }
130

131
    ptr += 1;  /* Must point after \ */
132
    erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133
      code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
134
    ptr -= 1;  /* Back to last code unit of escape */
135
    if (errorcode != 0)
136
      {
137
      /* errorcode from check_escape is positive, so must not be returned by
138
      pcre2_substitute(). */
139
      rc = PCRE2_ERROR_BADREPESCAPE;
140
      goto EXIT;
141
      }
142

143
    switch(erc)
144
      {
145
      case 0:      /* Data character */
146
      case ESC_b:  /* Data character */
147
      case ESC_v:  /* Data character */
148
      case ESC_E:  /* Isolated \E is ignored */
149
      break;
150

151
      case ESC_Q:
152
      literal = TRUE;
153
      break;
154

155
      case ESC_g:
156
      /* The \g<name> form (\g<number> already handled by check_escape)
157

158
      Don't worry about finding the matching ">". We are super, super lenient
159
      about validating ${} replacements inside find_text_end(), so we certainly
160
      don't need to worry about other syntax. Importantly, a \g<..> or $<...>
161
      sequence can't contain a '}' character. */
162
      break;
163

164
      default:
165
      if (erc < 0)
166
          break;  /* capture group reference */
167
      rc = PCRE2_ERROR_BADREPESCAPE;
168
      goto EXIT;
169
      }
170
    }
171
  }
172

173
rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
174

175
EXIT:
176
*ptrptr = ptr;
177
return rc;
178
}
179

180

181
/*************************************************
182
*           Validate group name                  *
183
*************************************************/
184

185
/* This function scans for a capture group name, validating it
186
consists of legal characters, is not empty, and does not exceed
187
MAX_NAME_SIZE.
188

189
Arguments:
190
  ptrptr    points to the pointer to the start of the text (updated)
191
  ptrend    end of the whole string
192
  utf       true if the input is UTF-encoded
193
  ctypes    pointer to the character types table
194

195
Returns:    TRUE if a name was read
196
            FALSE otherwise
197
*/
198

199
static BOOL
200
read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,
201
    const uint8_t* ctypes)
202
{
203
PCRE2_SPTR ptr = *ptrptr;
204
PCRE2_SPTR nameptr = ptr;
205

206
if (ptr >= ptrend)                 /* No characters in name */
207
  goto FAILED;
208

209
/* We do not need to check whether the name starts with a non-digit.
210
We are simply referencing names here, not defining them. */
211

212
/* See read_name in the pcre2_compile.c for the corresponding logic
213
restricting group names inside the pattern itself. */
214

215
#ifdef SUPPORT_UNICODE
216
if (utf)
217
  {
218
  uint32_t c, type;
219

220
  while (ptr < ptrend)
221
    {
222
    GETCHAR(c, ptr);
223
    type = UCD_CHARTYPE(c);
224
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
225
        c != CHAR_UNDERSCORE) break;
226
    ptr++;
227
    FORWARDCHARTEST(ptr, ptrend);
228
    }
229
  }
230
else
231
#else
232
(void)utf;  /* Avoid compiler warning */
233
#endif      /* SUPPORT_UNICODE */
234

235
/* Handle group names in non-UTF modes. */
236

237
  {
238
  while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)
239
    {
240
    ptr++;
241
    }
242
  }
243

244
/* Check name length */
245

246
if (ptr - nameptr > MAX_NAME_SIZE)
247
  goto FAILED;
248

249
/* Subpattern names must not be empty */
250
if (ptr == nameptr)
251
  goto FAILED;
252

253
*ptrptr = ptr;
254
return TRUE;
255

256
FAILED:
257
*ptrptr = ptr;
258
return FALSE;
259
}
260

261

262
/*************************************************
263
*              Case transformations              *
264
*************************************************/
265

266
#define PCRE2_SUBSTITUTE_CASE_NONE                 0
267
// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.
268
#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST  4
269

270
typedef struct {
271
  int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */
272
  BOOL single_char;
273
} case_state;
274

275
/* Helper to guess how much a string is likely to increase in size when
276
case-transformed. Usually, strings don't change size at all, but some rare
277
characters do grow. Estimate +10%, plus another few characters.
278

279
Performing this estimation is unfortunate, but inevitable, since we can't call
280
the callout if we ran out of buffer space to prepare its input.
281

282
Because this estimate is inexact (and in pathological cases, underestimates the
283
required buffer size) we must document that when you have a
284
substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you
285
may need more than two calls to determine the final buffer size. */
286

287
static PCRE2_SIZE
288
pessimistic_case_inflation(PCRE2_SIZE len)
289
{
290
return (len >> 3u) + 10;
291
}
292

293
/* Case transformation behaviour if no callout is passed. */
294

295
static PCRE2_SIZE
296
default_substitute_case_callout(
297
  PCRE2_SPTR input, PCRE2_SIZE input_len,
298
  PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
299
  case_state *state, const pcre2_code *code)
300
{
301
PCRE2_SPTR input_end = input + input_len;
302
#ifdef SUPPORT_UNICODE
303
BOOL utf;
304
BOOL ucp;
305
#endif
306
PCRE2_UCHAR temp[6];
307
BOOL next_to_upper;
308
BOOL rest_to_upper;
309
BOOL single_char;
310
BOOL overflow = FALSE;
311
PCRE2_SIZE written = 0;
312

313
/* Helpful simplifying invariant: input and output are disjoint buffers.
314
I believe that this code is technically undefined behaviour, because the two
315
pointers input/output are "unrelated" pointers and hence not comparable. Casting
316
via char* bypasses some but not all of those technical rules. It is not included
317
in release builds, in any case. */
318
PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||
319
             (char *)(output + output_cap) <= (char *)input);
320

321
#ifdef SUPPORT_UNICODE
322
utf = (code->overall_options & PCRE2_UTF) != 0;
323
ucp = (code->overall_options & PCRE2_UCP) != 0;
324
#endif
325

326
if (input_len == 0) return 0;
327

328
switch (state->to_case)
329
  {
330
  default:
331
  PCRE2_DEBUG_UNREACHABLE();
332
  return 0;
333

334
  case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
335
  case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
336
  next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);
337
  break;
338

339
  case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
340
  next_to_upper = TRUE;
341
  rest_to_upper = FALSE;
342
  state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
343
  break;
344

345
  case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
346
  next_to_upper = FALSE;
347
  rest_to_upper = TRUE;
348
  state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
349
  break;
350
  }
351

352
single_char = state->single_char;
353
if (single_char)
354
  state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
355

356
while (input < input_end)
357
  {
358
  uint32_t ch;
359
  unsigned int chlen;
360

361
  GETCHARINCTEST(ch, input);
362

363
#ifdef SUPPORT_UNICODE
364
  if ((utf || ucp) && ch >= 128)
365
    {
366
    uint32_t type = UCD_CHARTYPE(ch);
367
    if (PRIV(ucp_gentype)[type] == ucp_L &&
368
        type != (next_to_upper? ucp_Lu : ucp_Ll))
369
      ch = UCD_OTHERCASE(ch);
370

371
    /* TODO This is far from correct... it doesn't support the SpecialCasing.txt
372
    mappings, but worse, it's not even correct for all the ordinary case
373
    mappings. We should add support for those (at least), and then add the
374
    SpecialCasing.txt mappings for Esszet and ligatures, and finally use the
375
    Turkish casing flag on the match context. */
376
    }
377
  else
378
#endif
379
  if (MAX_255(ch))
380
    {
381
    if (((code->tables + cbits_offset +
382
        (next_to_upper? cbit_upper:cbit_lower)
383
        )[ch/8] & (1u << (ch%8))) == 0)
384
      ch = (code->tables + fcc_offset)[ch];
385
    }
386

387
#ifdef SUPPORT_UNICODE
388
  if (utf) chlen = PRIV(ord2utf)(ch, temp); else
389
#endif
390
    {
391
    temp[0] = ch;
392
    chlen = 1;
393
    }
394

395
  if (!overflow && chlen <= output_cap)
396
    {
397
    memcpy(output, temp, CU2BYTES(chlen));
398
    output += chlen;
399
    output_cap -= chlen;
400
    }
401
  else
402
    {
403
    overflow = TRUE;
404
    }
405

406
  if (chlen > ~(PCRE2_SIZE)0 - written)  /* Integer overflow */
407
    return ~(PCRE2_SIZE)0;
408
  written += chlen;
409

410
  next_to_upper = rest_to_upper;
411

412
  /* memcpy the remainder, if only transforming a single character. */
413

414
  if (single_char)
415
    {
416
    PCRE2_SIZE rest_len = input_end - input;
417

418
    if (!overflow && rest_len <= output_cap)
419
      memcpy(output, input, CU2BYTES(rest_len));
420

421
    if (rest_len > ~(PCRE2_SIZE)0 - written)  /* Integer overflow */
422
      return ~(PCRE2_SIZE)0;
423
    written += rest_len;
424

425
    return written;
426
    }
427
  }
428

429
return written;
430
}
431

432
/* Helper to perform the call to the substitute_case_callout. We wrap the
433
user-provided callout because our internal arguments are slightly extended. We
434
don't want the user callout to handle the case of "\l" (first character only to
435
lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because
436
those are not operations defined by Unicode. Instead the user callout simply
437
needs to provide the three Unicode primitives: lower, upper, titlecase. */
438

439
static PCRE2_SIZE
440
do_case_copy(
441
  PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,
442
  case_state *state, BOOL utf,
443
  PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
444
                                        PCRE2_SIZE, int, void *),
445
  void *substitute_case_callout_data)
446
{
447
PCRE2_SPTR input = input_output;
448
PCRE2_UCHAR *output = input_output;
449
PCRE2_SIZE rc;
450
PCRE2_SIZE rc2;
451
int ch1_to_case;
452
int rest_to_case;
453
PCRE2_UCHAR ch1[6];
454
PCRE2_SIZE ch1_len;
455
PCRE2_SPTR rest;
456
PCRE2_SIZE rest_len;
457
BOOL ch1_overflow = FALSE;
458
BOOL rest_overflow = FALSE;
459

460
#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)
461
(void)utf; /* Avoid compiler warning. */
462
#endif
463

464
PCRE2_ASSERT(input_len != 0);
465

466
switch (state->to_case)
467
  {
468
  default:
469
  PCRE2_DEBUG_UNREACHABLE();
470
  return 0;
471

472
  case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
473
  case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
474
  case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
475

476
  /* The easy case, where our internal casing operations align with those of
477
  the callout. */
478

479
  if (state->single_char == FALSE)
480
    {
481
    rc = substitute_case_callout(input, input_len, output, output_cap,
482
                                 state->to_case, substitute_case_callout_data);
483

484
    if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)
485
      state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
486

487
    return rc;
488
    }
489

490
  ch1_to_case = state->to_case;
491
  rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;
492
  break;
493

494
  case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
495
  ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
496
  rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
497
  break;
498
  }
499

500
/* Identify the leading character. Take copy, because its storage overlaps with
501
`output`, and hence may be scrambled by the callout. */
502

503
  {
504
  PCRE2_SPTR ch_end = input;
505
  uint32_t ch;
506

507
  GETCHARINCTEST(ch, ch_end);
508
  (void) ch;
509
  PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);
510
  ch1_len = ch_end - input;
511
  memcpy(ch1, input, CU2BYTES(ch1_len));
512
  }
513

514
rest = input + ch1_len;
515
rest_len = input_len - ch1_len;
516

517
/* Transform just ch1. The buffers are always in-place (input == output). With a
518
custom callout, we need a loop to discover its required buffer size. The loop
519
wouldn't be required if the callout were well-behaved, but it might be naughty
520
and return "5" the first time, then "10" the next time we call it using the
521
exact same input! */
522

523
  {
524
  PCRE2_SIZE ch1_cap;
525
  PCRE2_SIZE max_ch1_cap;
526

527
  ch1_cap = ch1_len;  /* First attempt uses the space vacated by ch1. */
528
  PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);
529
  max_ch1_cap = output_cap - rest_len;
530

531
  while (TRUE)
532
    {
533
    rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,
534
                                 substitute_case_callout_data);
535
    if (rc == ~(PCRE2_SIZE)0) return rc;
536

537
    if (rc <= ch1_cap) break;
538

539
    if (rc > max_ch1_cap)
540
      {
541
      ch1_overflow = TRUE;
542
      break;
543
      }
544

545
    /* Move the rest to the right, to make room for expanding ch1. */
546

547
    memmove(input_output + rc, rest, CU2BYTES(rest_len));
548
    rest = input + rc;
549

550
    ch1_cap = rc;
551

552
    /* Proof of loop termination: `ch1_cap` is growing on each iteration, but
553
    the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */
554
    }
555
  }
556

557
if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)
558
  {
559
  if (!ch1_overflow)
560
    {
561
    PCRE2_ASSERT(rest_len <= output_cap - rc);
562
    memmove(output + rc, rest, CU2BYTES(rest_len));
563
    }
564
  rc2 = rest_len;
565

566
  state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
567
  }
568
else
569
  {
570
  PCRE2_UCHAR dummy[1];
571

572
  rc2 = substitute_case_callout(rest, rest_len,
573
                                ch1_overflow? dummy : output + rc,
574
                                ch1_overflow? 0u : output_cap - rc,
575
                                rest_to_case, substitute_case_callout_data);
576
  if (rc2 == ~(PCRE2_SIZE)0) return rc2;
577

578
  if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;
579

580
  /* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then
581
  `rest` shrinks, it's actually possible for the total calculated length of
582
  `xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't
583
  report that, because it would make it seem that the operation succeeded.
584
  If either of xform(ch1) or xform(rest) won't fit in the buffer, our final
585
  result must be > output_cap. */
586
  if (ch1_overflow && rc2 < rest_len)
587
    rc2 = rest_len;
588

589
  state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
590
  }
591

592
if (rc2 > ~(PCRE2_SIZE)0 - rc)  /* Integer overflow */
593
  return ~(PCRE2_SIZE)0;
594

595
PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);
596
(void)rest_overflow;
597

598
return rc + rc2;
599
}
600

601

602
/*************************************************
603
*              Match and substitute              *
604
*************************************************/
605

606
/* This function applies a compiled re to a subject string and creates a new
607
string with substitutions. The first 7 arguments are the same as for
608
pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
609

610
Arguments:
611
  code            points to the compiled expression
612
  subject         points to the subject string
613
  length          length of subject string (may contain binary zeros)
614
  start_offset    where to start in the subject string
615
  options         option bits
616
  match_data      points to a match_data block, or is NULL
617
  context         points a PCRE2 context
618
  replacement     points to the replacement string
619
  rlength         length of replacement string
620
  buffer          where to put the substituted string
621
  blength         points to length of buffer; updated to length of string
622

623
Returns:          >= 0 number of substitutions made
624
                  < 0 an error code
625
                  PCRE2_ERROR_BADREPLACEMENT means invalid use of $
626
*/
627

628
/* This macro checks for space in the buffer before copying into it. On
629
overflow, either give an error immediately, or keep on, accumulating the
630
length. */
631

632
#define CHECKMEMCPY(from, length_) \
633
  do {    \
634
     PCRE2_SIZE chkmc_length = length_; \
635
     if (overflowed) \
636
       {  \
637
       if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */ \
638
         goto TOOLARGEREPLACE; \
639
       extra_needed += chkmc_length; \
640
       }  \
641
     else if (lengthleft < chkmc_length) \
642
       {  \
643
       if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
644
       overflowed = TRUE; \
645
       extra_needed = chkmc_length - lengthleft; \
646
       }  \
647
     else \
648
       {  \
649
       memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \
650
       buff_offset += chkmc_length; \
651
       lengthleft -= chkmc_length; \
652
       }  \
653
     }    \
654
  while (0)
655

656
/* This macro checks for space and copies characters with casing modifications.
657
On overflow, it behaves as for CHECKMEMCPY().
658

659
When substitute_case_callout is NULL, the source and destination buffers must
660
not overlap, because our default handler does not support this. */
661

662
#define CHECKCASECPY_BASE(length_, do_call) \
663
  do {    \
664
     PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \
665
     PCRE2_SIZE chkcc_rc; \
666
     do_call \
667
     if (lengthleft < chkcc_rc) \
668
       {  \
669
       if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
670
       overflowed = TRUE; \
671
       extra_needed = chkcc_rc - lengthleft; \
672
       }  \
673
     else \
674
       {  \
675
       buff_offset += chkcc_rc; \
676
       lengthleft -= chkcc_rc; \
677
       }  \
678
     }    \
679
  while (0)
680

681
#define CHECKCASECPY_DEFAULT(from, length_) \
682
  CHECKCASECPY_BASE(length_, { \
683
    chkcc_rc = default_substitute_case_callout(from, chkcc_length,         \
684
                                               buffer + buff_offset,       \
685
                                               overflowed? 0 : lengthleft, \
686
                                               &forcecase, code);          \
687
    if (overflowed) \
688
      { \
689
      if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */ \
690
        goto TOOLARGEREPLACE; \
691
      extra_needed += chkcc_rc; \
692
      break; \
693
      } \
694
  })
695

696
#define CHECKCASECPY_CALLOUT(length_) \
697
  CHECKCASECPY_BASE(length_, { \
698
    chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \
699
                            lengthleft, &forcecase, utf,        \
700
                            substitute_case_callout,            \
701
                            substitute_case_callout_data);      \
702
    if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \
703
  })
704

705
/* This macro does a delayed case transformation, for the situation when we have
706
a case-forcing callout. */
707

708
#define DELAYEDFORCECASE() \
709
  do {      \
710
     PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \
711
            (extra_needed - casestart_extra_needed); \
712
     if (chars_outstanding > 0) \
713
       {    \
714
       if (overflowed) \
715
         {  \
716
         PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \
717
         if (guess > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */ \
718
           goto TOOLARGEREPLACE; \
719
         extra_needed += guess; \
720
         }  \
721
       else \
722
         {  \
723
         /* Rewind the buffer */ \
724
         lengthleft += (buff_offset - casestart_offset); \
725
         buff_offset = casestart_offset; \
726
         /* Care! In-place case transformation */ \
727
         CHECKCASECPY_CALLOUT(chars_outstanding); \
728
         }  \
729
       }    \
730
     }      \
731
  while (0)
732

733

734
/* Here's the function */
735

736
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
737
pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
738
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
739
  pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
740
  PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
741
{
742
int rc;
743
int subs;
744
uint32_t ovector_count;
745
uint32_t goptions = 0;
746
uint32_t suboptions;
747
pcre2_match_data *internal_match_data = NULL;
748
BOOL escaped_literal = FALSE;
749
BOOL overflowed = FALSE;
750
BOOL use_existing_match;
751
BOOL replacement_only;
752
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
753
PCRE2_UCHAR temp[6];
754
PCRE2_SPTR ptr;
755
PCRE2_SPTR repend = NULL;
756
PCRE2_SIZE extra_needed = 0;
757
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
758
PCRE2_SIZE *ovector;
759
PCRE2_SIZE ovecsave[3];
760
pcre2_substitute_callout_block scb;
761
PCRE2_SIZE sub_start_extra_needed;
762
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
763
                                      PCRE2_SIZE, int, void *) = NULL;
764
void *substitute_case_callout_data = NULL;
765

766
/* General initialization */
767

768
buff_offset = 0;
769
lengthleft = buff_length = *blength;
770
*blength = PCRE2_UNSET;
771
ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
772

773
if (mcontext != NULL)
774
  {
775
  substitute_case_callout = mcontext->substitute_case_callout;
776
  substitute_case_callout_data = mcontext->substitute_case_callout_data;
777
  }
778

779
/* Partial matching is not valid. This must come after setting *blength to
780
PCRE2_UNSET, so as not to imply an offset in the replacement. */
781

782
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
783
  return PCRE2_ERROR_BADOPTION;
784

785
/* Validate length and find the end of the replacement. A NULL replacement of
786
zero length is interpreted as an empty string. */
787

788
if (replacement == NULL)
789
  {
790
  if (rlength != 0) return PCRE2_ERROR_NULL;
791
  replacement = (PCRE2_SPTR)"";
792
  }
793

794
if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
795
repend = replacement + rlength;
796

797
/* Check for using a match that has already happened. Note that the subject
798
pointer in the match data may be NULL after a no-match. */
799

800
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
801
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
802

803
/* If starting from an existing match, there must be an externally provided
804
match data block. We create an internal match_data block in two cases: (a) an
805
external one is not supplied (and we are not starting from an existing match);
806
(b) an existing match is to be used for the first substitution. In the latter
807
case, we copy the existing match into the internal block, except for any cached
808
heap frame size and pointer. This ensures that no changes are made to the
809
external match data block. */
810

811
/* WARNING: In both cases below a general context is constructed "by hand"
812
because calling pcre2_general_context_create() involves a memory allocation. If
813
the contents of a general context control block are ever changed there will
814
have to be changes below. */
815

816
if (match_data == NULL)
817
  {
818
  pcre2_general_context gcontext;
819
  if (use_existing_match) return PCRE2_ERROR_NULL;
820
  gcontext.memctl = (mcontext == NULL)?
821
    ((const pcre2_real_code *)code)->memctl :
822
    ((pcre2_real_match_context *)mcontext)->memctl;
823
  match_data = internal_match_data =
824
    pcre2_match_data_create_from_pattern(code, &gcontext);
825
  if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
826
  }
827

828
else if (use_existing_match)
829
  {
830
  int pairs;
831
  pcre2_general_context gcontext;
832
  gcontext.memctl = (mcontext == NULL)?
833
    ((const pcre2_real_code *)code)->memctl :
834
    ((pcre2_real_match_context *)mcontext)->memctl;
835
  pairs = (code->top_bracket + 1 < match_data->oveccount)?
836
    code->top_bracket + 1 : match_data->oveccount;
837
  internal_match_data = pcre2_match_data_create(match_data->oveccount,
838
    &gcontext);
839
  if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
840
  memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
841
    + 2*pairs*sizeof(PCRE2_SIZE));
842
  internal_match_data->heapframes = NULL;
843
  internal_match_data->heapframes_size = 0;
844
  match_data = internal_match_data;
845
  }
846

847
/* Remember ovector details */
848

849
ovector = pcre2_get_ovector_pointer(match_data);
850
ovector_count = pcre2_get_ovector_count(match_data);
851

852
/* Fixed things in the callout block */
853

854
scb.version = 0;
855
scb.input = subject;
856
scb.output = (PCRE2_SPTR)buffer;
857
scb.ovector = ovector;
858

859
/* A NULL subject of zero length is treated as an empty string. */
860

861
if (subject == NULL)
862
  {
863
  if (length != 0) return PCRE2_ERROR_NULL;
864
  subject = (PCRE2_SPTR)"";
865
  }
866

867
/* Find length of zero-terminated subject */
868

869
if (length == PCRE2_ZERO_TERMINATED)
870
  length = subject? PRIV(strlen)(subject) : 0;
871

872
/* Check UTF replacement string if necessary. */
873

874
#ifdef SUPPORT_UNICODE
875
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
876
  {
877
  rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
878
  if (rc != 0)
879
    {
880
    match_data->leftchar = 0;
881
    goto EXIT;
882
    }
883
  }
884
#endif  /* SUPPORT_UNICODE */
885

886
/* Save the substitute options and remove them from the match options. */
887

888
suboptions = options & SUBSTITUTE_OPTIONS;
889
options &= ~SUBSTITUTE_OPTIONS;
890

891
/* Error if the start match offset is greater than the length of the subject. */
892

893
if (start_offset > length)
894
  {
895
  match_data->leftchar = 0;
896
  rc = PCRE2_ERROR_BADOFFSET;
897
  goto EXIT;
898
  }
899

900
/* Copy up to the start offset, unless only the replacement is required. */
901

902
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
903

904
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
905
match is taken from the match_data that was passed in. */
906

907
subs = 0;
908
do
909
  {
910
  PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
911
  uint32_t ptrstackptr = 0;
912
  case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
913
  PCRE2_SIZE casestart_offset = 0;
914
  PCRE2_SIZE casestart_extra_needed = 0;
915

916
  if (use_existing_match)
917
    {
918
    rc = match_data->rc;
919
    use_existing_match = FALSE;
920
    }
921
  else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
922
    match_data, mcontext);
923

924
#ifdef SUPPORT_UNICODE
925
  if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
926
#endif
927

928
  /* Any error other than no match returns the error code. No match when not
929
  doing the special after-empty-match global rematch, or when at the end of the
930
  subject, breaks the global loop. Otherwise, advance the starting point by one
931
  character, copying it to the output, and try again. */
932

933
  if (rc < 0)
934
    {
935
    PCRE2_SIZE save_start;
936

937
    if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
938
    if (goptions == 0 || start_offset >= length) break;
939

940
    /* Advance by one code point. Then, if CRLF is a valid newline sequence and
941
    we have advanced into the middle of it, advance one more code point. In
942
    other words, do not start in the middle of CRLF, even if CR and LF on their
943
    own are valid newlines. */
944

945
    save_start = start_offset++;
946
    if (subject[start_offset-1] == CHAR_CR &&
947
        (code->newline_convention == PCRE2_NEWLINE_CRLF ||
948
         code->newline_convention == PCRE2_NEWLINE_ANY ||
949
         code->newline_convention == PCRE2_NEWLINE_ANYCRLF) &&
950
        start_offset < length &&
951
        subject[start_offset] == CHAR_LF)
952
      start_offset++;
953

954
    /* Otherwise, in UTF mode, advance past any secondary code points. */
955

956
    else if ((code->overall_options & PCRE2_UTF) != 0)
957
      {
958
#if PCRE2_CODE_UNIT_WIDTH == 8
959
      while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
960
        start_offset++;
961
#elif PCRE2_CODE_UNIT_WIDTH == 16
962
      while (start_offset < length &&
963
            (subject[start_offset] & 0xfc00) == 0xdc00)
964
        start_offset++;
965
#endif
966
      }
967

968
    /* Copy what we have advanced past (unless not required), reset the special
969
    global options, and continue to the next match. */
970

971
    fraglength = start_offset - save_start;
972
    if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
973
    goptions = 0;
974
    continue;
975
    }
976

977
  /* Handle a successful match. Matches that use \K to end before they start
978
  or start before the current point in the subject are not supported. */
979

980
  if (ovector[1] < ovector[0] || ovector[0] < start_offset)
981
    {
982
    rc = PCRE2_ERROR_BADSUBSPATTERN;
983
    goto EXIT;
984
    }
985

986
  /* Check for the same match as previous. This is legitimate after matching an
987
  empty string that starts after the initial match offset. We have tried again
988
  at the match point in case the pattern is one like /(?<=\G.)/ which can never
989
  match at its starting point, so running the match achieves the bumpalong. If
990
  we do get the same (null) match at the original match point, it isn't such a
991
  pattern, so we now do the empty string magic. In all other cases, a repeat
992
  match should never occur. */
993

994
  if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
995
    {
996
    if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
997
      {
998
      goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
999
      ovecsave[2] = start_offset;
1000
      continue;    /* Back to the top of the loop */
1001
      }
1002
    rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
1003
    goto EXIT;
1004
    }
1005

1006
  /* Count substitutions with a paranoid check for integer overflow; surely no
1007
  real call to this function would ever hit this! */
1008

1009
  if (subs == INT_MAX)
1010
    {
1011
    rc = PCRE2_ERROR_TOOMANYREPLACE;
1012
    goto EXIT;
1013
    }
1014
  subs++;
1015

1016
  /* Copy the text leading up to the match (unless not required); remember
1017
  where the insert begins and how many ovector pairs are set; and remember how
1018
  much space we have requested in extra_needed. */
1019

1020
  if (rc == 0) rc = ovector_count;
1021
  fraglength = ovector[0] - start_offset;
1022
  if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
1023
  scb.output_offsets[0] = buff_offset;
1024
  scb.oveccount = rc;
1025
  sub_start_extra_needed = extra_needed;
1026

1027
  /* Process the replacement string. If the entire replacement is literal, just
1028
  copy it with length check. */
1029

1030
  ptr = replacement;
1031
  if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
1032
    {
1033
    CHECKMEMCPY(ptr, rlength);
1034
    }
1035

1036
  /* Within a non-literal replacement, which must be scanned character by
1037
  character, local literal mode can be set by \Q, but only in extended mode
1038
  when backslashes are being interpreted. In extended mode we must handle
1039
  nested substrings that are to be reprocessed. */
1040

1041
  else for (;;)
1042
    {
1043
    uint32_t ch;
1044
    unsigned int chlen;
1045
    int group;
1046
    uint32_t special;
1047
    PCRE2_SPTR text1_start = NULL;
1048
    PCRE2_SPTR text1_end = NULL;
1049
    PCRE2_SPTR text2_start = NULL;
1050
    PCRE2_SPTR text2_end = NULL;
1051
    PCRE2_UCHAR name[MAX_NAME_SIZE + 1];
1052

1053
    /* If at the end of a nested substring, pop the stack. */
1054

1055
    if (ptr >= repend)
1056
      {
1057
      if (ptrstackptr == 0) break;       /* End of replacement string */
1058
      repend = ptrstack[--ptrstackptr];
1059
      ptr = ptrstack[--ptrstackptr];
1060
      continue;
1061
      }
1062

1063
    /* Handle the next character */
1064

1065
    if (escaped_literal)
1066
      {
1067
      if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
1068
        {
1069
        escaped_literal = FALSE;
1070
        ptr += 2;
1071
        continue;
1072
        }
1073
      goto LOADLITERAL;
1074
      }
1075

1076
    /* Not in literal mode. */
1077

1078
    if (*ptr == CHAR_DOLLAR_SIGN)
1079
      {
1080
      BOOL inparens;
1081
      BOOL inangle;
1082
      BOOL star;
1083
      PCRE2_SIZE sublength;
1084
      PCRE2_UCHAR next;
1085
      PCRE2_SPTR subptr, subptrend;
1086

1087
      if (++ptr >= repend) goto BAD;
1088
      if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
1089

1090
      special = 0;
1091
      text1_start = NULL;
1092
      text1_end = NULL;
1093
      text2_start = NULL;
1094
      text2_end = NULL;
1095
      group = -1;
1096
      inparens = FALSE;
1097
      inangle = FALSE;
1098
      star = FALSE;
1099
      subptr = NULL;
1100
      subptrend = NULL;
1101

1102
      /* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */
1103
      if (next == CHAR_AMPERSAND)
1104
        {
1105
        ++ptr;
1106
        group = 0;
1107
        goto GROUP_SUBSTITUTE;
1108
        }
1109
      if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE)
1110
        {
1111
        ++ptr;
1112
        rc = pcre2_substring_length_bynumber(match_data, 0, &sublength);
1113
        if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */
1114

1115
        if (next == CHAR_GRAVE_ACCENT)
1116
          {
1117
          subptr = subject;
1118
          subptrend = subject + ovector[0];
1119
          }
1120
        else
1121
          {
1122
          subptr = subject + ovector[1];
1123
          subptrend = subject + length;
1124
          }
1125

1126
        goto SUBPTR_SUBSTITUTE;
1127
        }
1128
      if (next == CHAR_UNDERSCORE)
1129
        {
1130
        /* Java, .NET support $_ for "entire input string". */
1131
        ++ptr;
1132
        subptr = subject;
1133
        subptrend = subject + length;
1134
        goto SUBPTR_SUBSTITUTE;
1135
        }
1136

1137
      if (next == CHAR_LEFT_CURLY_BRACKET)
1138
        {
1139
        if (++ptr >= repend) goto BAD;
1140
        next = *ptr;
1141
        inparens = TRUE;
1142
        }
1143
      else if (next == CHAR_LESS_THAN_SIGN)
1144
        {
1145
        /* JavaScript compatibility syntax, $<name>. Processes only named
1146
        groups (not numbered) and does not support extensions such as star
1147
        (you can do ${name} and ${*name}, but not $<*name>). */
1148
        if (++ptr >= repend) goto BAD;
1149
        next = *ptr;
1150
        inangle = TRUE;
1151
        }
1152

1153
      if (!inangle && next == CHAR_ASTERISK)
1154
        {
1155
        if (++ptr >= repend) goto BAD;
1156
        next = *ptr;
1157
        star = TRUE;
1158
        }
1159

1160
      if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)
1161
        {
1162
        group = next - CHAR_0;
1163
        while (++ptr < repend)
1164
          {
1165
          next = *ptr;
1166
          if (next < CHAR_0 || next > CHAR_9) break;
1167
          group = group * 10 + (next - CHAR_0);
1168

1169
          /* A check for a number greater than the hightest captured group
1170
          is sufficient here; no need for a separate overflow check. If unknown
1171
          groups are to be treated as unset, just skip over any remaining
1172
          digits and carry on. */
1173

1174
          if (group > code->top_bracket)
1175
            {
1176
            if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1177
              {
1178
              while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
1179
              break;
1180
              }
1181
            else
1182
              {
1183
              rc = PCRE2_ERROR_NOSUBSTRING;
1184
              goto PTREXIT;
1185
              }
1186
            }
1187
          }
1188
        }
1189
      else
1190
        {
1191
        PCRE2_SIZE name_len;
1192
        PCRE2_SPTR name_start = ptr;
1193
        if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1194
          goto BAD;
1195
        name_len = ptr - name_start;
1196
        memcpy(name, name_start, CU2BYTES(name_len));
1197
        name[name_len] = 0;
1198
        }
1199

1200
      next = 0; /* not used or updated after this point */
1201
      (void)next;
1202

1203
      /* In extended mode we recognize ${name:+set text:unset text} and
1204
      ${name:-default text}. */
1205

1206
      if (inparens)
1207
        {
1208
        if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1209
             !star && ptr < repend - 2 && *ptr == CHAR_COLON)
1210
          {
1211
          special = *(++ptr);
1212
          if (special != CHAR_PLUS && special != CHAR_MINUS)
1213
            {
1214
            rc = PCRE2_ERROR_BADSUBSTITUTION;
1215
            goto PTREXIT;
1216
            }
1217

1218
          text1_start = ++ptr;
1219
          rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
1220
          if (rc != 0) goto PTREXIT;
1221
          text1_end = ptr;
1222

1223
          if (special == CHAR_PLUS && *ptr == CHAR_COLON)
1224
            {
1225
            text2_start = ++ptr;
1226
            rc = find_text_end(code, &ptr, repend, TRUE);
1227
            if (rc != 0) goto PTREXIT;
1228
            text2_end = ptr;
1229
            }
1230
          }
1231

1232
        else
1233
          {
1234
          if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
1235
            {
1236
            rc = PCRE2_ERROR_REPMISSINGBRACE;
1237
            goto PTREXIT;
1238
            }
1239
          }
1240

1241
        ptr++;
1242
        }
1243

1244
      if (inangle)
1245
        {
1246
        if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1247
          goto BAD;
1248
        ptr++;
1249
        }
1250

1251
      /* Have found a syntactically correct group number or name, or *name.
1252
      Only *MARK is currently recognized. */
1253

1254
      if (star)
1255
        {
1256
        if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
1257
          {
1258
          PCRE2_SPTR mark = pcre2_get_mark(match_data);
1259
          if (mark != NULL)
1260
            {
1261
            /* Peek backwards one code unit to obtain the length of the mark.
1262
            It can (theoretically) contain an embedded NUL. */
1263
            fraglength = mark[-1];
1264
            if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1265
                substitute_case_callout == NULL)
1266
              CHECKCASECPY_DEFAULT(mark, fraglength);
1267
            else
1268
              CHECKMEMCPY(mark, fraglength);
1269
            }
1270
          }
1271
        else goto BAD;
1272
        }
1273

1274
      /* Substitute the contents of a group. We don't use substring_copy
1275
      functions any more, in order to support case forcing. */
1276

1277
      else
1278
        {
1279
        GROUP_SUBSTITUTE:
1280
        /* Find a number for a named group. In case there are duplicate names,
1281
        search for the first one that is set. If the name is not found when
1282
        PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
1283
        non-existent group. */
1284

1285
        if (group < 0)
1286
          {
1287
          PCRE2_SPTR first, last, entry;
1288
          rc = pcre2_substring_nametable_scan(code, name, &first, &last);
1289
          if (rc == PCRE2_ERROR_NOSUBSTRING &&
1290
              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1291
            {
1292
            group = code->top_bracket + 1;
1293
            }
1294
          else
1295
            {
1296
            if (rc < 0) goto PTREXIT;
1297
            for (entry = first; entry <= last; entry += rc)
1298
              {
1299
              uint32_t ng = GET2(entry, 0);
1300
              if (ng < ovector_count)
1301
                {
1302
                if (group < 0) group = ng;          /* First in ovector */
1303
                if (ovector[ng*2] != PCRE2_UNSET)
1304
                  {
1305
                  group = ng;                       /* First that is set */
1306
                  break;
1307
                  }
1308
                }
1309
              }
1310

1311
            /* If group is still negative, it means we did not find a group
1312
            that is in the ovector. Just set the first group. */
1313

1314
            if (group < 0) group = GET2(first, 0);
1315
            }
1316
          }
1317

1318
        /* We now have a group that is identified by number. Find the length of
1319
        the captured string. If a group in a non-special substitution is unset
1320
        when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
1321

1322
        rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
1323
        if (rc < 0)
1324
          {
1325
          if (rc == PCRE2_ERROR_NOSUBSTRING &&
1326
              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1327
            {
1328
            rc = PCRE2_ERROR_UNSET;
1329
            }
1330
          if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
1331
          if (special == 0)                           /* Plain substitution */
1332
            {
1333
            if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
1334
            goto PTREXIT;                             /* Else error */
1335
            }
1336
          }
1337

1338
        /* If special is '+' we have a 'set' and possibly an 'unset' text,
1339
        both of which are reprocessed when used. If special is '-' we have a
1340
        default text for when the group is unset; it must be reprocessed. */
1341

1342
        if (special != 0)
1343
          {
1344
          if (special == CHAR_MINUS)
1345
            {
1346
            if (rc == 0) goto LITERAL_SUBSTITUTE;
1347
            text2_start = text1_start;
1348
            text2_end = text1_end;
1349
            }
1350

1351
          if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
1352
          ptrstack[ptrstackptr++] = ptr;
1353
          ptrstack[ptrstackptr++] = repend;
1354

1355
          if (rc == 0)
1356
            {
1357
            ptr = text1_start;
1358
            repend = text1_end;
1359
            }
1360
          else
1361
            {
1362
            ptr = text2_start;
1363
            repend = text2_end;
1364
            }
1365
          continue;
1366
          }
1367

1368
        /* Otherwise we have a literal substitution of a group's contents. */
1369

1370
        LITERAL_SUBSTITUTE:
1371
        subptr = subject + ovector[group*2];
1372
        subptrend = subject + ovector[group*2 + 1];
1373

1374
        /* Substitute a literal string, possibly forcing alphabetic case. */
1375

1376
        SUBPTR_SUBSTITUTE:
1377
        if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1378
            substitute_case_callout == NULL)
1379
          CHECKCASECPY_DEFAULT(subptr, subptrend - subptr);
1380
        else
1381
          CHECKMEMCPY(subptr, subptrend - subptr);
1382
        }
1383
      }   /* End of $ processing */
1384

1385
    /* Handle an escape sequence in extended mode. We can use check_escape()
1386
    to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
1387
    the case-forcing escapes are not supported in pcre2_compile() so must be
1388
    recognized here. */
1389

1390
    else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1391
              *ptr == CHAR_BACKSLASH)
1392
      {
1393
      int errorcode;
1394
      case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
1395

1396
      if (ptr < repend - 1) switch (ptr[1])
1397
        {
1398
        case CHAR_L:
1399
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1400
        new_forcecase.single_char = FALSE;
1401
        ptr += 2;
1402
        break;
1403

1404
        case CHAR_l:
1405
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1406
        new_forcecase.single_char = TRUE;
1407
        ptr += 2;
1408
        if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)
1409
          {
1410
          /* Perl reverse-title-casing feature for \l\U */
1411
          new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST;
1412
          new_forcecase.single_char = FALSE;
1413
          ptr += 2;
1414
          }
1415
        break;
1416

1417
        case CHAR_U:
1418
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
1419
        new_forcecase.single_char = FALSE;
1420
        ptr += 2;
1421
        break;
1422

1423
        case CHAR_u:
1424
        new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1425
        new_forcecase.single_char = TRUE;
1426
        ptr += 2;
1427
        if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)
1428
          {
1429
          /* Perl title-casing feature for \u\L */
1430
          new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1431
          new_forcecase.single_char = FALSE;
1432
          ptr += 2;
1433
          }
1434
        break;
1435

1436
        default:
1437
        break;
1438
        }
1439

1440
      if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1441
        {
1442
        SETFORCECASE:
1443

1444
        /* If the substitute_case_callout is unset, our case-forcing is done
1445
        immediately. If there is a callout however, then its action is delayed
1446
        until all the characters have been collected.
1447

1448
        Apply the callout now, before we set the new casing mode. */
1449

1450
        if (substitute_case_callout != NULL &&
1451
            forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1452
          DELAYEDFORCECASE();
1453

1454
        forcecase = new_forcecase;
1455
        casestart_offset = buff_offset;
1456
        casestart_extra_needed = extra_needed;
1457
        continue;
1458
        }
1459

1460
      ptr++;  /* Point after \ */
1461
      rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
1462
        code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
1463
      if (errorcode != 0) goto BADESCAPE;
1464

1465
      switch(rc)
1466
        {
1467
        case ESC_E:
1468
        goto SETFORCECASE;
1469

1470
        case ESC_Q:
1471
        escaped_literal = TRUE;
1472
        continue;
1473

1474
        case 0:      /* Data character */
1475
        case ESC_b:  /* \b is backspace in a substitution */
1476
        case ESC_v:  /* \v is vertical tab in a substitution */
1477

1478
        if (rc == ESC_b) ch = CHAR_BS;
1479
        if (rc == ESC_v) ch = CHAR_VT;
1480

1481
#ifdef SUPPORT_UNICODE
1482
        if (utf) chlen = PRIV(ord2utf)(ch, temp); else
1483
#endif
1484
          {
1485
          temp[0] = ch;
1486
          chlen = 1;
1487
          }
1488

1489
        if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1490
            substitute_case_callout == NULL)
1491
          CHECKCASECPY_DEFAULT(temp, chlen);
1492
        else
1493
          CHECKMEMCPY(temp, chlen);
1494
        continue;
1495

1496
        case ESC_g:
1497
          {
1498
          PCRE2_SIZE name_len;
1499
          PCRE2_SPTR name_start;
1500

1501
          /* Parse the \g<name> form (\g<number> already handled by check_escape) */
1502
          if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)
1503
            goto BADESCAPE;
1504
          ++ptr;
1505

1506
          name_start = ptr;
1507
          if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1508
            goto BADESCAPE;
1509
          name_len = ptr - name_start;
1510

1511
          if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1512
            goto BADESCAPE;
1513
          ++ptr;
1514

1515
          special = 0;
1516
          group = -1;
1517
          memcpy(name, name_start, CU2BYTES(name_len));
1518
          name[name_len] = 0;
1519
          goto GROUP_SUBSTITUTE;
1520
          }
1521

1522
        default:
1523
        if (rc < 0)
1524
          {
1525
          special = 0;
1526
          group = -rc - 1;
1527
          goto GROUP_SUBSTITUTE;
1528
          }
1529
        goto BADESCAPE;
1530
        }
1531
      }   /* End of backslash processing */
1532

1533
    /* Handle a literal code unit */
1534

1535
    else
1536
      {
1537
      PCRE2_SPTR ch_start;
1538

1539
      LOADLITERAL:
1540
      ch_start = ptr;
1541
      GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
1542
      (void) ch;
1543

1544
      if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1545
          substitute_case_callout == NULL)
1546
        CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start);
1547
      else
1548
        CHECKMEMCPY(ch_start, ptr - ch_start);
1549
      } /* End handling a literal code unit */
1550
    }   /* End of loop for scanning the replacement. */
1551

1552
  /* If the substitute_case_callout is unset, our case-forcing is done
1553
  immediately. If there is a callout however, then its action is delayed
1554
  until all the characters have been collected.
1555

1556
  We now clean up any trailing section of the replacement for which we deferred
1557
  the case-forcing. */
1558

1559
  if (substitute_case_callout != NULL &&
1560
      forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1561
    DELAYEDFORCECASE();
1562

1563
  /* The replacement has been copied to the output, or its size has been
1564
  remembered. Handle the callout if there is one. */
1565

1566
  if (mcontext != NULL && mcontext->substitute_callout != NULL)
1567
    {
1568
    /* If we an actual (non-simulated) replacement, do the callout. */
1569

1570
    if (!overflowed)
1571
      {
1572
      scb.subscount = subs;
1573
      scb.output_offsets[1] = buff_offset;
1574
      rc = mcontext->substitute_callout(&scb,
1575
                                        mcontext->substitute_callout_data);
1576

1577
      /* A non-zero return means cancel this substitution. Instead, copy the
1578
      matched string fragment. */
1579

1580
      if (rc != 0)
1581
        {
1582
        PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
1583
        PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1584

1585
        buff_offset -= newlength;
1586
        lengthleft += newlength;
1587
        if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
1588

1589
        /* A negative return means do not do any more. */
1590

1591
        if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
1592
        }
1593
      }
1594

1595
    /* In this interesting case, we cannot do the callout, so it's hard to
1596
    estimate the required buffer size. What callers want is to be able to make
1597
    two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
1598
    to discover the buffer size, and then a second and final call. Older
1599
    versions of PCRE2 violated this assumption, by proceding as if the callout
1600
    had returned zero - but on the second call to pcre2_substitute() it could
1601
    return non-zero and then overflow the buffer again. Callers probably don't
1602
    want to keep on looping to incrementally discover the buffer size. */
1603

1604
    else
1605
      {
1606
      PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0];
1607
      PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed;
1608
      PCRE2_SIZE newlength =
1609
        (newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)?  /* Integer overflow */
1610
        ~(PCRE2_SIZE)0 : newlength_buf + newlength_extra;    /* Cap the addition */
1611
      PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1612

1613
      /* Be pessimistic: request whichever buffer size is larger out of
1614
      accepting or rejecting the substitution. */
1615

1616
      if (oldlength > newlength)
1617
        {
1618
        PCRE2_SIZE additional = oldlength - newlength;
1619
        if (additional > ~(PCRE2_SIZE)0 - extra_needed)  /* Integer overflow */
1620
          goto TOOLARGEREPLACE;
1621
        extra_needed += additional;
1622
        }
1623

1624
      /* Proceed as if the callout did not return a negative. A negative
1625
      effectively rejects all future substitutions, but we want to examine them
1626
      pessimistically. */
1627
      }
1628
    }
1629

1630
  /* Save the details of this match. See above for how this data is used. If we
1631
  matched an empty string, do the magic for global matches. Update the start
1632
  offset to point to the rest of the subject string. If we re-used an existing
1633
  match for the first match, switch to the internal match data block. */
1634

1635
  ovecsave[0] = ovector[0];
1636
  ovecsave[1] = ovector[1];
1637
  ovecsave[2] = start_offset;
1638

1639
  goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
1640
    PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
1641
  start_offset = ovector[1];
1642
  } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
1643

1644
/* Copy the rest of the subject unless not required, and terminate the output
1645
with a binary zero. */
1646

1647
if (!replacement_only)
1648
  {
1649
  fraglength = length - start_offset;
1650
  CHECKMEMCPY(subject + start_offset, fraglength);
1651
  }
1652

1653
temp[0] = 0;
1654
CHECKMEMCPY(temp, 1);
1655

1656
/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
1657
and matching has carried on after a full buffer, in order to compute the length
1658
needed. Otherwise, an overflow generates an immediate error return. */
1659

1660
if (overflowed)
1661
  {
1662
  rc = PCRE2_ERROR_NOMEMORY;
1663

1664
  if (extra_needed > ~(PCRE2_SIZE)0 - buff_length)  /* Integer overflow */
1665
    goto TOOLARGEREPLACE;
1666
  *blength = buff_length + extra_needed;
1667
  }
1668

1669
/* After a successful execution, return the number of substitutions and set the
1670
length of buffer used, excluding the trailing zero. */
1671

1672
else
1673
  {
1674
  rc = subs;
1675
  *blength = buff_offset - 1;
1676
  }
1677

1678
EXIT:
1679
if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
1680
  else match_data->rc = rc;
1681
return rc;
1682

1683
NOROOM:
1684
rc = PCRE2_ERROR_NOMEMORY;
1685
goto EXIT;
1686

1687
CASEERROR:
1688
rc = PCRE2_ERROR_REPLACECASE;
1689
goto EXIT;
1690

1691
TOOLARGEREPLACE:
1692
rc = PCRE2_ERROR_TOOLARGEREPLACE;
1693
goto EXIT;
1694

1695
BAD:
1696
rc = PCRE2_ERROR_BADREPLACEMENT;
1697
goto PTREXIT;
1698

1699
BADESCAPE:
1700
rc = PCRE2_ERROR_BADREPESCAPE;
1701

1702
PTREXIT:
1703
*blength = (PCRE2_SIZE)(ptr - replacement);
1704
goto EXIT;
1705
}
1706

1707
/* End of pcre2_substitute.c */
1708

1709
Product

Resources

Company