Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_substitute.c
21745 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_internal.h"
43
44
45
46
#define PTR_STACK_SIZE 20
47
48
#define SUBSTITUTE_OPTIONS \
49
(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
50
PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
51
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
52
PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
53
54
55
56
/*************************************************
57
* Find end of substitute text *
58
*************************************************/
59
60
/* In extended mode, we recognize ${name:+set text:unset text} and similar
61
constructions. This requires the identification of unescaped : and }
62
characters. This function scans for such. It must deal with nested ${
63
constructions. The pointer to the text is updated, either to the required end
64
character, or to where an error was detected.
65
66
Arguments:
67
code points to the compiled expression (for options)
68
ptrptr points to the pointer to the start of the text (updated)
69
ptrend end of the whole string
70
last TRUE if the last expected string (only } recognized)
71
72
Returns: 0 on success
73
negative error code on failure
74
*/
75
76
static int
77
find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
78
BOOL last)
79
{
80
int rc = 0;
81
uint32_t nestlevel = 0;
82
BOOL literal = FALSE;
83
PCRE2_SPTR ptr = *ptrptr;
84
85
for (; ptr < ptrend; ptr++)
86
{
87
if (literal)
88
{
89
if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
90
{
91
literal = FALSE;
92
ptr += 1;
93
}
94
}
95
96
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
97
{
98
if (nestlevel == 0) goto EXIT;
99
nestlevel--;
100
}
101
102
else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
103
104
else if (*ptr == CHAR_DOLLAR_SIGN)
105
{
106
if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
107
{
108
nestlevel++;
109
ptr += 1;
110
}
111
}
112
113
else if (*ptr == CHAR_BACKSLASH)
114
{
115
int erc;
116
int errorcode;
117
uint32_t ch;
118
119
if (ptr < ptrend - 1) switch (ptr[1])
120
{
121
case CHAR_L:
122
case CHAR_l:
123
case CHAR_U:
124
case CHAR_u:
125
ptr += 1;
126
continue;
127
}
128
129
ptr += 1; /* Must point after \ */
130
erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
131
code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
132
if (errorcode != 0)
133
{
134
/* errorcode from check_escape is positive, so must not be returned by
135
pcre2_substitute(). */
136
rc = PCRE2_ERROR_BADREPESCAPE;
137
goto EXIT;
138
}
139
140
switch(erc)
141
{
142
case 0: /* Data character */
143
case ESC_b: /* Data character */
144
case ESC_v: /* Data character */
145
case ESC_E: /* Isolated \E is ignored */
146
break;
147
148
case ESC_Q:
149
literal = TRUE;
150
break;
151
152
case ESC_g:
153
/* The \g<name> form (\g<number> already handled by check_escape)
154
155
Don't worry about finding the matching ">". We are super, super lenient
156
about validating ${} replacements inside find_text_end(), so we certainly
157
don't need to worry about other syntax. Importantly, a \g<..> or $<...>
158
sequence can't contain a '}' character. */
159
break;
160
161
default:
162
if (erc < 0)
163
break; /* capture group reference */
164
rc = PCRE2_ERROR_BADREPESCAPE;
165
goto EXIT;
166
}
167
}
168
}
169
170
rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
171
172
EXIT:
173
*ptrptr = ptr;
174
return rc;
175
}
176
177
178
/*************************************************
179
* Validate group name *
180
*************************************************/
181
182
/* This function scans for a capture group name, validating it
183
consists of legal characters, is not empty, and does not exceed
184
MAX_NAME_SIZE.
185
186
Arguments:
187
ptrptr points to the pointer to the start of the text (updated)
188
ptrend end of the whole string
189
utf true if the input is UTF-encoded
190
ctypes pointer to the character types table
191
192
Returns: TRUE if a name was read
193
FALSE otherwise
194
*/
195
196
static BOOL
197
read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,
198
const uint8_t* ctypes)
199
{
200
PCRE2_SPTR ptr = *ptrptr;
201
PCRE2_SPTR nameptr = ptr;
202
203
if (ptr >= ptrend) /* No characters in name */
204
goto FAILED;
205
206
/* We do not need to check whether the name starts with a non-digit.
207
We are simply referencing names here, not defining them. */
208
209
/* See read_name in the pcre2_compile.c for the corresponding logic
210
restricting group names inside the pattern itself. */
211
212
#ifdef SUPPORT_UNICODE
213
if (utf)
214
{
215
uint32_t c, type;
216
217
while (ptr < ptrend)
218
{
219
GETCHAR(c, ptr);
220
type = UCD_CHARTYPE(c);
221
if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
222
c != CHAR_UNDERSCORE) break;
223
ptr++;
224
FORWARDCHARTEST(ptr, ptrend);
225
}
226
}
227
else
228
#else
229
(void)utf; /* Avoid compiler warning */
230
#endif /* SUPPORT_UNICODE */
231
232
/* Handle group names in non-UTF modes. */
233
234
{
235
while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)
236
{
237
ptr++;
238
}
239
}
240
241
/* Check name length */
242
243
if (ptr - nameptr > MAX_NAME_SIZE)
244
goto FAILED;
245
246
/* Subpattern names must not be empty */
247
if (ptr == nameptr)
248
goto FAILED;
249
250
*ptrptr = ptr;
251
return TRUE;
252
253
FAILED:
254
*ptrptr = ptr;
255
return FALSE;
256
}
257
258
259
/*************************************************
260
* Case transformations *
261
*************************************************/
262
263
#define PCRE2_SUBSTITUTE_CASE_NONE 0
264
// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.
265
#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST 4
266
267
typedef struct {
268
int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */
269
BOOL single_char;
270
} case_state;
271
272
/* Helper to guess how much a string is likely to increase in size when
273
case-transformed. Usually, strings don't change size at all, but some rare
274
characters do grow. Estimate +10%, plus another few characters.
275
276
Performing this estimation is unfortunate, but inevitable, since we can't call
277
the callout if we ran out of buffer space to prepare its input.
278
279
Because this estimate is inexact (and in pathological cases, underestimates the
280
required buffer size) we must document that when you have a
281
substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you
282
may need more than two calls to determine the final buffer size. */
283
284
static PCRE2_SIZE
285
pessimistic_case_inflation(PCRE2_SIZE len)
286
{
287
return (len >> 3u) + 10;
288
}
289
290
/* Case transformation behaviour if no callout is passed. */
291
292
static PCRE2_SIZE
293
default_substitute_case_callout(
294
PCRE2_SPTR input, PCRE2_SIZE input_len,
295
PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
296
case_state *state, const pcre2_code *code)
297
{
298
PCRE2_SPTR input_end = input + input_len;
299
#ifdef SUPPORT_UNICODE
300
BOOL utf;
301
BOOL ucp;
302
#endif
303
PCRE2_UCHAR temp[6];
304
BOOL next_to_upper;
305
BOOL rest_to_upper;
306
BOOL single_char;
307
BOOL overflow = FALSE;
308
PCRE2_SIZE written = 0;
309
310
/* Helpful simplifying invariant: input and output are disjoint buffers.
311
I believe that this code is technically undefined behaviour, because the two
312
pointers input/output are "unrelated" pointers and hence not comparable. Casting
313
via char* bypasses some but not all of those technical rules. It is not included
314
in release builds, in any case. */
315
PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||
316
(char *)(output + output_cap) <= (char *)input);
317
318
#ifdef SUPPORT_UNICODE
319
utf = (code->overall_options & PCRE2_UTF) != 0;
320
ucp = (code->overall_options & PCRE2_UCP) != 0;
321
#endif
322
323
if (input_len == 0) return 0;
324
325
switch (state->to_case)
326
{
327
/* LCOV_EXCL_START */
328
default:
329
PCRE2_DEBUG_UNREACHABLE();
330
return 0;
331
/* LCOV_EXCL_STOP */
332
333
case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
334
case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
335
next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);
336
break;
337
338
case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
339
next_to_upper = TRUE;
340
rest_to_upper = FALSE;
341
state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
342
break;
343
344
case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
345
next_to_upper = FALSE;
346
rest_to_upper = TRUE;
347
state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
348
break;
349
}
350
351
single_char = state->single_char;
352
if (single_char)
353
state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
354
355
while (input < input_end)
356
{
357
uint32_t ch;
358
unsigned int chlen;
359
360
GETCHARINCTEST(ch, input);
361
362
#ifdef SUPPORT_UNICODE
363
if ((utf || ucp) && ch >= 128)
364
{
365
uint32_t type = UCD_CHARTYPE(ch);
366
if (PRIV(ucp_gentype)[type] == ucp_L &&
367
type != (next_to_upper? ucp_Lu : ucp_Ll))
368
ch = UCD_OTHERCASE(ch);
369
370
/* TODO This is far from correct... it doesn't support the SpecialCasing.txt
371
mappings, but worse, it's not even correct for all the ordinary case
372
mappings. We should add support for those (at least), and then add the
373
SpecialCasing.txt mappings for Esszet and ligatures, and finally use the
374
Turkish casing flag on the match context. */
375
}
376
else
377
#endif
378
if (MAX_255(ch))
379
{
380
if (((code->tables + cbits_offset +
381
(next_to_upper? cbit_upper:cbit_lower)
382
)[ch/8] & (1u << (ch%8))) == 0)
383
ch = (code->tables + fcc_offset)[ch];
384
}
385
386
#ifdef SUPPORT_UNICODE
387
if (utf) chlen = PRIV(ord2utf)(ch, temp); else
388
#endif
389
{
390
temp[0] = ch;
391
chlen = 1;
392
}
393
394
if (!overflow && chlen <= output_cap)
395
{
396
memcpy(output, temp, CU2BYTES(chlen));
397
output += chlen;
398
output_cap -= chlen;
399
}
400
else
401
{
402
overflow = TRUE;
403
}
404
405
if (chlen > ~(PCRE2_SIZE)0 - written) /* Integer overflow */
406
return ~(PCRE2_SIZE)0;
407
written += chlen;
408
409
next_to_upper = rest_to_upper;
410
411
/* memcpy the remainder, if only transforming a single character. */
412
413
if (single_char)
414
{
415
PCRE2_SIZE rest_len = input_end - input;
416
417
if (!overflow && rest_len <= output_cap)
418
memcpy(output, input, CU2BYTES(rest_len));
419
420
if (rest_len > ~(PCRE2_SIZE)0 - written) /* Integer overflow */
421
return ~(PCRE2_SIZE)0;
422
written += rest_len;
423
424
return written;
425
}
426
}
427
428
return written;
429
}
430
431
/* Helper to perform the call to the substitute_case_callout. We wrap the
432
user-provided callout because our internal arguments are slightly extended. We
433
don't want the user callout to handle the case of "\l" (first character only to
434
lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because
435
those are not operations defined by Unicode. Instead the user callout simply
436
needs to provide the three Unicode primitives: lower, upper, titlecase. */
437
438
static PCRE2_SIZE
439
do_case_copy(
440
PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,
441
case_state *state, BOOL utf,
442
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
443
PCRE2_SIZE, int, void *),
444
void *substitute_case_callout_data)
445
{
446
PCRE2_SPTR input = input_output;
447
PCRE2_UCHAR *output = input_output;
448
PCRE2_SIZE rc;
449
PCRE2_SIZE rc2;
450
int ch1_to_case;
451
int rest_to_case;
452
PCRE2_UCHAR ch1[6];
453
PCRE2_SIZE ch1_len;
454
PCRE2_SPTR rest;
455
PCRE2_SIZE rest_len;
456
BOOL ch1_overflow = FALSE;
457
BOOL rest_overflow = FALSE;
458
459
#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)
460
(void)utf; /* Avoid compiler warning. */
461
#endif
462
463
PCRE2_ASSERT(input_len != 0);
464
465
switch (state->to_case)
466
{
467
/* LCOV_EXCL_START */
468
default:
469
PCRE2_DEBUG_UNREACHABLE();
470
return 0;
471
/* LCOV_EXCL_STOP */
472
473
case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
474
case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
475
case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
476
477
/* The easy case, where our internal casing operations align with those of
478
the callout. */
479
480
if (state->single_char == FALSE)
481
{
482
rc = substitute_case_callout(input, input_len, output, output_cap,
483
state->to_case, substitute_case_callout_data);
484
485
if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)
486
state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
487
488
return rc;
489
}
490
491
ch1_to_case = state->to_case;
492
rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;
493
break;
494
495
case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
496
ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
497
rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
498
break;
499
}
500
501
/* Identify the leading character. Take copy, because its storage overlaps with
502
`output`, and hence may be scrambled by the callout. */
503
504
{
505
PCRE2_SPTR ch_end = input;
506
uint32_t ch;
507
508
GETCHARINCTEST(ch, ch_end);
509
(void) ch;
510
PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);
511
ch1_len = ch_end - input;
512
memcpy(ch1, input, CU2BYTES(ch1_len));
513
}
514
515
rest = input + ch1_len;
516
rest_len = input_len - ch1_len;
517
518
/* Transform just ch1. The buffers are always in-place (input == output). With a
519
custom callout, we need a loop to discover its required buffer size. The loop
520
wouldn't be required if the callout were well-behaved, but it might be naughty
521
and return "5" the first time, then "10" the next time we call it using the
522
exact same input! */
523
524
{
525
PCRE2_SIZE ch1_cap;
526
PCRE2_SIZE max_ch1_cap;
527
528
ch1_cap = ch1_len; /* First attempt uses the space vacated by ch1. */
529
PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);
530
max_ch1_cap = output_cap - rest_len;
531
532
while (TRUE)
533
{
534
rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,
535
substitute_case_callout_data);
536
if (rc == ~(PCRE2_SIZE)0) return rc;
537
538
if (rc <= ch1_cap) break;
539
540
if (rc > max_ch1_cap)
541
{
542
ch1_overflow = TRUE;
543
break;
544
}
545
546
/* Move the rest to the right, to make room for expanding ch1. */
547
548
memmove(input_output + rc, rest, CU2BYTES(rest_len));
549
rest = input + rc;
550
551
ch1_cap = rc;
552
553
/* Proof of loop termination: `ch1_cap` is growing on each iteration, but
554
the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */
555
}
556
}
557
558
if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)
559
{
560
if (!ch1_overflow)
561
{
562
PCRE2_ASSERT(rest_len <= output_cap - rc);
563
memmove(output + rc, rest, CU2BYTES(rest_len));
564
}
565
rc2 = rest_len;
566
567
state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
568
}
569
else
570
{
571
PCRE2_UCHAR dummy[1];
572
573
rc2 = substitute_case_callout(rest, rest_len,
574
ch1_overflow? dummy : output + rc,
575
ch1_overflow? 0u : output_cap - rc,
576
rest_to_case, substitute_case_callout_data);
577
if (rc2 == ~(PCRE2_SIZE)0) return rc2;
578
579
if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;
580
581
/* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then
582
`rest` shrinks, it's actually possible for the total calculated length of
583
`xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't
584
report that, because it would make it seem that the operation succeeded.
585
If either of xform(ch1) or xform(rest) won't fit in the buffer, our final
586
result must be > output_cap. */
587
if (ch1_overflow && rc2 < rest_len)
588
rc2 = rest_len;
589
590
state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
591
}
592
593
if (rc2 > ~(PCRE2_SIZE)0 - rc) /* Integer overflow */
594
return ~(PCRE2_SIZE)0;
595
596
PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);
597
(void)rest_overflow;
598
599
return rc + rc2;
600
}
601
602
603
/*************************************************
604
* Match and substitute *
605
*************************************************/
606
607
/* This function applies a compiled re to a subject string and creates a new
608
string with substitutions. The first 7 arguments are the same as for
609
pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
610
611
Arguments:
612
code points to the compiled expression
613
subject points to the subject string
614
length length of subject string (may contain binary zeros)
615
start_offset where to start in the subject string
616
options option bits
617
match_data points to a match_data block, or is NULL
618
context points a PCRE2 context
619
replacement points to the replacement string
620
rlength length of replacement string
621
buffer where to put the substituted string
622
blength points to length of buffer; updated to length of string
623
624
Returns: >= 0 number of substitutions made
625
< 0 an error code
626
PCRE2_ERROR_BADREPLACEMENT means invalid use of $
627
*/
628
629
/* This macro checks for space in the buffer before copying into it. On
630
overflow, either give an error immediately, or keep on, accumulating the
631
length. */
632
633
#define CHECKMEMCPY(from, length_) \
634
do { \
635
PCRE2_SIZE chkmc_length = length_; \
636
if (overflowed) \
637
{ \
638
if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
639
goto TOOLARGEREPLACE; \
640
extra_needed += chkmc_length; \
641
} \
642
else if (lengthleft < chkmc_length) \
643
{ \
644
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
645
overflowed = TRUE; \
646
extra_needed = chkmc_length - lengthleft; \
647
} \
648
else \
649
{ \
650
memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \
651
buff_offset += chkmc_length; \
652
lengthleft -= chkmc_length; \
653
} \
654
} \
655
while (0)
656
657
/* This macro checks for space and copies characters with casing modifications.
658
On overflow, it behaves as for CHECKMEMCPY().
659
660
When substitute_case_callout is NULL, the source and destination buffers must
661
not overlap, because our default handler does not support this. */
662
663
#define CHECKCASECPY_BASE(length_, do_call) \
664
do { \
665
PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \
666
PCRE2_SIZE chkcc_rc; \
667
do_call \
668
if (lengthleft < chkcc_rc) \
669
{ \
670
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
671
overflowed = TRUE; \
672
extra_needed = chkcc_rc - lengthleft; \
673
} \
674
else \
675
{ \
676
buff_offset += chkcc_rc; \
677
lengthleft -= chkcc_rc; \
678
} \
679
} \
680
while (0)
681
682
#define CHECKCASECPY_DEFAULT(from, length_) \
683
CHECKCASECPY_BASE(length_, { \
684
chkcc_rc = default_substitute_case_callout(from, chkcc_length, \
685
buffer + buff_offset, \
686
overflowed? 0 : lengthleft, \
687
&forcecase, code); \
688
if (overflowed) \
689
{ \
690
if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
691
goto TOOLARGEREPLACE; \
692
extra_needed += chkcc_rc; \
693
break; \
694
} \
695
})
696
697
#define CHECKCASECPY_CALLOUT(length_) \
698
CHECKCASECPY_BASE(length_, { \
699
chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \
700
lengthleft, &forcecase, utf, \
701
substitute_case_callout, \
702
substitute_case_callout_data); \
703
if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \
704
})
705
706
/* This macro does a delayed case transformation, for the situation when we have
707
a case-forcing callout. */
708
709
#define DELAYEDFORCECASE() \
710
do { \
711
PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \
712
(extra_needed - casestart_extra_needed); \
713
if (chars_outstanding > 0) \
714
{ \
715
if (overflowed) \
716
{ \
717
PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \
718
if (guess > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
719
goto TOOLARGEREPLACE; \
720
extra_needed += guess; \
721
} \
722
else \
723
{ \
724
/* Rewind the buffer */ \
725
lengthleft += (buff_offset - casestart_offset); \
726
buff_offset = casestart_offset; \
727
/* Care! In-place case transformation */ \
728
CHECKCASECPY_CALLOUT(chars_outstanding); \
729
} \
730
} \
731
} \
732
while (0)
733
734
735
/* Here's the function */
736
737
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
738
pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
739
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
740
pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
741
PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
742
{
743
int rc;
744
int subs;
745
uint32_t ovector_count;
746
uint32_t goptions = 0;
747
uint32_t suboptions;
748
pcre2_match_data *internal_match_data = NULL;
749
BOOL escaped_literal = FALSE;
750
BOOL overflowed = FALSE;
751
BOOL use_existing_match;
752
BOOL replacement_only;
753
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
754
PCRE2_UCHAR temp[6];
755
PCRE2_UCHAR null_str[1] = { 0xcd };
756
PCRE2_SPTR original_subject = subject;
757
PCRE2_SPTR ptr;
758
PCRE2_SPTR repend = NULL;
759
PCRE2_SIZE extra_needed = 0;
760
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
761
PCRE2_SIZE *ovector;
762
PCRE2_SIZE ovecsave[2] = { 0, 0 };
763
pcre2_substitute_callout_block scb;
764
PCRE2_SIZE sub_start_extra_needed;
765
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
766
PCRE2_SIZE, int, void *) = NULL;
767
void *substitute_case_callout_data = NULL;
768
769
/* General initialization */
770
771
buff_offset = 0;
772
lengthleft = buff_length = *blength;
773
*blength = PCRE2_UNSET;
774
775
if (mcontext != NULL)
776
{
777
substitute_case_callout = mcontext->substitute_case_callout;
778
substitute_case_callout_data = mcontext->substitute_case_callout_data;
779
}
780
781
/* Partial matching is not valid. This must come after setting *blength to
782
PCRE2_UNSET, so as not to imply an offset in the replacement. */
783
784
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
785
return PCRE2_ERROR_BADOPTION;
786
787
/* Validate length and find the end of the replacement. A NULL replacement of
788
zero length is interpreted as an empty string. */
789
790
if (replacement == NULL)
791
{
792
if (rlength != 0) return PCRE2_ERROR_NULL;
793
replacement = null_str;
794
}
795
796
if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
797
repend = replacement + rlength;
798
799
/* A NULL subject of zero length is treated as an empty string. */
800
801
if (subject == NULL)
802
{
803
if (length != 0) return PCRE2_ERROR_NULL;
804
subject = null_str;
805
}
806
807
if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
808
809
/* Check for using a match that has already happened. Note that the subject
810
pointer in the match data may be NULL after a no-match. */
811
812
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
813
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
814
815
if (use_existing_match && match_data == NULL) return PCRE2_ERROR_NULL;
816
817
/* If an existing match is being passed in, we should check that it matches
818
the passed-in subject pointer, length, and match options. We don't currently
819
have a use-case for someone to match on one subject, then try and use that
820
match data on a different subject. In a UTF-encoded string, a simple change
821
like replacing one character for another won't preserve the code unit offsets,
822
so it's hard to see, in the general case, how it would be safe or useful to
823
support swapping or mutating the subject string.
824
825
Similarly, using different match options between the first (external) and
826
subsequent (internal, global) matches is hard to justify. */
827
828
if (use_existing_match)
829
{
830
/* Return early, as the rest of the match_data may not have been
831
initialised. This duplicates and must be in sync with the check below that
832
aborts substitution on any result other than success or no-match. */
833
if (match_data->rc < 0 && match_data->rc != PCRE2_ERROR_NOMATCH)
834
return match_data->rc;
835
836
/* Not supported if the passed-in match was from the DFA interpreter. */
837
if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
838
return PCRE2_ERROR_DFA_UFUNC;
839
840
if (code != match_data->code)
841
return PCRE2_ERROR_DIFFSUBSPATTERN;
842
843
/* We want the passed-in subject strings to match. This implies the effective
844
length must match, and either: the pointers are equal (with strict matching
845
of NULL against NULL); or, the special case of PCRE2_COPY_MATCHED_SUBJECT
846
where we cannot compare pointers but we can verify the contents. */
847
if (length != match_data->subject_length ||
848
!(original_subject == match_data->subject ||
849
((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0 &&
850
(length == 0 ||
851
memcmp(subject, match_data->subject, CU2BYTES(length)) == 0))))
852
return PCRE2_ERROR_DIFFSUBSSUBJECT;
853
854
if (start_offset != match_data->start_offset)
855
return PCRE2_ERROR_DIFFSUBSOFFSET;
856
857
if ((options & ~SUBSTITUTE_OPTIONS) != match_data->options)
858
return PCRE2_ERROR_DIFFSUBSOPTIONS;
859
}
860
861
/* If starting from an existing match, there must be an externally provided
862
match data block. We create an internal match_data block in two cases: (a) an
863
external one is not supplied (and we are not starting from an existing match);
864
(b) an existing match is to be used for the first substitution. In the latter
865
case, we copy the existing match into the internal block, except for any cached
866
heap frame size and pointer. This ensures that no changes are made to the
867
external match data block. */
868
869
/* WARNING: In both cases below a general context is constructed "by hand"
870
because calling pcre2_general_context_create() involves a memory allocation. If
871
the contents of a general context control block are ever changed there will
872
have to be changes below. */
873
874
if (match_data == NULL)
875
{
876
pcre2_general_context gcontext;
877
gcontext.memctl = (mcontext == NULL)?
878
((pcre2_real_code *)code)->memctl :
879
((pcre2_real_match_context *)mcontext)->memctl;
880
match_data = internal_match_data =
881
pcre2_match_data_create_from_pattern(code, &gcontext);
882
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
883
}
884
885
else if (use_existing_match)
886
{
887
int pairs;
888
pcre2_general_context gcontext;
889
gcontext.memctl = (mcontext == NULL)?
890
((pcre2_real_code *)code)->memctl :
891
((pcre2_real_match_context *)mcontext)->memctl;
892
pairs = (code->top_bracket + 1 < match_data->oveccount)?
893
code->top_bracket + 1 : match_data->oveccount;
894
internal_match_data = pcre2_match_data_create(match_data->oveccount,
895
&gcontext);
896
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
897
memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
898
+ 2*pairs*sizeof(PCRE2_SIZE));
899
internal_match_data->heapframes = NULL;
900
internal_match_data->heapframes_size = 0;
901
/* Ensure that the subject is not freed when internal_match_data is */
902
internal_match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
903
match_data = internal_match_data;
904
}
905
906
/* If using an internal match data, there's no need to copy the subject. */
907
908
if (internal_match_data != NULL) options &= ~PCRE2_COPY_MATCHED_SUBJECT;
909
910
/* Remember ovector details */
911
912
ovector = pcre2_get_ovector_pointer(match_data);
913
ovector_count = pcre2_get_ovector_count(match_data);
914
915
/* Fixed things in the callout block */
916
917
scb.version = 0;
918
scb.input = subject;
919
scb.output = (PCRE2_SPTR)buffer;
920
scb.ovector = ovector;
921
922
/* Check UTF replacement string if necessary. */
923
924
#ifdef SUPPORT_UNICODE
925
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
926
{
927
rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
928
if (rc != 0)
929
{
930
match_data->leftchar = 0;
931
goto EXIT;
932
}
933
}
934
#endif /* SUPPORT_UNICODE */
935
936
/* Save the substitute options and remove them from the match options. */
937
938
suboptions = options & SUBSTITUTE_OPTIONS;
939
options &= ~SUBSTITUTE_OPTIONS;
940
941
/* Error if the start match offset is greater than the length of the subject. */
942
943
if (start_offset > length)
944
{
945
match_data->leftchar = 0;
946
rc = PCRE2_ERROR_BADOFFSET;
947
goto EXIT;
948
}
949
950
/* Copy up to the start offset, unless only the replacement is required. */
951
952
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
953
954
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
955
match is taken from the match_data that was passed in. */
956
957
subs = 0;
958
for (;;)
959
{
960
PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
961
uint32_t ptrstackptr = 0;
962
case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
963
PCRE2_SIZE casestart_offset = 0;
964
PCRE2_SIZE casestart_extra_needed = 0;
965
966
if (use_existing_match)
967
{
968
rc = match_data->rc;
969
use_existing_match = FALSE;
970
}
971
else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
972
match_data, mcontext);
973
974
#ifdef SUPPORT_UNICODE
975
if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
976
#endif
977
978
/* Any error other than no match returns the error code. No match breaks the
979
global loop. */
980
981
if (rc == PCRE2_ERROR_NOMATCH) break;
982
983
if (rc < 0) goto EXIT;
984
985
/* Handle a successful match. Matches that use \K to end before they start
986
or start before the current point in the subject are not supported. */
987
988
if (ovector[1] < ovector[0] || ovector[0] < start_offset)
989
{
990
rc = PCRE2_ERROR_BADSUBSPATTERN;
991
goto EXIT;
992
}
993
994
/* Assert that our replacement loop is making progress, checked even in
995
release builds. This should be impossible to hit, however, an infinite loop
996
would be fairly catastrophic.
997
998
"Progress" is measured as ovector[1] strictly advancing, or, an empty match
999
after a non-empty match. */
1000
1001
/* LCOV_EXCL_START */
1002
if (subs > 0 &&
1003
!(ovector[1] > ovecsave[1] ||
1004
(ovector[1] == ovector[0] && ovecsave[1] > ovecsave[0] &&
1005
ovector[1] == ovecsave[1])))
1006
{
1007
PCRE2_DEBUG_UNREACHABLE();
1008
rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
1009
goto EXIT;
1010
}
1011
/* LCOV_EXCL_STOP */
1012
1013
ovecsave[0] = ovector[0];
1014
ovecsave[1] = ovector[1];
1015
1016
/* Count substitutions with a paranoid check for integer overflow; surely no
1017
real call to this function would ever hit this! */
1018
1019
if (subs == INT_MAX)
1020
{
1021
rc = PCRE2_ERROR_TOOMANYREPLACE;
1022
goto EXIT;
1023
}
1024
subs++;
1025
1026
/* Copy the text leading up to the match (unless not required); remember
1027
where the insert begins and how many ovector pairs are set; and remember how
1028
much space we have requested in extra_needed. */
1029
1030
if (rc == 0) rc = ovector_count;
1031
fraglength = ovector[0] - start_offset;
1032
if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
1033
scb.output_offsets[0] = buff_offset;
1034
scb.oveccount = rc;
1035
sub_start_extra_needed = extra_needed;
1036
1037
/* Process the replacement string. If the entire replacement is literal, just
1038
copy it with length check. */
1039
1040
ptr = replacement;
1041
if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
1042
{
1043
CHECKMEMCPY(ptr, rlength);
1044
}
1045
1046
/* Within a non-literal replacement, which must be scanned character by
1047
character, local literal mode can be set by \Q, but only in extended mode
1048
when backslashes are being interpreted. In extended mode we must handle
1049
nested substrings that are to be reprocessed. */
1050
1051
else for (;;)
1052
{
1053
uint32_t ch;
1054
unsigned int chlen;
1055
int group;
1056
uint32_t special;
1057
PCRE2_SPTR text1_start = NULL;
1058
PCRE2_SPTR text1_end = NULL;
1059
PCRE2_SPTR text2_start = NULL;
1060
PCRE2_SPTR text2_end = NULL;
1061
PCRE2_UCHAR name[MAX_NAME_SIZE + 1];
1062
1063
/* If at the end of a nested substring, pop the stack. */
1064
1065
if (ptr >= repend)
1066
{
1067
if (ptrstackptr == 0) break; /* End of replacement string */
1068
repend = ptrstack[--ptrstackptr];
1069
ptr = ptrstack[--ptrstackptr];
1070
continue;
1071
}
1072
1073
/* Handle the next character */
1074
1075
if (escaped_literal)
1076
{
1077
if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
1078
{
1079
escaped_literal = FALSE;
1080
ptr += 2;
1081
continue;
1082
}
1083
goto LOADLITERAL;
1084
}
1085
1086
/* Not in literal mode. */
1087
1088
if (*ptr == CHAR_DOLLAR_SIGN)
1089
{
1090
BOOL inparens;
1091
BOOL inangle;
1092
BOOL star;
1093
PCRE2_SIZE sublength;
1094
PCRE2_UCHAR next;
1095
PCRE2_SPTR subptr, subptrend;
1096
1097
if (++ptr >= repend) goto BAD;
1098
if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
1099
1100
special = 0;
1101
text1_start = NULL;
1102
text1_end = NULL;
1103
text2_start = NULL;
1104
text2_end = NULL;
1105
group = -1;
1106
inparens = FALSE;
1107
inangle = FALSE;
1108
star = FALSE;
1109
subptr = NULL;
1110
subptrend = NULL;
1111
1112
/* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */
1113
if (next == CHAR_AMPERSAND)
1114
{
1115
++ptr;
1116
group = 0;
1117
goto GROUP_SUBSTITUTE;
1118
}
1119
if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE)
1120
{
1121
++ptr;
1122
rc = pcre2_substring_length_bynumber(match_data, 0, &sublength);
1123
if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */
1124
1125
if (next == CHAR_GRAVE_ACCENT)
1126
{
1127
subptr = subject;
1128
subptrend = subject + ovector[0];
1129
}
1130
else
1131
{
1132
subptr = subject + ovector[1];
1133
subptrend = subject + length;
1134
}
1135
1136
goto SUBPTR_SUBSTITUTE;
1137
}
1138
if (next == CHAR_UNDERSCORE)
1139
{
1140
/* Java, .NET support $_ for "entire input string". */
1141
++ptr;
1142
subptr = subject;
1143
subptrend = subject + length;
1144
goto SUBPTR_SUBSTITUTE;
1145
}
1146
else if (next == CHAR_PLUS &&
1147
!(ptr+1 < repend && ptr[1] == CHAR_LEFT_CURLY_BRACKET))
1148
{
1149
/* Perl supports $+ for "highest captured group" (not the same as $^N
1150
which is mainly only useful inside Perl's match callbacks). We also
1151
don't accept "$+{..." since that's Perl syntax for our ${name}. */
1152
++ptr;
1153
if (code->top_bracket == 0)
1154
{
1155
/* Treat either as "no such group" or "all groups unset" based on the
1156
PCRE2_SUBSTITUTE_UNKNOWN_UNSET option. */
1157
if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) == 0)
1158
{
1159
rc = PCRE2_ERROR_NOSUBSTRING;
1160
goto PTREXIT;
1161
}
1162
group = 0;
1163
}
1164
else
1165
{
1166
/* If we have any capture groups, then the ovector needs to be large
1167
enough for all of them, or the result won't be accurate. */
1168
if (match_data->oveccount < code->top_bracket + 1)
1169
{
1170
rc = PCRE2_ERROR_UNAVAILABLE;
1171
goto PTREXIT;
1172
}
1173
for (group = code->top_bracket; group > 0; group--)
1174
if (ovector[2*group] != PCRE2_UNSET) break;
1175
}
1176
if (group == 0)
1177
{
1178
if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
1179
rc = PCRE2_ERROR_UNSET;
1180
goto PTREXIT;
1181
}
1182
goto GROUP_SUBSTITUTE;
1183
}
1184
1185
if (next == CHAR_LEFT_CURLY_BRACKET)
1186
{
1187
if (++ptr >= repend) goto BAD;
1188
next = *ptr;
1189
inparens = TRUE;
1190
}
1191
else if (next == CHAR_LESS_THAN_SIGN)
1192
{
1193
/* JavaScript compatibility syntax, $<name>. Processes only named
1194
groups (not numbered) and does not support extensions such as star
1195
(you can do ${name} and ${*name}, but not $<*name>). */
1196
if (++ptr >= repend) goto BAD;
1197
next = *ptr;
1198
inangle = TRUE;
1199
}
1200
1201
if (!inangle && next == CHAR_ASTERISK)
1202
{
1203
if (++ptr >= repend) goto BAD;
1204
next = *ptr;
1205
star = TRUE;
1206
}
1207
1208
if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)
1209
{
1210
group = next - CHAR_0;
1211
while (++ptr < repend)
1212
{
1213
next = *ptr;
1214
if (next < CHAR_0 || next > CHAR_9) break;
1215
group = group * 10 + (next - CHAR_0);
1216
1217
/* A check for a number greater than the hightest captured group
1218
is sufficient here; no need for a separate overflow check. If unknown
1219
groups are to be treated as unset, just skip over any remaining
1220
digits and carry on. */
1221
1222
if (group > code->top_bracket)
1223
{
1224
if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1225
{
1226
while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
1227
break;
1228
}
1229
else
1230
{
1231
rc = PCRE2_ERROR_NOSUBSTRING;
1232
goto PTREXIT;
1233
}
1234
}
1235
}
1236
}
1237
else
1238
{
1239
PCRE2_SIZE name_len;
1240
PCRE2_SPTR name_start = ptr;
1241
if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1242
goto BAD;
1243
name_len = ptr - name_start;
1244
memcpy(name, name_start, CU2BYTES(name_len));
1245
name[name_len] = 0;
1246
}
1247
1248
next = 0; /* not used or updated after this point */
1249
(void)next;
1250
1251
/* In extended mode we recognize ${name:+set text:unset text} and
1252
${name:-default text}. */
1253
1254
if (inparens)
1255
{
1256
if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1257
!star && ptr < repend - 2 && *ptr == CHAR_COLON)
1258
{
1259
special = *(++ptr);
1260
if (special != CHAR_PLUS && special != CHAR_MINUS)
1261
{
1262
rc = PCRE2_ERROR_BADSUBSTITUTION;
1263
goto PTREXIT;
1264
}
1265
1266
text1_start = ++ptr;
1267
rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
1268
if (rc != 0) goto PTREXIT;
1269
text1_end = ptr;
1270
1271
if (special == CHAR_PLUS && *ptr == CHAR_COLON)
1272
{
1273
text2_start = ++ptr;
1274
rc = find_text_end(code, &ptr, repend, TRUE);
1275
if (rc != 0) goto PTREXIT;
1276
text2_end = ptr;
1277
}
1278
}
1279
1280
else
1281
{
1282
if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
1283
{
1284
rc = PCRE2_ERROR_REPMISSINGBRACE;
1285
goto PTREXIT;
1286
}
1287
}
1288
1289
ptr++;
1290
}
1291
1292
if (inangle)
1293
{
1294
if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1295
goto BAD;
1296
ptr++;
1297
}
1298
1299
/* Have found a syntactically correct group number or name, or *name.
1300
Only *MARK is currently recognized. */
1301
1302
if (star)
1303
{
1304
if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
1305
{
1306
PCRE2_SPTR mark = pcre2_get_mark(match_data);
1307
if (mark != NULL)
1308
{
1309
/* Peek backwards one code unit to obtain the length of the mark.
1310
It can (theoretically) contain an embedded NUL. */
1311
fraglength = mark[-1];
1312
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1313
substitute_case_callout == NULL)
1314
CHECKCASECPY_DEFAULT(mark, fraglength);
1315
else
1316
CHECKMEMCPY(mark, fraglength);
1317
}
1318
}
1319
else goto BAD;
1320
}
1321
1322
/* Substitute the contents of a group. We don't use substring_copy
1323
functions any more, in order to support case forcing. */
1324
1325
else
1326
{
1327
GROUP_SUBSTITUTE:
1328
/* Find a number for a named group. In case there are duplicate names,
1329
search for the first one that is set. If the name is not found when
1330
PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
1331
non-existent group. */
1332
1333
if (group < 0)
1334
{
1335
PCRE2_SPTR first, last, entry;
1336
rc = pcre2_substring_nametable_scan(code, name, &first, &last);
1337
if (rc == PCRE2_ERROR_NOSUBSTRING &&
1338
(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1339
{
1340
group = code->top_bracket + 1;
1341
}
1342
else
1343
{
1344
if (rc < 0) goto PTREXIT;
1345
for (entry = first; entry <= last; entry += rc)
1346
{
1347
uint32_t ng = GET2(entry, 0);
1348
if (ng < ovector_count)
1349
{
1350
if (group < 0) group = ng; /* First in ovector */
1351
if (ovector[ng*2] != PCRE2_UNSET)
1352
{
1353
group = ng; /* First that is set */
1354
break;
1355
}
1356
}
1357
}
1358
1359
/* If group is still negative, it means we did not find a group
1360
that is in the ovector. Just set the first group. */
1361
1362
if (group < 0) group = GET2(first, 0);
1363
}
1364
}
1365
1366
/* We now have a group that is identified by number. Find the length of
1367
the captured string. If a group in a non-special substitution is unset
1368
when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
1369
1370
rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
1371
if (rc < 0)
1372
{
1373
if (rc == PCRE2_ERROR_NOSUBSTRING &&
1374
(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1375
{
1376
rc = PCRE2_ERROR_UNSET;
1377
}
1378
if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
1379
if (special == 0) /* Plain substitution */
1380
{
1381
if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
1382
goto PTREXIT; /* Else error */
1383
}
1384
}
1385
1386
/* If special is '+' we have a 'set' and possibly an 'unset' text,
1387
both of which are reprocessed when used. If special is '-' we have a
1388
default text for when the group is unset; it must be reprocessed. */
1389
1390
if (special != 0)
1391
{
1392
if (special == CHAR_MINUS)
1393
{
1394
if (rc == 0) goto LITERAL_SUBSTITUTE;
1395
text2_start = text1_start;
1396
text2_end = text1_end;
1397
}
1398
1399
if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
1400
ptrstack[ptrstackptr++] = ptr;
1401
ptrstack[ptrstackptr++] = repend;
1402
1403
if (rc == 0)
1404
{
1405
ptr = text1_start;
1406
repend = text1_end;
1407
}
1408
else
1409
{
1410
ptr = text2_start;
1411
repend = text2_end;
1412
}
1413
continue;
1414
}
1415
1416
/* Otherwise we have a literal substitution of a group's contents. */
1417
1418
LITERAL_SUBSTITUTE:
1419
subptr = subject + ovector[group*2];
1420
subptrend = subject + ovector[group*2 + 1];
1421
1422
/* Substitute a literal string, possibly forcing alphabetic case. */
1423
1424
SUBPTR_SUBSTITUTE:
1425
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1426
substitute_case_callout == NULL)
1427
CHECKCASECPY_DEFAULT(subptr, subptrend - subptr);
1428
else
1429
CHECKMEMCPY(subptr, subptrend - subptr);
1430
}
1431
} /* End of $ processing */
1432
1433
/* Handle an escape sequence in extended mode. We can use check_escape()
1434
to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
1435
the case-forcing escapes are not supported in pcre2_compile() so must be
1436
recognized here. */
1437
1438
else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1439
*ptr == CHAR_BACKSLASH)
1440
{
1441
int errorcode;
1442
case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
1443
1444
if (ptr < repend - 1) switch (ptr[1])
1445
{
1446
case CHAR_L:
1447
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1448
new_forcecase.single_char = FALSE;
1449
ptr += 2;
1450
break;
1451
1452
case CHAR_l:
1453
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1454
new_forcecase.single_char = TRUE;
1455
ptr += 2;
1456
if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)
1457
{
1458
/* Perl reverse-title-casing feature for \l\U */
1459
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST;
1460
new_forcecase.single_char = FALSE;
1461
ptr += 2;
1462
}
1463
break;
1464
1465
case CHAR_U:
1466
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
1467
new_forcecase.single_char = FALSE;
1468
ptr += 2;
1469
break;
1470
1471
case CHAR_u:
1472
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1473
new_forcecase.single_char = TRUE;
1474
ptr += 2;
1475
if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)
1476
{
1477
/* Perl title-casing feature for \u\L */
1478
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1479
new_forcecase.single_char = FALSE;
1480
ptr += 2;
1481
}
1482
break;
1483
1484
default:
1485
break;
1486
}
1487
1488
if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1489
{
1490
SETFORCECASE:
1491
1492
/* If the substitute_case_callout is unset, our case-forcing is done
1493
immediately. If there is a callout however, then its action is delayed
1494
until all the characters have been collected.
1495
1496
Apply the callout now, before we set the new casing mode. */
1497
1498
if (substitute_case_callout != NULL &&
1499
forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1500
DELAYEDFORCECASE();
1501
1502
forcecase = new_forcecase;
1503
casestart_offset = buff_offset;
1504
casestart_extra_needed = extra_needed;
1505
continue;
1506
}
1507
1508
ptr++; /* Point after \ */
1509
rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
1510
code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
1511
if (errorcode != 0) goto BADESCAPE;
1512
1513
switch(rc)
1514
{
1515
case ESC_E:
1516
goto SETFORCECASE;
1517
1518
case ESC_Q:
1519
escaped_literal = TRUE;
1520
continue;
1521
1522
case 0: /* Data character */
1523
case ESC_b: /* \b is backspace in a substitution */
1524
case ESC_v: /* \v is vertical tab in a substitution */
1525
1526
if (rc == ESC_b) ch = CHAR_BS;
1527
if (rc == ESC_v) ch = CHAR_VT;
1528
1529
#ifdef SUPPORT_UNICODE
1530
if (utf) chlen = PRIV(ord2utf)(ch, temp); else
1531
#endif
1532
{
1533
temp[0] = ch;
1534
chlen = 1;
1535
}
1536
1537
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1538
substitute_case_callout == NULL)
1539
CHECKCASECPY_DEFAULT(temp, chlen);
1540
else
1541
CHECKMEMCPY(temp, chlen);
1542
continue;
1543
1544
case ESC_g:
1545
{
1546
PCRE2_SIZE name_len;
1547
PCRE2_SPTR name_start;
1548
1549
/* Parse the \g<name> form (\g<number> already handled by check_escape) */
1550
if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)
1551
goto BADESCAPE;
1552
++ptr;
1553
1554
name_start = ptr;
1555
if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1556
goto BADESCAPE;
1557
name_len = ptr - name_start;
1558
1559
if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1560
goto BADESCAPE;
1561
++ptr;
1562
1563
special = 0;
1564
group = -1;
1565
memcpy(name, name_start, CU2BYTES(name_len));
1566
name[name_len] = 0;
1567
goto GROUP_SUBSTITUTE;
1568
}
1569
1570
default:
1571
if (rc < 0)
1572
{
1573
special = 0;
1574
group = -rc - 1;
1575
goto GROUP_SUBSTITUTE;
1576
}
1577
goto BADESCAPE;
1578
}
1579
} /* End of backslash processing */
1580
1581
/* Handle a literal code unit */
1582
1583
else
1584
{
1585
PCRE2_SPTR ch_start;
1586
1587
LOADLITERAL:
1588
ch_start = ptr;
1589
GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
1590
(void) ch;
1591
1592
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1593
substitute_case_callout == NULL)
1594
CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start);
1595
else
1596
CHECKMEMCPY(ch_start, ptr - ch_start);
1597
} /* End handling a literal code unit */
1598
} /* End of loop for scanning the replacement. */
1599
1600
/* If the substitute_case_callout is unset, our case-forcing is done
1601
immediately. If there is a callout however, then its action is delayed
1602
until all the characters have been collected.
1603
1604
We now clean up any trailing section of the replacement for which we deferred
1605
the case-forcing. */
1606
1607
if (substitute_case_callout != NULL &&
1608
forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1609
DELAYEDFORCECASE();
1610
1611
/* The replacement has been copied to the output, or its size has been
1612
remembered. Handle the callout if there is one. */
1613
1614
if (mcontext != NULL && mcontext->substitute_callout != NULL)
1615
{
1616
/* If we an actual (non-simulated) replacement, do the callout. */
1617
1618
if (!overflowed)
1619
{
1620
scb.subscount = subs;
1621
scb.output_offsets[1] = buff_offset;
1622
rc = mcontext->substitute_callout(&scb,
1623
mcontext->substitute_callout_data);
1624
1625
/* A non-zero return means cancel this substitution. Instead, copy the
1626
matched string fragment. */
1627
1628
if (rc != 0)
1629
{
1630
PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
1631
PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1632
1633
buff_offset -= newlength;
1634
lengthleft += newlength;
1635
if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
1636
1637
/* A negative return means do not do any more. */
1638
1639
if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
1640
}
1641
}
1642
1643
/* In this interesting case, we cannot do the callout, so it's hard to
1644
estimate the required buffer size. What callers want is to be able to make
1645
two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
1646
to discover the buffer size, and then a second and final call. Older
1647
versions of PCRE2 violated this assumption, by proceding as if the callout
1648
had returned zero - but on the second call to pcre2_substitute() it could
1649
return non-zero and then overflow the buffer again. Callers probably don't
1650
want to keep on looping to incrementally discover the buffer size. */
1651
1652
else
1653
{
1654
PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0];
1655
PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed;
1656
PCRE2_SIZE newlength =
1657
(newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)? /* Integer overflow */
1658
~(PCRE2_SIZE)0 : newlength_buf + newlength_extra; /* Cap the addition */
1659
PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1660
1661
/* Be pessimistic: request whichever buffer size is larger out of
1662
accepting or rejecting the substitution. */
1663
1664
if (oldlength > newlength)
1665
{
1666
PCRE2_SIZE additional = oldlength - newlength;
1667
if (additional > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */
1668
goto TOOLARGEREPLACE;
1669
extra_needed += additional;
1670
}
1671
1672
/* Proceed as if the callout did not return a negative. A negative
1673
effectively rejects all future substitutions, but we want to examine them
1674
pessimistically. */
1675
}
1676
}
1677
1678
/* Exit the global loop if we are not in global mode, or if pcre2_next_match()
1679
indicates we have reached the end of the subject. */
1680
1681
if ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) == 0 ||
1682
!pcre2_next_match(match_data, &start_offset, &goptions))
1683
{
1684
start_offset = ovector[1];
1685
break;
1686
}
1687
1688
/* Verify that pcre2_next_match() has not done a bumpalong (because we have
1689
already returned PCRE2_ERROR_BADSUBSPATTERN for \K in lookarounds).
1690
1691
We would otherwise have to memcpy the fragment spanning from ovector[1] to the
1692
new start_offset.*/
1693
1694
PCRE2_ASSERT(start_offset == ovector[1]);
1695
1696
} /* End of global loop */
1697
1698
/* Copy the rest of the subject unless not required, and terminate the output
1699
with a binary zero. */
1700
1701
if (!replacement_only)
1702
{
1703
fraglength = length - start_offset;
1704
CHECKMEMCPY(subject + start_offset, fraglength);
1705
}
1706
1707
temp[0] = 0;
1708
CHECKMEMCPY(temp, 1);
1709
1710
/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
1711
and matching has carried on after a full buffer, in order to compute the length
1712
needed. Otherwise, an overflow generates an immediate error return. */
1713
1714
if (overflowed)
1715
{
1716
rc = PCRE2_ERROR_NOMEMORY;
1717
1718
if (extra_needed > ~(PCRE2_SIZE)0 - buff_length) /* Integer overflow */
1719
goto TOOLARGEREPLACE;
1720
*blength = buff_length + extra_needed;
1721
}
1722
1723
/* After a successful execution, return the number of substitutions and set the
1724
length of buffer used, excluding the trailing zero. */
1725
1726
else
1727
{
1728
rc = subs;
1729
*blength = buff_offset - 1;
1730
}
1731
1732
EXIT:
1733
if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
1734
else match_data->rc = rc;
1735
return rc;
1736
1737
NOROOM:
1738
rc = PCRE2_ERROR_NOMEMORY;
1739
goto EXIT;
1740
1741
CASEERROR:
1742
rc = PCRE2_ERROR_REPLACECASE;
1743
goto EXIT;
1744
1745
TOOLARGEREPLACE:
1746
rc = PCRE2_ERROR_TOOLARGEREPLACE;
1747
goto EXIT;
1748
1749
BAD:
1750
rc = PCRE2_ERROR_BADREPLACEMENT;
1751
goto PTREXIT;
1752
1753
BADESCAPE:
1754
rc = PCRE2_ERROR_BADREPESCAPE;
1755
1756
PTREXIT:
1757
*blength = (PCRE2_SIZE)(ptr - replacement);
1758
goto EXIT;
1759
}
1760
1761
/* End of pcre2_substitute.c */
1762
1763