Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_substitute.c
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
#include "pcre2_internal.h"
47
48
#define PTR_STACK_SIZE 20
49
50
#define SUBSTITUTE_OPTIONS \
51
(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52
PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54
PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55
56
57
58
/*************************************************
59
* Find end of substitute text *
60
*************************************************/
61
62
/* In extended mode, we recognize ${name:+set text:unset text} and similar
63
constructions. This requires the identification of unescaped : and }
64
characters. This function scans for such. It must deal with nested ${
65
constructions. The pointer to the text is updated, either to the required end
66
character, or to where an error was detected.
67
68
Arguments:
69
code points to the compiled expression (for options)
70
ptrptr points to the pointer to the start of the text (updated)
71
ptrend end of the whole string
72
last TRUE if the last expected string (only } recognized)
73
74
Returns: 0 on success
75
negative error code on failure
76
*/
77
78
static int
79
find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80
BOOL last)
81
{
82
int rc = 0;
83
uint32_t nestlevel = 0;
84
BOOL literal = FALSE;
85
PCRE2_SPTR ptr = *ptrptr;
86
87
for (; ptr < ptrend; ptr++)
88
{
89
if (literal)
90
{
91
if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92
{
93
literal = FALSE;
94
ptr += 1;
95
}
96
}
97
98
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99
{
100
if (nestlevel == 0) goto EXIT;
101
nestlevel--;
102
}
103
104
else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106
else if (*ptr == CHAR_DOLLAR_SIGN)
107
{
108
if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109
{
110
nestlevel++;
111
ptr += 1;
112
}
113
}
114
115
else if (*ptr == CHAR_BACKSLASH)
116
{
117
int erc;
118
int errorcode;
119
uint32_t ch;
120
121
if (ptr < ptrend - 1) switch (ptr[1])
122
{
123
case CHAR_L:
124
case CHAR_l:
125
case CHAR_U:
126
case CHAR_u:
127
ptr += 1;
128
continue;
129
}
130
131
ptr += 1; /* Must point after \ */
132
erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133
code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
134
ptr -= 1; /* Back to last code unit of escape */
135
if (errorcode != 0)
136
{
137
/* errorcode from check_escape is positive, so must not be returned by
138
pcre2_substitute(). */
139
rc = PCRE2_ERROR_BADREPESCAPE;
140
goto EXIT;
141
}
142
143
switch(erc)
144
{
145
case 0: /* Data character */
146
case ESC_b: /* Data character */
147
case ESC_v: /* Data character */
148
case ESC_E: /* Isolated \E is ignored */
149
break;
150
151
case ESC_Q:
152
literal = TRUE;
153
break;
154
155
case ESC_g:
156
/* The \g<name> form (\g<number> already handled by check_escape)
157
158
Don't worry about finding the matching ">". We are super, super lenient
159
about validating ${} replacements inside find_text_end(), so we certainly
160
don't need to worry about other syntax. Importantly, a \g<..> or $<...>
161
sequence can't contain a '}' character. */
162
break;
163
164
default:
165
if (erc < 0)
166
break; /* capture group reference */
167
rc = PCRE2_ERROR_BADREPESCAPE;
168
goto EXIT;
169
}
170
}
171
}
172
173
rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
174
175
EXIT:
176
*ptrptr = ptr;
177
return rc;
178
}
179
180
181
/*************************************************
182
* Validate group name *
183
*************************************************/
184
185
/* This function scans for a capture group name, validating it
186
consists of legal characters, is not empty, and does not exceed
187
MAX_NAME_SIZE.
188
189
Arguments:
190
ptrptr points to the pointer to the start of the text (updated)
191
ptrend end of the whole string
192
utf true if the input is UTF-encoded
193
ctypes pointer to the character types table
194
195
Returns: TRUE if a name was read
196
FALSE otherwise
197
*/
198
199
static BOOL
200
read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,
201
const uint8_t* ctypes)
202
{
203
PCRE2_SPTR ptr = *ptrptr;
204
PCRE2_SPTR nameptr = ptr;
205
206
if (ptr >= ptrend) /* No characters in name */
207
goto FAILED;
208
209
/* We do not need to check whether the name starts with a non-digit.
210
We are simply referencing names here, not defining them. */
211
212
/* See read_name in the pcre2_compile.c for the corresponding logic
213
restricting group names inside the pattern itself. */
214
215
#ifdef SUPPORT_UNICODE
216
if (utf)
217
{
218
uint32_t c, type;
219
220
while (ptr < ptrend)
221
{
222
GETCHAR(c, ptr);
223
type = UCD_CHARTYPE(c);
224
if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
225
c != CHAR_UNDERSCORE) break;
226
ptr++;
227
FORWARDCHARTEST(ptr, ptrend);
228
}
229
}
230
else
231
#else
232
(void)utf; /* Avoid compiler warning */
233
#endif /* SUPPORT_UNICODE */
234
235
/* Handle group names in non-UTF modes. */
236
237
{
238
while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)
239
{
240
ptr++;
241
}
242
}
243
244
/* Check name length */
245
246
if (ptr - nameptr > MAX_NAME_SIZE)
247
goto FAILED;
248
249
/* Subpattern names must not be empty */
250
if (ptr == nameptr)
251
goto FAILED;
252
253
*ptrptr = ptr;
254
return TRUE;
255
256
FAILED:
257
*ptrptr = ptr;
258
return FALSE;
259
}
260
261
262
/*************************************************
263
* Case transformations *
264
*************************************************/
265
266
#define PCRE2_SUBSTITUTE_CASE_NONE 0
267
// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.
268
#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST 4
269
270
typedef struct {
271
int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */
272
BOOL single_char;
273
} case_state;
274
275
/* Helper to guess how much a string is likely to increase in size when
276
case-transformed. Usually, strings don't change size at all, but some rare
277
characters do grow. Estimate +10%, plus another few characters.
278
279
Performing this estimation is unfortunate, but inevitable, since we can't call
280
the callout if we ran out of buffer space to prepare its input.
281
282
Because this estimate is inexact (and in pathological cases, underestimates the
283
required buffer size) we must document that when you have a
284
substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you
285
may need more than two calls to determine the final buffer size. */
286
287
static PCRE2_SIZE
288
pessimistic_case_inflation(PCRE2_SIZE len)
289
{
290
return (len >> 3u) + 10;
291
}
292
293
/* Case transformation behaviour if no callout is passed. */
294
295
static PCRE2_SIZE
296
default_substitute_case_callout(
297
PCRE2_SPTR input, PCRE2_SIZE input_len,
298
PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
299
case_state *state, const pcre2_code *code)
300
{
301
PCRE2_SPTR input_end = input + input_len;
302
#ifdef SUPPORT_UNICODE
303
BOOL utf;
304
BOOL ucp;
305
#endif
306
PCRE2_UCHAR temp[6];
307
BOOL next_to_upper;
308
BOOL rest_to_upper;
309
BOOL single_char;
310
BOOL overflow = FALSE;
311
PCRE2_SIZE written = 0;
312
313
/* Helpful simplifying invariant: input and output are disjoint buffers.
314
I believe that this code is technically undefined behaviour, because the two
315
pointers input/output are "unrelated" pointers and hence not comparable. Casting
316
via char* bypasses some but not all of those technical rules. It is not included
317
in release builds, in any case. */
318
PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||
319
(char *)(output + output_cap) <= (char *)input);
320
321
#ifdef SUPPORT_UNICODE
322
utf = (code->overall_options & PCRE2_UTF) != 0;
323
ucp = (code->overall_options & PCRE2_UCP) != 0;
324
#endif
325
326
if (input_len == 0) return 0;
327
328
switch (state->to_case)
329
{
330
default:
331
PCRE2_DEBUG_UNREACHABLE();
332
return 0;
333
334
case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
335
case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
336
next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);
337
break;
338
339
case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
340
next_to_upper = TRUE;
341
rest_to_upper = FALSE;
342
state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
343
break;
344
345
case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
346
next_to_upper = FALSE;
347
rest_to_upper = TRUE;
348
state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
349
break;
350
}
351
352
single_char = state->single_char;
353
if (single_char)
354
state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
355
356
while (input < input_end)
357
{
358
uint32_t ch;
359
unsigned int chlen;
360
361
GETCHARINCTEST(ch, input);
362
363
#ifdef SUPPORT_UNICODE
364
if ((utf || ucp) && ch >= 128)
365
{
366
uint32_t type = UCD_CHARTYPE(ch);
367
if (PRIV(ucp_gentype)[type] == ucp_L &&
368
type != (next_to_upper? ucp_Lu : ucp_Ll))
369
ch = UCD_OTHERCASE(ch);
370
371
/* TODO This is far from correct... it doesn't support the SpecialCasing.txt
372
mappings, but worse, it's not even correct for all the ordinary case
373
mappings. We should add support for those (at least), and then add the
374
SpecialCasing.txt mappings for Esszet and ligatures, and finally use the
375
Turkish casing flag on the match context. */
376
}
377
else
378
#endif
379
if (MAX_255(ch))
380
{
381
if (((code->tables + cbits_offset +
382
(next_to_upper? cbit_upper:cbit_lower)
383
)[ch/8] & (1u << (ch%8))) == 0)
384
ch = (code->tables + fcc_offset)[ch];
385
}
386
387
#ifdef SUPPORT_UNICODE
388
if (utf) chlen = PRIV(ord2utf)(ch, temp); else
389
#endif
390
{
391
temp[0] = ch;
392
chlen = 1;
393
}
394
395
if (!overflow && chlen <= output_cap)
396
{
397
memcpy(output, temp, CU2BYTES(chlen));
398
output += chlen;
399
output_cap -= chlen;
400
}
401
else
402
{
403
overflow = TRUE;
404
}
405
406
if (chlen > ~(PCRE2_SIZE)0 - written) /* Integer overflow */
407
return ~(PCRE2_SIZE)0;
408
written += chlen;
409
410
next_to_upper = rest_to_upper;
411
412
/* memcpy the remainder, if only transforming a single character. */
413
414
if (single_char)
415
{
416
PCRE2_SIZE rest_len = input_end - input;
417
418
if (!overflow && rest_len <= output_cap)
419
memcpy(output, input, CU2BYTES(rest_len));
420
421
if (rest_len > ~(PCRE2_SIZE)0 - written) /* Integer overflow */
422
return ~(PCRE2_SIZE)0;
423
written += rest_len;
424
425
return written;
426
}
427
}
428
429
return written;
430
}
431
432
/* Helper to perform the call to the substitute_case_callout. We wrap the
433
user-provided callout because our internal arguments are slightly extended. We
434
don't want the user callout to handle the case of "\l" (first character only to
435
lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because
436
those are not operations defined by Unicode. Instead the user callout simply
437
needs to provide the three Unicode primitives: lower, upper, titlecase. */
438
439
static PCRE2_SIZE
440
do_case_copy(
441
PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,
442
case_state *state, BOOL utf,
443
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
444
PCRE2_SIZE, int, void *),
445
void *substitute_case_callout_data)
446
{
447
PCRE2_SPTR input = input_output;
448
PCRE2_UCHAR *output = input_output;
449
PCRE2_SIZE rc;
450
PCRE2_SIZE rc2;
451
int ch1_to_case;
452
int rest_to_case;
453
PCRE2_UCHAR ch1[6];
454
PCRE2_SIZE ch1_len;
455
PCRE2_SPTR rest;
456
PCRE2_SIZE rest_len;
457
BOOL ch1_overflow = FALSE;
458
BOOL rest_overflow = FALSE;
459
460
#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)
461
(void)utf; /* Avoid compiler warning. */
462
#endif
463
464
PCRE2_ASSERT(input_len != 0);
465
466
switch (state->to_case)
467
{
468
default:
469
PCRE2_DEBUG_UNREACHABLE();
470
return 0;
471
472
case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
473
case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
474
case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
475
476
/* The easy case, where our internal casing operations align with those of
477
the callout. */
478
479
if (state->single_char == FALSE)
480
{
481
rc = substitute_case_callout(input, input_len, output, output_cap,
482
state->to_case, substitute_case_callout_data);
483
484
if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)
485
state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
486
487
return rc;
488
}
489
490
ch1_to_case = state->to_case;
491
rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;
492
break;
493
494
case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
495
ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
496
rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
497
break;
498
}
499
500
/* Identify the leading character. Take copy, because its storage overlaps with
501
`output`, and hence may be scrambled by the callout. */
502
503
{
504
PCRE2_SPTR ch_end = input;
505
uint32_t ch;
506
507
GETCHARINCTEST(ch, ch_end);
508
(void) ch;
509
PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);
510
ch1_len = ch_end - input;
511
memcpy(ch1, input, CU2BYTES(ch1_len));
512
}
513
514
rest = input + ch1_len;
515
rest_len = input_len - ch1_len;
516
517
/* Transform just ch1. The buffers are always in-place (input == output). With a
518
custom callout, we need a loop to discover its required buffer size. The loop
519
wouldn't be required if the callout were well-behaved, but it might be naughty
520
and return "5" the first time, then "10" the next time we call it using the
521
exact same input! */
522
523
{
524
PCRE2_SIZE ch1_cap;
525
PCRE2_SIZE max_ch1_cap;
526
527
ch1_cap = ch1_len; /* First attempt uses the space vacated by ch1. */
528
PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);
529
max_ch1_cap = output_cap - rest_len;
530
531
while (TRUE)
532
{
533
rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,
534
substitute_case_callout_data);
535
if (rc == ~(PCRE2_SIZE)0) return rc;
536
537
if (rc <= ch1_cap) break;
538
539
if (rc > max_ch1_cap)
540
{
541
ch1_overflow = TRUE;
542
break;
543
}
544
545
/* Move the rest to the right, to make room for expanding ch1. */
546
547
memmove(input_output + rc, rest, CU2BYTES(rest_len));
548
rest = input + rc;
549
550
ch1_cap = rc;
551
552
/* Proof of loop termination: `ch1_cap` is growing on each iteration, but
553
the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */
554
}
555
}
556
557
if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)
558
{
559
if (!ch1_overflow)
560
{
561
PCRE2_ASSERT(rest_len <= output_cap - rc);
562
memmove(output + rc, rest, CU2BYTES(rest_len));
563
}
564
rc2 = rest_len;
565
566
state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
567
}
568
else
569
{
570
PCRE2_UCHAR dummy[1];
571
572
rc2 = substitute_case_callout(rest, rest_len,
573
ch1_overflow? dummy : output + rc,
574
ch1_overflow? 0u : output_cap - rc,
575
rest_to_case, substitute_case_callout_data);
576
if (rc2 == ~(PCRE2_SIZE)0) return rc2;
577
578
if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;
579
580
/* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then
581
`rest` shrinks, it's actually possible for the total calculated length of
582
`xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't
583
report that, because it would make it seem that the operation succeeded.
584
If either of xform(ch1) or xform(rest) won't fit in the buffer, our final
585
result must be > output_cap. */
586
if (ch1_overflow && rc2 < rest_len)
587
rc2 = rest_len;
588
589
state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
590
}
591
592
if (rc2 > ~(PCRE2_SIZE)0 - rc) /* Integer overflow */
593
return ~(PCRE2_SIZE)0;
594
595
PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);
596
(void)rest_overflow;
597
598
return rc + rc2;
599
}
600
601
602
/*************************************************
603
* Match and substitute *
604
*************************************************/
605
606
/* This function applies a compiled re to a subject string and creates a new
607
string with substitutions. The first 7 arguments are the same as for
608
pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
609
610
Arguments:
611
code points to the compiled expression
612
subject points to the subject string
613
length length of subject string (may contain binary zeros)
614
start_offset where to start in the subject string
615
options option bits
616
match_data points to a match_data block, or is NULL
617
context points a PCRE2 context
618
replacement points to the replacement string
619
rlength length of replacement string
620
buffer where to put the substituted string
621
blength points to length of buffer; updated to length of string
622
623
Returns: >= 0 number of substitutions made
624
< 0 an error code
625
PCRE2_ERROR_BADREPLACEMENT means invalid use of $
626
*/
627
628
/* This macro checks for space in the buffer before copying into it. On
629
overflow, either give an error immediately, or keep on, accumulating the
630
length. */
631
632
#define CHECKMEMCPY(from, length_) \
633
do { \
634
PCRE2_SIZE chkmc_length = length_; \
635
if (overflowed) \
636
{ \
637
if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
638
goto TOOLARGEREPLACE; \
639
extra_needed += chkmc_length; \
640
} \
641
else if (lengthleft < chkmc_length) \
642
{ \
643
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
644
overflowed = TRUE; \
645
extra_needed = chkmc_length - lengthleft; \
646
} \
647
else \
648
{ \
649
memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \
650
buff_offset += chkmc_length; \
651
lengthleft -= chkmc_length; \
652
} \
653
} \
654
while (0)
655
656
/* This macro checks for space and copies characters with casing modifications.
657
On overflow, it behaves as for CHECKMEMCPY().
658
659
When substitute_case_callout is NULL, the source and destination buffers must
660
not overlap, because our default handler does not support this. */
661
662
#define CHECKCASECPY_BASE(length_, do_call) \
663
do { \
664
PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \
665
PCRE2_SIZE chkcc_rc; \
666
do_call \
667
if (lengthleft < chkcc_rc) \
668
{ \
669
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
670
overflowed = TRUE; \
671
extra_needed = chkcc_rc - lengthleft; \
672
} \
673
else \
674
{ \
675
buff_offset += chkcc_rc; \
676
lengthleft -= chkcc_rc; \
677
} \
678
} \
679
while (0)
680
681
#define CHECKCASECPY_DEFAULT(from, length_) \
682
CHECKCASECPY_BASE(length_, { \
683
chkcc_rc = default_substitute_case_callout(from, chkcc_length, \
684
buffer + buff_offset, \
685
overflowed? 0 : lengthleft, \
686
&forcecase, code); \
687
if (overflowed) \
688
{ \
689
if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
690
goto TOOLARGEREPLACE; \
691
extra_needed += chkcc_rc; \
692
break; \
693
} \
694
})
695
696
#define CHECKCASECPY_CALLOUT(length_) \
697
CHECKCASECPY_BASE(length_, { \
698
chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \
699
lengthleft, &forcecase, utf, \
700
substitute_case_callout, \
701
substitute_case_callout_data); \
702
if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \
703
})
704
705
/* This macro does a delayed case transformation, for the situation when we have
706
a case-forcing callout. */
707
708
#define DELAYEDFORCECASE() \
709
do { \
710
PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \
711
(extra_needed - casestart_extra_needed); \
712
if (chars_outstanding > 0) \
713
{ \
714
if (overflowed) \
715
{ \
716
PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \
717
if (guess > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
718
goto TOOLARGEREPLACE; \
719
extra_needed += guess; \
720
} \
721
else \
722
{ \
723
/* Rewind the buffer */ \
724
lengthleft += (buff_offset - casestart_offset); \
725
buff_offset = casestart_offset; \
726
/* Care! In-place case transformation */ \
727
CHECKCASECPY_CALLOUT(chars_outstanding); \
728
} \
729
} \
730
} \
731
while (0)
732
733
734
/* Here's the function */
735
736
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
737
pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
738
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
739
pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
740
PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
741
{
742
int rc;
743
int subs;
744
uint32_t ovector_count;
745
uint32_t goptions = 0;
746
uint32_t suboptions;
747
pcre2_match_data *internal_match_data = NULL;
748
BOOL escaped_literal = FALSE;
749
BOOL overflowed = FALSE;
750
BOOL use_existing_match;
751
BOOL replacement_only;
752
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
753
PCRE2_UCHAR temp[6];
754
PCRE2_SPTR ptr;
755
PCRE2_SPTR repend = NULL;
756
PCRE2_SIZE extra_needed = 0;
757
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
758
PCRE2_SIZE *ovector;
759
PCRE2_SIZE ovecsave[3];
760
pcre2_substitute_callout_block scb;
761
PCRE2_SIZE sub_start_extra_needed;
762
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
763
PCRE2_SIZE, int, void *) = NULL;
764
void *substitute_case_callout_data = NULL;
765
766
/* General initialization */
767
768
buff_offset = 0;
769
lengthleft = buff_length = *blength;
770
*blength = PCRE2_UNSET;
771
ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
772
773
if (mcontext != NULL)
774
{
775
substitute_case_callout = mcontext->substitute_case_callout;
776
substitute_case_callout_data = mcontext->substitute_case_callout_data;
777
}
778
779
/* Partial matching is not valid. This must come after setting *blength to
780
PCRE2_UNSET, so as not to imply an offset in the replacement. */
781
782
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
783
return PCRE2_ERROR_BADOPTION;
784
785
/* Validate length and find the end of the replacement. A NULL replacement of
786
zero length is interpreted as an empty string. */
787
788
if (replacement == NULL)
789
{
790
if (rlength != 0) return PCRE2_ERROR_NULL;
791
replacement = (PCRE2_SPTR)"";
792
}
793
794
if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
795
repend = replacement + rlength;
796
797
/* Check for using a match that has already happened. Note that the subject
798
pointer in the match data may be NULL after a no-match. */
799
800
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
801
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
802
803
/* If starting from an existing match, there must be an externally provided
804
match data block. We create an internal match_data block in two cases: (a) an
805
external one is not supplied (and we are not starting from an existing match);
806
(b) an existing match is to be used for the first substitution. In the latter
807
case, we copy the existing match into the internal block, except for any cached
808
heap frame size and pointer. This ensures that no changes are made to the
809
external match data block. */
810
811
/* WARNING: In both cases below a general context is constructed "by hand"
812
because calling pcre2_general_context_create() involves a memory allocation. If
813
the contents of a general context control block are ever changed there will
814
have to be changes below. */
815
816
if (match_data == NULL)
817
{
818
pcre2_general_context gcontext;
819
if (use_existing_match) return PCRE2_ERROR_NULL;
820
gcontext.memctl = (mcontext == NULL)?
821
((const pcre2_real_code *)code)->memctl :
822
((pcre2_real_match_context *)mcontext)->memctl;
823
match_data = internal_match_data =
824
pcre2_match_data_create_from_pattern(code, &gcontext);
825
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
826
}
827
828
else if (use_existing_match)
829
{
830
int pairs;
831
pcre2_general_context gcontext;
832
gcontext.memctl = (mcontext == NULL)?
833
((const pcre2_real_code *)code)->memctl :
834
((pcre2_real_match_context *)mcontext)->memctl;
835
pairs = (code->top_bracket + 1 < match_data->oveccount)?
836
code->top_bracket + 1 : match_data->oveccount;
837
internal_match_data = pcre2_match_data_create(match_data->oveccount,
838
&gcontext);
839
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
840
memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
841
+ 2*pairs*sizeof(PCRE2_SIZE));
842
internal_match_data->heapframes = NULL;
843
internal_match_data->heapframes_size = 0;
844
match_data = internal_match_data;
845
}
846
847
/* Remember ovector details */
848
849
ovector = pcre2_get_ovector_pointer(match_data);
850
ovector_count = pcre2_get_ovector_count(match_data);
851
852
/* Fixed things in the callout block */
853
854
scb.version = 0;
855
scb.input = subject;
856
scb.output = (PCRE2_SPTR)buffer;
857
scb.ovector = ovector;
858
859
/* A NULL subject of zero length is treated as an empty string. */
860
861
if (subject == NULL)
862
{
863
if (length != 0) return PCRE2_ERROR_NULL;
864
subject = (PCRE2_SPTR)"";
865
}
866
867
/* Find length of zero-terminated subject */
868
869
if (length == PCRE2_ZERO_TERMINATED)
870
length = subject? PRIV(strlen)(subject) : 0;
871
872
/* Check UTF replacement string if necessary. */
873
874
#ifdef SUPPORT_UNICODE
875
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
876
{
877
rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
878
if (rc != 0)
879
{
880
match_data->leftchar = 0;
881
goto EXIT;
882
}
883
}
884
#endif /* SUPPORT_UNICODE */
885
886
/* Save the substitute options and remove them from the match options. */
887
888
suboptions = options & SUBSTITUTE_OPTIONS;
889
options &= ~SUBSTITUTE_OPTIONS;
890
891
/* Error if the start match offset is greater than the length of the subject. */
892
893
if (start_offset > length)
894
{
895
match_data->leftchar = 0;
896
rc = PCRE2_ERROR_BADOFFSET;
897
goto EXIT;
898
}
899
900
/* Copy up to the start offset, unless only the replacement is required. */
901
902
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
903
904
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
905
match is taken from the match_data that was passed in. */
906
907
subs = 0;
908
do
909
{
910
PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
911
uint32_t ptrstackptr = 0;
912
case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
913
PCRE2_SIZE casestart_offset = 0;
914
PCRE2_SIZE casestart_extra_needed = 0;
915
916
if (use_existing_match)
917
{
918
rc = match_data->rc;
919
use_existing_match = FALSE;
920
}
921
else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
922
match_data, mcontext);
923
924
#ifdef SUPPORT_UNICODE
925
if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
926
#endif
927
928
/* Any error other than no match returns the error code. No match when not
929
doing the special after-empty-match global rematch, or when at the end of the
930
subject, breaks the global loop. Otherwise, advance the starting point by one
931
character, copying it to the output, and try again. */
932
933
if (rc < 0)
934
{
935
PCRE2_SIZE save_start;
936
937
if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
938
if (goptions == 0 || start_offset >= length) break;
939
940
/* Advance by one code point. Then, if CRLF is a valid newline sequence and
941
we have advanced into the middle of it, advance one more code point. In
942
other words, do not start in the middle of CRLF, even if CR and LF on their
943
own are valid newlines. */
944
945
save_start = start_offset++;
946
if (subject[start_offset-1] == CHAR_CR &&
947
(code->newline_convention == PCRE2_NEWLINE_CRLF ||
948
code->newline_convention == PCRE2_NEWLINE_ANY ||
949
code->newline_convention == PCRE2_NEWLINE_ANYCRLF) &&
950
start_offset < length &&
951
subject[start_offset] == CHAR_LF)
952
start_offset++;
953
954
/* Otherwise, in UTF mode, advance past any secondary code points. */
955
956
else if ((code->overall_options & PCRE2_UTF) != 0)
957
{
958
#if PCRE2_CODE_UNIT_WIDTH == 8
959
while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
960
start_offset++;
961
#elif PCRE2_CODE_UNIT_WIDTH == 16
962
while (start_offset < length &&
963
(subject[start_offset] & 0xfc00) == 0xdc00)
964
start_offset++;
965
#endif
966
}
967
968
/* Copy what we have advanced past (unless not required), reset the special
969
global options, and continue to the next match. */
970
971
fraglength = start_offset - save_start;
972
if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
973
goptions = 0;
974
continue;
975
}
976
977
/* Handle a successful match. Matches that use \K to end before they start
978
or start before the current point in the subject are not supported. */
979
980
if (ovector[1] < ovector[0] || ovector[0] < start_offset)
981
{
982
rc = PCRE2_ERROR_BADSUBSPATTERN;
983
goto EXIT;
984
}
985
986
/* Check for the same match as previous. This is legitimate after matching an
987
empty string that starts after the initial match offset. We have tried again
988
at the match point in case the pattern is one like /(?<=\G.)/ which can never
989
match at its starting point, so running the match achieves the bumpalong. If
990
we do get the same (null) match at the original match point, it isn't such a
991
pattern, so we now do the empty string magic. In all other cases, a repeat
992
match should never occur. */
993
994
if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
995
{
996
if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
997
{
998
goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
999
ovecsave[2] = start_offset;
1000
continue; /* Back to the top of the loop */
1001
}
1002
rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
1003
goto EXIT;
1004
}
1005
1006
/* Count substitutions with a paranoid check for integer overflow; surely no
1007
real call to this function would ever hit this! */
1008
1009
if (subs == INT_MAX)
1010
{
1011
rc = PCRE2_ERROR_TOOMANYREPLACE;
1012
goto EXIT;
1013
}
1014
subs++;
1015
1016
/* Copy the text leading up to the match (unless not required); remember
1017
where the insert begins and how many ovector pairs are set; and remember how
1018
much space we have requested in extra_needed. */
1019
1020
if (rc == 0) rc = ovector_count;
1021
fraglength = ovector[0] - start_offset;
1022
if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
1023
scb.output_offsets[0] = buff_offset;
1024
scb.oveccount = rc;
1025
sub_start_extra_needed = extra_needed;
1026
1027
/* Process the replacement string. If the entire replacement is literal, just
1028
copy it with length check. */
1029
1030
ptr = replacement;
1031
if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
1032
{
1033
CHECKMEMCPY(ptr, rlength);
1034
}
1035
1036
/* Within a non-literal replacement, which must be scanned character by
1037
character, local literal mode can be set by \Q, but only in extended mode
1038
when backslashes are being interpreted. In extended mode we must handle
1039
nested substrings that are to be reprocessed. */
1040
1041
else for (;;)
1042
{
1043
uint32_t ch;
1044
unsigned int chlen;
1045
int group;
1046
uint32_t special;
1047
PCRE2_SPTR text1_start = NULL;
1048
PCRE2_SPTR text1_end = NULL;
1049
PCRE2_SPTR text2_start = NULL;
1050
PCRE2_SPTR text2_end = NULL;
1051
PCRE2_UCHAR name[MAX_NAME_SIZE + 1];
1052
1053
/* If at the end of a nested substring, pop the stack. */
1054
1055
if (ptr >= repend)
1056
{
1057
if (ptrstackptr == 0) break; /* End of replacement string */
1058
repend = ptrstack[--ptrstackptr];
1059
ptr = ptrstack[--ptrstackptr];
1060
continue;
1061
}
1062
1063
/* Handle the next character */
1064
1065
if (escaped_literal)
1066
{
1067
if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
1068
{
1069
escaped_literal = FALSE;
1070
ptr += 2;
1071
continue;
1072
}
1073
goto LOADLITERAL;
1074
}
1075
1076
/* Not in literal mode. */
1077
1078
if (*ptr == CHAR_DOLLAR_SIGN)
1079
{
1080
BOOL inparens;
1081
BOOL inangle;
1082
BOOL star;
1083
PCRE2_SIZE sublength;
1084
PCRE2_UCHAR next;
1085
PCRE2_SPTR subptr, subptrend;
1086
1087
if (++ptr >= repend) goto BAD;
1088
if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
1089
1090
special = 0;
1091
text1_start = NULL;
1092
text1_end = NULL;
1093
text2_start = NULL;
1094
text2_end = NULL;
1095
group = -1;
1096
inparens = FALSE;
1097
inangle = FALSE;
1098
star = FALSE;
1099
subptr = NULL;
1100
subptrend = NULL;
1101
1102
/* Special $ sequences, as supported by Perl, JavaScript, .NET and others. */
1103
if (next == CHAR_AMPERSAND)
1104
{
1105
++ptr;
1106
group = 0;
1107
goto GROUP_SUBSTITUTE;
1108
}
1109
if (next == CHAR_GRAVE_ACCENT || next == CHAR_APOSTROPHE)
1110
{
1111
++ptr;
1112
rc = pcre2_substring_length_bynumber(match_data, 0, &sublength);
1113
if (rc < 0) goto PTREXIT; /* (Sanity-check ovector before reading from it.) */
1114
1115
if (next == CHAR_GRAVE_ACCENT)
1116
{
1117
subptr = subject;
1118
subptrend = subject + ovector[0];
1119
}
1120
else
1121
{
1122
subptr = subject + ovector[1];
1123
subptrend = subject + length;
1124
}
1125
1126
goto SUBPTR_SUBSTITUTE;
1127
}
1128
if (next == CHAR_UNDERSCORE)
1129
{
1130
/* Java, .NET support $_ for "entire input string". */
1131
++ptr;
1132
subptr = subject;
1133
subptrend = subject + length;
1134
goto SUBPTR_SUBSTITUTE;
1135
}
1136
1137
if (next == CHAR_LEFT_CURLY_BRACKET)
1138
{
1139
if (++ptr >= repend) goto BAD;
1140
next = *ptr;
1141
inparens = TRUE;
1142
}
1143
else if (next == CHAR_LESS_THAN_SIGN)
1144
{
1145
/* JavaScript compatibility syntax, $<name>. Processes only named
1146
groups (not numbered) and does not support extensions such as star
1147
(you can do ${name} and ${*name}, but not $<*name>). */
1148
if (++ptr >= repend) goto BAD;
1149
next = *ptr;
1150
inangle = TRUE;
1151
}
1152
1153
if (!inangle && next == CHAR_ASTERISK)
1154
{
1155
if (++ptr >= repend) goto BAD;
1156
next = *ptr;
1157
star = TRUE;
1158
}
1159
1160
if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)
1161
{
1162
group = next - CHAR_0;
1163
while (++ptr < repend)
1164
{
1165
next = *ptr;
1166
if (next < CHAR_0 || next > CHAR_9) break;
1167
group = group * 10 + (next - CHAR_0);
1168
1169
/* A check for a number greater than the hightest captured group
1170
is sufficient here; no need for a separate overflow check. If unknown
1171
groups are to be treated as unset, just skip over any remaining
1172
digits and carry on. */
1173
1174
if (group > code->top_bracket)
1175
{
1176
if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1177
{
1178
while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
1179
break;
1180
}
1181
else
1182
{
1183
rc = PCRE2_ERROR_NOSUBSTRING;
1184
goto PTREXIT;
1185
}
1186
}
1187
}
1188
}
1189
else
1190
{
1191
PCRE2_SIZE name_len;
1192
PCRE2_SPTR name_start = ptr;
1193
if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1194
goto BAD;
1195
name_len = ptr - name_start;
1196
memcpy(name, name_start, CU2BYTES(name_len));
1197
name[name_len] = 0;
1198
}
1199
1200
next = 0; /* not used or updated after this point */
1201
(void)next;
1202
1203
/* In extended mode we recognize ${name:+set text:unset text} and
1204
${name:-default text}. */
1205
1206
if (inparens)
1207
{
1208
if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1209
!star && ptr < repend - 2 && *ptr == CHAR_COLON)
1210
{
1211
special = *(++ptr);
1212
if (special != CHAR_PLUS && special != CHAR_MINUS)
1213
{
1214
rc = PCRE2_ERROR_BADSUBSTITUTION;
1215
goto PTREXIT;
1216
}
1217
1218
text1_start = ++ptr;
1219
rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
1220
if (rc != 0) goto PTREXIT;
1221
text1_end = ptr;
1222
1223
if (special == CHAR_PLUS && *ptr == CHAR_COLON)
1224
{
1225
text2_start = ++ptr;
1226
rc = find_text_end(code, &ptr, repend, TRUE);
1227
if (rc != 0) goto PTREXIT;
1228
text2_end = ptr;
1229
}
1230
}
1231
1232
else
1233
{
1234
if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
1235
{
1236
rc = PCRE2_ERROR_REPMISSINGBRACE;
1237
goto PTREXIT;
1238
}
1239
}
1240
1241
ptr++;
1242
}
1243
1244
if (inangle)
1245
{
1246
if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1247
goto BAD;
1248
ptr++;
1249
}
1250
1251
/* Have found a syntactically correct group number or name, or *name.
1252
Only *MARK is currently recognized. */
1253
1254
if (star)
1255
{
1256
if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
1257
{
1258
PCRE2_SPTR mark = pcre2_get_mark(match_data);
1259
if (mark != NULL)
1260
{
1261
/* Peek backwards one code unit to obtain the length of the mark.
1262
It can (theoretically) contain an embedded NUL. */
1263
fraglength = mark[-1];
1264
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1265
substitute_case_callout == NULL)
1266
CHECKCASECPY_DEFAULT(mark, fraglength);
1267
else
1268
CHECKMEMCPY(mark, fraglength);
1269
}
1270
}
1271
else goto BAD;
1272
}
1273
1274
/* Substitute the contents of a group. We don't use substring_copy
1275
functions any more, in order to support case forcing. */
1276
1277
else
1278
{
1279
GROUP_SUBSTITUTE:
1280
/* Find a number for a named group. In case there are duplicate names,
1281
search for the first one that is set. If the name is not found when
1282
PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
1283
non-existent group. */
1284
1285
if (group < 0)
1286
{
1287
PCRE2_SPTR first, last, entry;
1288
rc = pcre2_substring_nametable_scan(code, name, &first, &last);
1289
if (rc == PCRE2_ERROR_NOSUBSTRING &&
1290
(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1291
{
1292
group = code->top_bracket + 1;
1293
}
1294
else
1295
{
1296
if (rc < 0) goto PTREXIT;
1297
for (entry = first; entry <= last; entry += rc)
1298
{
1299
uint32_t ng = GET2(entry, 0);
1300
if (ng < ovector_count)
1301
{
1302
if (group < 0) group = ng; /* First in ovector */
1303
if (ovector[ng*2] != PCRE2_UNSET)
1304
{
1305
group = ng; /* First that is set */
1306
break;
1307
}
1308
}
1309
}
1310
1311
/* If group is still negative, it means we did not find a group
1312
that is in the ovector. Just set the first group. */
1313
1314
if (group < 0) group = GET2(first, 0);
1315
}
1316
}
1317
1318
/* We now have a group that is identified by number. Find the length of
1319
the captured string. If a group in a non-special substitution is unset
1320
when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
1321
1322
rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
1323
if (rc < 0)
1324
{
1325
if (rc == PCRE2_ERROR_NOSUBSTRING &&
1326
(suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
1327
{
1328
rc = PCRE2_ERROR_UNSET;
1329
}
1330
if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
1331
if (special == 0) /* Plain substitution */
1332
{
1333
if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
1334
goto PTREXIT; /* Else error */
1335
}
1336
}
1337
1338
/* If special is '+' we have a 'set' and possibly an 'unset' text,
1339
both of which are reprocessed when used. If special is '-' we have a
1340
default text for when the group is unset; it must be reprocessed. */
1341
1342
if (special != 0)
1343
{
1344
if (special == CHAR_MINUS)
1345
{
1346
if (rc == 0) goto LITERAL_SUBSTITUTE;
1347
text2_start = text1_start;
1348
text2_end = text1_end;
1349
}
1350
1351
if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
1352
ptrstack[ptrstackptr++] = ptr;
1353
ptrstack[ptrstackptr++] = repend;
1354
1355
if (rc == 0)
1356
{
1357
ptr = text1_start;
1358
repend = text1_end;
1359
}
1360
else
1361
{
1362
ptr = text2_start;
1363
repend = text2_end;
1364
}
1365
continue;
1366
}
1367
1368
/* Otherwise we have a literal substitution of a group's contents. */
1369
1370
LITERAL_SUBSTITUTE:
1371
subptr = subject + ovector[group*2];
1372
subptrend = subject + ovector[group*2 + 1];
1373
1374
/* Substitute a literal string, possibly forcing alphabetic case. */
1375
1376
SUBPTR_SUBSTITUTE:
1377
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1378
substitute_case_callout == NULL)
1379
CHECKCASECPY_DEFAULT(subptr, subptrend - subptr);
1380
else
1381
CHECKMEMCPY(subptr, subptrend - subptr);
1382
}
1383
} /* End of $ processing */
1384
1385
/* Handle an escape sequence in extended mode. We can use check_escape()
1386
to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
1387
the case-forcing escapes are not supported in pcre2_compile() so must be
1388
recognized here. */
1389
1390
else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
1391
*ptr == CHAR_BACKSLASH)
1392
{
1393
int errorcode;
1394
case_state new_forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
1395
1396
if (ptr < repend - 1) switch (ptr[1])
1397
{
1398
case CHAR_L:
1399
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1400
new_forcecase.single_char = FALSE;
1401
ptr += 2;
1402
break;
1403
1404
case CHAR_l:
1405
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
1406
new_forcecase.single_char = TRUE;
1407
ptr += 2;
1408
if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_U)
1409
{
1410
/* Perl reverse-title-casing feature for \l\U */
1411
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST;
1412
new_forcecase.single_char = FALSE;
1413
ptr += 2;
1414
}
1415
break;
1416
1417
case CHAR_U:
1418
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
1419
new_forcecase.single_char = FALSE;
1420
ptr += 2;
1421
break;
1422
1423
case CHAR_u:
1424
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1425
new_forcecase.single_char = TRUE;
1426
ptr += 2;
1427
if (ptr + 2 < repend && ptr[0] == CHAR_BACKSLASH && ptr[1] == CHAR_L)
1428
{
1429
/* Perl title-casing feature for \u\L */
1430
new_forcecase.to_case = PCRE2_SUBSTITUTE_CASE_TITLE_FIRST;
1431
new_forcecase.single_char = FALSE;
1432
ptr += 2;
1433
}
1434
break;
1435
1436
default:
1437
break;
1438
}
1439
1440
if (new_forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1441
{
1442
SETFORCECASE:
1443
1444
/* If the substitute_case_callout is unset, our case-forcing is done
1445
immediately. If there is a callout however, then its action is delayed
1446
until all the characters have been collected.
1447
1448
Apply the callout now, before we set the new casing mode. */
1449
1450
if (substitute_case_callout != NULL &&
1451
forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1452
DELAYEDFORCECASE();
1453
1454
forcecase = new_forcecase;
1455
casestart_offset = buff_offset;
1456
casestart_extra_needed = extra_needed;
1457
continue;
1458
}
1459
1460
ptr++; /* Point after \ */
1461
rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
1462
code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
1463
if (errorcode != 0) goto BADESCAPE;
1464
1465
switch(rc)
1466
{
1467
case ESC_E:
1468
goto SETFORCECASE;
1469
1470
case ESC_Q:
1471
escaped_literal = TRUE;
1472
continue;
1473
1474
case 0: /* Data character */
1475
case ESC_b: /* \b is backspace in a substitution */
1476
case ESC_v: /* \v is vertical tab in a substitution */
1477
1478
if (rc == ESC_b) ch = CHAR_BS;
1479
if (rc == ESC_v) ch = CHAR_VT;
1480
1481
#ifdef SUPPORT_UNICODE
1482
if (utf) chlen = PRIV(ord2utf)(ch, temp); else
1483
#endif
1484
{
1485
temp[0] = ch;
1486
chlen = 1;
1487
}
1488
1489
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1490
substitute_case_callout == NULL)
1491
CHECKCASECPY_DEFAULT(temp, chlen);
1492
else
1493
CHECKMEMCPY(temp, chlen);
1494
continue;
1495
1496
case ESC_g:
1497
{
1498
PCRE2_SIZE name_len;
1499
PCRE2_SPTR name_start;
1500
1501
/* Parse the \g<name> form (\g<number> already handled by check_escape) */
1502
if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)
1503
goto BADESCAPE;
1504
++ptr;
1505
1506
name_start = ptr;
1507
if (!read_name_subst(&ptr, repend, utf, code->tables + ctypes_offset))
1508
goto BADESCAPE;
1509
name_len = ptr - name_start;
1510
1511
if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
1512
goto BADESCAPE;
1513
++ptr;
1514
1515
special = 0;
1516
group = -1;
1517
memcpy(name, name_start, CU2BYTES(name_len));
1518
name[name_len] = 0;
1519
goto GROUP_SUBSTITUTE;
1520
}
1521
1522
default:
1523
if (rc < 0)
1524
{
1525
special = 0;
1526
group = -rc - 1;
1527
goto GROUP_SUBSTITUTE;
1528
}
1529
goto BADESCAPE;
1530
}
1531
} /* End of backslash processing */
1532
1533
/* Handle a literal code unit */
1534
1535
else
1536
{
1537
PCRE2_SPTR ch_start;
1538
1539
LOADLITERAL:
1540
ch_start = ptr;
1541
GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
1542
(void) ch;
1543
1544
if (forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE &&
1545
substitute_case_callout == NULL)
1546
CHECKCASECPY_DEFAULT(ch_start, ptr - ch_start);
1547
else
1548
CHECKMEMCPY(ch_start, ptr - ch_start);
1549
} /* End handling a literal code unit */
1550
} /* End of loop for scanning the replacement. */
1551
1552
/* If the substitute_case_callout is unset, our case-forcing is done
1553
immediately. If there is a callout however, then its action is delayed
1554
until all the characters have been collected.
1555
1556
We now clean up any trailing section of the replacement for which we deferred
1557
the case-forcing. */
1558
1559
if (substitute_case_callout != NULL &&
1560
forcecase.to_case != PCRE2_SUBSTITUTE_CASE_NONE)
1561
DELAYEDFORCECASE();
1562
1563
/* The replacement has been copied to the output, or its size has been
1564
remembered. Handle the callout if there is one. */
1565
1566
if (mcontext != NULL && mcontext->substitute_callout != NULL)
1567
{
1568
/* If we an actual (non-simulated) replacement, do the callout. */
1569
1570
if (!overflowed)
1571
{
1572
scb.subscount = subs;
1573
scb.output_offsets[1] = buff_offset;
1574
rc = mcontext->substitute_callout(&scb,
1575
mcontext->substitute_callout_data);
1576
1577
/* A non-zero return means cancel this substitution. Instead, copy the
1578
matched string fragment. */
1579
1580
if (rc != 0)
1581
{
1582
PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
1583
PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1584
1585
buff_offset -= newlength;
1586
lengthleft += newlength;
1587
if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
1588
1589
/* A negative return means do not do any more. */
1590
1591
if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
1592
}
1593
}
1594
1595
/* In this interesting case, we cannot do the callout, so it's hard to
1596
estimate the required buffer size. What callers want is to be able to make
1597
two calls to pcre2_substitute(), once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
1598
to discover the buffer size, and then a second and final call. Older
1599
versions of PCRE2 violated this assumption, by proceding as if the callout
1600
had returned zero - but on the second call to pcre2_substitute() it could
1601
return non-zero and then overflow the buffer again. Callers probably don't
1602
want to keep on looping to incrementally discover the buffer size. */
1603
1604
else
1605
{
1606
PCRE2_SIZE newlength_buf = buff_offset - scb.output_offsets[0];
1607
PCRE2_SIZE newlength_extra = extra_needed - sub_start_extra_needed;
1608
PCRE2_SIZE newlength =
1609
(newlength_extra > ~(PCRE2_SIZE)0 - newlength_buf)? /* Integer overflow */
1610
~(PCRE2_SIZE)0 : newlength_buf + newlength_extra; /* Cap the addition */
1611
PCRE2_SIZE oldlength = ovector[1] - ovector[0];
1612
1613
/* Be pessimistic: request whichever buffer size is larger out of
1614
accepting or rejecting the substitution. */
1615
1616
if (oldlength > newlength)
1617
{
1618
PCRE2_SIZE additional = oldlength - newlength;
1619
if (additional > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */
1620
goto TOOLARGEREPLACE;
1621
extra_needed += additional;
1622
}
1623
1624
/* Proceed as if the callout did not return a negative. A negative
1625
effectively rejects all future substitutions, but we want to examine them
1626
pessimistically. */
1627
}
1628
}
1629
1630
/* Save the details of this match. See above for how this data is used. If we
1631
matched an empty string, do the magic for global matches. Update the start
1632
offset to point to the rest of the subject string. If we re-used an existing
1633
match for the first match, switch to the internal match data block. */
1634
1635
ovecsave[0] = ovector[0];
1636
ovecsave[1] = ovector[1];
1637
ovecsave[2] = start_offset;
1638
1639
goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
1640
PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
1641
start_offset = ovector[1];
1642
} while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
1643
1644
/* Copy the rest of the subject unless not required, and terminate the output
1645
with a binary zero. */
1646
1647
if (!replacement_only)
1648
{
1649
fraglength = length - start_offset;
1650
CHECKMEMCPY(subject + start_offset, fraglength);
1651
}
1652
1653
temp[0] = 0;
1654
CHECKMEMCPY(temp, 1);
1655
1656
/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
1657
and matching has carried on after a full buffer, in order to compute the length
1658
needed. Otherwise, an overflow generates an immediate error return. */
1659
1660
if (overflowed)
1661
{
1662
rc = PCRE2_ERROR_NOMEMORY;
1663
1664
if (extra_needed > ~(PCRE2_SIZE)0 - buff_length) /* Integer overflow */
1665
goto TOOLARGEREPLACE;
1666
*blength = buff_length + extra_needed;
1667
}
1668
1669
/* After a successful execution, return the number of substitutions and set the
1670
length of buffer used, excluding the trailing zero. */
1671
1672
else
1673
{
1674
rc = subs;
1675
*blength = buff_offset - 1;
1676
}
1677
1678
EXIT:
1679
if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
1680
else match_data->rc = rc;
1681
return rc;
1682
1683
NOROOM:
1684
rc = PCRE2_ERROR_NOMEMORY;
1685
goto EXIT;
1686
1687
CASEERROR:
1688
rc = PCRE2_ERROR_REPLACECASE;
1689
goto EXIT;
1690
1691
TOOLARGEREPLACE:
1692
rc = PCRE2_ERROR_TOOLARGEREPLACE;
1693
goto EXIT;
1694
1695
BAD:
1696
rc = PCRE2_ERROR_BADREPLACEMENT;
1697
goto PTREXIT;
1698
1699
BADESCAPE:
1700
rc = PCRE2_ERROR_BADREPESCAPE;
1701
1702
PTREXIT:
1703
*blength = (PCRE2_SIZE)(ptr - replacement);
1704
goto EXIT;
1705
}
1706
1707
/* End of pcre2_substitute.c */
1708
1709