Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_match.c
21798 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2015-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_internal.h"
43
44
45
46
/* These defines enable debugging code */
47
48
/* #define DEBUG_FRAMES_DISPLAY */
49
/* #define DEBUG_SHOW_OPS */
50
/* #define DEBUG_SHOW_RMATCH */
51
52
#ifdef DEBUG_FRAMES_DISPLAY
53
#include <stdarg.h>
54
#endif
55
56
#ifdef DEBUG_SHOW_OPS
57
static const char *OP_names[] = { OP_NAME_LIST };
58
#endif
59
60
/* These defines identify the name of the block containing "static"
61
information, and fields within it. */
62
63
#define NLBLOCK mb /* Block containing newline information */
64
#define PSSTART start_subject /* Field containing processed string start */
65
#define PSEND end_subject /* Field containing processed string end */
66
67
#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
68
69
/* Masks for identifying the public options that are permitted at match time. */
70
71
#define PUBLIC_MATCH_OPTIONS \
72
(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
73
PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
74
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \
75
PCRE2_DISABLE_RECURSELOOP_CHECK)
76
77
#define PUBLIC_JIT_MATCH_OPTIONS \
78
(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
79
PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
80
PCRE2_COPY_MATCHED_SUBJECT)
81
82
/* Non-error returns from and within the match() function. Error returns are
83
externally defined PCRE2_ERROR_xxx codes, which are all negative. */
84
85
#define MATCH_MATCH 1
86
#define MATCH_NOMATCH 0
87
88
/* Special internal returns used in the match() function. Make them
89
sufficiently negative to avoid the external error codes. */
90
91
#define MATCH_ACCEPT (-999)
92
#define MATCH_KETRPOS (-998)
93
/* The next 5 must be kept together and in sequence so that a test that checks
94
for any one of them can use a range. */
95
#define MATCH_COMMIT (-997)
96
#define MATCH_PRUNE (-996)
97
#define MATCH_SKIP (-995)
98
#define MATCH_SKIP_ARG (-994)
99
#define MATCH_THEN (-993)
100
#define MATCH_BACKTRACK_MAX MATCH_THEN
101
#define MATCH_BACKTRACK_MIN MATCH_COMMIT
102
103
/* Group frame type values. Zero means the frame is not a group frame. The
104
lower 16 bits are used for data (e.g. the capture number). Group frames are
105
used for most groups so that information about the start is easily available at
106
the end without having to scan back through intermediate frames (backtrack
107
points). */
108
109
#define GF_CAPTURE 0x00010000u
110
#define GF_NOCAPTURE 0x00020000u
111
#define GF_CONDASSERT 0x00030000u
112
#define GF_RECURSE 0x00040000u
113
114
/* Masks for the identity and data parts of the group frame type. */
115
116
#define GF_IDMASK(a) ((a) & 0xffff0000u)
117
#define GF_DATAMASK(a) ((a) & 0x0000ffffu)
118
119
/* Repetition types */
120
121
enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
122
123
/* Min and max values for the common repeats; a maximum of UINT32_MAX =>
124
infinity. */
125
126
static const uint32_t rep_min[] = {
127
0, 0, /* * and *? */
128
1, 1, /* + and +? */
129
0, 0, /* ? and ?? */
130
0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
131
0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
132
133
static const uint32_t rep_max[] = {
134
UINT32_MAX, UINT32_MAX, /* * and *? */
135
UINT32_MAX, UINT32_MAX, /* + and +? */
136
1, 1, /* ? and ?? */
137
0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
138
UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
139
140
/* Repetition types - must include OP_CRPOSRANGE (not needed above) */
141
142
static const uint32_t rep_typ[] = {
143
REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
144
REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
145
REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
146
REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
147
REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
148
REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
149
150
/* Numbers for RMATCH calls at backtracking points. When these lists are
151
changed, the code at RETURN_SWITCH below must be updated in sync. */
152
153
enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
154
RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
155
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
156
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 };
157
158
#ifdef SUPPORT_WIDE_CHARS
159
enum { RM100=100, RM101, RM102, RM103 };
160
#endif
161
162
#ifdef SUPPORT_UNICODE
163
enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
164
RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
165
RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,
166
RM224 };
167
#endif
168
169
/* Define short names for general fields in the current backtrack frame, which
170
is always pointed to by the F variable. Occasional references to fields in
171
other frames are written out explicitly. There are also some fields in the
172
current frame whose names start with "temp" that are used for short-term,
173
localised backtracking memory. These are #defined with Lxxx names at the point
174
of use and undefined afterwards. */
175
176
#define Fback_frame F->back_frame
177
#define Fcapture_last F->capture_last
178
#define Fcurrent_recurse F->current_recurse
179
#define Fecode F->ecode
180
#define Feptr F->eptr
181
#define Fgroup_frame_type F->group_frame_type
182
#define Flast_group_offset F->last_group_offset
183
#define Flength F->length
184
#define Fmark F->mark
185
#define Frdepth F->rdepth
186
#define Fstart_match F->start_match
187
#define Foffset_top F->offset_top
188
#define Foccu F->occu
189
#define Fop F->op
190
#define Fovector F->ovector
191
#define Freturn_id F->return_id
192
193
194
#ifdef DEBUG_FRAMES_DISPLAY
195
/*************************************************
196
* Display current frames and contents *
197
*************************************************/
198
199
/* This debugging function displays the current set of frames and their
200
contents. It is not called automatically from anywhere, the intention being
201
that calls can be inserted where necessary when debugging frame-related
202
problems.
203
204
Arguments:
205
f the file to write to
206
F the current top frame
207
P a previous frame of interest
208
frame_size the frame size
209
mb points to the match block
210
match_data points to the match data block
211
s identification text
212
213
Returns: nothing
214
*/
215
216
static void
217
display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
218
match_block *mb, pcre2_match_data *match_data, const char *s, ...)
219
{
220
uint32_t i;
221
heapframe *Q;
222
va_list ap;
223
va_start(ap, s);
224
225
fprintf(f, "FRAMES ");
226
vfprintf(f, s, ap);
227
va_end(ap);
228
229
if (P != NULL) fprintf(f, " P=%lu",
230
((char *)P - (char *)(match_data->heapframes))/frame_size);
231
fprintf(f, "\n");
232
233
for (i = 0, Q = match_data->heapframes;
234
Q <= F;
235
i++, Q = (heapframe *)((char *)Q + frame_size))
236
{
237
fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
238
i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
239
Q->back_frame, Q->return_id);
240
241
if (Q->last_group_offset == PCRE2_UNSET)
242
fprintf(f, " lgoffset=unset\n");
243
else
244
fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
245
}
246
}
247
248
#endif
249
250
251
252
/*************************************************
253
* Process a callout *
254
*************************************************/
255
256
/* This function is called for all callouts, whether "standalone" or at the
257
start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
258
OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
259
with fixed values.
260
261
Arguments:
262
F points to the current backtracking frame
263
mb points to the match block
264
lengthptr where to return the length of the callout item
265
266
Returns: the return from the callout
267
or 0 if no callout function exists
268
*/
269
270
static int
271
do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
272
{
273
int rc;
274
PCRE2_SIZE save0, save1;
275
PCRE2_SIZE *callout_ovector;
276
pcre2_callout_block *cb;
277
278
*lengthptr = (*Fecode == OP_CALLOUT)?
279
PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
280
281
if (mb->callout == NULL) return 0; /* No callout function provided */
282
283
/* The original matching code (pre 10.30) worked directly with the ovector
284
passed by the user, and this was passed to callouts. Now that the working
285
ovector is in the backtracking frame, it no longer needs to reserve space for
286
the overall match offsets (which would waste space in the frame). For backward
287
compatibility, however, we pass capture_top and offset_vector to the callout as
288
if for the extended ovector, and we ensure that the first two slots are unset
289
by preserving and restoring their current contents. Picky compilers complain if
290
references such as Fovector[-2] are use directly, so we set up a separate
291
pointer. */
292
293
callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
294
295
/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
296
are set externally. The first 3 never change; the last is updated for each
297
bumpalong. */
298
299
cb = mb->cb;
300
cb->capture_top = (uint32_t)Foffset_top/2 + 1;
301
cb->capture_last = Fcapture_last;
302
cb->offset_vector = callout_ovector;
303
cb->mark = mb->nomatch_mark;
304
cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
305
cb->pattern_position = GET(Fecode, 1);
306
cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
307
308
if (*Fecode == OP_CALLOUT) /* Numerical callout */
309
{
310
cb->callout_number = Fecode[1 + 2*LINK_SIZE];
311
cb->callout_string_offset = 0;
312
cb->callout_string = NULL;
313
cb->callout_string_length = 0;
314
}
315
else /* String callout */
316
{
317
cb->callout_number = 0;
318
cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
319
cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
320
cb->callout_string_length =
321
*lengthptr - (1 + 4*LINK_SIZE) - 2;
322
}
323
324
save0 = callout_ovector[0];
325
save1 = callout_ovector[1];
326
callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
327
rc = mb->callout(cb, mb->callout_data);
328
callout_ovector[0] = save0;
329
callout_ovector[1] = save1;
330
cb->callout_flags = 0;
331
return rc;
332
}
333
334
335
336
/*************************************************
337
* Match a back-reference *
338
*************************************************/
339
340
/* This function is called only when it is known that the offset lies within
341
the offsets that have so far been used in the match. Note that in caseless
342
UTF-8 mode, the number of subject bytes matched may be different to the number
343
of reference bytes. (In theory this could also happen in UTF-16 mode, but it
344
seems unlikely.)
345
346
Arguments:
347
offset index into the offset vector
348
caseless TRUE if caseless
349
caseopts bitmask of REFI_FLAG_XYZ values
350
F the current backtracking frame pointer
351
mb points to match block
352
lengthptr pointer for returning the length matched
353
354
Returns: = 0 sucessful match; number of code units matched is set
355
< 0 no match
356
> 0 partial match
357
*/
358
359
static int
360
match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F,
361
match_block *mb, PCRE2_SIZE *lengthptr)
362
{
363
PCRE2_SPTR p;
364
PCRE2_SIZE length;
365
PCRE2_SPTR eptr;
366
PCRE2_SPTR eptr_start;
367
368
#ifndef SUPPORT_UNICODE
369
(void)caseopts; /* Avoid compiler warning. */
370
#endif
371
372
/* Deal with an unset group. The default is no match, but there is an option to
373
match an empty string. */
374
375
if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
376
{
377
if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
378
{
379
*lengthptr = 0;
380
return 0; /* Match */
381
}
382
else return -1; /* No match */
383
}
384
385
/* Separate the caseless and UTF cases for speed. */
386
387
eptr = eptr_start = Feptr;
388
p = mb->start_subject + Fovector[offset];
389
length = Fovector[offset+1] - Fovector[offset];
390
PCRE2_ASSERT(eptr <= mb->end_subject);
391
392
if (caseless)
393
{
394
#if defined SUPPORT_UNICODE
395
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
396
BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0;
397
BOOL turkish_casing = !caseless_restrict && (caseopts & REFI_FLAG_TURKISH_CASING) != 0;
398
399
if (utf || (mb->poptions & PCRE2_UCP) != 0)
400
{
401
PCRE2_SPTR endptr = p + length;
402
403
/* Match characters up to the end of the reference. NOTE: the number of
404
code units matched may differ, because in UTF-8 there are some characters
405
whose upper and lower case codes have different numbers of bytes. For
406
example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
407
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
408
sequence of two of the latter. It is important, therefore, to check the
409
length along the reference, not along the subject (earlier code did this
410
wrong). UCP uses Unicode properties but without UTF encoding. */
411
412
while (p < endptr)
413
{
414
uint32_t c, d;
415
const ucd_record *ur;
416
if (eptr >= mb->end_subject) return 1; /* Partial match */
417
418
if (utf)
419
{
420
GETCHARINC(c, eptr);
421
GETCHARINC(d, p);
422
}
423
else
424
{
425
c = *eptr++;
426
d = *p++;
427
}
428
429
if (turkish_casing && UCD_ANY_I(d))
430
{
431
c = UCD_FOLD_I_TURKISH(c);
432
d = UCD_FOLD_I_TURKISH(d);
433
if (c != d) return -1; /* No match */
434
}
435
else if (c != d && c != (uint32_t)((int)d + (ur = GET_UCD(d))->other_case))
436
{
437
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
438
439
/* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets
440
that start with an ASCII character. */
441
if (caseless_restrict && *pp < 128) return -1; /* No match */
442
443
for (;;)
444
{
445
if (c < *pp) return -1; /* No match */
446
if (c == *pp++) break;
447
}
448
}
449
}
450
}
451
else
452
#endif
453
454
/* Not in UTF or UCP mode */
455
{
456
for (; length > 0; length--)
457
{
458
uint32_t cc, cp;
459
if (eptr >= mb->end_subject) return 1; /* Partial match */
460
cc = UCHAR21TEST(eptr);
461
cp = UCHAR21TEST(p);
462
if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
463
return -1; /* No match */
464
p++;
465
eptr++;
466
}
467
}
468
}
469
470
/* In the caseful case, we can just compare the code units, whether or not we
471
are in UTF and/or UCP mode. When partial matching, we have to do this unit by
472
unit. */
473
474
else
475
{
476
if (mb->partial != 0)
477
{
478
for (; length > 0; length--)
479
{
480
if (eptr >= mb->end_subject) return 1; /* Partial match */
481
if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
482
}
483
}
484
485
/* Not partial matching */
486
487
else
488
{
489
if ((PCRE2_SIZE)(mb->end_subject - eptr) < length ||
490
memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
491
eptr += length;
492
}
493
}
494
495
*lengthptr = eptr - eptr_start;
496
return 0; /* Match */
497
}
498
499
500
501
/*************************************************
502
* Restore offsets after a recurse *
503
*************************************************/
504
505
/* This function restores the ovector values when
506
a recursive block reaches its end, and the triggering
507
recurse has and argument list.
508
509
Arguments:
510
F the current backtracking frame pointer
511
P the previous backtracking frame pointer
512
*/
513
514
static void
515
recurse_update_offsets(heapframe *F, heapframe *P)
516
{
517
PCRE2_SIZE *dst = F->ovector;
518
PCRE2_SIZE *src = P->ovector;
519
/* The first bracket has offset 2, because
520
offset 0 is reserved for the full match. */
521
PCRE2_SIZE offset = 2;
522
PCRE2_SIZE offset_top = Foffset_top + 2;
523
PCRE2_SIZE diff;
524
PCRE2_SPTR ecode = Fecode;
525
526
do
527
{
528
diff = (GET2(ecode, 1) << 1) - offset;
529
ecode += 1 + IMM2_SIZE;
530
531
if (offset + diff >= offset_top)
532
{
533
/* Some OP_CREF opcodes are not
534
processed, they must be skipped. */
535
while (*ecode == OP_CREF) ecode += 1 + IMM2_SIZE;
536
break;
537
}
538
539
if (diff == 2)
540
{
541
dst[0] = src[0];
542
dst[1] = src[1];
543
}
544
else if (diff >= 4)
545
memcpy(dst, src, diff * sizeof(PCRE2_SIZE));
546
547
/* Skip the unmodified entry. */
548
diff += 2;
549
offset += diff;
550
dst += diff;
551
src += diff;
552
}
553
while (*ecode == OP_CREF);
554
555
diff = offset_top - offset;
556
if (diff == 2)
557
{
558
dst[0] = src[0];
559
dst[1] = src[1];
560
}
561
else if (diff >= 4)
562
memcpy(dst, src, diff * sizeof(PCRE2_SIZE));
563
564
Fecode = ecode;
565
Foffset_top = (offset <= P->offset_top) ? P->offset_top : (offset - 2);
566
}
567
568
569
570
/******************************************************************************
571
*******************************************************************************
572
"Recursion" in the match() function
573
574
The original match() function was highly recursive, but this proved to be the
575
source of a number of problems over the years, mostly because of the relatively
576
small system stacks that are commonly found. As new features were added to
577
patterns, various kludges were invented to reduce the amount of stack used,
578
making the code hard to understand in places.
579
580
A version did exist that used individual frames on the heap instead of calling
581
match() recursively, but this ran substantially slower. The current version is
582
a refactoring that uses a vector of frames to remember backtracking points.
583
This runs no slower, and possibly even a bit faster than the original recursive
584
implementation.
585
586
At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50
587
frames) was allocated on the system stack. If this was not big enough, the heap
588
was used for a larger vector. However, it turns out that there are environments
589
where taking as little as 20KiB from the system stack is an embarrassment.
590
After another refactoring, the heap is used exclusively, but a pointer the
591
frames vector and its size are cached in the match_data block, so that there is
592
no new memory allocation if the same match_data block is used for multiple
593
matches (unless the frames vector has to be extended).
594
*******************************************************************************
595
******************************************************************************/
596
597
598
599
600
/*************************************************
601
* Macros for the match() function *
602
*************************************************/
603
604
/* These macros pack up tests that are used for partial matching several times
605
in the code. The second one is used when we already know we are past the end of
606
the subject. We set the "hit end" flag if the pointer is at the end of the
607
subject and either (a) the pointer is past the earliest inspected character
608
(i.e. something has been matched, even if not part of the actual matched
609
string), or (b) the pattern contains a lookbehind. These are the conditions for
610
which adding more characters may allow the current match to continue.
611
612
For hard partial matching, we immediately return a partial match. Otherwise,
613
carrying on means that a complete match on the current subject will be sought.
614
A partial match is returned only if no complete match can be found. */
615
616
#define CHECK_PARTIAL() \
617
do { \
618
if (Feptr >= mb->end_subject) \
619
{ \
620
SCHECK_PARTIAL(); \
621
} \
622
} \
623
while (0)
624
625
#define SCHECK_PARTIAL() \
626
do { \
627
if (mb->partial != 0 && \
628
(Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
629
{ \
630
mb->hitend = TRUE; \
631
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
632
} \
633
} \
634
while (0)
635
636
637
/* These macros are used to implement backtracking. They simulate a recursive
638
call to the match() function by means of a local vector of frames which
639
remember the backtracking points. */
640
641
#define RMATCH(ra,rb) \
642
do { \
643
start_ecode = ra; \
644
Freturn_id = rb; \
645
goto MATCH_RECURSE; \
646
L_##rb:; \
647
} \
648
while (0)
649
650
#define RRETURN(ra) \
651
do { \
652
rrc = ra; \
653
goto RETURN_SWITCH; \
654
} \
655
while (0)
656
657
658
659
/*************************************************
660
* Match from current position *
661
*************************************************/
662
663
/* This function is called to run one match attempt at a single starting point
664
in the subject.
665
666
Performance note: It might be tempting to extract commonly used fields from the
667
mb structure (e.g. end_subject) into individual variables to improve
668
performance. Tests using gcc on a SPARC disproved this; in the first case, it
669
made performance worse.
670
671
Arguments:
672
start_eptr starting character in subject
673
start_ecode starting position in compiled code
674
top_bracket number of capturing parentheses in the pattern
675
frame_size size of each backtracking frame
676
match_data pointer to the match_data block
677
mb pointer to "static" variables block
678
679
Returns: MATCH_MATCH if matched ) these values are >= 0
680
MATCH_NOMATCH if failed to match )
681
negative MATCH_xxx value for PRUNE, SKIP, etc
682
negative PCRE2_ERROR_xxx value if aborted by an error condition
683
(e.g. stopped by repeated call or depth limit)
684
*/
685
686
static int
687
match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,
688
PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
689
{
690
/* Frame-handling variables */
691
692
heapframe *F; /* Current frame pointer */
693
heapframe *N = NULL; /* Temporary frame pointers */
694
heapframe *P = NULL;
695
696
heapframe *frames_top; /* End of frames vector */
697
heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
698
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
699
700
/* Local variables that do not need to be preserved over calls to RRMATCH(). */
701
702
PCRE2_SPTR branch_end = NULL;
703
PCRE2_SPTR branch_start;
704
PCRE2_SPTR bracode; /* Temp pointer to start of group */
705
PCRE2_SIZE offset; /* Used for group offsets */
706
PCRE2_SIZE length; /* Used for various length calculations */
707
708
int rrc; /* Return from functions & backtracking "recursions" */
709
#ifdef SUPPORT_UNICODE
710
int proptype; /* Type of character property */
711
#endif
712
713
uint32_t i; /* Used for local loops */
714
uint32_t fc; /* Character values */
715
uint32_t number; /* Used for group and other numbers */
716
uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
717
uint32_t group_frame_type; /* Specifies type for new group frames */
718
719
BOOL condition; /* Used in conditional groups */
720
BOOL cur_is_word; /* Used in "word" tests */
721
BOOL prev_is_word; /* Used in "word" tests */
722
723
/* UTF and UCP flags */
724
725
#ifdef SUPPORT_UNICODE
726
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
727
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
728
#else
729
BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
730
#endif
731
732
/* This is the length of the last part of a backtracking frame that must be
733
copied when a new frame is created. */
734
735
frame_copy_size = frame_size - offsetof(heapframe, eptr);
736
737
/* Set up the first frame and the end of the frames vector. */
738
739
F = match_data->heapframes;
740
frames_top = (heapframe *)((char *)F + match_data->heapframes_size);
741
742
Frdepth = 0; /* "Recursion" depth */
743
Fcapture_last = 0; /* Number of most recent capture */
744
Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
745
Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
746
Fmark = NULL; /* Most recent mark */
747
Foffset_top = 0; /* End of captures within the frame */
748
Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
749
group_frame_type = 0; /* Not a start of group frame */
750
goto NEW_FRAME; /* Start processing with this frame */
751
752
/* Come back here when we want to create a new frame for remembering a
753
backtracking point. */
754
755
MATCH_RECURSE:
756
757
/* Set up a new backtracking frame. If the vector is full, get a new one,
758
doubling the size, but constrained by the heap limit (which is in KiB). */
759
760
N = (heapframe *)((char *)F + frame_size);
761
if ((heapframe *)((char *)N + frame_size) >= frames_top)
762
{
763
heapframe *new;
764
PCRE2_SIZE newsize;
765
PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);
766
767
if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)
768
{
769
if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)
770
return PCRE2_ERROR_NOMEMORY;
771
newsize = PCRE2_SIZE_MAX - 1;
772
}
773
else
774
newsize = match_data->heapframes_size * 2;
775
776
if (newsize / 1024 >= mb->heap_limit)
777
{
778
PCRE2_SIZE old_size = match_data->heapframes_size / 1024;
779
if (mb->heap_limit <= old_size)
780
return PCRE2_ERROR_HEAPLIMIT;
781
else
782
{
783
PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);
784
int over_bytes = match_data->heapframes_size % 1024;
785
if (over_bytes) max_delta -= (1024 - over_bytes);
786
newsize = match_data->heapframes_size + max_delta;
787
}
788
}
789
790
/* With a heap limit set, the permitted additional size may not be enough for
791
another frame, so do a final check. */
792
793
if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;
794
new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
795
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
796
memcpy(new, match_data->heapframes, usedsize);
797
798
N = (heapframe *)((char *)new + usedsize);
799
F = (heapframe *)((char *)N - frame_size);
800
801
match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
802
match_data->heapframes = new;
803
match_data->heapframes_size = newsize;
804
frames_top = (heapframe *)((char *)new + newsize);
805
}
806
807
#ifdef DEBUG_SHOW_RMATCH
808
fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);
809
if (group_frame_type != 0)
810
{
811
fprintf(stderr, " type=%x ", group_frame_type);
812
switch (GF_IDMASK(group_frame_type))
813
{
814
case GF_CAPTURE:
815
fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
816
break;
817
818
case GF_NOCAPTURE:
819
fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
820
break;
821
822
case GF_CONDASSERT:
823
fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
824
break;
825
826
case GF_RECURSE:
827
fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
828
break;
829
830
default:
831
fprintf(stderr, "*** unknown ***");
832
break;
833
}
834
}
835
fprintf(stderr, "\n");
836
#endif
837
838
/* Copy those fields that must be copied into the new frame, increase the
839
"recursion" depth (i.e. the new frame's index) and then make the new frame
840
current. */
841
842
memcpy((char *)N + offsetof(heapframe, eptr),
843
(char *)F + offsetof(heapframe, eptr),
844
frame_copy_size);
845
846
N->rdepth = Frdepth + 1;
847
F = N;
848
849
/* Carry on processing with a new frame. */
850
851
NEW_FRAME:
852
Fgroup_frame_type = group_frame_type;
853
Fecode = start_ecode; /* Starting code pointer */
854
Fback_frame = frame_size; /* Default is go back one frame */
855
856
/* If this is a special type of group frame, remember its offset for quick
857
access at the end of the group. If this is a recursion, set a new current
858
recursion value. */
859
860
if (group_frame_type != 0)
861
{
862
Flast_group_offset = (char *)F - (char *)match_data->heapframes;
863
if (GF_IDMASK(group_frame_type) == GF_RECURSE)
864
Fcurrent_recurse = GF_DATAMASK(group_frame_type);
865
group_frame_type = 0;
866
}
867
868
869
/* ========================================================================= */
870
/* This is the main processing loop. First check that we haven't recorded too
871
many backtracks (search tree is too large), or that we haven't exceeded the
872
recursive depth limit (used too many backtracking frames). If not, process the
873
opcodes. */
874
875
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
876
if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
877
878
#ifdef DEBUG_SHOW_OPS
879
fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",
880
GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);
881
#endif
882
883
for (;;)
884
{
885
#ifdef DEBUG_SHOW_OPS
886
fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
887
OP_names[*Fecode]);
888
#endif
889
890
Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
891
switch(Fop)
892
{
893
/* ===================================================================== */
894
/* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
895
any currently open capturing brackets. Unlike reaching the end of a group,
896
where we know the starting frame is at the top of the chained frames, in
897
this case we have to search back for the relevant frame in case other types
898
of group that use chained frames have intervened. Multiple OP_CLOSEs always
899
come innermost first, which matches the chain order. We can ignore this in
900
a recursion, because captures are not passed out of recursions. */
901
902
case OP_CLOSE:
903
if (Fcurrent_recurse == RECURSE_UNSET)
904
{
905
number = GET2(Fecode, 1);
906
offset = Flast_group_offset;
907
for(;;)
908
{
909
/* Corrupted heapframes?. Trigger an assert and return an error */
910
PCRE2_ASSERT(offset != PCRE2_UNSET);
911
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
912
913
N = (heapframe *)((char *)match_data->heapframes + offset);
914
P = (heapframe *)((char *)N - frame_size);
915
if (N->group_frame_type == (GF_CAPTURE | number)) break;
916
offset = P->last_group_offset;
917
}
918
offset = (number << 1) - 2;
919
Fcapture_last = number;
920
Fovector[offset] = P->eptr - mb->start_subject;
921
Fovector[offset+1] = Feptr - mb->start_subject;
922
if (offset >= Foffset_top) Foffset_top = offset + 2;
923
}
924
Fecode += PRIV(OP_lengths)[*Fecode];
925
break;
926
927
928
/* ===================================================================== */
929
/* Real or forced end of the pattern, assertion, or recursion. In an
930
assertion ACCEPT, update the last used pointer and remember the current
931
frame so that the captures and mark can be fished out of it. */
932
933
case OP_ASSERT_ACCEPT:
934
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
935
assert_accept_frame = F;
936
RRETURN(MATCH_ACCEPT);
937
938
/* For ACCEPT within a recursion, we have to find the most recent
939
recursion. If not in a recursion, fall through to code that is common with
940
OP_END. */
941
942
case OP_ACCEPT:
943
if (Fcurrent_recurse != RECURSE_UNSET)
944
{
945
#ifdef DEBUG_SHOW_OPS
946
fprintf(stderr, "++ Accept within recursion\n");
947
#endif
948
offset = Flast_group_offset;
949
for(;;)
950
{
951
/* Corrupted heapframes?. Trigger an assert and return an error */
952
PCRE2_ASSERT(offset != PCRE2_UNSET);
953
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
954
955
N = (heapframe *)((char *)match_data->heapframes + offset);
956
P = (heapframe *)((char *)N - frame_size);
957
if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
958
offset = P->last_group_offset;
959
}
960
961
/* N is now the frame of the recursion; the previous frame is at the
962
OP_RECURSE position. Go back there, copying the current subject position
963
and mark, and the start_match position (\K might have changed it), and
964
then move on past the OP_RECURSE. */
965
966
P->eptr = Feptr;
967
P->mark = Fmark;
968
P->start_match = Fstart_match;
969
F = P;
970
Fecode += 1 + LINK_SIZE;
971
continue;
972
}
973
PCRE2_FALLTHROUGH /* Fall through */
974
975
/* OP_END itself can never be reached within a recursion because that is
976
picked up when the OP_KET that always precedes OP_END is reached. */
977
978
case OP_END:
979
980
/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
981
PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
982
subject. In both cases, backtracking will then try other alternatives, if
983
any. */
984
985
if (Feptr == Fstart_match &&
986
((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
987
((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
988
Fstart_match == mb->start_subject + mb->start_offset)))
989
{
990
#ifdef DEBUG_SHOW_OPS
991
fprintf(stderr, "++ Backtrack because empty string\n");
992
#endif
993
RRETURN(MATCH_NOMATCH);
994
}
995
996
/* Fail if PCRE2_ENDANCHORED is set and the end of the match is not
997
the end of the subject. After (*ACCEPT) we fail the entire match (at this
998
position) but backtrack if we've reached the end of the pattern. This
999
applies whether or not we are in a recursion. */
1000
1001
if (Feptr < mb->end_subject &&
1002
((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
1003
{
1004
if (Fop == OP_END)
1005
{
1006
#ifdef DEBUG_SHOW_OPS
1007
fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");
1008
#endif
1009
RRETURN(MATCH_NOMATCH);
1010
}
1011
1012
#ifdef DEBUG_SHOW_OPS
1013
fprintf(stderr, "++ Failed ACCEPT not at end (endanchored set)\n");
1014
#endif
1015
return MATCH_NOMATCH; /* (*ACCEPT) */
1016
}
1017
1018
/* Fail if we detect that the start position was moved to be either after
1019
the end position (\K in lookahead) or before the start offset (\K in
1020
lookbehind). If this occurs, the pattern must have used \K in a somewhat
1021
sneaky way (e.g. by pattern recursion), because if the \K is actually
1022
syntactically inside the lookaround, it's blocked at compile-time. */
1023
1024
if (Fstart_match < mb->start_subject + mb->start_offset ||
1025
Fstart_match > Feptr)
1026
{
1027
/* The \K expression is fairly rare. We assert it was used so that we
1028
catch any unexpected invalid data in start_match. */
1029
PCRE2_ASSERT(mb->hasbsk);
1030
1031
if (!mb->allowlookaroundbsk)
1032
return PCRE2_ERROR_BAD_BACKSLASH_K;
1033
}
1034
1035
/* We have a successful match of the whole pattern. Record the result and
1036
then do a direct return from the function. If there is space in the offset
1037
vector, set any pairs that follow the highest-numbered captured string but
1038
are less than the number of capturing groups in the pattern to PCRE2_UNSET.
1039
It is documented that this happens. "Gaps" are set to PCRE2_UNSET
1040
dynamically. It is only those at the end that need setting here. */
1041
1042
mb->end_match_ptr = Feptr; /* Record where we ended */
1043
mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
1044
mb->mark = Fmark; /* and the last success mark */
1045
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
1046
1047
match_data->ovector[0] = Fstart_match - mb->start_subject;
1048
match_data->ovector[1] = Feptr - mb->start_subject;
1049
1050
/* Set i to the smaller of the sizes of the external and frame ovectors. */
1051
1052
i = 2 * ((top_bracket + 1 > match_data->oveccount)?
1053
match_data->oveccount : top_bracket + 1);
1054
memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
1055
while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;
1056
return MATCH_MATCH; /* Note: NOT RRETURN */
1057
1058
1059
/*===================================================================== */
1060
/* Match any single character type except newline; have to take care with
1061
CRLF newlines and partial matching. */
1062
1063
case OP_ANY:
1064
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
1065
if (mb->partial != 0 &&
1066
Feptr == mb->end_subject - 1 &&
1067
NLBLOCK->nltype == NLTYPE_FIXED &&
1068
NLBLOCK->nllen == 2 &&
1069
UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
1070
{
1071
mb->hitend = TRUE;
1072
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
1073
}
1074
PCRE2_FALLTHROUGH /* Fall through */
1075
1076
/* Match any single character whatsoever. */
1077
1078
case OP_ALLANY:
1079
if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
1080
{ /* not be updated before SCHECK_PARTIAL. */
1081
SCHECK_PARTIAL();
1082
RRETURN(MATCH_NOMATCH);
1083
}
1084
Feptr++;
1085
#ifdef SUPPORT_UNICODE
1086
if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
1087
#endif
1088
Fecode++;
1089
break;
1090
1091
1092
/* ===================================================================== */
1093
/* Match a single code unit, even in UTF mode. This opcode really does
1094
match any code unit, even newline. (It really should be called ANYCODEUNIT,
1095
of course - the byte name is from pre-16 bit days.) */
1096
1097
case OP_ANYBYTE:
1098
if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
1099
{ /* not be updated before SCHECK_PARTIAL. */
1100
SCHECK_PARTIAL();
1101
RRETURN(MATCH_NOMATCH);
1102
}
1103
Feptr++;
1104
Fecode++;
1105
break;
1106
1107
1108
/* ===================================================================== */
1109
/* Match a single character, casefully */
1110
1111
case OP_CHAR:
1112
#ifdef SUPPORT_UNICODE
1113
if (utf)
1114
{
1115
Flength = 1;
1116
Fecode++;
1117
GETCHARLEN(fc, Fecode, Flength);
1118
if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
1119
{
1120
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
1121
RRETURN(MATCH_NOMATCH);
1122
}
1123
for (; Flength > 0; Flength--)
1124
{
1125
if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
1126
}
1127
}
1128
else
1129
#endif
1130
1131
/* Not UTF mode */
1132
{
1133
if (mb->end_subject - Feptr < 1)
1134
{
1135
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
1136
RRETURN(MATCH_NOMATCH);
1137
}
1138
if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
1139
Fecode += 2;
1140
}
1141
break;
1142
1143
1144
/* ===================================================================== */
1145
/* Match a single character, caselessly. If we are at the end of the
1146
subject, give up immediately. We get here only when the pattern character
1147
has at most one other case. Characters with more than two cases are coded
1148
as OP_PROP with the pseudo-property PT_CLIST. */
1149
1150
case OP_CHARI:
1151
if (Feptr >= mb->end_subject)
1152
{
1153
SCHECK_PARTIAL();
1154
RRETURN(MATCH_NOMATCH);
1155
}
1156
1157
#ifdef SUPPORT_UNICODE
1158
if (utf)
1159
{
1160
Flength = 1;
1161
Fecode++;
1162
GETCHARLEN(fc, Fecode, Flength);
1163
1164
/* If the pattern character's value is < 128, we know that its other case
1165
(if any) is also < 128 (and therefore only one code unit long in all
1166
code-unit widths), so we can use the fast lookup table. We checked above
1167
that there is at least one character left in the subject. */
1168
1169
if (fc < 128)
1170
{
1171
uint32_t cc = UCHAR21(Feptr);
1172
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1173
Fecode++;
1174
Feptr++;
1175
}
1176
1177
/* Otherwise we must pick up the subject character and use Unicode
1178
property support to test its other case. Note that we cannot use the
1179
value of "Flength" to check for sufficient bytes left, because the other
1180
case of the character may have more or fewer code units. */
1181
1182
else
1183
{
1184
uint32_t dc;
1185
GETCHARINC(dc, Feptr);
1186
Fecode += Flength;
1187
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1188
}
1189
}
1190
1191
/* If UCP is set without UTF we must do the same as above, but with one
1192
character per code unit. */
1193
1194
else if (ucp)
1195
{
1196
uint32_t cc = UCHAR21(Feptr);
1197
fc = Fecode[1];
1198
if (fc < 128)
1199
{
1200
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1201
}
1202
else
1203
{
1204
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1205
}
1206
Feptr++;
1207
Fecode += 2;
1208
}
1209
1210
else
1211
#endif /* SUPPORT_UNICODE */
1212
1213
/* Not UTF or UCP mode; use the table for characters < 256. */
1214
{
1215
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1216
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1217
Feptr++;
1218
Fecode += 2;
1219
}
1220
break;
1221
1222
1223
/* ===================================================================== */
1224
/* Match not a single character. */
1225
1226
case OP_NOT:
1227
case OP_NOTI:
1228
if (Feptr >= mb->end_subject)
1229
{
1230
SCHECK_PARTIAL();
1231
RRETURN(MATCH_NOMATCH);
1232
}
1233
1234
#ifdef SUPPORT_UNICODE
1235
if (utf)
1236
{
1237
uint32_t ch;
1238
Fecode++;
1239
GETCHARINC(ch, Fecode);
1240
GETCHARINC(fc, Feptr);
1241
if (ch == fc)
1242
{
1243
RRETURN(MATCH_NOMATCH); /* Caseful match */
1244
}
1245
else if (Fop == OP_NOTI) /* If caseless */
1246
{
1247
if (ch > 127)
1248
ch = UCD_OTHERCASE(ch);
1249
else
1250
ch = (mb->fcc)[ch];
1251
if (ch == fc) RRETURN(MATCH_NOMATCH);
1252
}
1253
}
1254
1255
/* UCP without UTF is as above, but with one character per code unit. */
1256
1257
else if (ucp)
1258
{
1259
uint32_t ch;
1260
fc = UCHAR21INC(Feptr);
1261
ch = Fecode[1];
1262
Fecode += 2;
1263
1264
if (ch == fc)
1265
{
1266
RRETURN(MATCH_NOMATCH); /* Caseful match */
1267
}
1268
else if (Fop == OP_NOTI) /* If caseless */
1269
{
1270
if (ch > 127)
1271
ch = UCD_OTHERCASE(ch);
1272
else
1273
ch = (mb->fcc)[ch];
1274
if (ch == fc) RRETURN(MATCH_NOMATCH);
1275
}
1276
}
1277
1278
else
1279
#endif /* SUPPORT_UNICODE */
1280
1281
/* Neither UTF nor UCP is set */
1282
1283
{
1284
uint32_t ch = Fecode[1];
1285
fc = UCHAR21INC(Feptr);
1286
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1287
RRETURN(MATCH_NOMATCH);
1288
Fecode += 2;
1289
}
1290
break;
1291
1292
1293
/* ===================================================================== */
1294
/* Match a single character repeatedly. */
1295
1296
#define Loclength F->temp_size
1297
#define Lstart_eptr F->temp_sptr[0]
1298
#define Lcharptr F->temp_sptr[1]
1299
#define Lmin F->temp_32[0]
1300
#define Lmax F->temp_32[1]
1301
#define Lc F->temp_32[2]
1302
#define Loc F->temp_32[3]
1303
1304
case OP_EXACT:
1305
case OP_EXACTI:
1306
Lmin = Lmax = GET2(Fecode, 1);
1307
Fecode += 1 + IMM2_SIZE;
1308
goto REPEATCHAR;
1309
1310
case OP_POSUPTO:
1311
case OP_POSUPTOI:
1312
reptype = REPTYPE_POS;
1313
Lmin = 0;
1314
Lmax = GET2(Fecode, 1);
1315
Fecode += 1 + IMM2_SIZE;
1316
goto REPEATCHAR;
1317
1318
case OP_UPTO:
1319
case OP_UPTOI:
1320
reptype = REPTYPE_MAX;
1321
Lmin = 0;
1322
Lmax = GET2(Fecode, 1);
1323
Fecode += 1 + IMM2_SIZE;
1324
goto REPEATCHAR;
1325
1326
case OP_MINUPTO:
1327
case OP_MINUPTOI:
1328
reptype = REPTYPE_MIN;
1329
Lmin = 0;
1330
Lmax = GET2(Fecode, 1);
1331
Fecode += 1 + IMM2_SIZE;
1332
goto REPEATCHAR;
1333
1334
case OP_POSSTAR:
1335
case OP_POSSTARI:
1336
reptype = REPTYPE_POS;
1337
Lmin = 0;
1338
Lmax = UINT32_MAX;
1339
Fecode++;
1340
goto REPEATCHAR;
1341
1342
case OP_POSPLUS:
1343
case OP_POSPLUSI:
1344
reptype = REPTYPE_POS;
1345
Lmin = 1;
1346
Lmax = UINT32_MAX;
1347
Fecode++;
1348
goto REPEATCHAR;
1349
1350
case OP_POSQUERY:
1351
case OP_POSQUERYI:
1352
reptype = REPTYPE_POS;
1353
Lmin = 0;
1354
Lmax = 1;
1355
Fecode++;
1356
goto REPEATCHAR;
1357
1358
case OP_STAR:
1359
case OP_STARI:
1360
case OP_MINSTAR:
1361
case OP_MINSTARI:
1362
case OP_PLUS:
1363
case OP_PLUSI:
1364
case OP_MINPLUS:
1365
case OP_MINPLUSI:
1366
case OP_QUERY:
1367
case OP_QUERYI:
1368
case OP_MINQUERY:
1369
case OP_MINQUERYI:
1370
fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1371
Lmin = rep_min[fc];
1372
Lmax = rep_max[fc];
1373
reptype = rep_typ[fc];
1374
1375
/* Common code for all repeated single-character matches. We first check
1376
for the minimum number of characters. If the minimum equals the maximum, we
1377
are done. Otherwise, if minimizing, check the rest of the pattern for a
1378
match; if there isn't one, advance up to the maximum, one character at a
1379
time.
1380
1381
If maximizing, advance up to the maximum number of matching characters,
1382
until Feptr is past the end of the maximum run. If possessive, we are
1383
then done (no backing up). Otherwise, match at this position; anything
1384
other than no match is immediately returned. For nomatch, back up one
1385
character, unless we are matching \R and the last thing matched was
1386
\r\n, in which case, back up two code units until we reach the first
1387
optional character position.
1388
1389
The various UTF/non-UTF and caseful/caseless cases are handled separately,
1390
for speed. */
1391
1392
REPEATCHAR:
1393
#ifdef SUPPORT_UNICODE
1394
if (utf)
1395
{
1396
Flength = 1;
1397
Lcharptr = Fecode;
1398
GETCHARLEN(fc, Fecode, Flength);
1399
Fecode += Flength;
1400
1401
/* Handle multi-code-unit character matching, caseful and caseless. */
1402
1403
if (Flength > 1)
1404
{
1405
uint32_t othercase;
1406
1407
if (Fop >= OP_STARI && /* Caseless */
1408
(othercase = UCD_OTHERCASE(fc)) != fc)
1409
Loclength = PRIV(ord2utf)(othercase, Foccu);
1410
else Loclength = 0;
1411
1412
for (i = 1; i <= Lmin; i++)
1413
{
1414
if (Feptr <= mb->end_subject - Flength &&
1415
memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1416
else if (Loclength > 0 &&
1417
Feptr <= mb->end_subject - Loclength &&
1418
memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1419
Feptr += Loclength;
1420
else
1421
{
1422
CHECK_PARTIAL();
1423
RRETURN(MATCH_NOMATCH);
1424
}
1425
}
1426
1427
if (Lmin == Lmax) continue;
1428
1429
if (reptype == REPTYPE_MIN)
1430
{
1431
for (;;)
1432
{
1433
RMATCH(Fecode, RM202);
1434
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1435
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1436
if (Feptr <= mb->end_subject - Flength &&
1437
memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1438
else if (Loclength > 0 &&
1439
Feptr <= mb->end_subject - Loclength &&
1440
memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1441
Feptr += Loclength;
1442
else
1443
{
1444
CHECK_PARTIAL();
1445
RRETURN(MATCH_NOMATCH);
1446
}
1447
}
1448
PCRE2_UNREACHABLE(); /* Control never reaches here */
1449
}
1450
1451
else /* Maximize */
1452
{
1453
Lstart_eptr = Feptr;
1454
for (i = Lmin; i < Lmax; i++)
1455
{
1456
if (Feptr <= mb->end_subject - Flength &&
1457
memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1458
Feptr += Flength;
1459
else if (Loclength > 0 &&
1460
Feptr <= mb->end_subject - Loclength &&
1461
memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1462
Feptr += Loclength;
1463
else
1464
{
1465
CHECK_PARTIAL();
1466
break;
1467
}
1468
}
1469
1470
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
1471
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1472
go too far. */
1473
1474
if (reptype != REPTYPE_POS) for(;;)
1475
{
1476
if (Feptr <= Lstart_eptr) break;
1477
RMATCH(Fecode, RM203);
1478
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1479
Feptr--;
1480
BACKCHAR(Feptr);
1481
}
1482
}
1483
break; /* End of repeated wide character handling */
1484
}
1485
1486
/* Length of UTF character is 1. Put it into the preserved variable and
1487
fall through to the non-UTF code. */
1488
1489
Lc = fc;
1490
}
1491
else
1492
#endif /* SUPPORT_UNICODE */
1493
1494
/* When not in UTF mode, load a single-code-unit character. Then proceed as
1495
above, using Unicode casing if either UTF or UCP is set. */
1496
1497
Lc = *Fecode++;
1498
1499
/* Caseless comparison */
1500
1501
if (Fop >= OP_STARI)
1502
{
1503
#if PCRE2_CODE_UNIT_WIDTH == 8
1504
#ifdef SUPPORT_UNICODE
1505
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1506
else
1507
#endif /* SUPPORT_UNICODE */
1508
/* Lc will be < 128 in UTF-8 mode. */
1509
Loc = mb->fcc[Lc];
1510
#else /* 16-bit & 32-bit */
1511
#ifdef SUPPORT_UNICODE
1512
if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1513
else
1514
#endif /* SUPPORT_UNICODE */
1515
Loc = TABLE_GET(Lc, mb->fcc, Lc);
1516
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1517
1518
for (i = 1; i <= Lmin; i++)
1519
{
1520
uint32_t cc; /* Faster than PCRE2_UCHAR */
1521
if (Feptr >= mb->end_subject)
1522
{
1523
SCHECK_PARTIAL();
1524
RRETURN(MATCH_NOMATCH);
1525
}
1526
cc = UCHAR21TEST(Feptr);
1527
if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1528
Feptr++;
1529
}
1530
if (Lmin == Lmax) continue;
1531
1532
if (reptype == REPTYPE_MIN)
1533
{
1534
for (;;)
1535
{
1536
uint32_t cc; /* Faster than PCRE2_UCHAR */
1537
RMATCH(Fecode, RM25);
1538
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1539
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1540
if (Feptr >= mb->end_subject)
1541
{
1542
SCHECK_PARTIAL();
1543
RRETURN(MATCH_NOMATCH);
1544
}
1545
cc = UCHAR21TEST(Feptr);
1546
if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1547
Feptr++;
1548
}
1549
PCRE2_UNREACHABLE(); /* Control never reaches here */
1550
}
1551
1552
else /* Maximize */
1553
{
1554
Lstart_eptr = Feptr;
1555
for (i = Lmin; i < Lmax; i++)
1556
{
1557
uint32_t cc; /* Faster than PCRE2_UCHAR */
1558
if (Feptr >= mb->end_subject)
1559
{
1560
SCHECK_PARTIAL();
1561
break;
1562
}
1563
cc = UCHAR21TEST(Feptr);
1564
if (Lc != cc && Loc != cc) break;
1565
Feptr++;
1566
}
1567
if (reptype != REPTYPE_POS) for (;;)
1568
{
1569
if (Feptr == Lstart_eptr) break;
1570
RMATCH(Fecode, RM26);
1571
Feptr--;
1572
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1573
}
1574
}
1575
}
1576
1577
/* Caseful comparisons (includes all multi-byte characters) */
1578
1579
else
1580
{
1581
for (i = 1; i <= Lmin; i++)
1582
{
1583
if (Feptr >= mb->end_subject)
1584
{
1585
SCHECK_PARTIAL();
1586
RRETURN(MATCH_NOMATCH);
1587
}
1588
if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1589
}
1590
1591
if (Lmin == Lmax) continue;
1592
1593
if (reptype == REPTYPE_MIN)
1594
{
1595
for (;;)
1596
{
1597
RMATCH(Fecode, RM27);
1598
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1599
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1600
if (Feptr >= mb->end_subject)
1601
{
1602
SCHECK_PARTIAL();
1603
RRETURN(MATCH_NOMATCH);
1604
}
1605
if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1606
}
1607
PCRE2_UNREACHABLE(); /* Control never reaches here */
1608
}
1609
else /* Maximize */
1610
{
1611
Lstart_eptr = Feptr;
1612
for (i = Lmin; i < Lmax; i++)
1613
{
1614
if (Feptr >= mb->end_subject)
1615
{
1616
SCHECK_PARTIAL();
1617
break;
1618
}
1619
1620
if (Lc != UCHAR21TEST(Feptr)) break;
1621
Feptr++;
1622
}
1623
1624
if (reptype != REPTYPE_POS) for (;;)
1625
{
1626
if (Feptr <= Lstart_eptr) break;
1627
RMATCH(Fecode, RM28);
1628
Feptr--;
1629
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1630
}
1631
}
1632
}
1633
break;
1634
1635
#undef Loclength
1636
#undef Lstart_eptr
1637
#undef Lcharptr
1638
#undef Lmin
1639
#undef Lmax
1640
#undef Lc
1641
#undef Loc
1642
1643
1644
/* ===================================================================== */
1645
/* Match a negated single one-byte character repeatedly. This is almost a
1646
repeat of the code for a repeated single character, but I haven't found a
1647
nice way of commoning these up that doesn't require a test of the
1648
positive/negative option for each character match. Maybe that wouldn't add
1649
very much to the time taken, but character matching *is* what this is all
1650
about... */
1651
1652
#define Lstart_eptr F->temp_sptr[0]
1653
#define Lmin F->temp_32[0]
1654
#define Lmax F->temp_32[1]
1655
#define Lc F->temp_32[2]
1656
#define Loc F->temp_32[3]
1657
1658
case OP_NOTEXACT:
1659
case OP_NOTEXACTI:
1660
Lmin = Lmax = GET2(Fecode, 1);
1661
Fecode += 1 + IMM2_SIZE;
1662
goto REPEATNOTCHAR;
1663
1664
case OP_NOTUPTO:
1665
case OP_NOTUPTOI:
1666
Lmin = 0;
1667
Lmax = GET2(Fecode, 1);
1668
reptype = REPTYPE_MAX;
1669
Fecode += 1 + IMM2_SIZE;
1670
goto REPEATNOTCHAR;
1671
1672
case OP_NOTMINUPTO:
1673
case OP_NOTMINUPTOI:
1674
Lmin = 0;
1675
Lmax = GET2(Fecode, 1);
1676
reptype = REPTYPE_MIN;
1677
Fecode += 1 + IMM2_SIZE;
1678
goto REPEATNOTCHAR;
1679
1680
case OP_NOTPOSSTAR:
1681
case OP_NOTPOSSTARI:
1682
reptype = REPTYPE_POS;
1683
Lmin = 0;
1684
Lmax = UINT32_MAX;
1685
Fecode++;
1686
goto REPEATNOTCHAR;
1687
1688
case OP_NOTPOSPLUS:
1689
case OP_NOTPOSPLUSI:
1690
reptype = REPTYPE_POS;
1691
Lmin = 1;
1692
Lmax = UINT32_MAX;
1693
Fecode++;
1694
goto REPEATNOTCHAR;
1695
1696
case OP_NOTPOSQUERY:
1697
case OP_NOTPOSQUERYI:
1698
reptype = REPTYPE_POS;
1699
Lmin = 0;
1700
Lmax = 1;
1701
Fecode++;
1702
goto REPEATNOTCHAR;
1703
1704
case OP_NOTPOSUPTO:
1705
case OP_NOTPOSUPTOI:
1706
reptype = REPTYPE_POS;
1707
Lmin = 0;
1708
Lmax = GET2(Fecode, 1);
1709
Fecode += 1 + IMM2_SIZE;
1710
goto REPEATNOTCHAR;
1711
1712
case OP_NOTSTAR:
1713
case OP_NOTSTARI:
1714
case OP_NOTMINSTAR:
1715
case OP_NOTMINSTARI:
1716
case OP_NOTPLUS:
1717
case OP_NOTPLUSI:
1718
case OP_NOTMINPLUS:
1719
case OP_NOTMINPLUSI:
1720
case OP_NOTQUERY:
1721
case OP_NOTQUERYI:
1722
case OP_NOTMINQUERY:
1723
case OP_NOTMINQUERYI:
1724
fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1725
Lmin = rep_min[fc];
1726
Lmax = rep_max[fc];
1727
reptype = rep_typ[fc];
1728
1729
/* Common code for all repeated single-character non-matches. */
1730
1731
REPEATNOTCHAR:
1732
GETCHARINCTEST(Lc, Fecode);
1733
1734
/* The code is duplicated for the caseless and caseful cases, for speed,
1735
since matching characters is likely to be quite common. First, ensure the
1736
minimum number of matches are present. If Lmin = Lmax, we are done.
1737
Otherwise, if minimizing, keep trying the rest of the expression and
1738
advancing one matching character if failing, up to the maximum.
1739
Alternatively, if maximizing, find the maximum number of characters and
1740
work backwards. */
1741
1742
if (Fop >= OP_NOTSTARI) /* Caseless */
1743
{
1744
#ifdef SUPPORT_UNICODE
1745
if ((utf || ucp) && Lc > 127)
1746
Loc = UCD_OTHERCASE(Lc);
1747
else
1748
#endif /* SUPPORT_UNICODE */
1749
1750
Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1751
1752
#ifdef SUPPORT_UNICODE
1753
if (utf)
1754
{
1755
uint32_t d;
1756
for (i = 1; i <= Lmin; i++)
1757
{
1758
if (Feptr >= mb->end_subject)
1759
{
1760
SCHECK_PARTIAL();
1761
RRETURN(MATCH_NOMATCH);
1762
}
1763
GETCHARINC(d, Feptr);
1764
if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1765
}
1766
}
1767
else
1768
#endif /* SUPPORT_UNICODE */
1769
1770
/* Not UTF mode */
1771
{
1772
for (i = 1; i <= Lmin; i++)
1773
{
1774
if (Feptr >= mb->end_subject)
1775
{
1776
SCHECK_PARTIAL();
1777
RRETURN(MATCH_NOMATCH);
1778
}
1779
if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1780
Feptr++;
1781
}
1782
}
1783
1784
if (Lmin == Lmax) continue; /* Finished for exact count */
1785
1786
if (reptype == REPTYPE_MIN)
1787
{
1788
#ifdef SUPPORT_UNICODE
1789
if (utf)
1790
{
1791
uint32_t d;
1792
for (;;)
1793
{
1794
RMATCH(Fecode, RM204);
1795
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1797
if (Feptr >= mb->end_subject)
1798
{
1799
SCHECK_PARTIAL();
1800
RRETURN(MATCH_NOMATCH);
1801
}
1802
GETCHARINC(d, Feptr);
1803
if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1804
}
1805
}
1806
else
1807
#endif /*SUPPORT_UNICODE */
1808
1809
/* Not UTF mode */
1810
{
1811
for (;;)
1812
{
1813
RMATCH(Fecode, RM29);
1814
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1816
if (Feptr >= mb->end_subject)
1817
{
1818
SCHECK_PARTIAL();
1819
RRETURN(MATCH_NOMATCH);
1820
}
1821
if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1822
Feptr++;
1823
}
1824
}
1825
PCRE2_UNREACHABLE(); /* Control never reaches here */
1826
}
1827
1828
/* Maximize case */
1829
1830
else
1831
{
1832
Lstart_eptr = Feptr;
1833
1834
#ifdef SUPPORT_UNICODE
1835
if (utf)
1836
{
1837
uint32_t d;
1838
for (i = Lmin; i < Lmax; i++)
1839
{
1840
int len = 1;
1841
if (Feptr >= mb->end_subject)
1842
{
1843
SCHECK_PARTIAL();
1844
break;
1845
}
1846
GETCHARLEN(d, Feptr, len);
1847
if (Lc == d || Loc == d) break;
1848
Feptr += len;
1849
}
1850
1851
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
1852
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1853
go too far. */
1854
1855
if (reptype != REPTYPE_POS) for(;;)
1856
{
1857
if (Feptr <= Lstart_eptr) break;
1858
RMATCH(Fecode, RM205);
1859
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1860
Feptr--;
1861
BACKCHAR(Feptr);
1862
}
1863
}
1864
else
1865
#endif /* SUPPORT_UNICODE */
1866
1867
/* Not UTF mode */
1868
{
1869
for (i = Lmin; i < Lmax; i++)
1870
{
1871
if (Feptr >= mb->end_subject)
1872
{
1873
SCHECK_PARTIAL();
1874
break;
1875
}
1876
if (Lc == *Feptr || Loc == *Feptr) break;
1877
Feptr++;
1878
}
1879
if (reptype != REPTYPE_POS) for (;;)
1880
{
1881
if (Feptr == Lstart_eptr) break;
1882
RMATCH(Fecode, RM30);
1883
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1884
Feptr--;
1885
}
1886
}
1887
}
1888
}
1889
1890
/* Caseful comparisons */
1891
1892
else
1893
{
1894
#ifdef SUPPORT_UNICODE
1895
if (utf)
1896
{
1897
uint32_t d;
1898
for (i = 1; i <= Lmin; i++)
1899
{
1900
if (Feptr >= mb->end_subject)
1901
{
1902
SCHECK_PARTIAL();
1903
RRETURN(MATCH_NOMATCH);
1904
}
1905
GETCHARINC(d, Feptr);
1906
if (Lc == d) RRETURN(MATCH_NOMATCH);
1907
}
1908
}
1909
else
1910
#endif
1911
/* Not UTF mode */
1912
{
1913
for (i = 1; i <= Lmin; i++)
1914
{
1915
if (Feptr >= mb->end_subject)
1916
{
1917
SCHECK_PARTIAL();
1918
RRETURN(MATCH_NOMATCH);
1919
}
1920
if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1921
}
1922
}
1923
1924
if (Lmin == Lmax) continue;
1925
1926
if (reptype == REPTYPE_MIN)
1927
{
1928
#ifdef SUPPORT_UNICODE
1929
if (utf)
1930
{
1931
uint32_t d;
1932
for (;;)
1933
{
1934
RMATCH(Fecode, RM206);
1935
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1936
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1937
if (Feptr >= mb->end_subject)
1938
{
1939
SCHECK_PARTIAL();
1940
RRETURN(MATCH_NOMATCH);
1941
}
1942
GETCHARINC(d, Feptr);
1943
if (Lc == d) RRETURN(MATCH_NOMATCH);
1944
}
1945
}
1946
else
1947
#endif
1948
/* Not UTF mode */
1949
{
1950
for (;;)
1951
{
1952
RMATCH(Fecode, RM31);
1953
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1954
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1955
if (Feptr >= mb->end_subject)
1956
{
1957
SCHECK_PARTIAL();
1958
RRETURN(MATCH_NOMATCH);
1959
}
1960
if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1961
}
1962
}
1963
PCRE2_UNREACHABLE(); /* Control never reaches here */
1964
}
1965
1966
/* Maximize case */
1967
1968
else
1969
{
1970
Lstart_eptr = Feptr;
1971
1972
#ifdef SUPPORT_UNICODE
1973
if (utf)
1974
{
1975
uint32_t d;
1976
for (i = Lmin; i < Lmax; i++)
1977
{
1978
int len = 1;
1979
if (Feptr >= mb->end_subject)
1980
{
1981
SCHECK_PARTIAL();
1982
break;
1983
}
1984
GETCHARLEN(d, Feptr, len);
1985
if (Lc == d) break;
1986
Feptr += len;
1987
}
1988
1989
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
1990
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1991
go too far. */
1992
1993
if (reptype != REPTYPE_POS) for(;;)
1994
{
1995
if (Feptr <= Lstart_eptr) break;
1996
RMATCH(Fecode, RM207);
1997
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1998
Feptr--;
1999
BACKCHAR(Feptr);
2000
}
2001
}
2002
else
2003
#endif
2004
/* Not UTF mode */
2005
{
2006
for (i = Lmin; i < Lmax; i++)
2007
{
2008
if (Feptr >= mb->end_subject)
2009
{
2010
SCHECK_PARTIAL();
2011
break;
2012
}
2013
if (Lc == *Feptr) break;
2014
Feptr++;
2015
}
2016
if (reptype != REPTYPE_POS) for (;;)
2017
{
2018
if (Feptr == Lstart_eptr) break;
2019
RMATCH(Fecode, RM32);
2020
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2021
Feptr--;
2022
}
2023
}
2024
}
2025
}
2026
break;
2027
2028
#undef Lstart_eptr
2029
#undef Lmin
2030
#undef Lmax
2031
#undef Lc
2032
#undef Loc
2033
2034
2035
/* ===================================================================== */
2036
/* Match a bit-mapped character class, possibly repeatedly. These opcodes
2037
are used when all the characters in the class have values in the range
2038
0-255, and either the matching is caseful, or the characters are in the
2039
range 0-127 when UTF processing is enabled. The only difference between
2040
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2041
encountered. */
2042
2043
#define Lmin F->temp_32[0]
2044
#define Lmax F->temp_32[1]
2045
#define Lstart_eptr F->temp_sptr[0]
2046
#define Lbyte_map_address F->temp_sptr[1]
2047
#define Lbyte_map ((const unsigned char *)Lbyte_map_address)
2048
2049
case OP_NCLASS:
2050
case OP_CLASS:
2051
{
2052
Lbyte_map_address = Fecode + 1; /* Save for matching */
2053
Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
2054
2055
/* Look past the end of the item to see if there is repeat information
2056
following. Then obey similar code to character type repeats. */
2057
2058
switch (*Fecode)
2059
{
2060
case OP_CRSTAR:
2061
case OP_CRMINSTAR:
2062
case OP_CRPLUS:
2063
case OP_CRMINPLUS:
2064
case OP_CRQUERY:
2065
case OP_CRMINQUERY:
2066
case OP_CRPOSSTAR:
2067
case OP_CRPOSPLUS:
2068
case OP_CRPOSQUERY:
2069
fc = *Fecode++ - OP_CRSTAR;
2070
Lmin = rep_min[fc];
2071
Lmax = rep_max[fc];
2072
reptype = rep_typ[fc];
2073
break;
2074
2075
case OP_CRRANGE:
2076
case OP_CRMINRANGE:
2077
case OP_CRPOSRANGE:
2078
Lmin = GET2(Fecode, 1);
2079
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2080
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2081
reptype = rep_typ[*Fecode - OP_CRSTAR];
2082
Fecode += 1 + 2 * IMM2_SIZE;
2083
break;
2084
2085
default: /* No repeat follows */
2086
Lmin = Lmax = 1;
2087
break;
2088
}
2089
2090
/* First, ensure the minimum number of matches are present. */
2091
2092
#ifdef SUPPORT_UNICODE
2093
if (utf)
2094
{
2095
for (i = 1; i <= Lmin; i++)
2096
{
2097
if (Feptr >= mb->end_subject)
2098
{
2099
SCHECK_PARTIAL();
2100
RRETURN(MATCH_NOMATCH);
2101
}
2102
GETCHARINC(fc, Feptr);
2103
if (fc > 255)
2104
{
2105
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2106
}
2107
else
2108
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2109
}
2110
}
2111
else
2112
#endif
2113
/* Not UTF mode */
2114
{
2115
for (i = 1; i <= Lmin; i++)
2116
{
2117
if (Feptr >= mb->end_subject)
2118
{
2119
SCHECK_PARTIAL();
2120
RRETURN(MATCH_NOMATCH);
2121
}
2122
fc = *Feptr++;
2123
#if PCRE2_CODE_UNIT_WIDTH != 8
2124
if (fc > 255)
2125
{
2126
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2127
}
2128
else
2129
#endif
2130
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2131
}
2132
}
2133
2134
/* If Lmax == Lmin we are done. Continue with main loop. */
2135
2136
if (Lmin == Lmax) continue;
2137
2138
/* If minimizing, keep testing the rest of the expression and advancing
2139
the pointer while it matches the class. */
2140
2141
if (reptype == REPTYPE_MIN)
2142
{
2143
#ifdef SUPPORT_UNICODE
2144
if (utf)
2145
{
2146
for (;;)
2147
{
2148
RMATCH(Fecode, RM200);
2149
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2150
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2151
if (Feptr >= mb->end_subject)
2152
{
2153
SCHECK_PARTIAL();
2154
RRETURN(MATCH_NOMATCH);
2155
}
2156
GETCHARINC(fc, Feptr);
2157
if (fc > 255)
2158
{
2159
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2160
}
2161
else
2162
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2163
}
2164
}
2165
else
2166
#endif
2167
/* Not UTF mode */
2168
{
2169
for (;;)
2170
{
2171
RMATCH(Fecode, RM23);
2172
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2173
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2174
if (Feptr >= mb->end_subject)
2175
{
2176
SCHECK_PARTIAL();
2177
RRETURN(MATCH_NOMATCH);
2178
}
2179
fc = *Feptr++;
2180
#if PCRE2_CODE_UNIT_WIDTH != 8
2181
if (fc > 255)
2182
{
2183
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2184
}
2185
else
2186
#endif
2187
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2188
}
2189
}
2190
PCRE2_UNREACHABLE(); /* Control never reaches here */
2191
}
2192
2193
/* If maximizing, find the longest possible run, then work backwards. */
2194
2195
else
2196
{
2197
Lstart_eptr = Feptr;
2198
2199
#ifdef SUPPORT_UNICODE
2200
if (utf)
2201
{
2202
for (i = Lmin; i < Lmax; i++)
2203
{
2204
int len = 1;
2205
if (Feptr >= mb->end_subject)
2206
{
2207
SCHECK_PARTIAL();
2208
break;
2209
}
2210
GETCHARLEN(fc, Feptr, len);
2211
if (fc > 255)
2212
{
2213
if (Fop == OP_CLASS) break;
2214
}
2215
else
2216
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2217
Feptr += len;
2218
}
2219
2220
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2221
2222
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
2223
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2224
go too far. */
2225
2226
for (;;)
2227
{
2228
RMATCH(Fecode, RM201);
2229
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2230
if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2231
BACKCHAR(Feptr);
2232
}
2233
}
2234
else
2235
#endif
2236
/* Not UTF mode */
2237
{
2238
for (i = Lmin; i < Lmax; i++)
2239
{
2240
if (Feptr >= mb->end_subject)
2241
{
2242
SCHECK_PARTIAL();
2243
break;
2244
}
2245
fc = *Feptr;
2246
#if PCRE2_CODE_UNIT_WIDTH != 8
2247
if (fc > 255)
2248
{
2249
if (Fop == OP_CLASS) break;
2250
}
2251
else
2252
#endif
2253
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2254
Feptr++;
2255
}
2256
2257
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2258
2259
while (Feptr >= Lstart_eptr)
2260
{
2261
RMATCH(Fecode, RM24);
2262
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2263
Feptr--;
2264
}
2265
}
2266
2267
RRETURN(MATCH_NOMATCH);
2268
}
2269
}
2270
2271
PCRE2_UNREACHABLE(); /* Control never reaches here */
2272
2273
#undef Lbyte_map_address
2274
#undef Lbyte_map
2275
#undef Lstart_eptr
2276
#undef Lmin
2277
#undef Lmax
2278
2279
2280
/* ===================================================================== */
2281
/* Match an extended character class. In the 8-bit library, this opcode is
2282
encountered only when UTF-8 mode mode is supported. In the 16-bit and
2283
32-bit libraries, codepoints greater than 255 may be encountered even when
2284
UTF is not supported. */
2285
2286
#define Lstart_eptr F->temp_sptr[0]
2287
#define Lxclass_data F->temp_sptr[1]
2288
#define Lmin F->temp_32[0]
2289
#define Lmax F->temp_32[1]
2290
2291
#ifdef SUPPORT_WIDE_CHARS
2292
case OP_XCLASS:
2293
{
2294
Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2295
Fecode += GET(Fecode, 1); /* Advance past the item */
2296
2297
switch (*Fecode)
2298
{
2299
case OP_CRSTAR:
2300
case OP_CRMINSTAR:
2301
case OP_CRPLUS:
2302
case OP_CRMINPLUS:
2303
case OP_CRQUERY:
2304
case OP_CRMINQUERY:
2305
case OP_CRPOSSTAR:
2306
case OP_CRPOSPLUS:
2307
case OP_CRPOSQUERY:
2308
fc = *Fecode++ - OP_CRSTAR;
2309
Lmin = rep_min[fc];
2310
Lmax = rep_max[fc];
2311
reptype = rep_typ[fc];
2312
break;
2313
2314
case OP_CRRANGE:
2315
case OP_CRMINRANGE:
2316
case OP_CRPOSRANGE:
2317
Lmin = GET2(Fecode, 1);
2318
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2319
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2320
reptype = rep_typ[*Fecode - OP_CRSTAR];
2321
Fecode += 1 + 2 * IMM2_SIZE;
2322
break;
2323
2324
default: /* No repeat follows */
2325
Lmin = Lmax = 1;
2326
break;
2327
}
2328
2329
/* First, ensure the minimum number of matches are present. */
2330
2331
for (i = 1; i <= Lmin; i++)
2332
{
2333
if (Feptr >= mb->end_subject)
2334
{
2335
SCHECK_PARTIAL();
2336
RRETURN(MATCH_NOMATCH);
2337
}
2338
GETCHARINCTEST(fc, Feptr);
2339
if (!PRIV(xclass)(fc, Lxclass_data,
2340
(const uint8_t*)mb->start_code, utf))
2341
RRETURN(MATCH_NOMATCH);
2342
}
2343
2344
/* If Lmax == Lmin we can just continue with the main loop. */
2345
2346
if (Lmin == Lmax) continue;
2347
2348
/* If minimizing, keep testing the rest of the expression and advancing
2349
the pointer while it matches the class. */
2350
2351
if (reptype == REPTYPE_MIN)
2352
{
2353
for (;;)
2354
{
2355
RMATCH(Fecode, RM100);
2356
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2357
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2358
if (Feptr >= mb->end_subject)
2359
{
2360
SCHECK_PARTIAL();
2361
RRETURN(MATCH_NOMATCH);
2362
}
2363
GETCHARINCTEST(fc, Feptr);
2364
if (!PRIV(xclass)(fc, Lxclass_data,
2365
(const uint8_t*)mb->start_code, utf))
2366
RRETURN(MATCH_NOMATCH);
2367
}
2368
PCRE2_UNREACHABLE(); /* Control never reaches here */
2369
}
2370
2371
/* If maximizing, find the longest possible run, then work backwards. */
2372
2373
else
2374
{
2375
Lstart_eptr = Feptr;
2376
for (i = Lmin; i < Lmax; i++)
2377
{
2378
int len = 1;
2379
if (Feptr >= mb->end_subject)
2380
{
2381
SCHECK_PARTIAL();
2382
break;
2383
}
2384
#ifdef SUPPORT_UNICODE
2385
GETCHARLENTEST(fc, Feptr, len);
2386
#else
2387
fc = *Feptr;
2388
#endif
2389
if (!PRIV(xclass)(fc, Lxclass_data,
2390
(const uint8_t*)mb->start_code, utf)) break;
2391
Feptr += len;
2392
}
2393
2394
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2395
2396
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
2397
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2398
go too far. */
2399
2400
for(;;)
2401
{
2402
RMATCH(Fecode, RM101);
2403
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2404
if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2405
#ifdef SUPPORT_UNICODE
2406
if (utf) BACKCHAR(Feptr);
2407
#endif
2408
}
2409
RRETURN(MATCH_NOMATCH);
2410
}
2411
2412
PCRE2_UNREACHABLE(); /* Control never reaches here */
2413
}
2414
#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2415
2416
#undef Lstart_eptr
2417
#undef Lxclass_data
2418
#undef Lmin
2419
#undef Lmax
2420
2421
2422
/* ===================================================================== */
2423
/* Match a complex, set-based character class. This opcodes are used when
2424
there is complex nesting or logical operations within the character
2425
class. */
2426
2427
#define Lstart_eptr F->temp_sptr[0]
2428
#define Leclass_data F->temp_sptr[1]
2429
#define Leclass_len F->temp_size
2430
#define Lmin F->temp_32[0]
2431
#define Lmax F->temp_32[1]
2432
2433
#ifdef SUPPORT_WIDE_CHARS
2434
case OP_ECLASS:
2435
{
2436
Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2437
Fecode += GET(Fecode, 1); /* Advance past the item */
2438
Leclass_len = (PCRE2_SIZE)(Fecode - Leclass_data);
2439
2440
switch (*Fecode)
2441
{
2442
case OP_CRSTAR:
2443
case OP_CRMINSTAR:
2444
case OP_CRPLUS:
2445
case OP_CRMINPLUS:
2446
case OP_CRQUERY:
2447
case OP_CRMINQUERY:
2448
case OP_CRPOSSTAR:
2449
case OP_CRPOSPLUS:
2450
case OP_CRPOSQUERY:
2451
fc = *Fecode++ - OP_CRSTAR;
2452
Lmin = rep_min[fc];
2453
Lmax = rep_max[fc];
2454
reptype = rep_typ[fc];
2455
break;
2456
2457
case OP_CRRANGE:
2458
case OP_CRMINRANGE:
2459
case OP_CRPOSRANGE:
2460
Lmin = GET2(Fecode, 1);
2461
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2462
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2463
reptype = rep_typ[*Fecode - OP_CRSTAR];
2464
Fecode += 1 + 2 * IMM2_SIZE;
2465
break;
2466
2467
default: /* No repeat follows */
2468
Lmin = Lmax = 1;
2469
break;
2470
}
2471
2472
/* First, ensure the minimum number of matches are present. */
2473
2474
for (i = 1; i <= Lmin; i++)
2475
{
2476
if (Feptr >= mb->end_subject)
2477
{
2478
SCHECK_PARTIAL();
2479
RRETURN(MATCH_NOMATCH);
2480
}
2481
GETCHARINCTEST(fc, Feptr);
2482
if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,
2483
(const uint8_t*)mb->start_code, utf))
2484
RRETURN(MATCH_NOMATCH);
2485
}
2486
2487
/* If Lmax == Lmin we can just continue with the main loop. */
2488
2489
if (Lmin == Lmax) continue;
2490
2491
/* If minimizing, keep testing the rest of the expression and advancing
2492
the pointer while it matches the class. */
2493
2494
if (reptype == REPTYPE_MIN)
2495
{
2496
for (;;)
2497
{
2498
RMATCH(Fecode, RM102);
2499
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2500
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2501
if (Feptr >= mb->end_subject)
2502
{
2503
SCHECK_PARTIAL();
2504
RRETURN(MATCH_NOMATCH);
2505
}
2506
GETCHARINCTEST(fc, Feptr);
2507
if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,
2508
(const uint8_t*)mb->start_code, utf))
2509
RRETURN(MATCH_NOMATCH);
2510
}
2511
PCRE2_UNREACHABLE(); /* Control never reaches here */
2512
}
2513
2514
/* If maximizing, find the longest possible run, then work backwards. */
2515
2516
else
2517
{
2518
Lstart_eptr = Feptr;
2519
for (i = Lmin; i < Lmax; i++)
2520
{
2521
int len = 1;
2522
if (Feptr >= mb->end_subject)
2523
{
2524
SCHECK_PARTIAL();
2525
break;
2526
}
2527
#ifdef SUPPORT_UNICODE
2528
GETCHARLENTEST(fc, Feptr, len);
2529
#else
2530
fc = *Feptr;
2531
#endif
2532
if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,
2533
(const uint8_t*)mb->start_code, utf))
2534
break;
2535
Feptr += len;
2536
}
2537
2538
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2539
2540
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
2541
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2542
go too far. */
2543
2544
for(;;)
2545
{
2546
RMATCH(Fecode, RM103);
2547
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2548
if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2549
#ifdef SUPPORT_UNICODE
2550
if (utf) BACKCHAR(Feptr);
2551
#endif
2552
}
2553
RRETURN(MATCH_NOMATCH);
2554
}
2555
2556
PCRE2_UNREACHABLE(); /* Control never reaches here */
2557
}
2558
#endif /* SUPPORT_WIDE_CHARS: end of ECLASS */
2559
2560
#undef Lstart_eptr
2561
#undef Leclass_data
2562
#undef Leclass_len
2563
#undef Lmin
2564
#undef Lmax
2565
2566
2567
/* ===================================================================== */
2568
/* Match various character types when PCRE2_UCP is not set. These opcodes
2569
are not generated when PCRE2_UCP is set - instead appropriate property
2570
tests are compiled. */
2571
2572
case OP_NOT_DIGIT:
2573
if (Feptr >= mb->end_subject)
2574
{
2575
SCHECK_PARTIAL();
2576
RRETURN(MATCH_NOMATCH);
2577
}
2578
GETCHARINCTEST(fc, Feptr);
2579
if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2580
RRETURN(MATCH_NOMATCH);
2581
Fecode++;
2582
break;
2583
2584
case OP_DIGIT:
2585
if (Feptr >= mb->end_subject)
2586
{
2587
SCHECK_PARTIAL();
2588
RRETURN(MATCH_NOMATCH);
2589
}
2590
GETCHARINCTEST(fc, Feptr);
2591
if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2592
RRETURN(MATCH_NOMATCH);
2593
Fecode++;
2594
break;
2595
2596
case OP_NOT_WHITESPACE:
2597
if (Feptr >= mb->end_subject)
2598
{
2599
SCHECK_PARTIAL();
2600
RRETURN(MATCH_NOMATCH);
2601
}
2602
GETCHARINCTEST(fc, Feptr);
2603
if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2604
RRETURN(MATCH_NOMATCH);
2605
Fecode++;
2606
break;
2607
2608
case OP_WHITESPACE:
2609
if (Feptr >= mb->end_subject)
2610
{
2611
SCHECK_PARTIAL();
2612
RRETURN(MATCH_NOMATCH);
2613
}
2614
GETCHARINCTEST(fc, Feptr);
2615
if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2616
RRETURN(MATCH_NOMATCH);
2617
Fecode++;
2618
break;
2619
2620
case OP_NOT_WORDCHAR:
2621
if (Feptr >= mb->end_subject)
2622
{
2623
SCHECK_PARTIAL();
2624
RRETURN(MATCH_NOMATCH);
2625
}
2626
GETCHARINCTEST(fc, Feptr);
2627
if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2628
RRETURN(MATCH_NOMATCH);
2629
Fecode++;
2630
break;
2631
2632
case OP_WORDCHAR:
2633
if (Feptr >= mb->end_subject)
2634
{
2635
SCHECK_PARTIAL();
2636
RRETURN(MATCH_NOMATCH);
2637
}
2638
GETCHARINCTEST(fc, Feptr);
2639
if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2640
RRETURN(MATCH_NOMATCH);
2641
Fecode++;
2642
break;
2643
2644
case OP_ANYNL:
2645
if (Feptr >= mb->end_subject)
2646
{
2647
SCHECK_PARTIAL();
2648
RRETURN(MATCH_NOMATCH);
2649
}
2650
GETCHARINCTEST(fc, Feptr);
2651
switch(fc)
2652
{
2653
default: RRETURN(MATCH_NOMATCH);
2654
2655
case CHAR_CR:
2656
if (Feptr >= mb->end_subject)
2657
{
2658
SCHECK_PARTIAL();
2659
}
2660
else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2661
break;
2662
2663
case CHAR_LF:
2664
break;
2665
2666
case CHAR_VT:
2667
case CHAR_FF:
2668
case CHAR_NEL:
2669
#ifndef EBCDIC
2670
case 0x2028:
2671
case 0x2029:
2672
#endif /* Not EBCDIC */
2673
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2674
break;
2675
}
2676
Fecode++;
2677
break;
2678
2679
case OP_NOT_HSPACE:
2680
if (Feptr >= mb->end_subject)
2681
{
2682
SCHECK_PARTIAL();
2683
RRETURN(MATCH_NOMATCH);
2684
}
2685
GETCHARINCTEST(fc, Feptr);
2686
switch(fc)
2687
{
2688
HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2689
default: break;
2690
}
2691
Fecode++;
2692
break;
2693
2694
case OP_HSPACE:
2695
if (Feptr >= mb->end_subject)
2696
{
2697
SCHECK_PARTIAL();
2698
RRETURN(MATCH_NOMATCH);
2699
}
2700
GETCHARINCTEST(fc, Feptr);
2701
switch(fc)
2702
{
2703
HSPACE_CASES: break; /* Byte and multibyte cases */
2704
default: RRETURN(MATCH_NOMATCH);
2705
}
2706
Fecode++;
2707
break;
2708
2709
case OP_NOT_VSPACE:
2710
if (Feptr >= mb->end_subject)
2711
{
2712
SCHECK_PARTIAL();
2713
RRETURN(MATCH_NOMATCH);
2714
}
2715
GETCHARINCTEST(fc, Feptr);
2716
switch(fc)
2717
{
2718
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2719
default: break;
2720
}
2721
Fecode++;
2722
break;
2723
2724
case OP_VSPACE:
2725
if (Feptr >= mb->end_subject)
2726
{
2727
SCHECK_PARTIAL();
2728
RRETURN(MATCH_NOMATCH);
2729
}
2730
GETCHARINCTEST(fc, Feptr);
2731
switch(fc)
2732
{
2733
VSPACE_CASES: break;
2734
default: RRETURN(MATCH_NOMATCH);
2735
}
2736
Fecode++;
2737
break;
2738
2739
2740
#ifdef SUPPORT_UNICODE
2741
2742
/* ===================================================================== */
2743
/* Check the next character by Unicode property. We will get here only
2744
if the support is in the binary; otherwise a compile-time error occurs. */
2745
2746
case OP_PROP:
2747
case OP_NOTPROP:
2748
if (Feptr >= mb->end_subject)
2749
{
2750
SCHECK_PARTIAL();
2751
RRETURN(MATCH_NOMATCH);
2752
}
2753
GETCHARINCTEST(fc, Feptr);
2754
{
2755
const uint32_t *cp;
2756
uint32_t chartype;
2757
const ucd_record *prop = GET_UCD(fc);
2758
BOOL notmatch = Fop == OP_NOTPROP;
2759
2760
switch(Fecode[1])
2761
{
2762
case PT_LAMP:
2763
chartype = prop->chartype;
2764
if ((chartype == ucp_Lu ||
2765
chartype == ucp_Ll ||
2766
chartype == ucp_Lt) == notmatch)
2767
RRETURN(MATCH_NOMATCH);
2768
break;
2769
2770
case PT_GC:
2771
if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)
2772
RRETURN(MATCH_NOMATCH);
2773
break;
2774
2775
case PT_PC:
2776
if ((Fecode[2] == prop->chartype) == notmatch)
2777
RRETURN(MATCH_NOMATCH);
2778
break;
2779
2780
case PT_SC:
2781
if ((Fecode[2] == prop->script) == notmatch)
2782
RRETURN(MATCH_NOMATCH);
2783
break;
2784
2785
case PT_SCX:
2786
{
2787
BOOL ok = (Fecode[2] == prop->script ||
2788
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);
2789
if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2790
}
2791
break;
2792
2793
/* These are specials */
2794
2795
case PT_ALNUM:
2796
chartype = prop->chartype;
2797
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2798
PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)
2799
RRETURN(MATCH_NOMATCH);
2800
break;
2801
2802
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
2803
which means that Perl space and POSIX space are now identical. PCRE
2804
was changed at release 8.34. */
2805
2806
case PT_SPACE: /* Perl space */
2807
case PT_PXSPACE: /* POSIX space */
2808
switch(fc)
2809
{
2810
HSPACE_CASES:
2811
VSPACE_CASES:
2812
if (notmatch) RRETURN(MATCH_NOMATCH);
2813
break;
2814
2815
default:
2816
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)
2817
RRETURN(MATCH_NOMATCH);
2818
break;
2819
}
2820
break;
2821
2822
case PT_WORD:
2823
chartype = prop->chartype;
2824
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2825
PRIV(ucp_gentype)[chartype] == ucp_N ||
2826
chartype == ucp_Mn ||
2827
chartype == ucp_Pc) == notmatch)
2828
RRETURN(MATCH_NOMATCH);
2829
break;
2830
2831
case PT_CLIST:
2832
#if PCRE2_CODE_UNIT_WIDTH == 32
2833
if (fc > MAX_UTF_CODE_POINT)
2834
{
2835
if (notmatch) break;;
2836
RRETURN(MATCH_NOMATCH);
2837
}
2838
#endif
2839
cp = PRIV(ucd_caseless_sets) + Fecode[2];
2840
for (;;)
2841
{
2842
if (fc < *cp)
2843
{ if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }
2844
if (fc == *cp++)
2845
{ if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }
2846
}
2847
break;
2848
2849
case PT_UCNC:
2850
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2851
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2852
fc >= 0xe000) == notmatch)
2853
RRETURN(MATCH_NOMATCH);
2854
break;
2855
2856
case PT_BIDICL:
2857
if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)
2858
RRETURN(MATCH_NOMATCH);
2859
break;
2860
2861
case PT_BOOL:
2862
{
2863
BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2864
UCD_BPROPS_PROP(prop), Fecode[2]) != 0;
2865
if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2866
}
2867
break;
2868
2869
/* This should never occur */
2870
2871
/* LCOV_EXCL_START */
2872
default:
2873
PCRE2_DEBUG_UNREACHABLE();
2874
return PCRE2_ERROR_INTERNAL;
2875
/* LCOV_EXCL_STOP */
2876
}
2877
2878
Fecode += 3;
2879
}
2880
break;
2881
2882
2883
/* ===================================================================== */
2884
/* Match an extended Unicode sequence. We will get here only if the support
2885
is in the binary; otherwise a compile-time error occurs. */
2886
2887
case OP_EXTUNI:
2888
if (Feptr >= mb->end_subject)
2889
{
2890
SCHECK_PARTIAL();
2891
RRETURN(MATCH_NOMATCH);
2892
}
2893
else
2894
{
2895
GETCHARINCTEST(fc, Feptr);
2896
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2897
NULL);
2898
}
2899
CHECK_PARTIAL();
2900
Fecode++;
2901
break;
2902
2903
#endif /* SUPPORT_UNICODE */
2904
2905
2906
/* ===================================================================== */
2907
/* Match a single character type repeatedly. Note that the property type
2908
does not need to be in a stack frame as it is not used within an RMATCH()
2909
loop. */
2910
2911
#define Lstart_eptr F->temp_sptr[0]
2912
#define Lmin F->temp_32[0]
2913
#define Lmax F->temp_32[1]
2914
#define Lctype F->temp_32[2]
2915
#define Lpropvalue F->temp_32[3]
2916
2917
case OP_TYPEEXACT:
2918
Lmin = Lmax = GET2(Fecode, 1);
2919
Fecode += 1 + IMM2_SIZE;
2920
goto REPEATTYPE;
2921
2922
case OP_TYPEUPTO:
2923
case OP_TYPEMINUPTO:
2924
Lmin = 0;
2925
Lmax = GET2(Fecode, 1);
2926
reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2927
Fecode += 1 + IMM2_SIZE;
2928
goto REPEATTYPE;
2929
2930
case OP_TYPEPOSSTAR:
2931
reptype = REPTYPE_POS;
2932
Lmin = 0;
2933
Lmax = UINT32_MAX;
2934
Fecode++;
2935
goto REPEATTYPE;
2936
2937
case OP_TYPEPOSPLUS:
2938
reptype = REPTYPE_POS;
2939
Lmin = 1;
2940
Lmax = UINT32_MAX;
2941
Fecode++;
2942
goto REPEATTYPE;
2943
2944
case OP_TYPEPOSQUERY:
2945
reptype = REPTYPE_POS;
2946
Lmin = 0;
2947
Lmax = 1;
2948
Fecode++;
2949
goto REPEATTYPE;
2950
2951
case OP_TYPEPOSUPTO:
2952
reptype = REPTYPE_POS;
2953
Lmin = 0;
2954
Lmax = GET2(Fecode, 1);
2955
Fecode += 1 + IMM2_SIZE;
2956
goto REPEATTYPE;
2957
2958
case OP_TYPESTAR:
2959
case OP_TYPEMINSTAR:
2960
case OP_TYPEPLUS:
2961
case OP_TYPEMINPLUS:
2962
case OP_TYPEQUERY:
2963
case OP_TYPEMINQUERY:
2964
fc = *Fecode++ - OP_TYPESTAR;
2965
Lmin = rep_min[fc];
2966
Lmax = rep_max[fc];
2967
reptype = rep_typ[fc];
2968
2969
/* Common code for all repeated character type matches. */
2970
2971
REPEATTYPE:
2972
Lctype = *Fecode++; /* Code for the character type */
2973
2974
#ifdef SUPPORT_UNICODE
2975
if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2976
{
2977
proptype = *Fecode++;
2978
Lpropvalue = *Fecode++;
2979
}
2980
else proptype = -1;
2981
#endif
2982
2983
/* First, ensure the minimum number of matches are present. Use inline
2984
code for maximizing the speed, and do the type test once at the start
2985
(i.e. keep it out of the loops). As there are no calls to RMATCH in the
2986
loops, we can use an ordinary variable for "notmatch". The code for UTF
2987
mode is separated out for tidiness, except for Unicode property tests. */
2988
2989
if (Lmin > 0)
2990
{
2991
#ifdef SUPPORT_UNICODE
2992
if (proptype >= 0) /* Property tests in all modes */
2993
{
2994
BOOL notmatch = Lctype == OP_NOTPROP;
2995
switch(proptype)
2996
{
2997
case PT_LAMP:
2998
for (i = 1; i <= Lmin; i++)
2999
{
3000
int chartype;
3001
if (Feptr >= mb->end_subject)
3002
{
3003
SCHECK_PARTIAL();
3004
RRETURN(MATCH_NOMATCH);
3005
}
3006
GETCHARINCTEST(fc, Feptr);
3007
chartype = UCD_CHARTYPE(fc);
3008
if ((chartype == ucp_Lu ||
3009
chartype == ucp_Ll ||
3010
chartype == ucp_Lt) == notmatch)
3011
RRETURN(MATCH_NOMATCH);
3012
}
3013
break;
3014
3015
case PT_GC:
3016
for (i = 1; i <= Lmin; i++)
3017
{
3018
if (Feptr >= mb->end_subject)
3019
{
3020
SCHECK_PARTIAL();
3021
RRETURN(MATCH_NOMATCH);
3022
}
3023
GETCHARINCTEST(fc, Feptr);
3024
if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)
3025
RRETURN(MATCH_NOMATCH);
3026
}
3027
break;
3028
3029
case PT_PC:
3030
for (i = 1; i <= Lmin; i++)
3031
{
3032
if (Feptr >= mb->end_subject)
3033
{
3034
SCHECK_PARTIAL();
3035
RRETURN(MATCH_NOMATCH);
3036
}
3037
GETCHARINCTEST(fc, Feptr);
3038
if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)
3039
RRETURN(MATCH_NOMATCH);
3040
}
3041
break;
3042
3043
case PT_SC:
3044
for (i = 1; i <= Lmin; i++)
3045
{
3046
if (Feptr >= mb->end_subject)
3047
{
3048
SCHECK_PARTIAL();
3049
RRETURN(MATCH_NOMATCH);
3050
}
3051
GETCHARINCTEST(fc, Feptr);
3052
if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)
3053
RRETURN(MATCH_NOMATCH);
3054
}
3055
break;
3056
3057
case PT_SCX:
3058
for (i = 1; i <= Lmin; i++)
3059
{
3060
BOOL ok;
3061
const ucd_record *prop;
3062
if (Feptr >= mb->end_subject)
3063
{
3064
SCHECK_PARTIAL();
3065
RRETURN(MATCH_NOMATCH);
3066
}
3067
GETCHARINCTEST(fc, Feptr);
3068
prop = GET_UCD(fc);
3069
ok = (prop->script == Lpropvalue ||
3070
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
3071
if (ok == notmatch)
3072
RRETURN(MATCH_NOMATCH);
3073
}
3074
break;
3075
3076
case PT_ALNUM:
3077
for (i = 1; i <= Lmin; i++)
3078
{
3079
int category;
3080
if (Feptr >= mb->end_subject)
3081
{
3082
SCHECK_PARTIAL();
3083
RRETURN(MATCH_NOMATCH);
3084
}
3085
GETCHARINCTEST(fc, Feptr);
3086
category = UCD_CATEGORY(fc);
3087
if ((category == ucp_L || category == ucp_N) == notmatch)
3088
RRETURN(MATCH_NOMATCH);
3089
}
3090
break;
3091
3092
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
3093
which means that Perl space and POSIX space are now identical. PCRE
3094
was changed at release 8.34. */
3095
3096
case PT_SPACE: /* Perl space */
3097
case PT_PXSPACE: /* POSIX space */
3098
for (i = 1; i <= Lmin; i++)
3099
{
3100
if (Feptr >= mb->end_subject)
3101
{
3102
SCHECK_PARTIAL();
3103
RRETURN(MATCH_NOMATCH);
3104
}
3105
GETCHARINCTEST(fc, Feptr);
3106
switch(fc)
3107
{
3108
HSPACE_CASES:
3109
VSPACE_CASES:
3110
if (notmatch) RRETURN(MATCH_NOMATCH);
3111
break;
3112
3113
default:
3114
if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
3115
RRETURN(MATCH_NOMATCH);
3116
break;
3117
}
3118
}
3119
break;
3120
3121
case PT_WORD:
3122
for (i = 1; i <= Lmin; i++)
3123
{
3124
int chartype, category;
3125
if (Feptr >= mb->end_subject)
3126
{
3127
SCHECK_PARTIAL();
3128
RRETURN(MATCH_NOMATCH);
3129
}
3130
GETCHARINCTEST(fc, Feptr);
3131
chartype = UCD_CHARTYPE(fc);
3132
category = PRIV(ucp_gentype)[chartype];
3133
if ((category == ucp_L || category == ucp_N ||
3134
chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)
3135
RRETURN(MATCH_NOMATCH);
3136
}
3137
break;
3138
3139
case PT_CLIST:
3140
for (i = 1; i <= Lmin; i++)
3141
{
3142
const uint32_t *cp;
3143
if (Feptr >= mb->end_subject)
3144
{
3145
SCHECK_PARTIAL();
3146
RRETURN(MATCH_NOMATCH);
3147
}
3148
GETCHARINCTEST(fc, Feptr);
3149
#if PCRE2_CODE_UNIT_WIDTH == 32
3150
if (fc > MAX_UTF_CODE_POINT)
3151
{
3152
if (notmatch) continue;
3153
RRETURN(MATCH_NOMATCH);
3154
}
3155
#endif
3156
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3157
for (;;)
3158
{
3159
if (fc < *cp)
3160
{
3161
if (notmatch) break;
3162
RRETURN(MATCH_NOMATCH);
3163
}
3164
if (fc == *cp++)
3165
{
3166
if (notmatch) RRETURN(MATCH_NOMATCH);
3167
break;
3168
}
3169
}
3170
}
3171
break;
3172
3173
case PT_UCNC:
3174
for (i = 1; i <= Lmin; i++)
3175
{
3176
if (Feptr >= mb->end_subject)
3177
{
3178
SCHECK_PARTIAL();
3179
RRETURN(MATCH_NOMATCH);
3180
}
3181
GETCHARINCTEST(fc, Feptr);
3182
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3183
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3184
fc >= 0xe000) == notmatch)
3185
RRETURN(MATCH_NOMATCH);
3186
}
3187
break;
3188
3189
case PT_BIDICL:
3190
for (i = 1; i <= Lmin; i++)
3191
{
3192
if (Feptr >= mb->end_subject)
3193
{
3194
SCHECK_PARTIAL();
3195
RRETURN(MATCH_NOMATCH);
3196
}
3197
GETCHARINCTEST(fc, Feptr);
3198
if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)
3199
RRETURN(MATCH_NOMATCH);
3200
}
3201
break;
3202
3203
case PT_BOOL:
3204
for (i = 1; i <= Lmin; i++)
3205
{
3206
BOOL ok;
3207
const ucd_record *prop;
3208
if (Feptr >= mb->end_subject)
3209
{
3210
SCHECK_PARTIAL();
3211
RRETURN(MATCH_NOMATCH);
3212
}
3213
GETCHARINCTEST(fc, Feptr);
3214
prop = GET_UCD(fc);
3215
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
3216
UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
3217
if (ok == notmatch)
3218
RRETURN(MATCH_NOMATCH);
3219
}
3220
break;
3221
3222
/* This should not occur */
3223
3224
/* LCOV_EXCL_START */
3225
default:
3226
PCRE2_DEBUG_UNREACHABLE();
3227
return PCRE2_ERROR_INTERNAL;
3228
/* LCOV_EXCL_STOP */
3229
}
3230
}
3231
3232
/* Match extended Unicode sequences. We will get here only if the
3233
support is in the binary; otherwise a compile-time error occurs. */
3234
3235
else if (Lctype == OP_EXTUNI)
3236
{
3237
for (i = 1; i <= Lmin; i++)
3238
{
3239
if (Feptr >= mb->end_subject)
3240
{
3241
SCHECK_PARTIAL();
3242
RRETURN(MATCH_NOMATCH);
3243
}
3244
else
3245
{
3246
GETCHARINCTEST(fc, Feptr);
3247
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
3248
mb->end_subject, utf, NULL);
3249
}
3250
CHECK_PARTIAL();
3251
}
3252
}
3253
else
3254
#endif /* SUPPORT_UNICODE */
3255
3256
/* Handle all other cases in UTF mode */
3257
3258
#ifdef SUPPORT_UNICODE
3259
if (utf) switch(Lctype)
3260
{
3261
case OP_ANY:
3262
for (i = 1; i <= Lmin; i++)
3263
{
3264
if (Feptr >= mb->end_subject)
3265
{
3266
SCHECK_PARTIAL();
3267
RRETURN(MATCH_NOMATCH);
3268
}
3269
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3270
if (mb->partial != 0 &&
3271
Feptr + 1 >= mb->end_subject &&
3272
NLBLOCK->nltype == NLTYPE_FIXED &&
3273
NLBLOCK->nllen == 2 &&
3274
UCHAR21(Feptr) == NLBLOCK->nl[0])
3275
{
3276
mb->hitend = TRUE;
3277
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3278
}
3279
Feptr++;
3280
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3281
}
3282
break;
3283
3284
case OP_ALLANY:
3285
for (i = 1; i <= Lmin; i++)
3286
{
3287
if (Feptr >= mb->end_subject)
3288
{
3289
SCHECK_PARTIAL();
3290
RRETURN(MATCH_NOMATCH);
3291
}
3292
Feptr++;
3293
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3294
}
3295
break;
3296
3297
case OP_ANYBYTE:
3298
if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
3299
Feptr += Lmin;
3300
break;
3301
3302
case OP_ANYNL:
3303
for (i = 1; i <= Lmin; i++)
3304
{
3305
if (Feptr >= mb->end_subject)
3306
{
3307
SCHECK_PARTIAL();
3308
RRETURN(MATCH_NOMATCH);
3309
}
3310
GETCHARINC(fc, Feptr);
3311
switch(fc)
3312
{
3313
default: RRETURN(MATCH_NOMATCH);
3314
3315
case CHAR_CR:
3316
if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3317
break;
3318
3319
case CHAR_LF:
3320
break;
3321
3322
case CHAR_VT:
3323
case CHAR_FF:
3324
case CHAR_NEL:
3325
#ifndef EBCDIC
3326
case 0x2028:
3327
case 0x2029:
3328
#endif /* Not EBCDIC */
3329
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3330
break;
3331
}
3332
}
3333
break;
3334
3335
case OP_NOT_HSPACE:
3336
for (i = 1; i <= Lmin; i++)
3337
{
3338
if (Feptr >= mb->end_subject)
3339
{
3340
SCHECK_PARTIAL();
3341
RRETURN(MATCH_NOMATCH);
3342
}
3343
GETCHARINC(fc, Feptr);
3344
switch(fc)
3345
{
3346
HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3347
default: break;
3348
}
3349
}
3350
break;
3351
3352
case OP_HSPACE:
3353
for (i = 1; i <= Lmin; i++)
3354
{
3355
if (Feptr >= mb->end_subject)
3356
{
3357
SCHECK_PARTIAL();
3358
RRETURN(MATCH_NOMATCH);
3359
}
3360
GETCHARINC(fc, Feptr);
3361
switch(fc)
3362
{
3363
HSPACE_CASES: break;
3364
default: RRETURN(MATCH_NOMATCH);
3365
}
3366
}
3367
break;
3368
3369
case OP_NOT_VSPACE:
3370
for (i = 1; i <= Lmin; i++)
3371
{
3372
if (Feptr >= mb->end_subject)
3373
{
3374
SCHECK_PARTIAL();
3375
RRETURN(MATCH_NOMATCH);
3376
}
3377
GETCHARINC(fc, Feptr);
3378
switch(fc)
3379
{
3380
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3381
default: break;
3382
}
3383
}
3384
break;
3385
3386
case OP_VSPACE:
3387
for (i = 1; i <= Lmin; i++)
3388
{
3389
if (Feptr >= mb->end_subject)
3390
{
3391
SCHECK_PARTIAL();
3392
RRETURN(MATCH_NOMATCH);
3393
}
3394
GETCHARINC(fc, Feptr);
3395
switch(fc)
3396
{
3397
VSPACE_CASES: break;
3398
default: RRETURN(MATCH_NOMATCH);
3399
}
3400
}
3401
break;
3402
3403
case OP_NOT_DIGIT:
3404
for (i = 1; i <= Lmin; i++)
3405
{
3406
if (Feptr >= mb->end_subject)
3407
{
3408
SCHECK_PARTIAL();
3409
RRETURN(MATCH_NOMATCH);
3410
}
3411
GETCHARINC(fc, Feptr);
3412
if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
3413
RRETURN(MATCH_NOMATCH);
3414
}
3415
break;
3416
3417
case OP_DIGIT:
3418
for (i = 1; i <= Lmin; i++)
3419
{
3420
uint32_t cc;
3421
if (Feptr >= mb->end_subject)
3422
{
3423
SCHECK_PARTIAL();
3424
RRETURN(MATCH_NOMATCH);
3425
}
3426
cc = UCHAR21(Feptr);
3427
if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3428
RRETURN(MATCH_NOMATCH);
3429
Feptr++;
3430
/* No need to skip more code units - we know it has only one. */
3431
}
3432
break;
3433
3434
case OP_NOT_WHITESPACE:
3435
for (i = 1; i <= Lmin; i++)
3436
{
3437
uint32_t cc;
3438
if (Feptr >= mb->end_subject)
3439
{
3440
SCHECK_PARTIAL();
3441
RRETURN(MATCH_NOMATCH);
3442
}
3443
cc = UCHAR21(Feptr);
3444
if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3445
RRETURN(MATCH_NOMATCH);
3446
Feptr++;
3447
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3448
}
3449
break;
3450
3451
case OP_WHITESPACE:
3452
for (i = 1; i <= Lmin; i++)
3453
{
3454
uint32_t cc;
3455
if (Feptr >= mb->end_subject)
3456
{
3457
SCHECK_PARTIAL();
3458
RRETURN(MATCH_NOMATCH);
3459
}
3460
cc = UCHAR21(Feptr);
3461
if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3462
RRETURN(MATCH_NOMATCH);
3463
Feptr++;
3464
/* No need to skip more code units - we know it has only one. */
3465
}
3466
break;
3467
3468
case OP_NOT_WORDCHAR:
3469
for (i = 1; i <= Lmin; i++)
3470
{
3471
uint32_t cc;
3472
if (Feptr >= mb->end_subject)
3473
{
3474
SCHECK_PARTIAL();
3475
RRETURN(MATCH_NOMATCH);
3476
}
3477
cc = UCHAR21(Feptr);
3478
if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3479
RRETURN(MATCH_NOMATCH);
3480
Feptr++;
3481
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3482
}
3483
break;
3484
3485
case OP_WORDCHAR:
3486
for (i = 1; i <= Lmin; i++)
3487
{
3488
uint32_t cc;
3489
if (Feptr >= mb->end_subject)
3490
{
3491
SCHECK_PARTIAL();
3492
RRETURN(MATCH_NOMATCH);
3493
}
3494
cc = UCHAR21(Feptr);
3495
if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3496
RRETURN(MATCH_NOMATCH);
3497
Feptr++;
3498
/* No need to skip more code units - we know it has only one. */
3499
}
3500
break;
3501
3502
/* LCOV_EXCL_START */
3503
default:
3504
PCRE2_DEBUG_UNREACHABLE();
3505
return PCRE2_ERROR_INTERNAL;
3506
/* LCOV_EXCL_STOP */
3507
} /* End switch(Lctype) */
3508
3509
else
3510
#endif /* SUPPORT_UNICODE */
3511
3512
/* Code for the non-UTF case for minimum matching of operators other
3513
than OP_PROP and OP_NOTPROP. */
3514
3515
switch(Lctype)
3516
{
3517
case OP_ANY:
3518
for (i = 1; i <= Lmin; i++)
3519
{
3520
if (Feptr >= mb->end_subject)
3521
{
3522
SCHECK_PARTIAL();
3523
RRETURN(MATCH_NOMATCH);
3524
}
3525
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3526
if (mb->partial != 0 &&
3527
Feptr + 1 >= mb->end_subject &&
3528
NLBLOCK->nltype == NLTYPE_FIXED &&
3529
NLBLOCK->nllen == 2 &&
3530
*Feptr == NLBLOCK->nl[0])
3531
{
3532
mb->hitend = TRUE;
3533
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3534
}
3535
Feptr++;
3536
}
3537
break;
3538
3539
case OP_ALLANY:
3540
if (Feptr > mb->end_subject - Lmin)
3541
{
3542
SCHECK_PARTIAL();
3543
RRETURN(MATCH_NOMATCH);
3544
}
3545
Feptr += Lmin;
3546
break;
3547
3548
/* This OP_ANYBYTE case will never be reached because \C gets turned
3549
into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3550
reports don't complain about it's never being used. */
3551
3552
/* case OP_ANYBYTE:
3553
* if (Feptr > mb->end_subject - Lmin)
3554
* {
3555
* SCHECK_PARTIAL();
3556
* RRETURN(MATCH_NOMATCH);
3557
* }
3558
* Feptr += Lmin;
3559
* break;
3560
*/
3561
case OP_ANYNL:
3562
for (i = 1; i <= Lmin; i++)
3563
{
3564
if (Feptr >= mb->end_subject)
3565
{
3566
SCHECK_PARTIAL();
3567
RRETURN(MATCH_NOMATCH);
3568
}
3569
switch(*Feptr++)
3570
{
3571
default: RRETURN(MATCH_NOMATCH);
3572
3573
case CHAR_CR:
3574
if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3575
break;
3576
3577
case CHAR_LF:
3578
break;
3579
3580
case CHAR_VT:
3581
case CHAR_FF:
3582
case CHAR_NEL:
3583
#if PCRE2_CODE_UNIT_WIDTH != 8
3584
case 0x2028:
3585
case 0x2029:
3586
#endif
3587
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3588
break;
3589
}
3590
}
3591
break;
3592
3593
case OP_NOT_HSPACE:
3594
for (i = 1; i <= Lmin; i++)
3595
{
3596
if (Feptr >= mb->end_subject)
3597
{
3598
SCHECK_PARTIAL();
3599
RRETURN(MATCH_NOMATCH);
3600
}
3601
switch(*Feptr++)
3602
{
3603
default: break;
3604
HSPACE_BYTE_CASES:
3605
#if PCRE2_CODE_UNIT_WIDTH != 8
3606
HSPACE_MULTIBYTE_CASES:
3607
#endif
3608
RRETURN(MATCH_NOMATCH);
3609
}
3610
}
3611
break;
3612
3613
case OP_HSPACE:
3614
for (i = 1; i <= Lmin; i++)
3615
{
3616
if (Feptr >= mb->end_subject)
3617
{
3618
SCHECK_PARTIAL();
3619
RRETURN(MATCH_NOMATCH);
3620
}
3621
switch(*Feptr++)
3622
{
3623
default: RRETURN(MATCH_NOMATCH);
3624
HSPACE_BYTE_CASES:
3625
#if PCRE2_CODE_UNIT_WIDTH != 8
3626
HSPACE_MULTIBYTE_CASES:
3627
#endif
3628
break;
3629
}
3630
}
3631
break;
3632
3633
case OP_NOT_VSPACE:
3634
for (i = 1; i <= Lmin; i++)
3635
{
3636
if (Feptr >= mb->end_subject)
3637
{
3638
SCHECK_PARTIAL();
3639
RRETURN(MATCH_NOMATCH);
3640
}
3641
switch(*Feptr++)
3642
{
3643
VSPACE_BYTE_CASES:
3644
#if PCRE2_CODE_UNIT_WIDTH != 8
3645
VSPACE_MULTIBYTE_CASES:
3646
#endif
3647
RRETURN(MATCH_NOMATCH);
3648
default: break;
3649
}
3650
}
3651
break;
3652
3653
case OP_VSPACE:
3654
for (i = 1; i <= Lmin; i++)
3655
{
3656
if (Feptr >= mb->end_subject)
3657
{
3658
SCHECK_PARTIAL();
3659
RRETURN(MATCH_NOMATCH);
3660
}
3661
switch(*Feptr++)
3662
{
3663
default: RRETURN(MATCH_NOMATCH);
3664
VSPACE_BYTE_CASES:
3665
#if PCRE2_CODE_UNIT_WIDTH != 8
3666
VSPACE_MULTIBYTE_CASES:
3667
#endif
3668
break;
3669
}
3670
}
3671
break;
3672
3673
case OP_NOT_DIGIT:
3674
for (i = 1; i <= Lmin; i++)
3675
{
3676
if (Feptr >= mb->end_subject)
3677
{
3678
SCHECK_PARTIAL();
3679
RRETURN(MATCH_NOMATCH);
3680
}
3681
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3682
RRETURN(MATCH_NOMATCH);
3683
Feptr++;
3684
}
3685
break;
3686
3687
case OP_DIGIT:
3688
for (i = 1; i <= Lmin; i++)
3689
{
3690
if (Feptr >= mb->end_subject)
3691
{
3692
SCHECK_PARTIAL();
3693
RRETURN(MATCH_NOMATCH);
3694
}
3695
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3696
RRETURN(MATCH_NOMATCH);
3697
Feptr++;
3698
}
3699
break;
3700
3701
case OP_NOT_WHITESPACE:
3702
for (i = 1; i <= Lmin; i++)
3703
{
3704
if (Feptr >= mb->end_subject)
3705
{
3706
SCHECK_PARTIAL();
3707
RRETURN(MATCH_NOMATCH);
3708
}
3709
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3710
RRETURN(MATCH_NOMATCH);
3711
Feptr++;
3712
}
3713
break;
3714
3715
case OP_WHITESPACE:
3716
for (i = 1; i <= Lmin; i++)
3717
{
3718
if (Feptr >= mb->end_subject)
3719
{
3720
SCHECK_PARTIAL();
3721
RRETURN(MATCH_NOMATCH);
3722
}
3723
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3724
RRETURN(MATCH_NOMATCH);
3725
Feptr++;
3726
}
3727
break;
3728
3729
case OP_NOT_WORDCHAR:
3730
for (i = 1; i <= Lmin; i++)
3731
{
3732
if (Feptr >= mb->end_subject)
3733
{
3734
SCHECK_PARTIAL();
3735
RRETURN(MATCH_NOMATCH);
3736
}
3737
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3738
RRETURN(MATCH_NOMATCH);
3739
Feptr++;
3740
}
3741
break;
3742
3743
case OP_WORDCHAR:
3744
for (i = 1; i <= Lmin; i++)
3745
{
3746
if (Feptr >= mb->end_subject)
3747
{
3748
SCHECK_PARTIAL();
3749
RRETURN(MATCH_NOMATCH);
3750
}
3751
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3752
RRETURN(MATCH_NOMATCH);
3753
Feptr++;
3754
}
3755
break;
3756
3757
/* LCOV_EXCL_START */
3758
default:
3759
PCRE2_DEBUG_UNREACHABLE();
3760
return PCRE2_ERROR_INTERNAL;
3761
/* LCOV_EXCL_STOP */
3762
}
3763
}
3764
3765
/* If Lmin = Lmax we are done. Continue with the main loop. */
3766
3767
if (Lmin == Lmax) continue;
3768
3769
/* If minimizing, we have to test the rest of the pattern before each
3770
subsequent match. This means we cannot use a local "notmatch" variable as
3771
in the other cases. As all 4 temporary 32-bit values in the frame are
3772
already in use, just test the type each time. */
3773
3774
if (reptype == REPTYPE_MIN)
3775
{
3776
#ifdef SUPPORT_UNICODE
3777
if (proptype >= 0)
3778
{
3779
switch(proptype)
3780
{
3781
case PT_LAMP:
3782
for (;;)
3783
{
3784
int chartype;
3785
RMATCH(Fecode, RM208);
3786
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3787
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3788
if (Feptr >= mb->end_subject)
3789
{
3790
SCHECK_PARTIAL();
3791
RRETURN(MATCH_NOMATCH);
3792
}
3793
GETCHARINCTEST(fc, Feptr);
3794
chartype = UCD_CHARTYPE(fc);
3795
if ((chartype == ucp_Lu ||
3796
chartype == ucp_Ll ||
3797
chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3798
RRETURN(MATCH_NOMATCH);
3799
}
3800
PCRE2_UNREACHABLE(); /* Control never reaches here */
3801
3802
case PT_GC:
3803
for (;;)
3804
{
3805
RMATCH(Fecode, RM209);
3806
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3807
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3808
if (Feptr >= mb->end_subject)
3809
{
3810
SCHECK_PARTIAL();
3811
RRETURN(MATCH_NOMATCH);
3812
}
3813
GETCHARINCTEST(fc, Feptr);
3814
if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3815
RRETURN(MATCH_NOMATCH);
3816
}
3817
PCRE2_UNREACHABLE(); /* Control never reaches here */
3818
3819
case PT_PC:
3820
for (;;)
3821
{
3822
RMATCH(Fecode, RM210);
3823
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3824
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3825
if (Feptr >= mb->end_subject)
3826
{
3827
SCHECK_PARTIAL();
3828
RRETURN(MATCH_NOMATCH);
3829
}
3830
GETCHARINCTEST(fc, Feptr);
3831
if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3832
RRETURN(MATCH_NOMATCH);
3833
}
3834
PCRE2_UNREACHABLE(); /* Control never reaches here */
3835
3836
case PT_SC:
3837
for (;;)
3838
{
3839
RMATCH(Fecode, RM211);
3840
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3841
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3842
if (Feptr >= mb->end_subject)
3843
{
3844
SCHECK_PARTIAL();
3845
RRETURN(MATCH_NOMATCH);
3846
}
3847
GETCHARINCTEST(fc, Feptr);
3848
if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3849
RRETURN(MATCH_NOMATCH);
3850
}
3851
PCRE2_UNREACHABLE(); /* Control never reaches here */
3852
3853
case PT_SCX:
3854
for (;;)
3855
{
3856
BOOL ok;
3857
const ucd_record *prop;
3858
RMATCH(Fecode, RM224);
3859
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3860
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3861
if (Feptr >= mb->end_subject)
3862
{
3863
SCHECK_PARTIAL();
3864
RRETURN(MATCH_NOMATCH);
3865
}
3866
GETCHARINCTEST(fc, Feptr);
3867
prop = GET_UCD(fc);
3868
ok = (prop->script == Lpropvalue
3869
|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
3870
if (ok == (Lctype == OP_NOTPROP))
3871
RRETURN(MATCH_NOMATCH);
3872
}
3873
PCRE2_UNREACHABLE(); /* Control never reaches here */
3874
3875
case PT_ALNUM:
3876
for (;;)
3877
{
3878
int category;
3879
RMATCH(Fecode, RM212);
3880
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3881
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3882
if (Feptr >= mb->end_subject)
3883
{
3884
SCHECK_PARTIAL();
3885
RRETURN(MATCH_NOMATCH);
3886
}
3887
GETCHARINCTEST(fc, Feptr);
3888
category = UCD_CATEGORY(fc);
3889
if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
3890
RRETURN(MATCH_NOMATCH);
3891
}
3892
PCRE2_UNREACHABLE(); /* Control never reaches here */
3893
3894
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
3895
which means that Perl space and POSIX space are now identical. PCRE
3896
was changed at release 8.34. */
3897
3898
case PT_SPACE: /* Perl space */
3899
case PT_PXSPACE: /* POSIX space */
3900
for (;;)
3901
{
3902
RMATCH(Fecode, RM213);
3903
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3904
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3905
if (Feptr >= mb->end_subject)
3906
{
3907
SCHECK_PARTIAL();
3908
RRETURN(MATCH_NOMATCH);
3909
}
3910
GETCHARINCTEST(fc, Feptr);
3911
switch(fc)
3912
{
3913
HSPACE_CASES:
3914
VSPACE_CASES:
3915
if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3916
break;
3917
3918
default:
3919
if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3920
RRETURN(MATCH_NOMATCH);
3921
break;
3922
}
3923
}
3924
PCRE2_UNREACHABLE(); /* Control never reaches here */
3925
3926
case PT_WORD:
3927
for (;;)
3928
{
3929
int chartype, category;
3930
RMATCH(Fecode, RM214);
3931
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3932
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3933
if (Feptr >= mb->end_subject)
3934
{
3935
SCHECK_PARTIAL();
3936
RRETURN(MATCH_NOMATCH);
3937
}
3938
GETCHARINCTEST(fc, Feptr);
3939
chartype = UCD_CHARTYPE(fc);
3940
category = PRIV(ucp_gentype)[chartype];
3941
if ((category == ucp_L ||
3942
category == ucp_N ||
3943
chartype == ucp_Mn ||
3944
chartype == ucp_Pc) == (Lctype == OP_NOTPROP))
3945
RRETURN(MATCH_NOMATCH);
3946
}
3947
PCRE2_UNREACHABLE(); /* Control never reaches here */
3948
3949
case PT_CLIST:
3950
for (;;)
3951
{
3952
const uint32_t *cp;
3953
RMATCH(Fecode, RM215);
3954
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3955
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3956
if (Feptr >= mb->end_subject)
3957
{
3958
SCHECK_PARTIAL();
3959
RRETURN(MATCH_NOMATCH);
3960
}
3961
GETCHARINCTEST(fc, Feptr);
3962
#if PCRE2_CODE_UNIT_WIDTH == 32
3963
if (fc > MAX_UTF_CODE_POINT)
3964
{
3965
if (Lctype == OP_NOTPROP) continue;
3966
RRETURN(MATCH_NOMATCH);
3967
}
3968
#endif
3969
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3970
for (;;)
3971
{
3972
if (fc < *cp)
3973
{
3974
if (Lctype == OP_NOTPROP) break;
3975
RRETURN(MATCH_NOMATCH);
3976
}
3977
if (fc == *cp++)
3978
{
3979
if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3980
break;
3981
}
3982
}
3983
}
3984
PCRE2_UNREACHABLE(); /* Control never reaches here */
3985
3986
case PT_UCNC:
3987
for (;;)
3988
{
3989
RMATCH(Fecode, RM216);
3990
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3991
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3992
if (Feptr >= mb->end_subject)
3993
{
3994
SCHECK_PARTIAL();
3995
RRETURN(MATCH_NOMATCH);
3996
}
3997
GETCHARINCTEST(fc, Feptr);
3998
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3999
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4000
fc >= 0xe000) == (Lctype == OP_NOTPROP))
4001
RRETURN(MATCH_NOMATCH);
4002
}
4003
PCRE2_UNREACHABLE(); /* Control never reaches here */
4004
4005
case PT_BIDICL:
4006
for (;;)
4007
{
4008
RMATCH(Fecode, RM223);
4009
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4010
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4011
if (Feptr >= mb->end_subject)
4012
{
4013
SCHECK_PARTIAL();
4014
RRETURN(MATCH_NOMATCH);
4015
}
4016
GETCHARINCTEST(fc, Feptr);
4017
if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
4018
RRETURN(MATCH_NOMATCH);
4019
}
4020
PCRE2_UNREACHABLE(); /* Control never reaches here */
4021
4022
case PT_BOOL:
4023
for (;;)
4024
{
4025
BOOL ok;
4026
const ucd_record *prop;
4027
RMATCH(Fecode, RM222);
4028
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4029
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4030
if (Feptr >= mb->end_subject)
4031
{
4032
SCHECK_PARTIAL();
4033
RRETURN(MATCH_NOMATCH);
4034
}
4035
GETCHARINCTEST(fc, Feptr);
4036
prop = GET_UCD(fc);
4037
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
4038
UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
4039
if (ok == (Lctype == OP_NOTPROP))
4040
RRETURN(MATCH_NOMATCH);
4041
}
4042
PCRE2_UNREACHABLE(); /* Control never reaches here */
4043
4044
/* This should never occur */
4045
4046
/* LCOV_EXCL_START */
4047
default:
4048
PCRE2_DEBUG_UNREACHABLE();
4049
return PCRE2_ERROR_INTERNAL;
4050
/* LCOV_EXCL_STOP */
4051
}
4052
}
4053
4054
/* Match extended Unicode sequences. We will get here only if the
4055
support is in the binary; otherwise a compile-time error occurs. */
4056
4057
else if (Lctype == OP_EXTUNI)
4058
{
4059
for (;;)
4060
{
4061
RMATCH(Fecode, RM217);
4062
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4063
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4064
if (Feptr >= mb->end_subject)
4065
{
4066
SCHECK_PARTIAL();
4067
RRETURN(MATCH_NOMATCH);
4068
}
4069
else
4070
{
4071
GETCHARINCTEST(fc, Feptr);
4072
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4073
utf, NULL);
4074
}
4075
CHECK_PARTIAL();
4076
}
4077
}
4078
else
4079
#endif /* SUPPORT_UNICODE */
4080
4081
/* UTF mode for non-property testing character types. */
4082
4083
#ifdef SUPPORT_UNICODE
4084
if (utf)
4085
{
4086
for (;;)
4087
{
4088
RMATCH(Fecode, RM218);
4089
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4090
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4091
if (Feptr >= mb->end_subject)
4092
{
4093
SCHECK_PARTIAL();
4094
RRETURN(MATCH_NOMATCH);
4095
}
4096
if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
4097
GETCHARINC(fc, Feptr);
4098
switch(Lctype)
4099
{
4100
case OP_ANY: /* This is the non-NL case */
4101
if (mb->partial != 0 && /* Take care with CRLF partial */
4102
Feptr >= mb->end_subject &&
4103
NLBLOCK->nltype == NLTYPE_FIXED &&
4104
NLBLOCK->nllen == 2 &&
4105
fc == NLBLOCK->nl[0])
4106
{
4107
mb->hitend = TRUE;
4108
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4109
}
4110
break;
4111
4112
case OP_ALLANY:
4113
case OP_ANYBYTE:
4114
break;
4115
4116
case OP_ANYNL:
4117
switch(fc)
4118
{
4119
default: RRETURN(MATCH_NOMATCH);
4120
4121
case CHAR_CR:
4122
if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
4123
break;
4124
4125
case CHAR_LF:
4126
break;
4127
4128
case CHAR_VT:
4129
case CHAR_FF:
4130
case CHAR_NEL:
4131
#ifndef EBCDIC
4132
case 0x2028:
4133
case 0x2029:
4134
#endif /* Not EBCDIC */
4135
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
4136
RRETURN(MATCH_NOMATCH);
4137
break;
4138
}
4139
break;
4140
4141
case OP_NOT_HSPACE:
4142
switch(fc)
4143
{
4144
HSPACE_CASES: RRETURN(MATCH_NOMATCH);
4145
default: break;
4146
}
4147
break;
4148
4149
case OP_HSPACE:
4150
switch(fc)
4151
{
4152
HSPACE_CASES: break;
4153
default: RRETURN(MATCH_NOMATCH);
4154
}
4155
break;
4156
4157
case OP_NOT_VSPACE:
4158
switch(fc)
4159
{
4160
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4161
default: break;
4162
}
4163
break;
4164
4165
case OP_VSPACE:
4166
switch(fc)
4167
{
4168
VSPACE_CASES: break;
4169
default: RRETURN(MATCH_NOMATCH);
4170
}
4171
break;
4172
4173
case OP_NOT_DIGIT:
4174
if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
4175
RRETURN(MATCH_NOMATCH);
4176
break;
4177
4178
case OP_DIGIT:
4179
if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
4180
RRETURN(MATCH_NOMATCH);
4181
break;
4182
4183
case OP_NOT_WHITESPACE:
4184
if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
4185
RRETURN(MATCH_NOMATCH);
4186
break;
4187
4188
case OP_WHITESPACE:
4189
if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
4190
RRETURN(MATCH_NOMATCH);
4191
break;
4192
4193
case OP_NOT_WORDCHAR:
4194
if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
4195
RRETURN(MATCH_NOMATCH);
4196
break;
4197
4198
case OP_WORDCHAR:
4199
if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
4200
RRETURN(MATCH_NOMATCH);
4201
break;
4202
4203
/* LCOV_EXCL_START */
4204
default:
4205
PCRE2_DEBUG_UNREACHABLE();
4206
return PCRE2_ERROR_INTERNAL;
4207
/* LCOV_EXCL_STOP */
4208
}
4209
}
4210
}
4211
else
4212
#endif /* SUPPORT_UNICODE */
4213
4214
/* Not UTF mode */
4215
{
4216
for (;;)
4217
{
4218
RMATCH(Fecode, RM33);
4219
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4220
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4221
if (Feptr >= mb->end_subject)
4222
{
4223
SCHECK_PARTIAL();
4224
RRETURN(MATCH_NOMATCH);
4225
}
4226
if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
4227
RRETURN(MATCH_NOMATCH);
4228
fc = *Feptr++;
4229
switch(Lctype)
4230
{
4231
case OP_ANY: /* This is the non-NL case */
4232
if (mb->partial != 0 && /* Take care with CRLF partial */
4233
Feptr >= mb->end_subject &&
4234
NLBLOCK->nltype == NLTYPE_FIXED &&
4235
NLBLOCK->nllen == 2 &&
4236
fc == NLBLOCK->nl[0])
4237
{
4238
mb->hitend = TRUE;
4239
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4240
}
4241
break;
4242
4243
case OP_ALLANY:
4244
case OP_ANYBYTE:
4245
break;
4246
4247
case OP_ANYNL:
4248
switch(fc)
4249
{
4250
default: RRETURN(MATCH_NOMATCH);
4251
4252
case CHAR_CR:
4253
if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
4254
break;
4255
4256
case CHAR_LF:
4257
break;
4258
4259
case CHAR_VT:
4260
case CHAR_FF:
4261
case CHAR_NEL:
4262
#if PCRE2_CODE_UNIT_WIDTH != 8
4263
case 0x2028:
4264
case 0x2029:
4265
#endif
4266
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
4267
RRETURN(MATCH_NOMATCH);
4268
break;
4269
}
4270
break;
4271
4272
case OP_NOT_HSPACE:
4273
switch(fc)
4274
{
4275
default: break;
4276
HSPACE_BYTE_CASES:
4277
#if PCRE2_CODE_UNIT_WIDTH != 8
4278
HSPACE_MULTIBYTE_CASES:
4279
#endif
4280
RRETURN(MATCH_NOMATCH);
4281
}
4282
break;
4283
4284
case OP_HSPACE:
4285
switch(fc)
4286
{
4287
default: RRETURN(MATCH_NOMATCH);
4288
HSPACE_BYTE_CASES:
4289
#if PCRE2_CODE_UNIT_WIDTH != 8
4290
HSPACE_MULTIBYTE_CASES:
4291
#endif
4292
break;
4293
}
4294
break;
4295
4296
case OP_NOT_VSPACE:
4297
switch(fc)
4298
{
4299
default: break;
4300
VSPACE_BYTE_CASES:
4301
#if PCRE2_CODE_UNIT_WIDTH != 8
4302
VSPACE_MULTIBYTE_CASES:
4303
#endif
4304
RRETURN(MATCH_NOMATCH);
4305
}
4306
break;
4307
4308
case OP_VSPACE:
4309
switch(fc)
4310
{
4311
default: RRETURN(MATCH_NOMATCH);
4312
VSPACE_BYTE_CASES:
4313
#if PCRE2_CODE_UNIT_WIDTH != 8
4314
VSPACE_MULTIBYTE_CASES:
4315
#endif
4316
break;
4317
}
4318
break;
4319
4320
case OP_NOT_DIGIT:
4321
if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
4322
RRETURN(MATCH_NOMATCH);
4323
break;
4324
4325
case OP_DIGIT:
4326
if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
4327
RRETURN(MATCH_NOMATCH);
4328
break;
4329
4330
case OP_NOT_WHITESPACE:
4331
if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
4332
RRETURN(MATCH_NOMATCH);
4333
break;
4334
4335
case OP_WHITESPACE:
4336
if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
4337
RRETURN(MATCH_NOMATCH);
4338
break;
4339
4340
case OP_NOT_WORDCHAR:
4341
if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
4342
RRETURN(MATCH_NOMATCH);
4343
break;
4344
4345
case OP_WORDCHAR:
4346
if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
4347
RRETURN(MATCH_NOMATCH);
4348
break;
4349
4350
/* LCOV_EXCL_START */
4351
default:
4352
PCRE2_DEBUG_UNREACHABLE();
4353
return PCRE2_ERROR_INTERNAL;
4354
/* LCOV_EXCL_STOP */
4355
}
4356
}
4357
}
4358
4359
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
4360
}
4361
4362
/* If maximizing, it is worth using inline code for speed, doing the type
4363
test once at the start (i.e. keep it out of the loops). Once again,
4364
"notmatch" can be an ordinary local variable because the loops do not call
4365
RMATCH. */
4366
4367
else
4368
{
4369
Lstart_eptr = Feptr; /* Remember where we started */
4370
4371
#ifdef SUPPORT_UNICODE
4372
if (proptype >= 0)
4373
{
4374
BOOL notmatch = Lctype == OP_NOTPROP;
4375
switch(proptype)
4376
{
4377
case PT_LAMP:
4378
for (i = Lmin; i < Lmax; i++)
4379
{
4380
int chartype;
4381
int len = 1;
4382
if (Feptr >= mb->end_subject)
4383
{
4384
SCHECK_PARTIAL();
4385
break;
4386
}
4387
GETCHARLENTEST(fc, Feptr, len);
4388
chartype = UCD_CHARTYPE(fc);
4389
if ((chartype == ucp_Lu ||
4390
chartype == ucp_Ll ||
4391
chartype == ucp_Lt) == notmatch)
4392
break;
4393
Feptr+= len;
4394
}
4395
break;
4396
4397
case PT_GC:
4398
for (i = Lmin; i < Lmax; i++)
4399
{
4400
int len = 1;
4401
if (Feptr >= mb->end_subject)
4402
{
4403
SCHECK_PARTIAL();
4404
break;
4405
}
4406
GETCHARLENTEST(fc, Feptr, len);
4407
if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;
4408
Feptr+= len;
4409
}
4410
break;
4411
4412
case PT_PC:
4413
for (i = Lmin; i < Lmax; i++)
4414
{
4415
int len = 1;
4416
if (Feptr >= mb->end_subject)
4417
{
4418
SCHECK_PARTIAL();
4419
break;
4420
}
4421
GETCHARLENTEST(fc, Feptr, len);
4422
if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;
4423
Feptr+= len;
4424
}
4425
break;
4426
4427
case PT_SC:
4428
for (i = Lmin; i < Lmax; i++)
4429
{
4430
int len = 1;
4431
if (Feptr >= mb->end_subject)
4432
{
4433
SCHECK_PARTIAL();
4434
break;
4435
}
4436
GETCHARLENTEST(fc, Feptr, len);
4437
if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;
4438
Feptr+= len;
4439
}
4440
break;
4441
4442
case PT_SCX:
4443
for (i = Lmin; i < Lmax; i++)
4444
{
4445
BOOL ok;
4446
const ucd_record *prop;
4447
int len = 1;
4448
if (Feptr >= mb->end_subject)
4449
{
4450
SCHECK_PARTIAL();
4451
break;
4452
}
4453
GETCHARLENTEST(fc, Feptr, len);
4454
prop = GET_UCD(fc);
4455
ok = (prop->script == Lpropvalue ||
4456
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
4457
if (ok == notmatch) break;
4458
Feptr+= len;
4459
}
4460
break;
4461
4462
case PT_ALNUM:
4463
for (i = Lmin; i < Lmax; i++)
4464
{
4465
int category;
4466
int len = 1;
4467
if (Feptr >= mb->end_subject)
4468
{
4469
SCHECK_PARTIAL();
4470
break;
4471
}
4472
GETCHARLENTEST(fc, Feptr, len);
4473
category = UCD_CATEGORY(fc);
4474
if ((category == ucp_L || category == ucp_N) == notmatch)
4475
break;
4476
Feptr+= len;
4477
}
4478
break;
4479
4480
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
4481
which means that Perl space and POSIX space are now identical. PCRE
4482
was changed at release 8.34. */
4483
4484
case PT_SPACE: /* Perl space */
4485
case PT_PXSPACE: /* POSIX space */
4486
for (i = Lmin; i < Lmax; i++)
4487
{
4488
int len = 1;
4489
if (Feptr >= mb->end_subject)
4490
{
4491
SCHECK_PARTIAL();
4492
break;
4493
}
4494
GETCHARLENTEST(fc, Feptr, len);
4495
switch(fc)
4496
{
4497
HSPACE_CASES:
4498
VSPACE_CASES:
4499
if (notmatch) goto ENDLOOP99; /* Break the loop */
4500
break;
4501
4502
default:
4503
if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
4504
goto ENDLOOP99; /* Break the loop */
4505
break;
4506
}
4507
Feptr+= len;
4508
}
4509
ENDLOOP99:
4510
break;
4511
4512
case PT_WORD:
4513
for (i = Lmin; i < Lmax; i++)
4514
{
4515
int chartype, category;
4516
int len = 1;
4517
if (Feptr >= mb->end_subject)
4518
{
4519
SCHECK_PARTIAL();
4520
break;
4521
}
4522
GETCHARLENTEST(fc, Feptr, len);
4523
chartype = UCD_CHARTYPE(fc);
4524
category = PRIV(ucp_gentype)[chartype];
4525
if ((category == ucp_L ||
4526
category == ucp_N ||
4527
chartype == ucp_Mn ||
4528
chartype == ucp_Pc) == notmatch)
4529
break;
4530
Feptr+= len;
4531
}
4532
break;
4533
4534
case PT_CLIST:
4535
for (i = Lmin; i < Lmax; i++)
4536
{
4537
const uint32_t *cp;
4538
int len = 1;
4539
if (Feptr >= mb->end_subject)
4540
{
4541
SCHECK_PARTIAL();
4542
break;
4543
}
4544
GETCHARLENTEST(fc, Feptr, len);
4545
#if PCRE2_CODE_UNIT_WIDTH == 32
4546
if (fc > MAX_UTF_CODE_POINT)
4547
{
4548
if (!notmatch) goto GOT_MAX;
4549
}
4550
else
4551
#endif
4552
{
4553
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4554
for (;;)
4555
{
4556
if (fc < *cp)
4557
{ if (notmatch) break; else goto GOT_MAX; }
4558
if (fc == *cp++)
4559
{ if (notmatch) goto GOT_MAX; else break; }
4560
}
4561
}
4562
4563
Feptr += len;
4564
}
4565
GOT_MAX:
4566
break;
4567
4568
case PT_UCNC:
4569
for (i = Lmin; i < Lmax; i++)
4570
{
4571
int len = 1;
4572
if (Feptr >= mb->end_subject)
4573
{
4574
SCHECK_PARTIAL();
4575
break;
4576
}
4577
GETCHARLENTEST(fc, Feptr, len);
4578
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4579
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4580
fc >= 0xe000) == notmatch)
4581
break;
4582
Feptr += len;
4583
}
4584
break;
4585
4586
case PT_BIDICL:
4587
for (i = Lmin; i < Lmax; i++)
4588
{
4589
int len = 1;
4590
if (Feptr >= mb->end_subject)
4591
{
4592
SCHECK_PARTIAL();
4593
break;
4594
}
4595
GETCHARLENTEST(fc, Feptr, len);
4596
if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;
4597
Feptr+= len;
4598
}
4599
break;
4600
4601
case PT_BOOL:
4602
for (i = Lmin; i < Lmax; i++)
4603
{
4604
BOOL ok;
4605
const ucd_record *prop;
4606
int len = 1;
4607
if (Feptr >= mb->end_subject)
4608
{
4609
SCHECK_PARTIAL();
4610
break;
4611
}
4612
GETCHARLENTEST(fc, Feptr, len);
4613
prop = GET_UCD(fc);
4614
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
4615
UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
4616
if (ok == notmatch) break;
4617
Feptr+= len;
4618
}
4619
break;
4620
4621
/* LCOV_EXCL_START */
4622
default:
4623
PCRE2_DEBUG_UNREACHABLE();
4624
return PCRE2_ERROR_INTERNAL;
4625
/* LCOV_EXCL_STOP */
4626
}
4627
4628
/* Feptr is now past the end of the maximum run */
4629
4630
if (reptype == REPTYPE_POS) continue; /* No backtracking */
4631
4632
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
4633
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4634
go too far. */
4635
4636
for(;;)
4637
{
4638
if (Feptr <= Lstart_eptr) break;
4639
RMATCH(Fecode, RM221);
4640
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4641
Feptr--;
4642
if (utf) BACKCHAR(Feptr);
4643
}
4644
}
4645
4646
/* Match extended Unicode grapheme clusters. We will get here only if the
4647
support is in the binary; otherwise a compile-time error occurs. */
4648
4649
else if (Lctype == OP_EXTUNI)
4650
{
4651
for (i = Lmin; i < Lmax; i++)
4652
{
4653
if (Feptr >= mb->end_subject)
4654
{
4655
SCHECK_PARTIAL();
4656
break;
4657
}
4658
else
4659
{
4660
GETCHARINCTEST(fc, Feptr);
4661
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4662
utf, NULL);
4663
}
4664
CHECK_PARTIAL();
4665
}
4666
4667
/* Feptr is now past the end of the maximum run */
4668
4669
if (reptype == REPTYPE_POS) continue; /* No backtracking */
4670
4671
/* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4672
of the run while backtracking because the use of \C in UTF mode can
4673
cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4674
the use of \C in UTF mode is fraught with danger. */
4675
4676
for(;;)
4677
{
4678
int lgb, rgb;
4679
PCRE2_SPTR fptr;
4680
4681
if (Feptr <= Lstart_eptr) break; /* At start of char run */
4682
RMATCH(Fecode, RM219);
4683
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4684
4685
/* Backtracking over an extended grapheme cluster involves inspecting
4686
the previous two characters (if present) to see if a break is
4687
permitted between them. */
4688
4689
Feptr--;
4690
if (!utf) fc = *Feptr; else
4691
{
4692
BACKCHAR(Feptr);
4693
GETCHAR(fc, Feptr);
4694
}
4695
rgb = UCD_GRAPHBREAK(fc);
4696
4697
for (;;)
4698
{
4699
if (Feptr <= Lstart_eptr) break; /* At start of char run */
4700
fptr = Feptr - 1;
4701
if (!utf) fc = *fptr; else
4702
{
4703
BACKCHAR(fptr);
4704
GETCHAR(fc, fptr);
4705
}
4706
lgb = UCD_GRAPHBREAK(fc);
4707
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4708
Feptr = fptr;
4709
rgb = lgb;
4710
}
4711
}
4712
}
4713
4714
else
4715
#endif /* SUPPORT_UNICODE */
4716
4717
#ifdef SUPPORT_UNICODE
4718
if (utf)
4719
{
4720
switch(Lctype)
4721
{
4722
case OP_ANY:
4723
for (i = Lmin; i < Lmax; i++)
4724
{
4725
if (Feptr >= mb->end_subject)
4726
{
4727
SCHECK_PARTIAL();
4728
break;
4729
}
4730
if (IS_NEWLINE(Feptr)) break;
4731
if (mb->partial != 0 && /* Take care with CRLF partial */
4732
Feptr + 1 >= mb->end_subject &&
4733
NLBLOCK->nltype == NLTYPE_FIXED &&
4734
NLBLOCK->nllen == 2 &&
4735
UCHAR21(Feptr) == NLBLOCK->nl[0])
4736
{
4737
mb->hitend = TRUE;
4738
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4739
}
4740
Feptr++;
4741
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4742
}
4743
break;
4744
4745
case OP_ALLANY:
4746
if (Lmax < UINT32_MAX)
4747
{
4748
for (i = Lmin; i < Lmax; i++)
4749
{
4750
if (Feptr >= mb->end_subject)
4751
{
4752
SCHECK_PARTIAL();
4753
break;
4754
}
4755
Feptr++;
4756
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4757
}
4758
}
4759
else
4760
{
4761
Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4762
SCHECK_PARTIAL();
4763
}
4764
break;
4765
4766
/* The "byte" (i.e. "code unit") case is the same as non-UTF */
4767
4768
case OP_ANYBYTE:
4769
fc = Lmax - Lmin;
4770
if (fc > (uint32_t)(mb->end_subject - Feptr))
4771
{
4772
Feptr = mb->end_subject;
4773
SCHECK_PARTIAL();
4774
}
4775
else Feptr += fc;
4776
break;
4777
4778
case OP_ANYNL:
4779
for (i = Lmin; i < Lmax; i++)
4780
{
4781
int len = 1;
4782
if (Feptr >= mb->end_subject)
4783
{
4784
SCHECK_PARTIAL();
4785
break;
4786
}
4787
GETCHARLEN(fc, Feptr, len);
4788
if (fc == CHAR_CR)
4789
{
4790
if (++Feptr >= mb->end_subject) break;
4791
if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4792
}
4793
else
4794
{
4795
if (fc != CHAR_LF &&
4796
(mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4797
(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4798
#ifndef EBCDIC
4799
&& fc != 0x2028 && fc != 0x2029
4800
#endif /* Not EBCDIC */
4801
)))
4802
break;
4803
Feptr += len;
4804
}
4805
}
4806
break;
4807
4808
case OP_NOT_HSPACE:
4809
case OP_HSPACE:
4810
for (i = Lmin; i < Lmax; i++)
4811
{
4812
BOOL gotspace;
4813
int len = 1;
4814
if (Feptr >= mb->end_subject)
4815
{
4816
SCHECK_PARTIAL();
4817
break;
4818
}
4819
GETCHARLEN(fc, Feptr, len);
4820
switch(fc)
4821
{
4822
HSPACE_CASES: gotspace = TRUE; break;
4823
default: gotspace = FALSE; break;
4824
}
4825
if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4826
Feptr += len;
4827
}
4828
break;
4829
4830
case OP_NOT_VSPACE:
4831
case OP_VSPACE:
4832
for (i = Lmin; i < Lmax; i++)
4833
{
4834
BOOL gotspace;
4835
int len = 1;
4836
if (Feptr >= mb->end_subject)
4837
{
4838
SCHECK_PARTIAL();
4839
break;
4840
}
4841
GETCHARLEN(fc, Feptr, len);
4842
switch(fc)
4843
{
4844
VSPACE_CASES: gotspace = TRUE; break;
4845
default: gotspace = FALSE; break;
4846
}
4847
if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4848
Feptr += len;
4849
}
4850
break;
4851
4852
case OP_NOT_DIGIT:
4853
for (i = Lmin; i < Lmax; i++)
4854
{
4855
int len = 1;
4856
if (Feptr >= mb->end_subject)
4857
{
4858
SCHECK_PARTIAL();
4859
break;
4860
}
4861
GETCHARLEN(fc, Feptr, len);
4862
if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4863
Feptr+= len;
4864
}
4865
break;
4866
4867
case OP_DIGIT:
4868
for (i = Lmin; i < Lmax; i++)
4869
{
4870
int len = 1;
4871
if (Feptr >= mb->end_subject)
4872
{
4873
SCHECK_PARTIAL();
4874
break;
4875
}
4876
GETCHARLEN(fc, Feptr, len);
4877
if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4878
Feptr+= len;
4879
}
4880
break;
4881
4882
case OP_NOT_WHITESPACE:
4883
for (i = Lmin; i < Lmax; i++)
4884
{
4885
int len = 1;
4886
if (Feptr >= mb->end_subject)
4887
{
4888
SCHECK_PARTIAL();
4889
break;
4890
}
4891
GETCHARLEN(fc, Feptr, len);
4892
if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4893
Feptr+= len;
4894
}
4895
break;
4896
4897
case OP_WHITESPACE:
4898
for (i = Lmin; i < Lmax; i++)
4899
{
4900
int len = 1;
4901
if (Feptr >= mb->end_subject)
4902
{
4903
SCHECK_PARTIAL();
4904
break;
4905
}
4906
GETCHARLEN(fc, Feptr, len);
4907
if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4908
Feptr+= len;
4909
}
4910
break;
4911
4912
case OP_NOT_WORDCHAR:
4913
for (i = Lmin; i < Lmax; i++)
4914
{
4915
int len = 1;
4916
if (Feptr >= mb->end_subject)
4917
{
4918
SCHECK_PARTIAL();
4919
break;
4920
}
4921
GETCHARLEN(fc, Feptr, len);
4922
if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4923
Feptr+= len;
4924
}
4925
break;
4926
4927
case OP_WORDCHAR:
4928
for (i = Lmin; i < Lmax; i++)
4929
{
4930
int len = 1;
4931
if (Feptr >= mb->end_subject)
4932
{
4933
SCHECK_PARTIAL();
4934
break;
4935
}
4936
GETCHARLEN(fc, Feptr, len);
4937
if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4938
Feptr+= len;
4939
}
4940
break;
4941
4942
/* LCOV_EXCL_START */
4943
default:
4944
PCRE2_DEBUG_UNREACHABLE();
4945
return PCRE2_ERROR_INTERNAL;
4946
/* LCOV_EXCL_STOP */
4947
}
4948
4949
if (reptype == REPTYPE_POS) continue; /* No backtracking */
4950
4951
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
4952
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4953
too far. */
4954
4955
for(;;)
4956
{
4957
if (Feptr <= Lstart_eptr) break;
4958
RMATCH(Fecode, RM220);
4959
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4960
Feptr--;
4961
BACKCHAR(Feptr);
4962
if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4963
UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4964
Feptr--;
4965
}
4966
}
4967
else
4968
#endif /* SUPPORT_UNICODE */
4969
4970
/* Not UTF mode */
4971
{
4972
switch(Lctype)
4973
{
4974
case OP_ANY:
4975
for (i = Lmin; i < Lmax; i++)
4976
{
4977
if (Feptr >= mb->end_subject)
4978
{
4979
SCHECK_PARTIAL();
4980
break;
4981
}
4982
if (IS_NEWLINE(Feptr)) break;
4983
if (mb->partial != 0 && /* Take care with CRLF partial */
4984
Feptr + 1 >= mb->end_subject &&
4985
NLBLOCK->nltype == NLTYPE_FIXED &&
4986
NLBLOCK->nllen == 2 &&
4987
*Feptr == NLBLOCK->nl[0])
4988
{
4989
mb->hitend = TRUE;
4990
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4991
}
4992
Feptr++;
4993
}
4994
break;
4995
4996
case OP_ALLANY:
4997
case OP_ANYBYTE:
4998
fc = Lmax - Lmin;
4999
if (fc > (uint32_t)(mb->end_subject - Feptr))
5000
{
5001
Feptr = mb->end_subject;
5002
SCHECK_PARTIAL();
5003
}
5004
else Feptr += fc;
5005
break;
5006
5007
case OP_ANYNL:
5008
for (i = Lmin; i < Lmax; i++)
5009
{
5010
if (Feptr >= mb->end_subject)
5011
{
5012
SCHECK_PARTIAL();
5013
break;
5014
}
5015
fc = *Feptr;
5016
if (fc == CHAR_CR)
5017
{
5018
if (++Feptr >= mb->end_subject) break;
5019
if (*Feptr == CHAR_LF) Feptr++;
5020
}
5021
else
5022
{
5023
if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
5024
(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
5025
#if PCRE2_CODE_UNIT_WIDTH != 8
5026
&& fc != 0x2028 && fc != 0x2029
5027
#endif
5028
))) break;
5029
Feptr++;
5030
}
5031
}
5032
break;
5033
5034
case OP_NOT_HSPACE:
5035
for (i = Lmin; i < Lmax; i++)
5036
{
5037
if (Feptr >= mb->end_subject)
5038
{
5039
SCHECK_PARTIAL();
5040
break;
5041
}
5042
switch(*Feptr)
5043
{
5044
default: Feptr++; break;
5045
HSPACE_BYTE_CASES:
5046
#if PCRE2_CODE_UNIT_WIDTH != 8
5047
HSPACE_MULTIBYTE_CASES:
5048
#endif
5049
goto ENDLOOP00;
5050
}
5051
}
5052
ENDLOOP00:
5053
break;
5054
5055
case OP_HSPACE:
5056
for (i = Lmin; i < Lmax; i++)
5057
{
5058
if (Feptr >= mb->end_subject)
5059
{
5060
SCHECK_PARTIAL();
5061
break;
5062
}
5063
switch(*Feptr)
5064
{
5065
default: goto ENDLOOP01;
5066
HSPACE_BYTE_CASES:
5067
#if PCRE2_CODE_UNIT_WIDTH != 8
5068
HSPACE_MULTIBYTE_CASES:
5069
#endif
5070
Feptr++; break;
5071
}
5072
}
5073
ENDLOOP01:
5074
break;
5075
5076
case OP_NOT_VSPACE:
5077
for (i = Lmin; i < Lmax; i++)
5078
{
5079
if (Feptr >= mb->end_subject)
5080
{
5081
SCHECK_PARTIAL();
5082
break;
5083
}
5084
switch(*Feptr)
5085
{
5086
default: Feptr++; break;
5087
VSPACE_BYTE_CASES:
5088
#if PCRE2_CODE_UNIT_WIDTH != 8
5089
VSPACE_MULTIBYTE_CASES:
5090
#endif
5091
goto ENDLOOP02;
5092
}
5093
}
5094
ENDLOOP02:
5095
break;
5096
5097
case OP_VSPACE:
5098
for (i = Lmin; i < Lmax; i++)
5099
{
5100
if (Feptr >= mb->end_subject)
5101
{
5102
SCHECK_PARTIAL();
5103
break;
5104
}
5105
switch(*Feptr)
5106
{
5107
default: goto ENDLOOP03;
5108
VSPACE_BYTE_CASES:
5109
#if PCRE2_CODE_UNIT_WIDTH != 8
5110
VSPACE_MULTIBYTE_CASES:
5111
#endif
5112
Feptr++; break;
5113
}
5114
}
5115
ENDLOOP03:
5116
break;
5117
5118
case OP_NOT_DIGIT:
5119
for (i = Lmin; i < Lmax; i++)
5120
{
5121
if (Feptr >= mb->end_subject)
5122
{
5123
SCHECK_PARTIAL();
5124
break;
5125
}
5126
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
5127
break;
5128
Feptr++;
5129
}
5130
break;
5131
5132
case OP_DIGIT:
5133
for (i = Lmin; i < Lmax; i++)
5134
{
5135
if (Feptr >= mb->end_subject)
5136
{
5137
SCHECK_PARTIAL();
5138
break;
5139
}
5140
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
5141
break;
5142
Feptr++;
5143
}
5144
break;
5145
5146
case OP_NOT_WHITESPACE:
5147
for (i = Lmin; i < Lmax; i++)
5148
{
5149
if (Feptr >= mb->end_subject)
5150
{
5151
SCHECK_PARTIAL();
5152
break;
5153
}
5154
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
5155
break;
5156
Feptr++;
5157
}
5158
break;
5159
5160
case OP_WHITESPACE:
5161
for (i = Lmin; i < Lmax; i++)
5162
{
5163
if (Feptr >= mb->end_subject)
5164
{
5165
SCHECK_PARTIAL();
5166
break;
5167
}
5168
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
5169
break;
5170
Feptr++;
5171
}
5172
break;
5173
5174
case OP_NOT_WORDCHAR:
5175
for (i = Lmin; i < Lmax; i++)
5176
{
5177
if (Feptr >= mb->end_subject)
5178
{
5179
SCHECK_PARTIAL();
5180
break;
5181
}
5182
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
5183
break;
5184
Feptr++;
5185
}
5186
break;
5187
5188
case OP_WORDCHAR:
5189
for (i = Lmin; i < Lmax; i++)
5190
{
5191
if (Feptr >= mb->end_subject)
5192
{
5193
SCHECK_PARTIAL();
5194
break;
5195
}
5196
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
5197
break;
5198
Feptr++;
5199
}
5200
break;
5201
5202
/* LCOV_EXCL_START */
5203
default:
5204
PCRE2_DEBUG_UNREACHABLE();
5205
return PCRE2_ERROR_INTERNAL;
5206
/* LCOV_EXCL_STOP */
5207
}
5208
5209
if (reptype == REPTYPE_POS) continue; /* No backtracking */
5210
5211
for (;;)
5212
{
5213
if (Feptr == Lstart_eptr) break;
5214
RMATCH(Fecode, RM34);
5215
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5216
Feptr--;
5217
if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
5218
Feptr[-1] == CHAR_CR) Feptr--;
5219
}
5220
}
5221
}
5222
break; /* End of repeat character type processing */
5223
5224
#undef Lstart_eptr
5225
#undef Lmin
5226
#undef Lmax
5227
#undef Lctype
5228
#undef Lpropvalue
5229
5230
5231
/* ===================================================================== */
5232
/* Match a back reference, possibly repeatedly. Look past the end of the
5233
item to see if there is repeat information following. The OP_REF and
5234
OP_REFI opcodes are used for a reference to a numbered group or to a
5235
non-duplicated named group. For a duplicated named group, OP_DNREF and
5236
OP_DNREFI are used. In this case we must scan the list of groups to which
5237
the name refers, and use the first one that is set. */
5238
5239
#define Lmin F->temp_32[0]
5240
#define Lmax F->temp_32[1]
5241
#define Lcaseless F->temp_32[2]
5242
#define Lcaseopts F->temp_32[3]
5243
#define Lstart F->temp_sptr[0]
5244
#define Loffset F->temp_size
5245
5246
case OP_DNREF:
5247
case OP_DNREFI:
5248
Lcaseless = (Fop == OP_DNREFI);
5249
Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0;
5250
{
5251
int count = GET2(Fecode, 1+IMM2_SIZE);
5252
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5253
Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0);
5254
5255
while (count-- > 0)
5256
{
5257
Loffset = (GET2(slot, 0) << 1) - 2;
5258
if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
5259
slot += mb->name_entry_size;
5260
}
5261
}
5262
goto REF_REPEAT;
5263
5264
case OP_REF:
5265
case OP_REFI:
5266
Lcaseless = (Fop == OP_REFI);
5267
Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0;
5268
Loffset = (GET2(Fecode, 1) << 1) - 2;
5269
Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0);
5270
5271
/* Set up for repetition, or handle the non-repeated case. The maximum and
5272
minimum must be in the heap frame, but as they are short-term values, we
5273
use temporary fields. */
5274
5275
REF_REPEAT:
5276
switch (*Fecode)
5277
{
5278
case OP_CRSTAR:
5279
case OP_CRMINSTAR:
5280
case OP_CRPLUS:
5281
case OP_CRMINPLUS:
5282
case OP_CRQUERY:
5283
case OP_CRMINQUERY:
5284
fc = *Fecode++ - OP_CRSTAR;
5285
Lmin = rep_min[fc];
5286
Lmax = rep_max[fc];
5287
reptype = rep_typ[fc];
5288
break;
5289
5290
case OP_CRRANGE:
5291
case OP_CRMINRANGE:
5292
Lmin = GET2(Fecode, 1);
5293
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
5294
reptype = rep_typ[*Fecode - OP_CRSTAR];
5295
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
5296
Fecode += 1 + 2 * IMM2_SIZE;
5297
break;
5298
5299
default: /* No repeat follows */
5300
{
5301
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length);
5302
if (rrc != 0)
5303
{
5304
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5305
CHECK_PARTIAL();
5306
RRETURN(MATCH_NOMATCH);
5307
}
5308
}
5309
Feptr += length;
5310
continue; /* With the main loop */
5311
}
5312
5313
/* Handle repeated back references. If a set group has length zero, just
5314
continue with the main loop, because it matches however many times. For an
5315
unset reference, if the minimum is zero, we can also just continue. We can
5316
also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
5317
group behave as a zero-length group. For any other unset cases, carrying
5318
on will result in NOMATCH. */
5319
5320
if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
5321
{
5322
if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
5323
}
5324
else /* Group is not set */
5325
{
5326
if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
5327
continue;
5328
}
5329
5330
/* First, ensure the minimum number of matches are present. */
5331
5332
for (i = 1; i <= Lmin; i++)
5333
{
5334
PCRE2_SIZE slength;
5335
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5336
if (rrc != 0)
5337
{
5338
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5339
CHECK_PARTIAL();
5340
RRETURN(MATCH_NOMATCH);
5341
}
5342
Feptr += slength;
5343
}
5344
5345
/* If min = max, we are done. They are not both allowed to be zero. */
5346
5347
if (Lmin == Lmax) continue;
5348
5349
/* If minimizing, keep trying and advancing the pointer. */
5350
5351
if (reptype == REPTYPE_MIN)
5352
{
5353
for (;;)
5354
{
5355
PCRE2_SIZE slength;
5356
RMATCH(Fecode, RM20);
5357
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5358
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
5359
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5360
if (rrc != 0)
5361
{
5362
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5363
CHECK_PARTIAL();
5364
RRETURN(MATCH_NOMATCH);
5365
}
5366
Feptr += slength;
5367
}
5368
5369
PCRE2_UNREACHABLE(); /* Control never reaches here */
5370
}
5371
5372
/* If maximizing, find the longest string and work backwards, as long as
5373
the matched lengths for each iteration are the same. */
5374
5375
else
5376
{
5377
BOOL samelengths = TRUE;
5378
Lstart = Feptr; /* Starting position */
5379
Flength = Fovector[Loffset+1] - Fovector[Loffset];
5380
5381
for (i = Lmin; i < Lmax; i++)
5382
{
5383
PCRE2_SIZE slength;
5384
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5385
if (rrc != 0)
5386
{
5387
/* Can't use CHECK_PARTIAL because we don't want to update Feptr in
5388
the soft partial matching case. */
5389
5390
if (rrc > 0 && mb->partial != 0 &&
5391
mb->end_subject > mb->start_used_ptr)
5392
{
5393
mb->hitend = TRUE;
5394
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5395
}
5396
break;
5397
}
5398
5399
if (slength != Flength) samelengths = FALSE;
5400
Feptr += slength;
5401
}
5402
5403
/* If the length matched for each repetition is the same as the length of
5404
the captured group, we can easily work backwards. This is the normal
5405
case. However, in caseless UTF-8 mode there are pairs of case-equivalent
5406
characters whose lengths (in terms of code units) differ. However, this
5407
is very rare, so we handle it by re-matching fewer and fewer times. */
5408
5409
if (samelengths)
5410
{
5411
while (Feptr >= Lstart)
5412
{
5413
RMATCH(Fecode, RM21);
5414
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5415
Feptr -= Flength;
5416
}
5417
}
5418
5419
/* The rare case of non-matching lengths. Re-scan the repetition for each
5420
iteration. We know that match_ref() will succeed every time. */
5421
5422
else
5423
{
5424
Lmax = i;
5425
for (;;)
5426
{
5427
RMATCH(Fecode, RM22);
5428
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5429
if (Feptr == Lstart) break; /* Failed after minimal repetition */
5430
Feptr = Lstart;
5431
Lmax--;
5432
for (i = Lmin; i < Lmax; i++)
5433
{
5434
PCRE2_SIZE slength;
5435
(void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5436
Feptr += slength;
5437
}
5438
}
5439
}
5440
5441
RRETURN(MATCH_NOMATCH);
5442
}
5443
5444
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
5445
5446
#undef Lcaseless
5447
#undef Lmin
5448
#undef Lmax
5449
#undef Lstart
5450
#undef Loffset
5451
5452
5453
5454
/* ========================================================================= */
5455
/* Opcodes for the start of various parenthesized items */
5456
/* ========================================================================= */
5457
5458
/* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
5459
(*THEN) is within the current branch by comparing the address of OP_THEN
5460
that is passed back with the end of the branch. If (*THEN) is within the
5461
current branch, and the branch is one of two or more alternatives (it
5462
either starts or ends with OP_ALT), we have reached the limit of THEN's
5463
action, so convert the return code to NOMATCH, which will cause normal
5464
backtracking to happen from now on. Otherwise, THEN is passed back to an
5465
outer alternative. This implements Perl's treatment of parenthesized
5466
groups, where a group not containing | does not affect the current
5467
alternative, that is, (X) is NOT the same as (X|(*F)). */
5468
5469
5470
/* ===================================================================== */
5471
/* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
5472
bracket group, indicating that it may occur zero times. It may repeat
5473
infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
5474
the pattern. Brackets with fixed upper repeat limits are compiled as a
5475
number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
5476
Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
5477
5478
#define Lnext_ecode F->temp_sptr[0]
5479
5480
case OP_BRAZERO:
5481
Lnext_ecode = Fecode + 1;
5482
RMATCH(Lnext_ecode, RM9);
5483
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5484
do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5485
Fecode = Lnext_ecode + 1 + LINK_SIZE;
5486
break;
5487
5488
case OP_BRAMINZERO:
5489
Lnext_ecode = Fecode + 1;
5490
do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5491
RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
5492
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5493
Fecode++;
5494
break;
5495
5496
#undef Lnext_ecode
5497
5498
case OP_SKIPZERO:
5499
Fecode++;
5500
do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5501
Fecode += 1 + LINK_SIZE;
5502
break;
5503
5504
5505
/* ===================================================================== */
5506
/* Handle possessive brackets with an unlimited repeat. The end of these
5507
brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
5508
going further in the pattern. */
5509
5510
#define Lframe_type F->temp_32[0]
5511
#define Lmatched_once F->temp_32[1]
5512
#define Lzero_allowed F->temp_32[2]
5513
#define Lstart_eptr F->temp_sptr[0]
5514
#define Lstart_group F->temp_sptr[1]
5515
5516
case OP_BRAPOSZERO:
5517
Lzero_allowed = TRUE; /* Zero repeat is allowed */
5518
Fecode += 1;
5519
if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
5520
goto POSSESSIVE_CAPTURE;
5521
goto POSSESSIVE_NON_CAPTURE;
5522
5523
case OP_BRAPOS:
5524
case OP_SBRAPOS:
5525
Lzero_allowed = FALSE; /* Zero repeat not allowed */
5526
5527
POSSESSIVE_NON_CAPTURE:
5528
Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
5529
goto POSSESSIVE_GROUP;
5530
5531
case OP_CBRAPOS:
5532
case OP_SCBRAPOS:
5533
Lzero_allowed = FALSE; /* Zero repeat not allowed */
5534
5535
POSSESSIVE_CAPTURE:
5536
number = GET2(Fecode, 1+LINK_SIZE);
5537
Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
5538
5539
POSSESSIVE_GROUP:
5540
Lmatched_once = FALSE; /* Never matched */
5541
Lstart_group = Fecode; /* Start of this group */
5542
5543
for (;;)
5544
{
5545
Lstart_eptr = Feptr; /* Position at group start */
5546
group_frame_type = Lframe_type;
5547
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
5548
if (rrc == MATCH_KETRPOS)
5549
{
5550
Lmatched_once = TRUE; /* Matched at least once */
5551
if (Feptr == Lstart_eptr) /* Empty match; skip to end */
5552
{
5553
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5554
break;
5555
}
5556
5557
Fecode = Lstart_group;
5558
continue;
5559
}
5560
5561
/* See comment above about handling THEN. */
5562
5563
if (rrc == MATCH_THEN)
5564
{
5565
PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5566
if (mb->verb_ecode_ptr < next_ecode &&
5567
(*Fecode == OP_ALT || *next_ecode == OP_ALT))
5568
rrc = MATCH_NOMATCH;
5569
}
5570
5571
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5572
Fecode += GET(Fecode, 1);
5573
if (*Fecode != OP_ALT) break;
5574
}
5575
5576
/* Success if matched something or zero repeat allowed */
5577
5578
if (Lmatched_once || Lzero_allowed)
5579
{
5580
Fecode += 1 + LINK_SIZE;
5581
break;
5582
}
5583
5584
RRETURN(MATCH_NOMATCH);
5585
5586
#undef Lmatched_once
5587
#undef Lzero_allowed
5588
#undef Lframe_type
5589
#undef Lstart_eptr
5590
#undef Lstart_group
5591
5592
5593
/* ===================================================================== */
5594
/* Handle non-capturing brackets that cannot match an empty string. When we
5595
get to the final alternative within the brackets, as long as there are no
5596
THEN's in the pattern, we can optimize by not recording a new backtracking
5597
point. (Ideally we should test for a THEN within this group, but we don't
5598
have that information.) Don't do this if we are at the very top level,
5599
however, because that would make handling assertions and once-only brackets
5600
messier when there is nothing to go back to. */
5601
5602
#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5603
#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5604
5605
case OP_BRA:
5606
if (mb->hasthen || Frdepth == 0)
5607
{
5608
Lframe_type = 0;
5609
goto GROUPLOOP;
5610
}
5611
5612
for (;;)
5613
{
5614
Lnext_branch = Fecode + GET(Fecode, 1);
5615
if (*Lnext_branch != OP_ALT) break;
5616
5617
/* This is never the final branch. We do not need to test for MATCH_THEN
5618
here because this code is not used when there is a THEN in the pattern. */
5619
5620
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5621
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5622
Fecode = Lnext_branch;
5623
}
5624
5625
/* Hit the start of the final branch. Continue at this level. */
5626
5627
Fecode += PRIV(OP_lengths)[*Fecode];
5628
break;
5629
5630
#undef Lnext_branch
5631
5632
5633
/* ===================================================================== */
5634
/* Handle a capturing bracket, other than those that are possessive with an
5635
unlimited repeat. */
5636
5637
case OP_CBRA:
5638
case OP_SCBRA:
5639
Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5640
goto GROUPLOOP;
5641
5642
5643
/* ===================================================================== */
5644
/* Atomic groups and non-capturing brackets that can match an empty string
5645
must record a backtracking point and also set up a chained frame. */
5646
5647
case OP_ONCE:
5648
case OP_SCRIPT_RUN:
5649
case OP_SBRA:
5650
Lframe_type = GF_NOCAPTURE | Fop;
5651
5652
GROUPLOOP:
5653
for (;;)
5654
{
5655
group_frame_type = Lframe_type;
5656
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5657
if (rrc == MATCH_THEN)
5658
{
5659
PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5660
if (mb->verb_ecode_ptr < next_ecode &&
5661
(*Fecode == OP_ALT || *next_ecode == OP_ALT))
5662
rrc = MATCH_NOMATCH;
5663
}
5664
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5665
Fecode += GET(Fecode, 1);
5666
if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5667
}
5668
PCRE2_UNREACHABLE(); /* Control never reaches here */
5669
5670
#undef Lframe_type
5671
5672
5673
/* ===================================================================== */
5674
/* Pattern recursion either matches the current regex, or some
5675
subexpression. The offset data is the offset to the starting bracket from
5676
the start of the whole pattern. This is so that it works from duplicated
5677
subpatterns. For a whole-pattern recursion, we have to infer the number
5678
zero. */
5679
5680
#define Lframe_type F->temp_32[0]
5681
#define Lstart_branch F->temp_sptr[0]
5682
5683
case OP_RECURSE:
5684
bracode = mb->start_code + GET(Fecode, 1);
5685
number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5686
5687
/* If we are already in a pattern recursion, check for repeating the same
5688
one without changing the subject pointer or the last referenced character
5689
in the subject. This should catch convoluted mutual recursions; some
5690
simple cases are caught at compile time. However, there are rare cases when
5691
this check needs to be turned off. In this case, actual recursion loops
5692
will be caught by the match or heap limits. */
5693
5694
if (Fcurrent_recurse != RECURSE_UNSET)
5695
{
5696
offset = Flast_group_offset;
5697
while (offset != PCRE2_UNSET)
5698
{
5699
N = (heapframe *)((char *)match_data->heapframes + offset);
5700
P = (heapframe *)((char *)N - frame_size);
5701
if (N->group_frame_type == (GF_RECURSE | number))
5702
{
5703
if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&
5704
(mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)
5705
return PCRE2_ERROR_RECURSELOOP;
5706
break;
5707
}
5708
offset = P->last_group_offset;
5709
}
5710
}
5711
5712
/* Remember the current last referenced character and then run the
5713
recursion branch by branch. */
5714
5715
F->recurse_last_used = mb->last_used_ptr;
5716
Lstart_branch = bracode;
5717
Lframe_type = GF_RECURSE | number;
5718
5719
for (;;)
5720
{
5721
PCRE2_SPTR next_ecode;
5722
5723
group_frame_type = Lframe_type;
5724
RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5725
next_ecode = Lstart_branch + GET(Lstart_branch,1);
5726
5727
/* Handle backtracking verbs, which are defined in a range that can
5728
easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5729
escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5730
5731
When one of these verbs triggers, the current recursion group number is
5732
recorded. If it matches the recursion we are processing, the verb
5733
happened within the recursion and we must deal with it. Otherwise it must
5734
have happened after the recursion completed, and so has to be passed
5735
back. See comment above about handling THEN. */
5736
5737
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5738
mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5739
{
5740
if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5741
(*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5742
rrc = MATCH_NOMATCH;
5743
else RRETURN(MATCH_NOMATCH);
5744
}
5745
5746
/* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5747
OP_ACCEPT code. Nothing needs to be done here. */
5748
5749
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5750
Lstart_branch = next_ecode;
5751
if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5752
}
5753
PCRE2_UNREACHABLE(); /* Control never reaches here */
5754
5755
#undef Lframe_type
5756
#undef Lstart_branch
5757
5758
5759
/* ===================================================================== */
5760
/* Positive assertions are like other groups except that PCRE doesn't allow
5761
the effect of (*THEN) to escape beyond an assertion; it is therefore
5762
treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5763
captures and mark retained. Any other return is an error. */
5764
5765
#define Lframe_type F->temp_32[0]
5766
5767
case OP_ASSERT:
5768
case OP_ASSERTBACK:
5769
case OP_ASSERT_NA:
5770
case OP_ASSERTBACK_NA:
5771
Lframe_type = GF_NOCAPTURE | Fop;
5772
for (;;)
5773
{
5774
group_frame_type = Lframe_type;
5775
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5776
if (rrc == MATCH_ACCEPT)
5777
{
5778
memcpy(Fovector,
5779
(char *)assert_accept_frame + offsetof(heapframe, ovector),
5780
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5781
Foffset_top = assert_accept_frame->offset_top;
5782
Fmark = assert_accept_frame->mark;
5783
break;
5784
}
5785
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5786
Fecode += GET(Fecode, 1);
5787
if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5788
}
5789
5790
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5791
Fecode += 1 + LINK_SIZE;
5792
break;
5793
5794
#undef Lframe_type
5795
5796
5797
/* ===================================================================== */
5798
/* Handle negative assertions. Loop for each non-matching branch as for
5799
positive assertions. */
5800
5801
#define Lframe_type F->temp_32[0]
5802
5803
case OP_ASSERT_NOT:
5804
case OP_ASSERTBACK_NOT:
5805
Lframe_type = GF_NOCAPTURE | Fop;
5806
5807
for (;;)
5808
{
5809
group_frame_type = Lframe_type;
5810
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5811
switch(rrc)
5812
{
5813
case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5814
case MATCH_MATCH:
5815
RRETURN (MATCH_NOMATCH);
5816
5817
case MATCH_NOMATCH: /* Branch failed, try next if present. */
5818
case MATCH_THEN:
5819
Fecode += GET(Fecode, 1);
5820
if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5821
break;
5822
5823
case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5824
case MATCH_SKIP:
5825
case MATCH_PRUNE:
5826
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5827
goto ASSERT_NOT_FAILED;
5828
5829
default: /* Pass back any other return */
5830
RRETURN(rrc);
5831
}
5832
}
5833
5834
/* None of the branches have matched or there was a backtrack to (*COMMIT),
5835
(*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5836
negative assertion, so carry on. */
5837
5838
ASSERT_NOT_FAILED:
5839
Fecode += 1 + LINK_SIZE;
5840
break;
5841
5842
#undef Lframe_type
5843
5844
/* ===================================================================== */
5845
/* Handle scan substring operation. */
5846
5847
#define Lframe_type F->temp_32[0]
5848
#define Lextra_size F->temp_32[1]
5849
#define Lsaved_moptions F->temp_32[2]
5850
#define Lsaved_end_subject F->temp_sptr[0]
5851
#define Lsaved_eptr F->temp_sptr[1]
5852
#define Ltrue_end_extra F->temp_size
5853
5854
case OP_ASSERT_SCS:
5855
{
5856
PCRE2_SPTR ecode = Fecode + 1 + LINK_SIZE;
5857
uint32_t extra_size = 0;
5858
int count;
5859
PCRE2_SPTR slot;
5860
5861
/* Disable compiler warning. */
5862
offset = 0;
5863
(void)offset;
5864
5865
for (;;)
5866
{
5867
if (*ecode == OP_CREF)
5868
{
5869
extra_size += 1+IMM2_SIZE;
5870
offset = (GET2(ecode, 1) << 1) - 2;
5871
ecode += 1+IMM2_SIZE;
5872
if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)
5873
goto SCS_OFFSET_FOUND;
5874
continue;
5875
}
5876
5877
if (*ecode != OP_DNCREF) RRETURN(MATCH_NOMATCH);
5878
5879
count = GET2(ecode, 1 + IMM2_SIZE);
5880
slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
5881
extra_size += 1+2*IMM2_SIZE;
5882
ecode += 1+2*IMM2_SIZE;
5883
5884
while (count > 0)
5885
{
5886
offset = (GET2(slot, 0) << 1) - 2;
5887
if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)
5888
goto SCS_OFFSET_FOUND;
5889
slot += mb->name_entry_size;
5890
count--;
5891
}
5892
}
5893
5894
SCS_OFFSET_FOUND:
5895
5896
/* Skip remaining options. */
5897
for (;;)
5898
{
5899
if (*ecode == OP_CREF)
5900
{
5901
extra_size += 1+IMM2_SIZE;
5902
ecode += 1+IMM2_SIZE;
5903
}
5904
else if (*ecode == OP_DNCREF)
5905
{
5906
extra_size += 1+2*IMM2_SIZE;
5907
ecode += 1+2*IMM2_SIZE;
5908
}
5909
else break;
5910
}
5911
5912
Lextra_size = extra_size;
5913
}
5914
5915
Lsaved_end_subject = mb->end_subject;
5916
Ltrue_end_extra = mb->true_end_subject - mb->end_subject;
5917
Lsaved_eptr = Feptr;
5918
Lsaved_moptions = mb->moptions;
5919
5920
Feptr = mb->start_subject + Fovector[offset];
5921
mb->true_end_subject = mb->end_subject =
5922
mb->start_subject + Fovector[offset + 1];
5923
mb->moptions &= ~PCRE2_NOTEOL;
5924
5925
Lframe_type = GF_NOCAPTURE | Fop;
5926
for (;;)
5927
{
5928
group_frame_type = Lframe_type;
5929
RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38);
5930
if (rrc == MATCH_ACCEPT)
5931
{
5932
memcpy(Fovector,
5933
(char *)assert_accept_frame + offsetof(heapframe, ovector),
5934
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5935
Foffset_top = assert_accept_frame->offset_top;
5936
Fmark = assert_accept_frame->mark;
5937
mb->end_subject = Lsaved_end_subject;
5938
mb->true_end_subject = mb->end_subject + Ltrue_end_extra;
5939
mb->moptions = Lsaved_moptions;
5940
break;
5941
}
5942
5943
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
5944
{
5945
mb->end_subject = Lsaved_end_subject;
5946
mb->true_end_subject = mb->end_subject + Ltrue_end_extra;
5947
mb->moptions = Lsaved_moptions;
5948
RRETURN(rrc);
5949
}
5950
5951
Fecode += GET(Fecode, 1);
5952
if (*Fecode != OP_ALT)
5953
{
5954
mb->end_subject = Lsaved_end_subject;
5955
mb->true_end_subject = mb->end_subject + Ltrue_end_extra;
5956
mb->moptions = Lsaved_moptions;
5957
RRETURN(MATCH_NOMATCH);
5958
}
5959
Lextra_size = 0;
5960
}
5961
5962
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5963
Fecode += 1 + LINK_SIZE;
5964
Feptr = Lsaved_eptr;
5965
break;
5966
5967
#undef Lframe_type
5968
#undef Lextra_size
5969
#undef Lsaved_end_subject
5970
#undef Lsaved_eptr
5971
#undef Ltrue_end_extra
5972
#undef Lsave_moptions
5973
5974
/* ===================================================================== */
5975
/* The callout item calls an external function, if one is provided, passing
5976
details of the match so far. This is mainly for debugging, though the
5977
function is able to force a failure. */
5978
5979
case OP_CALLOUT:
5980
case OP_CALLOUT_STR:
5981
rrc = do_callout(F, mb, &length);
5982
if (rrc > 0) RRETURN(MATCH_NOMATCH);
5983
if (rrc < 0) RRETURN(rrc);
5984
Fecode += length;
5985
break;
5986
5987
5988
/* ===================================================================== */
5989
/* Conditional group: compilation checked that there are no more than two
5990
branches. If the condition is false, skipping the first branch takes us
5991
past the end of the item if there is only one branch, but that's exactly
5992
what we want. */
5993
5994
case OP_COND:
5995
case OP_SCOND:
5996
5997
/* The variable Flength will be added to Fecode when the condition is
5998
false, to get to the second branch. Setting it to the offset to the ALT or
5999
KET, then incrementing Fecode achieves this effect. However, if the second
6000
branch is non-existent, we must point to the KET so that the end of the
6001
group is correctly processed. We now have Fecode pointing to the condition
6002
or callout. */
6003
6004
Flength = GET(Fecode, 1); /* Offset to the second branch */
6005
if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
6006
Fecode += 1 + LINK_SIZE; /* From this opcode */
6007
6008
/* Because of the way auto-callout works during compile, a callout item is
6009
inserted between OP_COND and an assertion condition. Such a callout can
6010
also be inserted manually. */
6011
6012
if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
6013
{
6014
rrc = do_callout(F, mb, &length);
6015
if (rrc > 0) RRETURN(MATCH_NOMATCH);
6016
if (rrc < 0) RRETURN(rrc);
6017
6018
/* Advance Fecode past the callout, so it now points to the condition. We
6019
must adjust Flength so that the value of Fecode+Flength is unchanged. */
6020
6021
Fecode += length;
6022
Flength -= length;
6023
}
6024
6025
/* Test the various possible conditions */
6026
6027
condition = FALSE;
6028
switch(*Fecode)
6029
{
6030
case OP_RREF: /* Group recursion test */
6031
if (Fcurrent_recurse != RECURSE_UNSET)
6032
{
6033
number = GET2(Fecode, 1);
6034
condition = (number == RREF_ANY || number == Fcurrent_recurse);
6035
}
6036
break;
6037
6038
case OP_DNRREF: /* Duplicate named group recursion test */
6039
if (Fcurrent_recurse != RECURSE_UNSET)
6040
{
6041
int count = GET2(Fecode, 1 + IMM2_SIZE);
6042
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
6043
while (count-- > 0)
6044
{
6045
number = GET2(slot, 0);
6046
condition = number == Fcurrent_recurse;
6047
if (condition) break;
6048
slot += mb->name_entry_size;
6049
}
6050
}
6051
break;
6052
6053
case OP_CREF: /* Numbered group used test */
6054
offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
6055
condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
6056
break;
6057
6058
case OP_DNCREF: /* Duplicate named group used test */
6059
{
6060
int count = GET2(Fecode, 1 + IMM2_SIZE);
6061
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
6062
while (count-- > 0)
6063
{
6064
offset = (GET2(slot, 0) << 1) - 2;
6065
condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
6066
if (condition) break;
6067
slot += mb->name_entry_size;
6068
}
6069
}
6070
break;
6071
6072
case OP_FALSE:
6073
case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
6074
break;
6075
6076
case OP_TRUE:
6077
condition = TRUE;
6078
break;
6079
6080
/* The condition is an assertion. Run code similar to the assertion code
6081
above. */
6082
6083
#define Lpositive F->temp_32[0]
6084
#define Lstart_branch F->temp_sptr[0]
6085
6086
default:
6087
Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
6088
Lstart_branch = Fecode;
6089
6090
for (;;)
6091
{
6092
group_frame_type = GF_CONDASSERT | *Fecode;
6093
RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
6094
6095
switch(rrc)
6096
{
6097
case MATCH_ACCEPT: /* Save captures */
6098
memcpy(Fovector,
6099
(char *)assert_accept_frame + offsetof(heapframe, ovector),
6100
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
6101
Foffset_top = assert_accept_frame->offset_top;
6102
6103
PCRE2_FALLTHROUGH /* Fall through */
6104
/* In the case of a match, the captures have already been put into
6105
the current frame. */
6106
6107
case MATCH_MATCH:
6108
condition = Lpositive; /* TRUE for positive assertion */
6109
break;
6110
6111
/* PCRE doesn't allow the effect of (*THEN) to escape beyond an
6112
assertion; it is therefore always treated as NOMATCH. */
6113
6114
case MATCH_NOMATCH:
6115
case MATCH_THEN:
6116
Lstart_branch += GET(Lstart_branch, 1);
6117
if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
6118
condition = !Lpositive; /* TRUE for negative assertion */
6119
break;
6120
6121
/* These force no match without checking other branches. */
6122
6123
case MATCH_COMMIT:
6124
case MATCH_SKIP:
6125
case MATCH_PRUNE:
6126
condition = !Lpositive;
6127
break;
6128
6129
default:
6130
RRETURN(rrc);
6131
}
6132
break; /* Out of the branch loop */
6133
}
6134
6135
/* If the condition is true, find the end of the assertion so that
6136
advancing past it gets us to the start of the first branch. */
6137
6138
if (condition)
6139
{
6140
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
6141
}
6142
break; /* End of assertion condition */
6143
}
6144
6145
#undef Lpositive
6146
#undef Lstart_branch
6147
6148
/* Choose branch according to the condition. */
6149
6150
Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
6151
6152
/* If the opcode is OP_SCOND it means we are at a repeated conditional
6153
group that might match an empty string. We must therefore descend a level
6154
so that the start is remembered for checking. For OP_COND we can just
6155
continue at this level. */
6156
6157
if (Fop == OP_SCOND)
6158
{
6159
group_frame_type = GF_NOCAPTURE | Fop;
6160
RMATCH(Fecode, RM35);
6161
RRETURN(rrc);
6162
}
6163
break;
6164
6165
6166
6167
/* ========================================================================= */
6168
/* End of start of parenthesis opcodes */
6169
/* ========================================================================= */
6170
6171
6172
/* ===================================================================== */
6173
/* Move the subject pointer back by one fixed amount. This occurs at the
6174
start of each branch that has a fixed length in a lookbehind assertion. If
6175
we are too close to the start to move back, fail. When working with UTF-8
6176
we move back a number of characters, not bytes. */
6177
6178
case OP_REVERSE:
6179
number = GET2(Fecode, 1);
6180
#ifdef SUPPORT_UNICODE
6181
if (utf)
6182
{
6183
/* We used to do a simpler `while (number-- > 0)` but that triggers
6184
clang's unsigned integer overflow sanitizer. */
6185
while (number > 0)
6186
{
6187
--number;
6188
if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
6189
Feptr--;
6190
BACKCHAR(Feptr);
6191
}
6192
}
6193
else
6194
#endif
6195
6196
/* No UTF support, or not in UTF mode: count is code unit count */
6197
6198
{
6199
if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
6200
Feptr -= number;
6201
}
6202
6203
/* Save the earliest consulted character, then skip to next opcode */
6204
6205
if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
6206
Fecode += 1 + IMM2_SIZE;
6207
break;
6208
6209
6210
/* ===================================================================== */
6211
/* Move the subject pointer back by a variable amount. This occurs at the
6212
start of each branch of a lookbehind assertion when the branch has a
6213
variable, but limited, length. A loop is needed to try matching the branch
6214
after moving back different numbers of characters. If we are too close to
6215
the start to move back even the minimum amount, fail. When working with
6216
UTF-8 we move back a number of characters, not bytes. */
6217
6218
#define Lmin F->temp_32[0]
6219
#define Lmax F->temp_32[1]
6220
#define Leptr F->temp_sptr[0]
6221
6222
case OP_VREVERSE:
6223
Lmin = GET2(Fecode, 1);
6224
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
6225
Leptr = Feptr;
6226
6227
/* Move back by the maximum branch length and then work forwards. This
6228
ensures that items such as \d{3,5} get the maximum length, which is
6229
relevant for captures, and makes for Perl compatibility. */
6230
6231
#ifdef SUPPORT_UNICODE
6232
if (utf)
6233
{
6234
for (i = 0; i < Lmax; i++)
6235
{
6236
if (Feptr == mb->start_subject)
6237
{
6238
if (i < Lmin) RRETURN(MATCH_NOMATCH);
6239
Lmax = i;
6240
break;
6241
}
6242
Feptr--;
6243
BACKCHAR(Feptr);
6244
}
6245
}
6246
else
6247
#endif
6248
6249
/* No UTF support or not in UTF mode */
6250
6251
{
6252
ptrdiff_t diff = Feptr - mb->start_subject;
6253
uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);
6254
if (Lmin > available) RRETURN(MATCH_NOMATCH);
6255
if (Lmax > available) Lmax = available;
6256
Feptr -= Lmax;
6257
}
6258
6259
/* Now try matching, moving forward one character on failure, until we
6260
reach the minimum back length. */
6261
6262
for (;;)
6263
{
6264
RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);
6265
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6266
if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);
6267
Feptr++;
6268
#ifdef SUPPORT_UNICODE
6269
if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }
6270
#endif
6271
}
6272
PCRE2_UNREACHABLE(); /* Control never reaches here */
6273
6274
#undef Lmin
6275
#undef Lmax
6276
#undef Leptr
6277
6278
/* ===================================================================== */
6279
/* An alternation is the end of a branch; scan along to find the end of the
6280
bracketed group. */
6281
6282
case OP_ALT:
6283
branch_end = Fecode;
6284
do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
6285
break;
6286
6287
6288
/* ===================================================================== */
6289
/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
6290
starting frame was added to the chained frames in order to remember the
6291
starting subject position for the group. (Not true for OP_BRA when it's a
6292
whole pattern recursion, but that is handled separately below.)*/
6293
6294
case OP_KET:
6295
case OP_KETRMIN:
6296
case OP_KETRMAX:
6297
case OP_KETRPOS:
6298
6299
bracode = Fecode - GET(Fecode, 1);
6300
6301
if (branch_end == NULL) branch_end = Fecode;
6302
branch_start = bracode;
6303
while (branch_start + GET(branch_start, 1) != branch_end)
6304
branch_start += GET(branch_start, 1);
6305
branch_end = NULL;
6306
6307
/* Point N to the frame at the start of the most recent group, and P to its
6308
predecessor. Remember the subject pointer at the start of the group. */
6309
6310
if (*bracode != OP_BRA && *bracode != OP_COND)
6311
{
6312
N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);
6313
P = (heapframe *)((char *)N - frame_size);
6314
Flast_group_offset = P->last_group_offset;
6315
6316
#ifdef DEBUG_SHOW_RMATCH
6317
fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
6318
N->rdepth, N->group_frame_type,
6319
(char *)P->eptr - (char *)mb->start_subject);
6320
#endif
6321
6322
/* If we are at the end of an assertion that is a condition, first check
6323
to see if we are at the end of a variable-length branch in a lookbehind.
6324
If this is the case and we have not landed on the current character,
6325
return no match. Compare code below for non-condition lookbehinds. In
6326
other cases, return a match, discarding any intermediate backtracking
6327
points. Copy back the mark setting and the captures into the frame before
6328
N so that they are set on return. Doing this for all assertions, both
6329
positive and negative, seems to match what Perl does. */
6330
6331
if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
6332
{
6333
if ((*bracode == OP_ASSERTBACK || *bracode == OP_ASSERTBACK_NOT) &&
6334
branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6335
RRETURN(MATCH_NOMATCH);
6336
memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
6337
Foffset_top * sizeof(PCRE2_SIZE));
6338
P->offset_top = Foffset_top;
6339
P->mark = Fmark;
6340
Fback_frame = (char *)F - (char *)P;
6341
RRETURN(MATCH_MATCH);
6342
}
6343
}
6344
else P = NULL; /* Indicates starting frame not recorded */
6345
6346
/* The group was not a conditional assertion. */
6347
6348
switch (*bracode)
6349
{
6350
/* Whole pattern recursion is handled as a recursion into group 0, but
6351
the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
6352
group - a design mistake: it should perhaps have been capture group 0.
6353
Anyway, that means the end of such recursion must be handled here. It is
6354
detected by checking for an immediately following OP_END when we are
6355
recursing in group 0. If this is not the end of a whole-pattern
6356
recursion, there is nothing to be done. */
6357
6358
case OP_BRA:
6359
if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;
6360
6361
/* It is the end of whole-pattern recursion. */
6362
6363
offset = Flast_group_offset;
6364
6365
/* Corrupted heapframes?. Trigger an assert and return an error */
6366
PCRE2_ASSERT(offset != PCRE2_UNSET);
6367
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
6368
6369
N = (heapframe *)((char *)match_data->heapframes + offset);
6370
P = (heapframe *)((char *)N - frame_size);
6371
Flast_group_offset = P->last_group_offset;
6372
6373
/* Reinstate the previous set of captures and then carry on after the
6374
recursion call. */
6375
6376
Fecode = P->ecode + 1 + LINK_SIZE;
6377
6378
if (*Fecode != OP_CREF)
6379
{
6380
memcpy(F->ovector, P->ovector, Foffset_top * sizeof(PCRE2_SIZE));
6381
Foffset_top = P->offset_top;
6382
}
6383
else
6384
recurse_update_offsets(F, P);
6385
6386
Fcapture_last = P->capture_last;
6387
Fcurrent_recurse = P->current_recurse;
6388
continue; /* With next opcode */
6389
6390
case OP_COND: /* No need to do anything for these */
6391
case OP_SCOND:
6392
break;
6393
6394
/* Non-atomic positive assertions are like OP_BRA, except that the
6395
subject pointer must be put back to where it was at the start of the
6396
assertion. For a variable lookbehind, check its end point. */
6397
6398
case OP_ASSERTBACK_NA:
6399
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6400
RRETURN(MATCH_NOMATCH);
6401
PCRE2_FALLTHROUGH /* Fall through */
6402
6403
case OP_ASSERT_NA:
6404
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6405
Feptr = P->eptr;
6406
break;
6407
6408
/* Atomic positive assertions are like OP_ONCE, except that in addition
6409
the subject pointer must be put back to where it was at the start of the
6410
assertion. For a variable lookbehind, check its end point. */
6411
6412
case OP_ASSERTBACK:
6413
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6414
RRETURN(MATCH_NOMATCH);
6415
PCRE2_FALLTHROUGH /* Fall through */
6416
6417
case OP_ASSERT:
6418
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6419
Feptr = P->eptr;
6420
PCRE2_FALLTHROUGH /* Fall through */
6421
6422
/* For an atomic group, discard internal backtracking points. We must
6423
also ensure that any remaining branches within the top-level of the group
6424
are not tried. Do this by adjusting the code pointer within the backtrack
6425
frame so that it points to the final branch. */
6426
6427
case OP_ONCE:
6428
Fback_frame = ((char *)F - (char *)P);
6429
for (;;)
6430
{
6431
uint32_t y = GET(P->ecode,1);
6432
if ((P->ecode)[y] != OP_ALT) break;
6433
P->ecode += y;
6434
}
6435
break;
6436
6437
/* A matching negative assertion returns MATCH, which is turned into
6438
NOMATCH at the assertion level. For a variable lookbehind, check its end
6439
point. */
6440
6441
case OP_ASSERTBACK_NOT:
6442
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6443
RRETURN(MATCH_NOMATCH);
6444
PCRE2_FALLTHROUGH /* Fall through */
6445
6446
case OP_ASSERT_NOT:
6447
RRETURN(MATCH_MATCH);
6448
6449
/* A scan substring group must preserve the current end_subject,
6450
and restore it before the backtracking is performed into its sub
6451
pattern. */
6452
6453
case OP_ASSERT_SCS:
6454
F->temp_sptr[0] = mb->end_subject;
6455
mb->end_subject = P->temp_sptr[0];
6456
mb->true_end_subject = mb->end_subject + P->temp_size;
6457
Feptr = P->temp_sptr[1];
6458
6459
RMATCH(Fecode + 1 + LINK_SIZE, RM39);
6460
6461
mb->end_subject = F->temp_sptr[0];
6462
mb->true_end_subject = mb->end_subject;
6463
RRETURN(rrc);
6464
break;
6465
6466
/* At the end of a script run, apply the script-checking rules. This code
6467
will never by exercised if Unicode support it not compiled, because in
6468
that environment script runs cause an error at compile time. */
6469
6470
case OP_SCRIPT_RUN:
6471
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
6472
break;
6473
6474
/* Whole-pattern recursion is coded as a recurse into group 0, and is
6475
handled with OP_BRA above. Other recursion is handled here. */
6476
6477
case OP_CBRA:
6478
case OP_CBRAPOS:
6479
case OP_SCBRA:
6480
case OP_SCBRAPOS:
6481
number = GET2(bracode, 1+LINK_SIZE);
6482
6483
/* Handle a recursively called group. We reinstate the previous set of
6484
captures and then carry on after the recursion call. */
6485
6486
if (Fcurrent_recurse == number)
6487
{
6488
P = (heapframe *)((char *)N - frame_size);
6489
Fecode = P->ecode + 1 + LINK_SIZE;
6490
6491
if (*Fecode != OP_CREF)
6492
{
6493
memcpy(F->ovector, P->ovector, Foffset_top * sizeof(PCRE2_SIZE));
6494
Foffset_top = P->offset_top;
6495
}
6496
else
6497
recurse_update_offsets(F, P);
6498
6499
Fcapture_last = P->capture_last;
6500
Fcurrent_recurse = P->current_recurse;
6501
continue; /* With next opcode */
6502
}
6503
6504
/* Deal with actual capturing. */
6505
6506
offset = (number << 1) - 2;
6507
Fcapture_last = number;
6508
Fovector[offset] = P->eptr - mb->start_subject;
6509
Fovector[offset+1] = Feptr - mb->start_subject;
6510
if (offset >= Foffset_top) Foffset_top = offset + 2;
6511
break;
6512
} /* End actions relating to the starting opcode */
6513
6514
/* OP_KETRPOS is a possessive repeating ket. Remember the current position,
6515
and return the MATCH_KETRPOS. This makes it possible to do the repeats one
6516
at a time from the outer level. This must precede the empty string test -
6517
in this case that test is done at the outer level. */
6518
6519
if (*Fecode == OP_KETRPOS)
6520
{
6521
memcpy((char *)P + offsetof(heapframe, eptr),
6522
(char *)F + offsetof(heapframe, eptr),
6523
frame_copy_size);
6524
RRETURN(MATCH_KETRPOS);
6525
}
6526
6527
/* Handle the different kinds of closing brackets. A non-repeating ket
6528
needs no special action, just continuing at this level. This also happens
6529
for the repeating kets if the group matched no characters, in order to
6530
forcibly break infinite loops. Otherwise, the repeating kets try the rest
6531
of the pattern or restart from the preceding bracket, in the appropriate
6532
order. */
6533
6534
if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
6535
{
6536
if (Fop == OP_KETRMIN)
6537
{
6538
RMATCH(Fecode + 1 + LINK_SIZE, RM6);
6539
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6540
Fecode -= GET(Fecode, 1);
6541
break; /* End of ket processing */
6542
}
6543
6544
/* Repeat the maximum number of times (KETRMAX) */
6545
6546
RMATCH(bracode, RM7);
6547
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6548
}
6549
6550
/* Carry on at this level for a non-repeating ket, or after matching an
6551
empty string, or after repeating for a maximum number of times. */
6552
6553
Fecode += 1 + LINK_SIZE;
6554
break;
6555
6556
6557
/* ===================================================================== */
6558
/* Start and end of line assertions, not multiline mode. */
6559
6560
case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
6561
if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
6562
RRETURN(MATCH_NOMATCH);
6563
Fecode++;
6564
break;
6565
6566
case OP_SOD: /* Unconditional start of subject */
6567
if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
6568
Fecode++;
6569
break;
6570
6571
/* When PCRE2_NOTEOL is unset, assert before the subject end, or a
6572
terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
6573
6574
case OP_DOLL:
6575
if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6576
if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
6577
6578
PCRE2_FALLTHROUGH /* Fall through */
6579
/* Unconditional end of subject assertion (\z). */
6580
6581
case OP_EOD:
6582
if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);
6583
if (mb->partial != 0)
6584
{
6585
mb->hitend = TRUE;
6586
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6587
}
6588
Fecode++;
6589
break;
6590
6591
/* End of subject or ending \n assertion (\Z) */
6592
6593
case OP_EODN:
6594
ASSERT_NL_OR_EOS:
6595
if (Feptr < mb->true_end_subject &&
6596
(!IS_NEWLINE(Feptr) || Feptr != mb->true_end_subject - mb->nllen))
6597
{
6598
if (mb->partial != 0 &&
6599
Feptr + 1 >= mb->end_subject &&
6600
NLBLOCK->nltype == NLTYPE_FIXED &&
6601
NLBLOCK->nllen == 2 &&
6602
UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6603
{
6604
mb->hitend = TRUE;
6605
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6606
}
6607
RRETURN(MATCH_NOMATCH);
6608
}
6609
6610
/* Either at end of string or \n before end. */
6611
6612
if (mb->partial != 0)
6613
{
6614
mb->hitend = TRUE;
6615
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6616
}
6617
Fecode++;
6618
break;
6619
6620
6621
/* ===================================================================== */
6622
/* Start and end of line assertions, multiline mode. */
6623
6624
/* Start of subject unless notbol, or after any newline except for one at
6625
the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
6626
6627
case OP_CIRCM:
6628
if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
6629
RRETURN(MATCH_NOMATCH);
6630
if (Feptr != mb->start_subject &&
6631
((Feptr == mb->end_subject &&
6632
(mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
6633
!WAS_NEWLINE(Feptr)))
6634
RRETURN(MATCH_NOMATCH);
6635
Fecode++;
6636
break;
6637
6638
/* Assert before any newline, or before end of subject unless noteol is
6639
set. */
6640
6641
case OP_DOLLM:
6642
if (Feptr < mb->end_subject)
6643
{
6644
if (!IS_NEWLINE(Feptr))
6645
{
6646
if (mb->partial != 0 &&
6647
Feptr + 1 >= mb->end_subject &&
6648
NLBLOCK->nltype == NLTYPE_FIXED &&
6649
NLBLOCK->nllen == 2 &&
6650
UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6651
{
6652
mb->hitend = TRUE;
6653
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6654
}
6655
RRETURN(MATCH_NOMATCH);
6656
}
6657
}
6658
else
6659
{
6660
if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6661
SCHECK_PARTIAL();
6662
}
6663
Fecode++;
6664
break;
6665
6666
6667
/* ===================================================================== */
6668
/* Start of match assertion */
6669
6670
case OP_SOM:
6671
if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
6672
Fecode++;
6673
break;
6674
6675
6676
/* ===================================================================== */
6677
/* Reset the start of match point */
6678
6679
case OP_SET_SOM:
6680
Fstart_match = Feptr;
6681
Fecode++;
6682
break;
6683
6684
6685
/* ===================================================================== */
6686
/* Word boundary assertions. Find out if the previous and current
6687
characters are "word" characters. It takes a bit more work in UTF mode.
6688
Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
6689
not set. When it is set, use Unicode properties if available, even when not
6690
in UTF mode. Remember the earliest and latest consulted characters. */
6691
6692
case OP_NOT_WORD_BOUNDARY:
6693
case OP_WORD_BOUNDARY:
6694
case OP_NOT_UCP_WORD_BOUNDARY:
6695
case OP_UCP_WORD_BOUNDARY:
6696
if (Feptr == mb->check_subject) prev_is_word = FALSE; else
6697
{
6698
PCRE2_SPTR lastptr = Feptr - 1;
6699
#ifdef SUPPORT_UNICODE
6700
if (utf)
6701
{
6702
BACKCHAR(lastptr);
6703
GETCHAR(fc, lastptr);
6704
}
6705
else
6706
#endif /* SUPPORT_UNICODE */
6707
fc = *lastptr;
6708
if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
6709
#ifdef SUPPORT_UNICODE
6710
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
6711
{
6712
int chartype = UCD_CHARTYPE(fc);
6713
int category = PRIV(ucp_gentype)[chartype];
6714
prev_is_word = (category == ucp_L || category == ucp_N ||
6715
chartype == ucp_Mn || chartype == ucp_Pc);
6716
}
6717
else
6718
#endif /* SUPPORT_UNICODE */
6719
prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6720
}
6721
6722
/* Get status of next character */
6723
6724
if (Feptr >= mb->end_subject)
6725
{
6726
SCHECK_PARTIAL();
6727
cur_is_word = FALSE;
6728
}
6729
else
6730
{
6731
PCRE2_SPTR nextptr = Feptr + 1;
6732
#ifdef SUPPORT_UNICODE
6733
if (utf)
6734
{
6735
FORWARDCHARTEST(nextptr, mb->end_subject);
6736
GETCHAR(fc, Feptr);
6737
}
6738
else
6739
#endif /* SUPPORT_UNICODE */
6740
fc = *Feptr;
6741
if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
6742
#ifdef SUPPORT_UNICODE
6743
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
6744
{
6745
int chartype = UCD_CHARTYPE(fc);
6746
int category = PRIV(ucp_gentype)[chartype];
6747
cur_is_word = (category == ucp_L || category == ucp_N ||
6748
chartype == ucp_Mn || chartype == ucp_Pc);
6749
}
6750
else
6751
#endif /* SUPPORT_UNICODE */
6752
cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6753
}
6754
6755
/* Now see if the situation is what we want */
6756
6757
if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?
6758
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6759
RRETURN(MATCH_NOMATCH);
6760
break;
6761
6762
6763
/* ===================================================================== */
6764
/* Backtracking (*VERB)s, with and without arguments. Note that if the
6765
pattern is successfully matched, we do not come back from RMATCH. */
6766
6767
case OP_MARK:
6768
Fmark = mb->nomatch_mark = Fecode + 2;
6769
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
6770
6771
/* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
6772
argument, and we must check whether that argument matches this MARK's
6773
argument. It is passed back in mb->verb_skip_ptr. If it does match, we
6774
return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
6775
position that corresponds to this mark. Otherwise, pass back the return
6776
code unaltered. */
6777
6778
if (rrc == MATCH_SKIP_ARG &&
6779
PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
6780
{
6781
mb->verb_skip_ptr = Feptr; /* Pass back current position */
6782
RRETURN(MATCH_SKIP);
6783
}
6784
RRETURN(rrc);
6785
6786
case OP_FAIL:
6787
RRETURN(MATCH_NOMATCH);
6788
6789
/* Record the current recursing group number in mb->verb_current_recurse
6790
when a backtracking return such as MATCH_COMMIT is given. This enables the
6791
recurse processing to catch verbs from within the recursion. */
6792
6793
case OP_COMMIT:
6794
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
6795
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6796
mb->verb_current_recurse = Fcurrent_recurse;
6797
RRETURN(MATCH_COMMIT);
6798
6799
case OP_COMMIT_ARG:
6800
Fmark = mb->nomatch_mark = Fecode + 2;
6801
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
6802
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6803
mb->verb_current_recurse = Fcurrent_recurse;
6804
RRETURN(MATCH_COMMIT);
6805
6806
case OP_PRUNE:
6807
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
6808
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6809
mb->verb_current_recurse = Fcurrent_recurse;
6810
RRETURN(MATCH_PRUNE);
6811
6812
case OP_PRUNE_ARG:
6813
Fmark = mb->nomatch_mark = Fecode + 2;
6814
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
6815
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6816
mb->verb_current_recurse = Fcurrent_recurse;
6817
RRETURN(MATCH_PRUNE);
6818
6819
case OP_SKIP:
6820
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
6821
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6822
mb->verb_skip_ptr = Feptr; /* Pass back current position */
6823
mb->verb_current_recurse = Fcurrent_recurse;
6824
RRETURN(MATCH_SKIP);
6825
6826
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
6827
nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
6828
not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
6829
that failed and any that precede it (either they also failed, or were not
6830
triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
6831
SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
6832
set to the count of the one that failed. */
6833
6834
case OP_SKIP_ARG:
6835
mb->skip_arg_count++;
6836
if (mb->skip_arg_count <= mb->ignore_skip_arg)
6837
{
6838
Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
6839
break;
6840
}
6841
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
6842
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6843
6844
/* Pass back the current skip name and return the special MATCH_SKIP_ARG
6845
return code. This will either be caught by a matching MARK, or get to the
6846
top, where it causes a rematch with mb->ignore_skip_arg set to the value of
6847
mb->skip_arg_count. */
6848
6849
mb->verb_skip_ptr = Fecode + 2;
6850
mb->verb_current_recurse = Fcurrent_recurse;
6851
RRETURN(MATCH_SKIP_ARG);
6852
6853
/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
6854
the branch in which it occurs can be determined. */
6855
6856
case OP_THEN:
6857
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6858
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6859
mb->verb_ecode_ptr = Fecode;
6860
mb->verb_current_recurse = Fcurrent_recurse;
6861
RRETURN(MATCH_THEN);
6862
6863
case OP_THEN_ARG:
6864
Fmark = mb->nomatch_mark = Fecode + 2;
6865
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6866
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6867
mb->verb_ecode_ptr = Fecode;
6868
mb->verb_current_recurse = Fcurrent_recurse;
6869
RRETURN(MATCH_THEN);
6870
6871
6872
/* ===================================================================== */
6873
/* There's been some horrible disaster. Arrival here can only mean there is
6874
something seriously wrong in the code above or the OP_xxx definitions. */
6875
6876
/* LCOV_EXCL_START */
6877
default:
6878
PCRE2_DEBUG_UNREACHABLE();
6879
return PCRE2_ERROR_INTERNAL;
6880
/* LCOV_EXCL_STOP */
6881
}
6882
6883
/* Do not insert any code in here without much thought; it is assumed
6884
that "continue" in the code above comes out to here to repeat the main
6885
loop. */
6886
6887
} /* End of main loop */
6888
6889
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6890
6891
/* ========================================================================= */
6892
/* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6893
indicates which label we actually want to return to. The value in Frdepth is
6894
the index number of the frame in the vector. The return value has been placed
6895
in rrc. */
6896
6897
#define LBL(val) case val: goto L_RM##val;
6898
6899
RETURN_SWITCH:
6900
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6901
if (Frdepth == 0) return rrc; /* Exit from the top level */
6902
F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6903
mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6904
6905
#ifdef DEBUG_SHOW_RMATCH
6906
fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);
6907
#endif
6908
6909
switch (Freturn_id)
6910
{
6911
LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6912
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6913
LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6914
LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6915
LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39)
6916
6917
#ifdef SUPPORT_WIDE_CHARS
6918
LBL(100) LBL(101) LBL(102) LBL(103)
6919
#endif
6920
6921
#ifdef SUPPORT_UNICODE
6922
LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6923
LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6924
LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
6925
LBL(221) LBL(222) LBL(223) LBL(224)
6926
#endif
6927
6928
/* LCOV_EXCL_START */
6929
default:
6930
PCRE2_DEBUG_UNREACHABLE();
6931
return PCRE2_ERROR_INTERNAL;
6932
/* LCOV_EXCL_STOP */
6933
}
6934
#undef LBL
6935
}
6936
6937
6938
/*************************************************
6939
* Match a Regular Expression *
6940
*************************************************/
6941
6942
/* This function applies a compiled pattern to a subject string and picks out
6943
portions of the string if it matches. Two elements in the vector are set for
6944
each substring: the offsets to the start and end of the substring.
6945
6946
Arguments:
6947
code points to the compiled expression
6948
subject points to the subject string
6949
length length of subject string (may contain binary zeros)
6950
start_offset where to start in the subject string
6951
options option bits
6952
match_data points to a match_data block
6953
mcontext points a PCRE2 context
6954
6955
Returns: > 0 => success; value is the number of ovector pairs filled
6956
= 0 => success, but ovector is not big enough
6957
= -1 => failed to match (PCRE2_ERROR_NOMATCH)
6958
= -2 => partial match (PCRE2_ERROR_PARTIAL)
6959
< -2 => some kind of unexpected problem
6960
*/
6961
6962
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
6963
pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6964
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6965
pcre2_match_context *mcontext)
6966
{
6967
int rc;
6968
const uint8_t *start_bits = NULL;
6969
const pcre2_real_code *re = (const pcre2_real_code *)code;
6970
uint32_t original_options = options;
6971
6972
BOOL anchored;
6973
BOOL firstline;
6974
BOOL has_first_cu = FALSE;
6975
BOOL has_req_cu = FALSE;
6976
BOOL startline;
6977
6978
#if PCRE2_CODE_UNIT_WIDTH == 8
6979
PCRE2_SPTR memchr_found_first_cu;
6980
PCRE2_SPTR memchr_found_first_cu2;
6981
#endif
6982
6983
PCRE2_UCHAR first_cu = 0;
6984
PCRE2_UCHAR first_cu2 = 0;
6985
PCRE2_UCHAR req_cu = 0;
6986
PCRE2_UCHAR req_cu2 = 0;
6987
6988
PCRE2_UCHAR null_str[1] = { 0xcd };
6989
PCRE2_SPTR original_subject = subject;
6990
PCRE2_SPTR bumpalong_limit;
6991
PCRE2_SPTR end_subject;
6992
PCRE2_SPTR true_end_subject;
6993
PCRE2_SPTR start_match;
6994
PCRE2_SPTR req_cu_ptr;
6995
PCRE2_SPTR start_partial;
6996
PCRE2_SPTR match_partial;
6997
6998
#ifdef SUPPORT_JIT
6999
BOOL use_jit;
7000
#endif
7001
7002
/* This flag is needed even when Unicode is not supported for convenience
7003
(it is used by the IS_NEWLINE macro). */
7004
7005
BOOL utf = FALSE;
7006
7007
#ifdef SUPPORT_UNICODE
7008
BOOL ucp = FALSE;
7009
BOOL allow_invalid;
7010
uint32_t fragment_options = 0;
7011
#ifdef SUPPORT_JIT
7012
BOOL jit_checked_utf = FALSE;
7013
#endif
7014
#endif /* SUPPORT_UNICODE */
7015
7016
PCRE2_SIZE frame_size;
7017
PCRE2_SIZE heapframes_size;
7018
7019
/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
7020
macro is used below, and it expects NLBLOCK to be defined as a pointer. */
7021
7022
pcre2_callout_block cb;
7023
match_block actual_match_block;
7024
match_block *mb = &actual_match_block;
7025
7026
/* Recognize NULL, length 0 as an empty string. */
7027
7028
if (subject == NULL && length == 0) subject = null_str;
7029
7030
/* Plausibility checks */
7031
7032
if (match_data == NULL) return PCRE2_ERROR_NULL;
7033
if (code == NULL || subject == NULL)
7034
return match_data->rc = PCRE2_ERROR_NULL;
7035
if ((options & ~PUBLIC_MATCH_OPTIONS) != 0)
7036
return match_data->rc = PCRE2_ERROR_BADOPTION;
7037
7038
start_match = subject + start_offset;
7039
req_cu_ptr = start_match - 1;
7040
if (length == PCRE2_ZERO_TERMINATED)
7041
{
7042
length = PRIV(strlen)(subject);
7043
}
7044
true_end_subject = end_subject = subject + length;
7045
7046
if (start_offset > length) return match_data->rc = PCRE2_ERROR_BADOFFSET;
7047
7048
/* Check that the first field in the block is the magic number. */
7049
7050
if (re->magic_number != MAGIC_NUMBER)
7051
return match_data->rc = PCRE2_ERROR_BADMAGIC;
7052
7053
/* Check the code unit width. */
7054
7055
if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
7056
return match_data->rc = PCRE2_ERROR_BADMODE;
7057
7058
/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
7059
options variable for this function. Users of PCRE2 who are not calling the
7060
function directly would like to have a way of setting these flags, in the same
7061
way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with
7062
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
7063
(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
7064
transfer to the options for this function. The bits are guaranteed to be
7065
adjacent, but do not have the same values. This bit of Boolean trickery assumes
7066
that the match-time bits are not more significant than the flag bits. If by
7067
accident this is not the case, a compile-time division by zero error will
7068
occur. */
7069
7070
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
7071
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
7072
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
7073
#undef FF
7074
#undef OO
7075
7076
/* If the pattern was successfully studied with JIT support, we will run the
7077
JIT executable instead of the rest of this function. Most options must be set
7078
at compile time for the JIT code to be usable. */
7079
7080
#ifdef SUPPORT_JIT
7081
use_jit = (re->executable_jit != NULL &&
7082
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
7083
#endif
7084
7085
/* Initialize UTF/UCP parameters. */
7086
7087
#ifdef SUPPORT_UNICODE
7088
utf = (re->overall_options & PCRE2_UTF) != 0;
7089
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
7090
ucp = (re->overall_options & PCRE2_UCP) != 0;
7091
#endif /* SUPPORT_UNICODE */
7092
7093
/* Convert the partial matching flags into an integer. */
7094
7095
mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
7096
((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
7097
7098
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
7099
time. */
7100
7101
if (mb->partial != 0 &&
7102
((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
7103
return match_data->rc = PCRE2_ERROR_BADOPTION;
7104
7105
/* It is an error to set an offset limit without setting the flag at compile
7106
time. */
7107
7108
if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
7109
(re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
7110
return match_data->rc = PCRE2_ERROR_BADOFFSETLIMIT;
7111
7112
/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
7113
free the memory that was obtained. Set the field to NULL for match error
7114
cases. */
7115
7116
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
7117
{
7118
match_data->memctl.free((void *)match_data->subject,
7119
match_data->memctl.memory_data);
7120
match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
7121
}
7122
match_data->subject = NULL;
7123
7124
/* Zero the error offset in case the first code unit is invalid UTF. */
7125
7126
match_data->startchar = 0;
7127
7128
7129
/* ============================= JIT matching ============================== */
7130
7131
/* Prepare for JIT matching. Check a UTF string for validity unless no check is
7132
requested or invalid UTF can be handled. We check only the portion of the
7133
subject that might be be inspected during matching - from the offset minus the
7134
maximum lookbehind to the given length. This saves time when a small part of a
7135
large subject is being matched by the use of a starting offset. Note that the
7136
maximum lookbehind is a number of characters, not code units. */
7137
7138
#ifdef SUPPORT_JIT
7139
if (use_jit)
7140
{
7141
#ifdef SUPPORT_UNICODE
7142
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
7143
{
7144
7145
/* For 8-bit and 16-bit UTF, check that the first code unit is a valid
7146
character start. */
7147
7148
#if PCRE2_CODE_UNIT_WIDTH != 32
7149
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
7150
{
7151
if (start_offset > 0) return match_data->rc = PCRE2_ERROR_BADUTFOFFSET;
7152
#if PCRE2_CODE_UNIT_WIDTH == 8
7153
return match_data->rc = PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
7154
#else
7155
return match_data->rc = PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
7156
#endif
7157
}
7158
#endif /* WIDTH != 32 */
7159
7160
/* Move back by the maximum lookbehind, just in case it happens at the very
7161
start of matching. */
7162
7163
#if PCRE2_CODE_UNIT_WIDTH != 32
7164
for (unsigned int i = re->max_lookbehind; i > 0 && start_match > subject; i--)
7165
{
7166
start_match--;
7167
while (start_match > subject &&
7168
#if PCRE2_CODE_UNIT_WIDTH == 8
7169
(*start_match & 0xc0) == 0x80)
7170
#else /* 16-bit */
7171
(*start_match & 0xfc00) == 0xdc00)
7172
#endif
7173
start_match--;
7174
}
7175
#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
7176
7177
/* In the 32-bit library, one code unit equals one character. However,
7178
we cannot just subtract the lookbehind and then compare pointers, because
7179
a very large lookbehind could create an invalid pointer. */
7180
7181
if (start_offset >= re->max_lookbehind)
7182
start_match -= re->max_lookbehind;
7183
else
7184
start_match = subject;
7185
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
7186
7187
/* Validate the relevant portion of the subject. Adjust the offset of an
7188
invalid code point to be an absolute offset in the whole string. */
7189
7190
rc = PRIV(valid_utf)(start_match,
7191
length - (start_match - subject), &(match_data->startchar));
7192
if (rc != 0)
7193
{
7194
match_data->startchar += start_match - subject;
7195
return match_data->rc = rc;
7196
}
7197
jit_checked_utf = TRUE;
7198
}
7199
#endif /* SUPPORT_UNICODE */
7200
7201
/* If JIT returns BADOPTION, which means that the selected complete or
7202
partial matching mode was not compiled, fall through to the interpreter. */
7203
7204
rc = pcre2_jit_match(code, subject, length, start_offset, options,
7205
match_data, mcontext);
7206
if (rc != PCRE2_ERROR_JIT_BADOPTION)
7207
{
7208
match_data->options = original_options;
7209
if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7210
{
7211
if (length != 0)
7212
{
7213
match_data->subject = match_data->memctl.malloc(CU2BYTES(length),
7214
match_data->memctl.memory_data);
7215
if (match_data->subject == NULL)
7216
return match_data->rc = PCRE2_ERROR_NOMEMORY;
7217
memcpy((void *)match_data->subject, subject, CU2BYTES(length));
7218
}
7219
else
7220
match_data->subject = NULL;
7221
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7222
}
7223
else
7224
{
7225
/* When pcre2_jit_match sets the subject, it doesn't know what the
7226
original passed-in pointer was. */
7227
if (match_data->subject != NULL) match_data->subject = original_subject;
7228
}
7229
return rc;
7230
}
7231
}
7232
#endif /* SUPPORT_JIT */
7233
7234
/* ========================= End of JIT matching ========================== */
7235
7236
7237
/* Proceed with non-JIT matching. The default is to allow lookbehinds to the
7238
start of the subject. A UTF check when there is a non-zero offset may change
7239
this. */
7240
7241
mb->check_subject = subject;
7242
7243
/* If a UTF subject string was not checked for validity in the JIT code above,
7244
check it here, and handle support for invalid UTF strings. The check above
7245
happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
7246
If we get here in those circumstances, it means the subject string is valid,
7247
but for some reason JIT matching was not successful. There is no need to check
7248
the subject again.
7249
7250
We check only the portion of the subject that might be be inspected during
7251
matching - from the offset minus the maximum lookbehind to the given length.
7252
This saves time when a small part of a large subject is being matched by the
7253
use of a starting offset. Note that the maximum lookbehind is a number of
7254
characters, not code units.
7255
7256
Note also that support for invalid UTF forces a check, overriding the setting
7257
of PCRE2_NO_CHECK_UTF. */
7258
7259
#ifdef SUPPORT_UNICODE
7260
if (utf &&
7261
#ifdef SUPPORT_JIT
7262
!jit_checked_utf &&
7263
#endif
7264
((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
7265
{
7266
#if PCRE2_CODE_UNIT_WIDTH != 32
7267
BOOL skipped_bad_start = FALSE;
7268
#endif
7269
7270
/* For 8-bit and 16-bit UTF, check that the first code unit is a valid
7271
character start. If we are handling invalid UTF, just skip over such code
7272
units. Otherwise, give an appropriate error. */
7273
7274
#if PCRE2_CODE_UNIT_WIDTH != 32
7275
if (allow_invalid)
7276
{
7277
while (start_match < end_subject && NOT_FIRSTCU(*start_match))
7278
{
7279
start_match++;
7280
skipped_bad_start = TRUE;
7281
}
7282
}
7283
else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
7284
{
7285
if (start_offset > 0) return match_data->rc = PCRE2_ERROR_BADUTFOFFSET;
7286
#if PCRE2_CODE_UNIT_WIDTH == 8
7287
return match_data->rc = PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
7288
#else
7289
return match_data->rc = PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
7290
#endif
7291
}
7292
#endif /* WIDTH != 32 */
7293
7294
/* The mb->check_subject field points to the start of UTF checking;
7295
lookbehinds can go back no further than this. */
7296
7297
mb->check_subject = start_match;
7298
7299
/* Move back by the maximum lookbehind, just in case it happens at the very
7300
start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
7301
units above. */
7302
7303
#if PCRE2_CODE_UNIT_WIDTH != 32
7304
if (!skipped_bad_start)
7305
{
7306
unsigned int i;
7307
for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
7308
{
7309
mb->check_subject--;
7310
while (mb->check_subject > subject &&
7311
#if PCRE2_CODE_UNIT_WIDTH == 8
7312
(*mb->check_subject & 0xc0) == 0x80)
7313
#else /* 16-bit */
7314
(*mb->check_subject & 0xfc00) == 0xdc00)
7315
#endif
7316
mb->check_subject--;
7317
}
7318
}
7319
#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
7320
7321
/* In the 32-bit library, one code unit equals one character. However,
7322
we cannot just subtract the lookbehind and then compare pointers, because
7323
a very large lookbehind could create an invalid pointer. */
7324
7325
if (start_offset >= re->max_lookbehind)
7326
mb->check_subject -= re->max_lookbehind;
7327
else
7328
mb->check_subject = subject;
7329
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
7330
7331
/* Validate the relevant portion of the subject. There's a loop in case we
7332
encounter bad UTF in the characters preceding start_match which we are
7333
scanning because of a lookbehind. */
7334
7335
for (;;)
7336
{
7337
rc = PRIV(valid_utf)(mb->check_subject,
7338
length - (mb->check_subject - subject), &(match_data->startchar));
7339
7340
if (rc == 0) break; /* Valid UTF string */
7341
7342
/* Invalid UTF string. Adjust the offset to be an absolute offset in the
7343
whole string. If we are handling invalid UTF strings, set end_subject to
7344
stop before the bad code unit, and set the options to "not end of line".
7345
Otherwise return the error. */
7346
7347
match_data->startchar += mb->check_subject - subject;
7348
if (!allow_invalid || rc > 0) return match_data->rc = rc;
7349
end_subject = subject + match_data->startchar;
7350
7351
/* If the end precedes start_match, it means there is invalid UTF in the
7352
extra code units we reversed over because of a lookbehind. Advance past the
7353
first bad code unit, and then skip invalid character starting code units in
7354
8-bit and 16-bit modes, and try again with the original end point. */
7355
7356
if (end_subject < start_match)
7357
{
7358
mb->check_subject = end_subject + 1;
7359
#if PCRE2_CODE_UNIT_WIDTH != 32
7360
while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
7361
mb->check_subject++;
7362
#endif
7363
end_subject = true_end_subject;
7364
}
7365
7366
/* Otherwise, set the not end of line option, and do the match. */
7367
7368
else
7369
{
7370
fragment_options = PCRE2_NOTEOL;
7371
break;
7372
}
7373
}
7374
}
7375
#endif /* SUPPORT_UNICODE */
7376
7377
/* A NULL match context means "use a default context", but we take the memory
7378
control functions from the pattern. */
7379
7380
if (mcontext == NULL)
7381
{
7382
mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
7383
mb->memctl = re->memctl;
7384
}
7385
else mb->memctl = mcontext->memctl;
7386
7387
anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
7388
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
7389
startline = (re->flags & PCRE2_STARTLINE) != 0;
7390
bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
7391
true_end_subject : subject + mcontext->offset_limit;
7392
7393
/* Initialize and set up the fixed fields in the callout block, with a pointer
7394
in the match block. */
7395
7396
mb->cb = &cb;
7397
cb.version = 2;
7398
cb.subject = subject;
7399
cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
7400
cb.callout_flags = 0;
7401
7402
/* Fill in the remaining fields in the match block, except for moptions, which
7403
gets set later. */
7404
7405
mb->callout = mcontext->callout;
7406
mb->callout_data = mcontext->callout_data;
7407
7408
mb->start_subject = subject;
7409
mb->start_offset = start_offset;
7410
mb->end_subject = end_subject;
7411
mb->true_end_subject = true_end_subject;
7412
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
7413
mb->hasbsk = (re->flags & PCRE2_HASBSK) != 0;
7414
mb->allowemptypartial = (re->max_lookbehind > 0) ||
7415
(re->flags & PCRE2_MATCH_EMPTY) != 0;
7416
mb->allowlookaroundbsk =
7417
(re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0;
7418
mb->poptions = re->overall_options; /* Pattern options */
7419
mb->ignore_skip_arg = 0;
7420
mb->mark = mb->nomatch_mark = NULL; /* In case never set */
7421
7422
/* The name table is needed for finding all the numbers associated with a
7423
given name, for condition testing. The code follows the name table. */
7424
7425
mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code));
7426
mb->name_count = re->name_count;
7427
mb->name_entry_size = re->name_entry_size;
7428
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);
7429
7430
/* Process the \R and newline settings. */
7431
7432
mb->bsr_convention = re->bsr_convention;
7433
mb->nltype = NLTYPE_FIXED;
7434
switch(re->newline_convention)
7435
{
7436
case PCRE2_NEWLINE_CR:
7437
mb->nllen = 1;
7438
mb->nl[0] = CHAR_CR;
7439
break;
7440
7441
case PCRE2_NEWLINE_LF:
7442
mb->nllen = 1;
7443
mb->nl[0] = CHAR_NL;
7444
break;
7445
7446
case PCRE2_NEWLINE_NUL:
7447
mb->nllen = 1;
7448
mb->nl[0] = CHAR_NUL;
7449
break;
7450
7451
case PCRE2_NEWLINE_CRLF:
7452
mb->nllen = 2;
7453
mb->nl[0] = CHAR_CR;
7454
mb->nl[1] = CHAR_NL;
7455
break;
7456
7457
case PCRE2_NEWLINE_ANY:
7458
mb->nltype = NLTYPE_ANY;
7459
break;
7460
7461
case PCRE2_NEWLINE_ANYCRLF:
7462
mb->nltype = NLTYPE_ANYCRLF;
7463
break;
7464
7465
/* LCOV_EXCL_START */
7466
default:
7467
PCRE2_DEBUG_UNREACHABLE();
7468
return match_data->rc = PCRE2_ERROR_INTERNAL;
7469
/* LCOV_EXCL_STOP */
7470
}
7471
7472
/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
7473
vector at the end, whose size depends on the number of capturing parentheses in
7474
the pattern. It is not used at all if there are no capturing parentheses.
7475
7476
frame_size is the total size of each frame
7477
match_data->heapframes is the pointer to the frames vector
7478
match_data->heapframes_size is the allocated size of the vector
7479
7480
We must pad the frame_size for alignment to ensure subsequent frames are as
7481
aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
7482
array, that does not guarantee it is suitably aligned for pointers, as some
7483
architectures have pointers that are larger than a size_t. */
7484
7485
frame_size = (offsetof(heapframe, ovector) +
7486
re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &
7487
~(HEAPFRAME_ALIGNMENT - 1);
7488
7489
/* Limits set in the pattern override the match context only if they are
7490
smaller. */
7491
7492
mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
7493
mcontext->heap_limit : re->limit_heap);
7494
7495
mb->match_limit = (mcontext->match_limit < re->limit_match)?
7496
mcontext->match_limit : re->limit_match;
7497
7498
mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
7499
mcontext->depth_limit : re->limit_depth;
7500
7501
/* If a pattern has very many capturing parentheses, the frame size may be very
7502
large. Set the initial frame vector size to ensure that there are at least 10
7503
available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
7504
greater than the heap limit, get as large a vector as possible. */
7505
7506
heapframes_size = frame_size * 10;
7507
if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
7508
if (heapframes_size / 1024 > mb->heap_limit)
7509
{
7510
PCRE2_SIZE max_size = 1024 * mb->heap_limit;
7511
if (max_size < frame_size) return match_data->rc = PCRE2_ERROR_HEAPLIMIT;
7512
heapframes_size = max_size;
7513
}
7514
7515
/* If an existing frame vector in the match_data block is large enough, we can
7516
use it. Otherwise, free any pre-existing vector and get a new one. */
7517
7518
if (match_data->heapframes_size < heapframes_size)
7519
{
7520
match_data->memctl.free(match_data->heapframes,
7521
match_data->memctl.memory_data);
7522
match_data->heapframes = match_data->memctl.malloc(heapframes_size,
7523
match_data->memctl.memory_data);
7524
if (match_data->heapframes == NULL)
7525
{
7526
match_data->heapframes_size = 0;
7527
return match_data->rc = PCRE2_ERROR_NOMEMORY;
7528
}
7529
match_data->heapframes_size = heapframes_size;
7530
}
7531
7532
/* Write to the ovector within the first frame to mark every capture unset and
7533
to avoid uninitialized memory read errors when it is copied to a new frame. */
7534
7535
memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,
7536
frame_size - offsetof(heapframe, ovector));
7537
7538
/* Pointers to the individual character tables */
7539
7540
mb->lcc = re->tables + lcc_offset;
7541
mb->fcc = re->tables + fcc_offset;
7542
mb->ctypes = re->tables + ctypes_offset;
7543
7544
/* Set up the first code unit to match, if available. If there's no first code
7545
unit there may be a bitmap of possible first characters. */
7546
7547
if ((re->flags & PCRE2_FIRSTSET) != 0)
7548
{
7549
has_first_cu = TRUE;
7550
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
7551
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
7552
{
7553
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
7554
#ifdef SUPPORT_UNICODE
7555
#if PCRE2_CODE_UNIT_WIDTH == 8
7556
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
7557
#else
7558
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
7559
#endif
7560
#endif /* SUPPORT_UNICODE */
7561
}
7562
}
7563
else
7564
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
7565
start_bits = re->start_bitmap;
7566
7567
/* There may also be a "last known required character" set. */
7568
7569
if ((re->flags & PCRE2_LASTSET) != 0)
7570
{
7571
has_req_cu = TRUE;
7572
req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
7573
if ((re->flags & PCRE2_LASTCASELESS) != 0)
7574
{
7575
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
7576
#ifdef SUPPORT_UNICODE
7577
#if PCRE2_CODE_UNIT_WIDTH == 8
7578
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
7579
#else
7580
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
7581
#endif
7582
#endif /* SUPPORT_UNICODE */
7583
}
7584
}
7585
7586
7587
/* ==========================================================================*/
7588
7589
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
7590
the loop runs just once. */
7591
7592
#ifdef SUPPORT_UNICODE
7593
FRAGMENT_RESTART:
7594
#endif
7595
7596
start_partial = match_partial = NULL;
7597
mb->hitend = FALSE;
7598
7599
#if PCRE2_CODE_UNIT_WIDTH == 8
7600
memchr_found_first_cu = NULL;
7601
memchr_found_first_cu2 = NULL;
7602
#endif
7603
7604
for(;;)
7605
{
7606
PCRE2_SPTR new_start_match;
7607
7608
/* ----------------- Start of match optimizations ---------------- */
7609
7610
/* There are some optimizations that avoid running the match if a known
7611
starting point is not found, or if a known later code unit is not present.
7612
However, there is an option (settable at compile time) that disables these,
7613
for testing and for ensuring that all callouts do actually occur. */
7614
7615
if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
7616
{
7617
/* If firstline is TRUE, the start of the match is constrained to the first
7618
line of a multiline string. That is, the match must be before or at the
7619
first newline following the start of matching. Temporarily adjust
7620
end_subject so that we stop the scans for a first code unit at a newline.
7621
If the match fails at the newline, later code breaks the loop. */
7622
7623
if (firstline)
7624
{
7625
PCRE2_SPTR t = start_match;
7626
#ifdef SUPPORT_UNICODE
7627
if (utf)
7628
{
7629
while (t < end_subject && !IS_NEWLINE(t))
7630
{
7631
t++;
7632
ACROSSCHAR(t < end_subject, t, t++);
7633
}
7634
}
7635
else
7636
#endif
7637
while (t < end_subject && !IS_NEWLINE(t)) t++;
7638
end_subject = t;
7639
}
7640
7641
/* Anchored: check the first code unit if one is recorded. This may seem
7642
pointless but it can help in detecting a no match case without scanning for
7643
the required code unit. */
7644
7645
if (anchored)
7646
{
7647
if (has_first_cu || start_bits != NULL)
7648
{
7649
BOOL ok = start_match < end_subject;
7650
if (ok)
7651
{
7652
PCRE2_UCHAR c = UCHAR21TEST(start_match);
7653
ok = has_first_cu && (c == first_cu || c == first_cu2);
7654
if (!ok && start_bits != NULL)
7655
{
7656
#if PCRE2_CODE_UNIT_WIDTH != 8
7657
if (c > 255) c = 255;
7658
#endif
7659
ok = (start_bits[c/8] & (1u << (c&7))) != 0;
7660
}
7661
}
7662
if (!ok)
7663
{
7664
rc = MATCH_NOMATCH;
7665
break;
7666
}
7667
}
7668
}
7669
7670
/* Not anchored. Advance to a unique first code unit if there is one. */
7671
7672
else
7673
{
7674
if (has_first_cu)
7675
{
7676
if (first_cu != first_cu2) /* Caseless */
7677
{
7678
/* In 16-bit and 32_bit modes we have to do our own search, so can
7679
look for both cases at once. */
7680
7681
#if PCRE2_CODE_UNIT_WIDTH != 8
7682
PCRE2_UCHAR smc;
7683
while (start_match < end_subject &&
7684
(smc = UCHAR21TEST(start_match)) != first_cu &&
7685
smc != first_cu2)
7686
start_match++;
7687
#else
7688
/* In 8-bit mode, the use of memchr() gives a big speed up, even
7689
though we have to call it twice in order to find the earliest
7690
occurrence of the code unit in either of its cases. Caching is used
7691
to remember the positions of previously found code units. This can
7692
make a huge difference when the strings are very long and only one
7693
case is actually present. */
7694
7695
PCRE2_SPTR pp1 = NULL;
7696
PCRE2_SPTR pp2 = NULL;
7697
PCRE2_SIZE searchlength = end_subject - start_match;
7698
7699
/* If we haven't got a previously found position for first_cu, or if
7700
the current starting position is later, we need to do a search. If
7701
the code unit is not found, set it to the end. */
7702
7703
if (memchr_found_first_cu == NULL ||
7704
start_match > memchr_found_first_cu)
7705
{
7706
pp1 = memchr(start_match, first_cu, searchlength);
7707
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
7708
}
7709
7710
/* If the start is before a previously found position, use the
7711
previous position, or NULL if a previous search failed. */
7712
7713
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
7714
memchr_found_first_cu;
7715
7716
/* Do the same thing for the other case. */
7717
7718
if (memchr_found_first_cu2 == NULL ||
7719
start_match > memchr_found_first_cu2)
7720
{
7721
pp2 = memchr(start_match, first_cu2, searchlength);
7722
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
7723
}
7724
7725
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
7726
memchr_found_first_cu2;
7727
7728
/* Set the start to the end of the subject if neither case was found.
7729
Otherwise, use the earlier found point. */
7730
7731
if (pp1 == NULL)
7732
start_match = (pp2 == NULL)? end_subject : pp2;
7733
else
7734
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
7735
7736
#endif /* 8-bit handling */
7737
}
7738
7739
/* The caseful case is much simpler. */
7740
7741
else
7742
{
7743
#if PCRE2_CODE_UNIT_WIDTH != 8
7744
while (start_match < end_subject && UCHAR21TEST(start_match) !=
7745
first_cu)
7746
start_match++;
7747
#else
7748
start_match = memchr(start_match, first_cu, end_subject - start_match);
7749
if (start_match == NULL) start_match = end_subject;
7750
#endif
7751
}
7752
7753
/* If we can't find the required first code unit, having reached the
7754
true end of the subject, break the bumpalong loop, to force a match
7755
failure, except when doing partial matching, when we let the next cycle
7756
run at the end of the subject. To see why, consider the pattern
7757
/(?<=abc)def/, which partially matches "abc", even though the string
7758
does not contain the starting character "d". If we have not reached the
7759
true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
7760
temporarily modified) we also let the cycle run, because the matching
7761
string is legitimately allowed to start with the first code unit of a
7762
newline. */
7763
7764
if (mb->partial == 0 && start_match >= mb->end_subject)
7765
{
7766
rc = MATCH_NOMATCH;
7767
break;
7768
}
7769
}
7770
7771
/* If there's no first code unit, advance to just after a linebreak for a
7772
multiline match if required. */
7773
7774
else if (startline)
7775
{
7776
if (start_match > mb->start_subject + start_offset)
7777
{
7778
#ifdef SUPPORT_UNICODE
7779
if (utf)
7780
{
7781
while (start_match < end_subject && !WAS_NEWLINE(start_match))
7782
{
7783
start_match++;
7784
ACROSSCHAR(start_match < end_subject, start_match, start_match++);
7785
}
7786
}
7787
else
7788
#endif
7789
while (start_match < end_subject && !WAS_NEWLINE(start_match))
7790
start_match++;
7791
7792
/* If we have just passed a CR and the newline option is ANY or
7793
ANYCRLF, and we are now at a LF, advance the match position by one
7794
more code unit. */
7795
7796
if (start_match[-1] == CHAR_CR &&
7797
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
7798
start_match < end_subject &&
7799
UCHAR21TEST(start_match) == CHAR_NL)
7800
start_match++;
7801
}
7802
}
7803
7804
/* If there's no first code unit or a requirement for a multiline line
7805
start, advance to a non-unique first code unit if any have been
7806
identified. The bitmap contains only 256 bits. When code units are 16 or
7807
32 bits wide, all code units greater than 254 set the 255 bit. */
7808
7809
else if (start_bits != NULL)
7810
{
7811
while (start_match < end_subject)
7812
{
7813
uint32_t c = UCHAR21TEST(start_match);
7814
#if PCRE2_CODE_UNIT_WIDTH != 8
7815
if (c > 255) c = 255;
7816
#endif
7817
if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
7818
start_match++;
7819
}
7820
7821
/* See comment above in first_cu checking about the next few lines. */
7822
7823
if (mb->partial == 0 && start_match >= mb->end_subject)
7824
{
7825
rc = MATCH_NOMATCH;
7826
break;
7827
}
7828
}
7829
} /* End first code unit handling */
7830
7831
/* Restore fudged end_subject */
7832
7833
end_subject = mb->end_subject;
7834
7835
/* The following two optimizations must be disabled for partial matching. */
7836
7837
if (mb->partial == 0)
7838
{
7839
PCRE2_SPTR p;
7840
7841
/* The minimum matching length is a lower bound; no string of that length
7842
may actually match the pattern. Although the value is, strictly, in
7843
characters, we treat it as code units to avoid spending too much time in
7844
this optimization. */
7845
7846
if (end_subject - start_match < re->minlength)
7847
{
7848
rc = MATCH_NOMATCH;
7849
break;
7850
}
7851
7852
/* If req_cu is set, we know that that code unit must appear in the
7853
subject for the (non-partial) match to succeed. If the first code unit is
7854
set, req_cu must be later in the subject; otherwise the test starts at
7855
the match point. This optimization can save a huge amount of backtracking
7856
in patterns with nested unlimited repeats that aren't going to match.
7857
Writing separate code for caseful/caseless versions makes it go faster,
7858
as does using an autoincrement and backing off on a match. As in the case
7859
of the first code unit, using memchr() in the 8-bit library gives a big
7860
speed up. Unlike the first_cu check above, we do not need to call
7861
memchr() twice in the caseless case because we only need to check for the
7862
presence of the character in either case, not find the first occurrence.
7863
7864
The search can be skipped if the code unit was found later than the
7865
current starting point in a previous iteration of the bumpalong loop.
7866
7867
HOWEVER: when the subject string is very, very long, searching to its end
7868
can take a long time, and give bad performance on quite ordinary
7869
anchored patterns. This showed up when somebody was matching something
7870
like /^\d+C/ on a 32-megabyte string... so we don't do this when the
7871
string is sufficiently long, but it's worth searching a lot more for
7872
unanchored patterns. */
7873
7874
p = start_match + (has_first_cu? 1:0);
7875
if (has_req_cu && p > req_cu_ptr)
7876
{
7877
PCRE2_SIZE check_length = end_subject - start_match;
7878
7879
if (check_length < REQ_CU_MAX ||
7880
(!anchored && check_length < REQ_CU_MAX * 1000))
7881
{
7882
if (req_cu != req_cu2) /* Caseless */
7883
{
7884
#if PCRE2_CODE_UNIT_WIDTH != 8
7885
while (p < end_subject)
7886
{
7887
uint32_t pp = UCHAR21INCTEST(p);
7888
if (pp == req_cu || pp == req_cu2) { p--; break; }
7889
}
7890
#else /* 8-bit code units */
7891
PCRE2_SPTR pp = p;
7892
p = memchr(pp, req_cu, end_subject - pp);
7893
if (p == NULL)
7894
{
7895
p = memchr(pp, req_cu2, end_subject - pp);
7896
if (p == NULL) p = end_subject;
7897
}
7898
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
7899
}
7900
7901
/* The caseful case */
7902
7903
else
7904
{
7905
#if PCRE2_CODE_UNIT_WIDTH != 8
7906
while (p < end_subject)
7907
{
7908
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7909
}
7910
7911
#else /* 8-bit code units */
7912
p = memchr(p, req_cu, end_subject - p);
7913
if (p == NULL) p = end_subject;
7914
#endif
7915
}
7916
7917
/* If we can't find the required code unit, break the bumpalong loop,
7918
forcing a match failure. */
7919
7920
if (p >= end_subject)
7921
{
7922
rc = MATCH_NOMATCH;
7923
break;
7924
}
7925
7926
/* If we have found the required code unit, save the point where we
7927
found it, so that we don't search again next time round the bumpalong
7928
loop if the start hasn't yet passed this code unit. */
7929
7930
req_cu_ptr = p;
7931
}
7932
}
7933
}
7934
}
7935
7936
/* ------------ End of start of match optimizations ------------ */
7937
7938
/* Give no match if we have passed the bumpalong limit. */
7939
7940
if (start_match > bumpalong_limit)
7941
{
7942
rc = MATCH_NOMATCH;
7943
break;
7944
}
7945
7946
/* OK, we can now run the match. If "hitend" is set afterwards, remember the
7947
first starting point for which a partial match was found. */
7948
7949
cb.start_match = (PCRE2_SIZE)(start_match - subject);
7950
cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7951
7952
mb->start_used_ptr = start_match;
7953
mb->last_used_ptr = start_match;
7954
#ifdef SUPPORT_UNICODE
7955
mb->moptions = options | fragment_options;
7956
#else
7957
mb->moptions = options;
7958
#endif
7959
mb->match_call_count = 0;
7960
mb->end_offset_top = 0;
7961
mb->skip_arg_count = 0;
7962
7963
#ifdef DEBUG_SHOW_OPS
7964
fprintf(stderr, "++ Calling match()\n");
7965
#endif
7966
7967
rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
7968
match_data, mb);
7969
7970
#ifdef DEBUG_SHOW_OPS
7971
fprintf(stderr, "++ match() returned %d\n\n", rc);
7972
#endif
7973
7974
if (mb->hitend && start_partial == NULL)
7975
{
7976
start_partial = mb->start_used_ptr;
7977
match_partial = start_match;
7978
}
7979
7980
switch(rc)
7981
{
7982
/* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7983
the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7984
entirely. The only way we can do that is to re-do the match at the same
7985
point, with a flag to force SKIP with an argument to be ignored. Just
7986
treating this case as NOMATCH does not work because it does not check other
7987
alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7988
7989
case MATCH_SKIP_ARG:
7990
new_start_match = start_match;
7991
mb->ignore_skip_arg = mb->skip_arg_count;
7992
break;
7993
7994
/* SKIP passes back the next starting point explicitly, but if it is no
7995
greater than the match we have just done, treat it as NOMATCH. */
7996
7997
case MATCH_SKIP:
7998
if (mb->verb_skip_ptr > start_match)
7999
{
8000
new_start_match = mb->verb_skip_ptr;
8001
break;
8002
}
8003
PCRE2_FALLTHROUGH /* Fall through */
8004
8005
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
8006
exactly like PRUNE. Unset ignore SKIP-with-argument. */
8007
8008
case MATCH_NOMATCH:
8009
case MATCH_PRUNE:
8010
case MATCH_THEN:
8011
mb->ignore_skip_arg = 0;
8012
new_start_match = start_match + 1;
8013
#ifdef SUPPORT_UNICODE
8014
if (utf)
8015
ACROSSCHAR(new_start_match < end_subject, new_start_match,
8016
new_start_match++);
8017
#endif
8018
break;
8019
8020
/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
8021
8022
case MATCH_COMMIT:
8023
rc = MATCH_NOMATCH;
8024
goto ENDLOOP;
8025
8026
/* Any other return is either a match, or some kind of error. */
8027
8028
default:
8029
goto ENDLOOP;
8030
}
8031
8032
/* Control reaches here for the various types of "no match at this point"
8033
result. Reset the code to MATCH_NOMATCH for subsequent checking. */
8034
8035
rc = MATCH_NOMATCH;
8036
8037
/* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
8038
newline in the subject (though it may continue over the newline). Therefore,
8039
if we have just failed to match, starting at a newline, do not continue. */
8040
8041
if (firstline && IS_NEWLINE(start_match)) break;
8042
8043
/* Advance to new matching position */
8044
8045
start_match = new_start_match;
8046
8047
/* Break the loop if the pattern is anchored or if we have passed the end of
8048
the subject. */
8049
8050
if (anchored || start_match > end_subject) break;
8051
8052
/* If we have just passed a CR and we are now at a LF, and the pattern does
8053
not contain any explicit matches for \r or \n, and the newline option is CRLF
8054
or ANY or ANYCRLF, advance the match position by one more code unit. In
8055
normal matching start_match will aways be greater than the first position at
8056
this stage, but a failed *SKIP can cause a return at the same point, which is
8057
why the first test exists. */
8058
8059
if (start_match > subject + start_offset &&
8060
start_match[-1] == CHAR_CR &&
8061
start_match < end_subject &&
8062
*start_match == CHAR_NL &&
8063
(re->flags & PCRE2_HASCRORLF) == 0 &&
8064
(mb->nltype == NLTYPE_ANY ||
8065
mb->nltype == NLTYPE_ANYCRLF ||
8066
mb->nllen == 2))
8067
start_match++;
8068
8069
mb->mark = NULL; /* Reset for start of next match attempt */
8070
} /* End of for(;;) "bumpalong" loop */
8071
8072
/* ==========================================================================*/
8073
8074
/* When we reach here, one of the following stopping conditions is true:
8075
8076
(1) The match succeeded, either completely, or partially;
8077
8078
(2) The pattern is anchored or the match was failed after (*COMMIT);
8079
8080
(3) We are past the end of the subject or the bumpalong limit;
8081
8082
(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
8083
this option requests that a match occur at or before the first newline in
8084
the subject.
8085
8086
(5) Some kind of error occurred.
8087
8088
*/
8089
8090
ENDLOOP:
8091
8092
/* If end_subject != true_end_subject, it means we are handling invalid UTF,
8093
and have just processed a non-terminal fragment. If this resulted in no match
8094
or a partial match we must carry on to the next fragment (a partial match is
8095
returned to the caller only at the very end of the subject). A loop is used to
8096
avoid trying to match against empty fragments; if the pattern can match an
8097
empty string it would have done so already. */
8098
8099
#ifdef SUPPORT_UNICODE
8100
if (utf && end_subject != true_end_subject &&
8101
(rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
8102
{
8103
for (;;)
8104
{
8105
/* Advance past the first bad code unit, and then skip invalid character
8106
starting code units in 8-bit and 16-bit modes. */
8107
8108
start_match = end_subject + 1;
8109
8110
#if PCRE2_CODE_UNIT_WIDTH != 32
8111
while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
8112
start_match++;
8113
#endif
8114
8115
/* If we have hit the end of the subject, there isn't another non-empty
8116
fragment, so give up. */
8117
8118
if (start_match >= true_end_subject)
8119
{
8120
rc = MATCH_NOMATCH; /* In case it was partial */
8121
match_partial = NULL;
8122
break;
8123
}
8124
8125
/* Check the rest of the subject */
8126
8127
mb->check_subject = start_match;
8128
rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
8129
&(match_data->startchar));
8130
8131
/* The rest of the subject is valid UTF. */
8132
8133
if (rc == 0)
8134
{
8135
mb->end_subject = end_subject = true_end_subject;
8136
fragment_options = PCRE2_NOTBOL;
8137
goto FRAGMENT_RESTART;
8138
}
8139
8140
/* A subsequent UTF error has been found; if the next fragment is
8141
non-empty, set up to process it. Otherwise, let the loop advance. */
8142
8143
else if (rc < 0)
8144
{
8145
mb->end_subject = end_subject = start_match + match_data->startchar;
8146
if (end_subject > start_match)
8147
{
8148
fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
8149
goto FRAGMENT_RESTART;
8150
}
8151
}
8152
}
8153
}
8154
#endif /* SUPPORT_UNICODE */
8155
8156
/* Fill in fields that are always returned in the match data. */
8157
8158
match_data->code = re;
8159
match_data->mark = mb->mark;
8160
match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
8161
match_data->options = original_options;
8162
8163
/* Handle a fully successful match. Set the return code to the number of
8164
captured strings, or 0 if there were too many to fit into the ovector, and then
8165
set the remaining returned values before returning. Make a copy of the subject
8166
string if requested. */
8167
8168
if (rc == MATCH_MATCH)
8169
{
8170
match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
8171
0 : (int)mb->end_offset_top/2 + 1;
8172
match_data->subject_length = length;
8173
match_data->start_offset = start_offset;
8174
match_data->startchar = start_match - subject;
8175
match_data->leftchar = mb->start_used_ptr - subject;
8176
match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
8177
mb->last_used_ptr : mb->end_match_ptr) - subject;
8178
if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
8179
{
8180
if (length != 0)
8181
{
8182
match_data->subject = match_data->memctl.malloc(CU2BYTES(length),
8183
match_data->memctl.memory_data);
8184
if (match_data->subject == NULL)
8185
return match_data->rc = PCRE2_ERROR_NOMEMORY;
8186
memcpy((void *)match_data->subject, subject, CU2BYTES(length));
8187
}
8188
else
8189
match_data->subject = NULL;
8190
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
8191
}
8192
else match_data->subject = original_subject;
8193
8194
return match_data->rc;
8195
}
8196
8197
/* Control gets here if there has been a partial match, an error, or if the
8198
overall match attempt has failed at all permitted starting positions. Any mark
8199
data is in the nomatch_mark field. */
8200
8201
match_data->mark = mb->nomatch_mark;
8202
8203
/* For anything other than nomatch or partial match, just return the code. */
8204
8205
if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
8206
8207
/* Handle a partial match. If a "soft" partial match was requested, searching
8208
for a complete match will have continued, and the value of rc at this point
8209
will be MATCH_NOMATCH. For a "hard" partial match, it will already be
8210
PCRE2_ERROR_PARTIAL. */
8211
8212
else if (match_partial != NULL)
8213
{
8214
match_data->subject = original_subject;
8215
match_data->subject_length = length;
8216
match_data->start_offset = start_offset;
8217
match_data->ovector[0] = match_partial - subject;
8218
match_data->ovector[1] = end_subject - subject;
8219
match_data->startchar = match_partial - subject;
8220
match_data->leftchar = start_partial - subject;
8221
match_data->rightchar = end_subject - subject;
8222
match_data->rc = PCRE2_ERROR_PARTIAL;
8223
}
8224
8225
/* Else this is the classic nomatch case. */
8226
8227
else
8228
{
8229
match_data->subject = original_subject;
8230
match_data->subject_length = length;
8231
match_data->start_offset = start_offset;
8232
match_data->rc = PCRE2_ERROR_NOMATCH;
8233
}
8234
8235
return match_data->rc;
8236
}
8237
8238
/* These #undefs are here to enable unity builds with CMake. */
8239
8240
#undef NLBLOCK /* Block containing newline information */
8241
#undef PSSTART /* Field containing processed string start */
8242
#undef PSEND /* Field containing processed string end */
8243
8244
/* End of pcre2_match.c */
8245
8246