Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_match.c
9903 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2015-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#ifdef HAVE_CONFIG_H
43
#include "config.h"
44
#endif
45
46
#include "pcre2_internal.h"
47
48
/* These defines enable debugging code */
49
50
/* #define DEBUG_FRAMES_DISPLAY */
51
/* #define DEBUG_SHOW_OPS */
52
/* #define DEBUG_SHOW_RMATCH */
53
54
#ifdef DEBUG_FRAMES_DISPLAY
55
#include <stdarg.h>
56
#endif
57
58
#ifdef DEBUG_SHOW_OPS
59
static const char *OP_names[] = { OP_NAME_LIST };
60
#endif
61
62
/* These defines identify the name of the block containing "static"
63
information, and fields within it. */
64
65
#define NLBLOCK mb /* Block containing newline information */
66
#define PSSTART start_subject /* Field containing processed string start */
67
#define PSEND end_subject /* Field containing processed string end */
68
69
#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
70
71
/* Masks for identifying the public options that are permitted at match time. */
72
73
#define PUBLIC_MATCH_OPTIONS \
74
(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
75
PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
76
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \
77
PCRE2_DISABLE_RECURSELOOP_CHECK)
78
79
#define PUBLIC_JIT_MATCH_OPTIONS \
80
(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
81
PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
82
PCRE2_COPY_MATCHED_SUBJECT)
83
84
/* Non-error returns from and within the match() function. Error returns are
85
externally defined PCRE2_ERROR_xxx codes, which are all negative. */
86
87
#define MATCH_MATCH 1
88
#define MATCH_NOMATCH 0
89
90
/* Special internal returns used in the match() function. Make them
91
sufficiently negative to avoid the external error codes. */
92
93
#define MATCH_ACCEPT (-999)
94
#define MATCH_KETRPOS (-998)
95
/* The next 5 must be kept together and in sequence so that a test that checks
96
for any one of them can use a range. */
97
#define MATCH_COMMIT (-997)
98
#define MATCH_PRUNE (-996)
99
#define MATCH_SKIP (-995)
100
#define MATCH_SKIP_ARG (-994)
101
#define MATCH_THEN (-993)
102
#define MATCH_BACKTRACK_MAX MATCH_THEN
103
#define MATCH_BACKTRACK_MIN MATCH_COMMIT
104
105
/* Group frame type values. Zero means the frame is not a group frame. The
106
lower 16 bits are used for data (e.g. the capture number). Group frames are
107
used for most groups so that information about the start is easily available at
108
the end without having to scan back through intermediate frames (backtrack
109
points). */
110
111
#define GF_CAPTURE 0x00010000u
112
#define GF_NOCAPTURE 0x00020000u
113
#define GF_CONDASSERT 0x00030000u
114
#define GF_RECURSE 0x00040000u
115
116
/* Masks for the identity and data parts of the group frame type. */
117
118
#define GF_IDMASK(a) ((a) & 0xffff0000u)
119
#define GF_DATAMASK(a) ((a) & 0x0000ffffu)
120
121
/* Repetition types */
122
123
enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
124
125
/* Min and max values for the common repeats; a maximum of UINT32_MAX =>
126
infinity. */
127
128
static const uint32_t rep_min[] = {
129
0, 0, /* * and *? */
130
1, 1, /* + and +? */
131
0, 0, /* ? and ?? */
132
0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
133
0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
134
135
static const uint32_t rep_max[] = {
136
UINT32_MAX, UINT32_MAX, /* * and *? */
137
UINT32_MAX, UINT32_MAX, /* + and +? */
138
1, 1, /* ? and ?? */
139
0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
140
UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
141
142
/* Repetition types - must include OP_CRPOSRANGE (not needed above) */
143
144
static const uint32_t rep_typ[] = {
145
REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
146
REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
147
REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
148
REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
149
REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
150
REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
151
152
/* Numbers for RMATCH calls at backtracking points. When these lists are
153
changed, the code at RETURN_SWITCH below must be updated in sync. */
154
155
enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
156
RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
157
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
158
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 };
159
160
#ifdef SUPPORT_WIDE_CHARS
161
enum { RM100=100, RM101, RM102, RM103 };
162
#endif
163
164
#ifdef SUPPORT_UNICODE
165
enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
166
RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
167
RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,
168
RM224 };
169
#endif
170
171
/* Define short names for general fields in the current backtrack frame, which
172
is always pointed to by the F variable. Occasional references to fields in
173
other frames are written out explicitly. There are also some fields in the
174
current frame whose names start with "temp" that are used for short-term,
175
localised backtracking memory. These are #defined with Lxxx names at the point
176
of use and undefined afterwards. */
177
178
#define Fback_frame F->back_frame
179
#define Fcapture_last F->capture_last
180
#define Fcurrent_recurse F->current_recurse
181
#define Fecode F->ecode
182
#define Feptr F->eptr
183
#define Fgroup_frame_type F->group_frame_type
184
#define Flast_group_offset F->last_group_offset
185
#define Flength F->length
186
#define Fmark F->mark
187
#define Frdepth F->rdepth
188
#define Fstart_match F->start_match
189
#define Foffset_top F->offset_top
190
#define Foccu F->occu
191
#define Fop F->op
192
#define Fovector F->ovector
193
#define Freturn_id F->return_id
194
195
196
#ifdef DEBUG_FRAMES_DISPLAY
197
/*************************************************
198
* Display current frames and contents *
199
*************************************************/
200
201
/* This debugging function displays the current set of frames and their
202
contents. It is not called automatically from anywhere, the intention being
203
that calls can be inserted where necessary when debugging frame-related
204
problems.
205
206
Arguments:
207
f the file to write to
208
F the current top frame
209
P a previous frame of interest
210
frame_size the frame size
211
mb points to the match block
212
match_data points to the match data block
213
s identification text
214
215
Returns: nothing
216
*/
217
218
static void
219
display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
220
match_block *mb, pcre2_match_data *match_data, const char *s, ...)
221
{
222
uint32_t i;
223
heapframe *Q;
224
va_list ap;
225
va_start(ap, s);
226
227
fprintf(f, "FRAMES ");
228
vfprintf(f, s, ap);
229
va_end(ap);
230
231
if (P != NULL) fprintf(f, " P=%lu",
232
((char *)P - (char *)(match_data->heapframes))/frame_size);
233
fprintf(f, "\n");
234
235
for (i = 0, Q = match_data->heapframes;
236
Q <= F;
237
i++, Q = (heapframe *)((char *)Q + frame_size))
238
{
239
fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
240
i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
241
Q->back_frame, Q->return_id);
242
243
if (Q->last_group_offset == PCRE2_UNSET)
244
fprintf(f, " lgoffset=unset\n");
245
else
246
fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
247
}
248
}
249
250
#endif
251
252
253
254
/*************************************************
255
* Process a callout *
256
*************************************************/
257
258
/* This function is called for all callouts, whether "standalone" or at the
259
start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
260
OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
261
with fixed values.
262
263
Arguments:
264
F points to the current backtracking frame
265
mb points to the match block
266
lengthptr where to return the length of the callout item
267
268
Returns: the return from the callout
269
or 0 if no callout function exists
270
*/
271
272
static int
273
do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
274
{
275
int rc;
276
PCRE2_SIZE save0, save1;
277
PCRE2_SIZE *callout_ovector;
278
pcre2_callout_block *cb;
279
280
*lengthptr = (*Fecode == OP_CALLOUT)?
281
PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
282
283
if (mb->callout == NULL) return 0; /* No callout function provided */
284
285
/* The original matching code (pre 10.30) worked directly with the ovector
286
passed by the user, and this was passed to callouts. Now that the working
287
ovector is in the backtracking frame, it no longer needs to reserve space for
288
the overall match offsets (which would waste space in the frame). For backward
289
compatibility, however, we pass capture_top and offset_vector to the callout as
290
if for the extended ovector, and we ensure that the first two slots are unset
291
by preserving and restoring their current contents. Picky compilers complain if
292
references such as Fovector[-2] are use directly, so we set up a separate
293
pointer. */
294
295
callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
296
297
/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
298
are set externally. The first 3 never change; the last is updated for each
299
bumpalong. */
300
301
cb = mb->cb;
302
cb->capture_top = (uint32_t)Foffset_top/2 + 1;
303
cb->capture_last = Fcapture_last;
304
cb->offset_vector = callout_ovector;
305
cb->mark = mb->nomatch_mark;
306
cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
307
cb->pattern_position = GET(Fecode, 1);
308
cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
309
310
if (*Fecode == OP_CALLOUT) /* Numerical callout */
311
{
312
cb->callout_number = Fecode[1 + 2*LINK_SIZE];
313
cb->callout_string_offset = 0;
314
cb->callout_string = NULL;
315
cb->callout_string_length = 0;
316
}
317
else /* String callout */
318
{
319
cb->callout_number = 0;
320
cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
321
cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
322
cb->callout_string_length =
323
*lengthptr - (1 + 4*LINK_SIZE) - 2;
324
}
325
326
save0 = callout_ovector[0];
327
save1 = callout_ovector[1];
328
callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
329
rc = mb->callout(cb, mb->callout_data);
330
callout_ovector[0] = save0;
331
callout_ovector[1] = save1;
332
cb->callout_flags = 0;
333
return rc;
334
}
335
336
337
338
/*************************************************
339
* Match a back-reference *
340
*************************************************/
341
342
/* This function is called only when it is known that the offset lies within
343
the offsets that have so far been used in the match. Note that in caseless
344
UTF-8 mode, the number of subject bytes matched may be different to the number
345
of reference bytes. (In theory this could also happen in UTF-16 mode, but it
346
seems unlikely.)
347
348
Arguments:
349
offset index into the offset vector
350
caseless TRUE if caseless
351
caseopts bitmask of REFI_FLAG_XYZ values
352
F the current backtracking frame pointer
353
mb points to match block
354
lengthptr pointer for returning the length matched
355
356
Returns: = 0 sucessful match; number of code units matched is set
357
< 0 no match
358
> 0 partial match
359
*/
360
361
static int
362
match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F,
363
match_block *mb, PCRE2_SIZE *lengthptr)
364
{
365
PCRE2_SPTR p;
366
PCRE2_SIZE length;
367
PCRE2_SPTR eptr;
368
PCRE2_SPTR eptr_start;
369
370
/* Deal with an unset group. The default is no match, but there is an option to
371
match an empty string. */
372
373
if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
374
{
375
if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
376
{
377
*lengthptr = 0;
378
return 0; /* Match */
379
}
380
else return -1; /* No match */
381
}
382
383
/* Separate the caseless and UTF cases for speed. */
384
385
eptr = eptr_start = Feptr;
386
p = mb->start_subject + Fovector[offset];
387
length = Fovector[offset+1] - Fovector[offset];
388
389
if (caseless)
390
{
391
#if defined SUPPORT_UNICODE
392
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
393
BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0;
394
BOOL turkish_casing = !caseless_restrict && (caseopts & REFI_FLAG_TURKISH_CASING) != 0;
395
396
if (utf || (mb->poptions & PCRE2_UCP) != 0)
397
{
398
PCRE2_SPTR endptr = p + length;
399
400
/* Match characters up to the end of the reference. NOTE: the number of
401
code units matched may differ, because in UTF-8 there are some characters
402
whose upper and lower case codes have different numbers of bytes. For
403
example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
404
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
405
sequence of two of the latter. It is important, therefore, to check the
406
length along the reference, not along the subject (earlier code did this
407
wrong). UCP without uses Unicode properties but without UTF encoding. */
408
409
while (p < endptr)
410
{
411
uint32_t c, d;
412
const ucd_record *ur;
413
if (eptr >= mb->end_subject) return 1; /* Partial match */
414
415
if (utf)
416
{
417
GETCHARINC(c, eptr);
418
GETCHARINC(d, p);
419
}
420
else
421
{
422
c = *eptr++;
423
d = *p++;
424
}
425
426
if (turkish_casing && UCD_ANY_I(d))
427
{
428
c = UCD_FOLD_I_TURKISH(c);
429
d = UCD_FOLD_I_TURKISH(d);
430
if (c != d) return -1; /* No match */
431
}
432
else if (c != d && c != (uint32_t)((int)d + (ur = GET_UCD(d))->other_case))
433
{
434
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
435
436
/* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets
437
that start with an ASCII character. */
438
if (caseless_restrict && *pp < 128) return -1; /* No match */
439
440
for (;;)
441
{
442
if (c < *pp) return -1; /* No match */
443
if (c == *pp++) break;
444
}
445
}
446
}
447
}
448
else
449
#endif
450
451
/* Not in UTF or UCP mode */
452
{
453
for (; length > 0; length--)
454
{
455
uint32_t cc, cp;
456
if (eptr >= mb->end_subject) return 1; /* Partial match */
457
cc = UCHAR21TEST(eptr);
458
cp = UCHAR21TEST(p);
459
if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
460
return -1; /* No match */
461
p++;
462
eptr++;
463
}
464
}
465
}
466
467
/* In the caseful case, we can just compare the code units, whether or not we
468
are in UTF and/or UCP mode. When partial matching, we have to do this unit by
469
unit. */
470
471
else
472
{
473
if (mb->partial != 0)
474
{
475
for (; length > 0; length--)
476
{
477
if (eptr >= mb->end_subject) return 1; /* Partial match */
478
if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
479
}
480
}
481
482
/* Not partial matching */
483
484
else
485
{
486
if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
487
if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
488
eptr += length;
489
}
490
}
491
492
*lengthptr = eptr - eptr_start;
493
return 0; /* Match */
494
}
495
496
497
498
/******************************************************************************
499
*******************************************************************************
500
"Recursion" in the match() function
501
502
The original match() function was highly recursive, but this proved to be the
503
source of a number of problems over the years, mostly because of the relatively
504
small system stacks that are commonly found. As new features were added to
505
patterns, various kludges were invented to reduce the amount of stack used,
506
making the code hard to understand in places.
507
508
A version did exist that used individual frames on the heap instead of calling
509
match() recursively, but this ran substantially slower. The current version is
510
a refactoring that uses a vector of frames to remember backtracking points.
511
This runs no slower, and possibly even a bit faster than the original recursive
512
implementation.
513
514
At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50
515
frames) was allocated on the system stack. If this was not big enough, the heap
516
was used for a larger vector. However, it turns out that there are environments
517
where taking as little as 20KiB from the system stack is an embarrassment.
518
After another refactoring, the heap is used exclusively, but a pointer the
519
frames vector and its size are cached in the match_data block, so that there is
520
no new memory allocation if the same match_data block is used for multiple
521
matches (unless the frames vector has to be extended).
522
*******************************************************************************
523
******************************************************************************/
524
525
526
527
528
/*************************************************
529
* Macros for the match() function *
530
*************************************************/
531
532
/* These macros pack up tests that are used for partial matching several times
533
in the code. The second one is used when we already know we are past the end of
534
the subject. We set the "hit end" flag if the pointer is at the end of the
535
subject and either (a) the pointer is past the earliest inspected character
536
(i.e. something has been matched, even if not part of the actual matched
537
string), or (b) the pattern contains a lookbehind. These are the conditions for
538
which adding more characters may allow the current match to continue.
539
540
For hard partial matching, we immediately return a partial match. Otherwise,
541
carrying on means that a complete match on the current subject will be sought.
542
A partial match is returned only if no complete match can be found. */
543
544
#define CHECK_PARTIAL() \
545
do { \
546
if (Feptr >= mb->end_subject) \
547
{ \
548
SCHECK_PARTIAL(); \
549
} \
550
} \
551
while (0)
552
553
#define SCHECK_PARTIAL() \
554
do { \
555
if (mb->partial != 0 && \
556
(Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
557
{ \
558
mb->hitend = TRUE; \
559
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
560
} \
561
} \
562
while (0)
563
564
565
/* These macros are used to implement backtracking. They simulate a recursive
566
call to the match() function by means of a local vector of frames which
567
remember the backtracking points. */
568
569
#define RMATCH(ra,rb) \
570
do { \
571
start_ecode = ra; \
572
Freturn_id = rb; \
573
goto MATCH_RECURSE; \
574
L_##rb:; \
575
} \
576
while (0)
577
578
#define RRETURN(ra) \
579
do { \
580
rrc = ra; \
581
goto RETURN_SWITCH; \
582
} \
583
while (0)
584
585
586
587
/*************************************************
588
* Match from current position *
589
*************************************************/
590
591
/* This function is called to run one match attempt at a single starting point
592
in the subject.
593
594
Performance note: It might be tempting to extract commonly used fields from the
595
mb structure (e.g. end_subject) into individual variables to improve
596
performance. Tests using gcc on a SPARC disproved this; in the first case, it
597
made performance worse.
598
599
Arguments:
600
start_eptr starting character in subject
601
start_ecode starting position in compiled code
602
top_bracket number of capturing parentheses in the pattern
603
frame_size size of each backtracking frame
604
match_data pointer to the match_data block
605
mb pointer to "static" variables block
606
607
Returns: MATCH_MATCH if matched ) these values are >= 0
608
MATCH_NOMATCH if failed to match )
609
negative MATCH_xxx value for PRUNE, SKIP, etc
610
negative PCRE2_ERROR_xxx value if aborted by an error condition
611
(e.g. stopped by repeated call or depth limit)
612
*/
613
614
static int
615
match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,
616
PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
617
{
618
/* Frame-handling variables */
619
620
heapframe *F; /* Current frame pointer */
621
heapframe *N = NULL; /* Temporary frame pointers */
622
heapframe *P = NULL;
623
624
heapframe *frames_top; /* End of frames vector */
625
heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
626
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
627
628
/* Local variables that do not need to be preserved over calls to RRMATCH(). */
629
630
PCRE2_SPTR branch_end = NULL;
631
PCRE2_SPTR branch_start;
632
PCRE2_SPTR bracode; /* Temp pointer to start of group */
633
PCRE2_SIZE offset; /* Used for group offsets */
634
PCRE2_SIZE length; /* Used for various length calculations */
635
636
int rrc; /* Return from functions & backtracking "recursions" */
637
#ifdef SUPPORT_UNICODE
638
int proptype; /* Type of character property */
639
#endif
640
641
uint32_t i; /* Used for local loops */
642
uint32_t fc; /* Character values */
643
uint32_t number; /* Used for group and other numbers */
644
uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
645
uint32_t group_frame_type; /* Specifies type for new group frames */
646
647
BOOL condition; /* Used in conditional groups */
648
BOOL cur_is_word; /* Used in "word" tests */
649
BOOL prev_is_word; /* Used in "word" tests */
650
651
/* UTF and UCP flags */
652
653
#ifdef SUPPORT_UNICODE
654
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
655
BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
656
#else
657
BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
658
#endif
659
660
/* This is the length of the last part of a backtracking frame that must be
661
copied when a new frame is created. */
662
663
frame_copy_size = frame_size - offsetof(heapframe, eptr);
664
665
/* Set up the first frame and the end of the frames vector. */
666
667
F = match_data->heapframes;
668
frames_top = (heapframe *)((char *)F + match_data->heapframes_size);
669
670
Frdepth = 0; /* "Recursion" depth */
671
Fcapture_last = 0; /* Number of most recent capture */
672
Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
673
Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
674
Fmark = NULL; /* Most recent mark */
675
Foffset_top = 0; /* End of captures within the frame */
676
Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
677
group_frame_type = 0; /* Not a start of group frame */
678
goto NEW_FRAME; /* Start processing with this frame */
679
680
/* Come back here when we want to create a new frame for remembering a
681
backtracking point. */
682
683
MATCH_RECURSE:
684
685
/* Set up a new backtracking frame. If the vector is full, get a new one,
686
doubling the size, but constrained by the heap limit (which is in KiB). */
687
688
N = (heapframe *)((char *)F + frame_size);
689
if ((heapframe *)((char *)N + frame_size) >= frames_top)
690
{
691
heapframe *new;
692
PCRE2_SIZE newsize;
693
PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);
694
695
if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)
696
{
697
if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)
698
return PCRE2_ERROR_NOMEMORY;
699
newsize = PCRE2_SIZE_MAX - 1;
700
}
701
else
702
newsize = match_data->heapframes_size * 2;
703
704
if (newsize / 1024 >= mb->heap_limit)
705
{
706
PCRE2_SIZE old_size = match_data->heapframes_size / 1024;
707
if (mb->heap_limit <= old_size)
708
return PCRE2_ERROR_HEAPLIMIT;
709
else
710
{
711
PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);
712
int over_bytes = match_data->heapframes_size % 1024;
713
if (over_bytes) max_delta -= (1024 - over_bytes);
714
newsize = match_data->heapframes_size + max_delta;
715
}
716
}
717
718
/* With a heap limit set, the permitted additional size may not be enough for
719
another frame, so do a final check. */
720
721
if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;
722
new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
723
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
724
memcpy(new, match_data->heapframes, usedsize);
725
726
N = (heapframe *)((char *)new + usedsize);
727
F = (heapframe *)((char *)N - frame_size);
728
729
match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
730
match_data->heapframes = new;
731
match_data->heapframes_size = newsize;
732
frames_top = (heapframe *)((char *)new + newsize);
733
}
734
735
#ifdef DEBUG_SHOW_RMATCH
736
fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);
737
if (group_frame_type != 0)
738
{
739
fprintf(stderr, " type=%x ", group_frame_type);
740
switch (GF_IDMASK(group_frame_type))
741
{
742
case GF_CAPTURE:
743
fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
744
break;
745
746
case GF_NOCAPTURE:
747
fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
748
break;
749
750
case GF_CONDASSERT:
751
fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
752
break;
753
754
case GF_RECURSE:
755
fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
756
break;
757
758
default:
759
fprintf(stderr, "*** unknown ***");
760
break;
761
}
762
}
763
fprintf(stderr, "\n");
764
#endif
765
766
/* Copy those fields that must be copied into the new frame, increase the
767
"recursion" depth (i.e. the new frame's index) and then make the new frame
768
current. */
769
770
memcpy((char *)N + offsetof(heapframe, eptr),
771
(char *)F + offsetof(heapframe, eptr),
772
frame_copy_size);
773
774
N->rdepth = Frdepth + 1;
775
F = N;
776
777
/* Carry on processing with a new frame. */
778
779
NEW_FRAME:
780
Fgroup_frame_type = group_frame_type;
781
Fecode = start_ecode; /* Starting code pointer */
782
Fback_frame = frame_size; /* Default is go back one frame */
783
784
/* If this is a special type of group frame, remember its offset for quick
785
access at the end of the group. If this is a recursion, set a new current
786
recursion value. */
787
788
if (group_frame_type != 0)
789
{
790
Flast_group_offset = (char *)F - (char *)match_data->heapframes;
791
if (GF_IDMASK(group_frame_type) == GF_RECURSE)
792
Fcurrent_recurse = GF_DATAMASK(group_frame_type);
793
group_frame_type = 0;
794
}
795
796
797
/* ========================================================================= */
798
/* This is the main processing loop. First check that we haven't recorded too
799
many backtracks (search tree is too large), or that we haven't exceeded the
800
recursive depth limit (used too many backtracking frames). If not, process the
801
opcodes. */
802
803
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
804
if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
805
806
#ifdef DEBUG_SHOW_OPS
807
fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",
808
GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);
809
#endif
810
811
for (;;)
812
{
813
#ifdef DEBUG_SHOW_OPS
814
fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
815
OP_names[*Fecode]);
816
#endif
817
818
Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
819
switch(Fop)
820
{
821
/* ===================================================================== */
822
/* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
823
any currently open capturing brackets. Unlike reaching the end of a group,
824
where we know the starting frame is at the top of the chained frames, in
825
this case we have to search back for the relevant frame in case other types
826
of group that use chained frames have intervened. Multiple OP_CLOSEs always
827
come innermost first, which matches the chain order. We can ignore this in
828
a recursion, because captures are not passed out of recursions. */
829
830
case OP_CLOSE:
831
if (Fcurrent_recurse == RECURSE_UNSET)
832
{
833
number = GET2(Fecode, 1);
834
offset = Flast_group_offset;
835
for(;;)
836
{
837
/* Corrupted heapframes?. Trigger an assert and return an error */
838
PCRE2_ASSERT(offset != PCRE2_UNSET);
839
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
840
841
N = (heapframe *)((char *)match_data->heapframes + offset);
842
P = (heapframe *)((char *)N - frame_size);
843
if (N->group_frame_type == (GF_CAPTURE | number)) break;
844
offset = P->last_group_offset;
845
}
846
offset = (number << 1) - 2;
847
Fcapture_last = number;
848
Fovector[offset] = P->eptr - mb->start_subject;
849
Fovector[offset+1] = Feptr - mb->start_subject;
850
if (offset >= Foffset_top) Foffset_top = offset + 2;
851
}
852
Fecode += PRIV(OP_lengths)[*Fecode];
853
break;
854
855
856
/* ===================================================================== */
857
/* Real or forced end of the pattern, assertion, or recursion. In an
858
assertion ACCEPT, update the last used pointer and remember the current
859
frame so that the captures and mark can be fished out of it. */
860
861
case OP_ASSERT_ACCEPT:
862
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
863
assert_accept_frame = F;
864
RRETURN(MATCH_ACCEPT);
865
866
/* For ACCEPT within a recursion, we have to find the most recent
867
recursion. If not in a recursion, fall through to code that is common with
868
OP_END. */
869
870
case OP_ACCEPT:
871
if (Fcurrent_recurse != RECURSE_UNSET)
872
{
873
#ifdef DEBUG_SHOW_OPS
874
fprintf(stderr, "++ Accept within recursion\n");
875
#endif
876
offset = Flast_group_offset;
877
for(;;)
878
{
879
/* Corrupted heapframes?. Trigger an assert and return an error */
880
PCRE2_ASSERT(offset != PCRE2_UNSET);
881
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
882
883
N = (heapframe *)((char *)match_data->heapframes + offset);
884
P = (heapframe *)((char *)N - frame_size);
885
if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
886
offset = P->last_group_offset;
887
}
888
889
/* N is now the frame of the recursion; the previous frame is at the
890
OP_RECURSE position. Go back there, copying the current subject position
891
and mark, and the start_match position (\K might have changed it), and
892
then move on past the OP_RECURSE. */
893
894
P->eptr = Feptr;
895
P->mark = Fmark;
896
P->start_match = Fstart_match;
897
F = P;
898
Fecode += 1 + LINK_SIZE;
899
continue;
900
}
901
/* Fall through */
902
903
/* OP_END itself can never be reached within a recursion because that is
904
picked up when the OP_KET that always precedes OP_END is reached. */
905
906
case OP_END:
907
908
/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
909
PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
910
subject. In both cases, backtracking will then try other alternatives, if
911
any. */
912
913
if (Feptr == Fstart_match &&
914
((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
915
((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
916
Fstart_match == mb->start_subject + mb->start_offset)))
917
{
918
#ifdef DEBUG_SHOW_OPS
919
fprintf(stderr, "++ Backtrack because empty string\n");
920
#endif
921
RRETURN(MATCH_NOMATCH);
922
}
923
924
/* Fail if PCRE2_ENDANCHORED is set and the end of the match is not
925
the end of the subject. After (*ACCEPT) we fail the entire match (at this
926
position) but backtrack if we've reached the end of the pattern. This
927
applies whether or not we are in a recursion. */
928
929
if (Feptr < mb->end_subject &&
930
((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
931
{
932
if (Fop == OP_END)
933
{
934
#ifdef DEBUG_SHOW_OPS
935
fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");
936
#endif
937
RRETURN(MATCH_NOMATCH);
938
}
939
940
#ifdef DEBUG_SHOW_OPS
941
fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");
942
#endif
943
return MATCH_NOMATCH; /* (*ACCEPT) */
944
}
945
946
/* We have a successful match of the whole pattern. Record the result and
947
then do a direct return from the function. If there is space in the offset
948
vector, set any pairs that follow the highest-numbered captured string but
949
are less than the number of capturing groups in the pattern to PCRE2_UNSET.
950
It is documented that this happens. "Gaps" are set to PCRE2_UNSET
951
dynamically. It is only those at the end that need setting here. */
952
953
mb->end_match_ptr = Feptr; /* Record where we ended */
954
mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
955
mb->mark = Fmark; /* and the last success mark */
956
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
957
958
match_data->ovector[0] = Fstart_match - mb->start_subject;
959
match_data->ovector[1] = Feptr - mb->start_subject;
960
961
/* Set i to the smaller of the sizes of the external and frame ovectors. */
962
963
i = 2 * ((top_bracket + 1 > match_data->oveccount)?
964
match_data->oveccount : top_bracket + 1);
965
memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
966
while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;
967
return MATCH_MATCH; /* Note: NOT RRETURN */
968
969
970
/*===================================================================== */
971
/* Match any single character type except newline; have to take care with
972
CRLF newlines and partial matching. */
973
974
case OP_ANY:
975
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
976
if (mb->partial != 0 &&
977
Feptr == mb->end_subject - 1 &&
978
NLBLOCK->nltype == NLTYPE_FIXED &&
979
NLBLOCK->nllen == 2 &&
980
UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
981
{
982
mb->hitend = TRUE;
983
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
984
}
985
/* Fall through */
986
987
/* Match any single character whatsoever. */
988
989
case OP_ALLANY:
990
if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
991
{ /* not be updated before SCHECK_PARTIAL. */
992
SCHECK_PARTIAL();
993
RRETURN(MATCH_NOMATCH);
994
}
995
Feptr++;
996
#ifdef SUPPORT_UNICODE
997
if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
998
#endif
999
Fecode++;
1000
break;
1001
1002
1003
/* ===================================================================== */
1004
/* Match a single code unit, even in UTF mode. This opcode really does
1005
match any code unit, even newline. (It really should be called ANYCODEUNIT,
1006
of course - the byte name is from pre-16 bit days.) */
1007
1008
case OP_ANYBYTE:
1009
if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
1010
{ /* not be updated before SCHECK_PARTIAL. */
1011
SCHECK_PARTIAL();
1012
RRETURN(MATCH_NOMATCH);
1013
}
1014
Feptr++;
1015
Fecode++;
1016
break;
1017
1018
1019
/* ===================================================================== */
1020
/* Match a single character, casefully */
1021
1022
case OP_CHAR:
1023
#ifdef SUPPORT_UNICODE
1024
if (utf)
1025
{
1026
Flength = 1;
1027
Fecode++;
1028
GETCHARLEN(fc, Fecode, Flength);
1029
if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
1030
{
1031
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
1032
RRETURN(MATCH_NOMATCH);
1033
}
1034
for (; Flength > 0; Flength--)
1035
{
1036
if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
1037
}
1038
}
1039
else
1040
#endif
1041
1042
/* Not UTF mode */
1043
{
1044
if (mb->end_subject - Feptr < 1)
1045
{
1046
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
1047
RRETURN(MATCH_NOMATCH);
1048
}
1049
if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
1050
Fecode += 2;
1051
}
1052
break;
1053
1054
1055
/* ===================================================================== */
1056
/* Match a single character, caselessly. If we are at the end of the
1057
subject, give up immediately. We get here only when the pattern character
1058
has at most one other case. Characters with more than two cases are coded
1059
as OP_PROP with the pseudo-property PT_CLIST. */
1060
1061
case OP_CHARI:
1062
if (Feptr >= mb->end_subject)
1063
{
1064
SCHECK_PARTIAL();
1065
RRETURN(MATCH_NOMATCH);
1066
}
1067
1068
#ifdef SUPPORT_UNICODE
1069
if (utf)
1070
{
1071
Flength = 1;
1072
Fecode++;
1073
GETCHARLEN(fc, Fecode, Flength);
1074
1075
/* If the pattern character's value is < 128, we know that its other case
1076
(if any) is also < 128 (and therefore only one code unit long in all
1077
code-unit widths), so we can use the fast lookup table. We checked above
1078
that there is at least one character left in the subject. */
1079
1080
if (fc < 128)
1081
{
1082
uint32_t cc = UCHAR21(Feptr);
1083
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1084
Fecode++;
1085
Feptr++;
1086
}
1087
1088
/* Otherwise we must pick up the subject character and use Unicode
1089
property support to test its other case. Note that we cannot use the
1090
value of "Flength" to check for sufficient bytes left, because the other
1091
case of the character may have more or fewer code units. */
1092
1093
else
1094
{
1095
uint32_t dc;
1096
GETCHARINC(dc, Feptr);
1097
Fecode += Flength;
1098
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1099
}
1100
}
1101
1102
/* If UCP is set without UTF we must do the same as above, but with one
1103
character per code unit. */
1104
1105
else if (ucp)
1106
{
1107
uint32_t cc = UCHAR21(Feptr);
1108
fc = Fecode[1];
1109
if (fc < 128)
1110
{
1111
if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1112
}
1113
else
1114
{
1115
if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1116
}
1117
Feptr++;
1118
Fecode += 2;
1119
}
1120
1121
else
1122
#endif /* SUPPORT_UNICODE */
1123
1124
/* Not UTF or UCP mode; use the table for characters < 256. */
1125
{
1126
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1127
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1128
Feptr++;
1129
Fecode += 2;
1130
}
1131
break;
1132
1133
1134
/* ===================================================================== */
1135
/* Match not a single character. */
1136
1137
case OP_NOT:
1138
case OP_NOTI:
1139
if (Feptr >= mb->end_subject)
1140
{
1141
SCHECK_PARTIAL();
1142
RRETURN(MATCH_NOMATCH);
1143
}
1144
1145
#ifdef SUPPORT_UNICODE
1146
if (utf)
1147
{
1148
uint32_t ch;
1149
Fecode++;
1150
GETCHARINC(ch, Fecode);
1151
GETCHARINC(fc, Feptr);
1152
if (ch == fc)
1153
{
1154
RRETURN(MATCH_NOMATCH); /* Caseful match */
1155
}
1156
else if (Fop == OP_NOTI) /* If caseless */
1157
{
1158
if (ch > 127)
1159
ch = UCD_OTHERCASE(ch);
1160
else
1161
ch = (mb->fcc)[ch];
1162
if (ch == fc) RRETURN(MATCH_NOMATCH);
1163
}
1164
}
1165
1166
/* UCP without UTF is as above, but with one character per code unit. */
1167
1168
else if (ucp)
1169
{
1170
uint32_t ch;
1171
fc = UCHAR21INC(Feptr);
1172
ch = Fecode[1];
1173
Fecode += 2;
1174
1175
if (ch == fc)
1176
{
1177
RRETURN(MATCH_NOMATCH); /* Caseful match */
1178
}
1179
else if (Fop == OP_NOTI) /* If caseless */
1180
{
1181
if (ch > 127)
1182
ch = UCD_OTHERCASE(ch);
1183
else
1184
ch = (mb->fcc)[ch];
1185
if (ch == fc) RRETURN(MATCH_NOMATCH);
1186
}
1187
}
1188
1189
else
1190
#endif /* SUPPORT_UNICODE */
1191
1192
/* Neither UTF nor UCP is set */
1193
1194
{
1195
uint32_t ch = Fecode[1];
1196
fc = UCHAR21INC(Feptr);
1197
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1198
RRETURN(MATCH_NOMATCH);
1199
Fecode += 2;
1200
}
1201
break;
1202
1203
1204
/* ===================================================================== */
1205
/* Match a single character repeatedly. */
1206
1207
#define Loclength F->temp_size
1208
#define Lstart_eptr F->temp_sptr[0]
1209
#define Lcharptr F->temp_sptr[1]
1210
#define Lmin F->temp_32[0]
1211
#define Lmax F->temp_32[1]
1212
#define Lc F->temp_32[2]
1213
#define Loc F->temp_32[3]
1214
1215
case OP_EXACT:
1216
case OP_EXACTI:
1217
Lmin = Lmax = GET2(Fecode, 1);
1218
Fecode += 1 + IMM2_SIZE;
1219
goto REPEATCHAR;
1220
1221
case OP_POSUPTO:
1222
case OP_POSUPTOI:
1223
reptype = REPTYPE_POS;
1224
Lmin = 0;
1225
Lmax = GET2(Fecode, 1);
1226
Fecode += 1 + IMM2_SIZE;
1227
goto REPEATCHAR;
1228
1229
case OP_UPTO:
1230
case OP_UPTOI:
1231
reptype = REPTYPE_MAX;
1232
Lmin = 0;
1233
Lmax = GET2(Fecode, 1);
1234
Fecode += 1 + IMM2_SIZE;
1235
goto REPEATCHAR;
1236
1237
case OP_MINUPTO:
1238
case OP_MINUPTOI:
1239
reptype = REPTYPE_MIN;
1240
Lmin = 0;
1241
Lmax = GET2(Fecode, 1);
1242
Fecode += 1 + IMM2_SIZE;
1243
goto REPEATCHAR;
1244
1245
case OP_POSSTAR:
1246
case OP_POSSTARI:
1247
reptype = REPTYPE_POS;
1248
Lmin = 0;
1249
Lmax = UINT32_MAX;
1250
Fecode++;
1251
goto REPEATCHAR;
1252
1253
case OP_POSPLUS:
1254
case OP_POSPLUSI:
1255
reptype = REPTYPE_POS;
1256
Lmin = 1;
1257
Lmax = UINT32_MAX;
1258
Fecode++;
1259
goto REPEATCHAR;
1260
1261
case OP_POSQUERY:
1262
case OP_POSQUERYI:
1263
reptype = REPTYPE_POS;
1264
Lmin = 0;
1265
Lmax = 1;
1266
Fecode++;
1267
goto REPEATCHAR;
1268
1269
case OP_STAR:
1270
case OP_STARI:
1271
case OP_MINSTAR:
1272
case OP_MINSTARI:
1273
case OP_PLUS:
1274
case OP_PLUSI:
1275
case OP_MINPLUS:
1276
case OP_MINPLUSI:
1277
case OP_QUERY:
1278
case OP_QUERYI:
1279
case OP_MINQUERY:
1280
case OP_MINQUERYI:
1281
fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1282
Lmin = rep_min[fc];
1283
Lmax = rep_max[fc];
1284
reptype = rep_typ[fc];
1285
1286
/* Common code for all repeated single-character matches. We first check
1287
for the minimum number of characters. If the minimum equals the maximum, we
1288
are done. Otherwise, if minimizing, check the rest of the pattern for a
1289
match; if there isn't one, advance up to the maximum, one character at a
1290
time.
1291
1292
If maximizing, advance up to the maximum number of matching characters,
1293
until Feptr is past the end of the maximum run. If possessive, we are
1294
then done (no backing up). Otherwise, match at this position; anything
1295
other than no match is immediately returned. For nomatch, back up one
1296
character, unless we are matching \R and the last thing matched was
1297
\r\n, in which case, back up two code units until we reach the first
1298
optional character position.
1299
1300
The various UTF/non-UTF and caseful/caseless cases are handled separately,
1301
for speed. */
1302
1303
REPEATCHAR:
1304
#ifdef SUPPORT_UNICODE
1305
if (utf)
1306
{
1307
Flength = 1;
1308
Lcharptr = Fecode;
1309
GETCHARLEN(fc, Fecode, Flength);
1310
Fecode += Flength;
1311
1312
/* Handle multi-code-unit character matching, caseful and caseless. */
1313
1314
if (Flength > 1)
1315
{
1316
uint32_t othercase;
1317
1318
if (Fop >= OP_STARI && /* Caseless */
1319
(othercase = UCD_OTHERCASE(fc)) != fc)
1320
Loclength = PRIV(ord2utf)(othercase, Foccu);
1321
else Loclength = 0;
1322
1323
for (i = 1; i <= Lmin; i++)
1324
{
1325
if (Feptr <= mb->end_subject - Flength &&
1326
memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1327
else if (Loclength > 0 &&
1328
Feptr <= mb->end_subject - Loclength &&
1329
memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1330
Feptr += Loclength;
1331
else
1332
{
1333
CHECK_PARTIAL();
1334
RRETURN(MATCH_NOMATCH);
1335
}
1336
}
1337
1338
if (Lmin == Lmax) continue;
1339
1340
if (reptype == REPTYPE_MIN)
1341
{
1342
for (;;)
1343
{
1344
RMATCH(Fecode, RM202);
1345
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1346
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1347
if (Feptr <= mb->end_subject - Flength &&
1348
memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1349
else if (Loclength > 0 &&
1350
Feptr <= mb->end_subject - Loclength &&
1351
memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1352
Feptr += Loclength;
1353
else
1354
{
1355
CHECK_PARTIAL();
1356
RRETURN(MATCH_NOMATCH);
1357
}
1358
}
1359
PCRE2_UNREACHABLE(); /* Control never reaches here */
1360
}
1361
1362
else /* Maximize */
1363
{
1364
Lstart_eptr = Feptr;
1365
for (i = Lmin; i < Lmax; i++)
1366
{
1367
if (Feptr <= mb->end_subject - Flength &&
1368
memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1369
Feptr += Flength;
1370
else if (Loclength > 0 &&
1371
Feptr <= mb->end_subject - Loclength &&
1372
memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1373
Feptr += Loclength;
1374
else
1375
{
1376
CHECK_PARTIAL();
1377
break;
1378
}
1379
}
1380
1381
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
1382
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1383
go too far. */
1384
1385
if (reptype != REPTYPE_POS) for(;;)
1386
{
1387
if (Feptr <= Lstart_eptr) break;
1388
RMATCH(Fecode, RM203);
1389
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1390
Feptr--;
1391
BACKCHAR(Feptr);
1392
}
1393
}
1394
break; /* End of repeated wide character handling */
1395
}
1396
1397
/* Length of UTF character is 1. Put it into the preserved variable and
1398
fall through to the non-UTF code. */
1399
1400
Lc = fc;
1401
}
1402
else
1403
#endif /* SUPPORT_UNICODE */
1404
1405
/* When not in UTF mode, load a single-code-unit character. Then proceed as
1406
above, using Unicode casing if either UTF or UCP is set. */
1407
1408
Lc = *Fecode++;
1409
1410
/* Caseless comparison */
1411
1412
if (Fop >= OP_STARI)
1413
{
1414
#if PCRE2_CODE_UNIT_WIDTH == 8
1415
#ifdef SUPPORT_UNICODE
1416
if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1417
else
1418
#endif /* SUPPORT_UNICODE */
1419
/* Lc will be < 128 in UTF-8 mode. */
1420
Loc = mb->fcc[Lc];
1421
#else /* 16-bit & 32-bit */
1422
#ifdef SUPPORT_UNICODE
1423
if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1424
else
1425
#endif /* SUPPORT_UNICODE */
1426
Loc = TABLE_GET(Lc, mb->fcc, Lc);
1427
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1428
1429
for (i = 1; i <= Lmin; i++)
1430
{
1431
uint32_t cc; /* Faster than PCRE2_UCHAR */
1432
if (Feptr >= mb->end_subject)
1433
{
1434
SCHECK_PARTIAL();
1435
RRETURN(MATCH_NOMATCH);
1436
}
1437
cc = UCHAR21TEST(Feptr);
1438
if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1439
Feptr++;
1440
}
1441
if (Lmin == Lmax) continue;
1442
1443
if (reptype == REPTYPE_MIN)
1444
{
1445
for (;;)
1446
{
1447
uint32_t cc; /* Faster than PCRE2_UCHAR */
1448
RMATCH(Fecode, RM25);
1449
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1450
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1451
if (Feptr >= mb->end_subject)
1452
{
1453
SCHECK_PARTIAL();
1454
RRETURN(MATCH_NOMATCH);
1455
}
1456
cc = UCHAR21TEST(Feptr);
1457
if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1458
Feptr++;
1459
}
1460
PCRE2_UNREACHABLE(); /* Control never reaches here */
1461
}
1462
1463
else /* Maximize */
1464
{
1465
Lstart_eptr = Feptr;
1466
for (i = Lmin; i < Lmax; i++)
1467
{
1468
uint32_t cc; /* Faster than PCRE2_UCHAR */
1469
if (Feptr >= mb->end_subject)
1470
{
1471
SCHECK_PARTIAL();
1472
break;
1473
}
1474
cc = UCHAR21TEST(Feptr);
1475
if (Lc != cc && Loc != cc) break;
1476
Feptr++;
1477
}
1478
if (reptype != REPTYPE_POS) for (;;)
1479
{
1480
if (Feptr == Lstart_eptr) break;
1481
RMATCH(Fecode, RM26);
1482
Feptr--;
1483
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1484
}
1485
}
1486
}
1487
1488
/* Caseful comparisons (includes all multi-byte characters) */
1489
1490
else
1491
{
1492
for (i = 1; i <= Lmin; i++)
1493
{
1494
if (Feptr >= mb->end_subject)
1495
{
1496
SCHECK_PARTIAL();
1497
RRETURN(MATCH_NOMATCH);
1498
}
1499
if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1500
}
1501
1502
if (Lmin == Lmax) continue;
1503
1504
if (reptype == REPTYPE_MIN)
1505
{
1506
for (;;)
1507
{
1508
RMATCH(Fecode, RM27);
1509
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1510
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1511
if (Feptr >= mb->end_subject)
1512
{
1513
SCHECK_PARTIAL();
1514
RRETURN(MATCH_NOMATCH);
1515
}
1516
if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1517
}
1518
PCRE2_UNREACHABLE(); /* Control never reaches here */
1519
}
1520
else /* Maximize */
1521
{
1522
Lstart_eptr = Feptr;
1523
for (i = Lmin; i < Lmax; i++)
1524
{
1525
if (Feptr >= mb->end_subject)
1526
{
1527
SCHECK_PARTIAL();
1528
break;
1529
}
1530
1531
if (Lc != UCHAR21TEST(Feptr)) break;
1532
Feptr++;
1533
}
1534
1535
if (reptype != REPTYPE_POS) for (;;)
1536
{
1537
if (Feptr <= Lstart_eptr) break;
1538
RMATCH(Fecode, RM28);
1539
Feptr--;
1540
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1541
}
1542
}
1543
}
1544
break;
1545
1546
#undef Loclength
1547
#undef Lstart_eptr
1548
#undef Lcharptr
1549
#undef Lmin
1550
#undef Lmax
1551
#undef Lc
1552
#undef Loc
1553
1554
1555
/* ===================================================================== */
1556
/* Match a negated single one-byte character repeatedly. This is almost a
1557
repeat of the code for a repeated single character, but I haven't found a
1558
nice way of commoning these up that doesn't require a test of the
1559
positive/negative option for each character match. Maybe that wouldn't add
1560
very much to the time taken, but character matching *is* what this is all
1561
about... */
1562
1563
#define Lstart_eptr F->temp_sptr[0]
1564
#define Lmin F->temp_32[0]
1565
#define Lmax F->temp_32[1]
1566
#define Lc F->temp_32[2]
1567
#define Loc F->temp_32[3]
1568
1569
case OP_NOTEXACT:
1570
case OP_NOTEXACTI:
1571
Lmin = Lmax = GET2(Fecode, 1);
1572
Fecode += 1 + IMM2_SIZE;
1573
goto REPEATNOTCHAR;
1574
1575
case OP_NOTUPTO:
1576
case OP_NOTUPTOI:
1577
Lmin = 0;
1578
Lmax = GET2(Fecode, 1);
1579
reptype = REPTYPE_MAX;
1580
Fecode += 1 + IMM2_SIZE;
1581
goto REPEATNOTCHAR;
1582
1583
case OP_NOTMINUPTO:
1584
case OP_NOTMINUPTOI:
1585
Lmin = 0;
1586
Lmax = GET2(Fecode, 1);
1587
reptype = REPTYPE_MIN;
1588
Fecode += 1 + IMM2_SIZE;
1589
goto REPEATNOTCHAR;
1590
1591
case OP_NOTPOSSTAR:
1592
case OP_NOTPOSSTARI:
1593
reptype = REPTYPE_POS;
1594
Lmin = 0;
1595
Lmax = UINT32_MAX;
1596
Fecode++;
1597
goto REPEATNOTCHAR;
1598
1599
case OP_NOTPOSPLUS:
1600
case OP_NOTPOSPLUSI:
1601
reptype = REPTYPE_POS;
1602
Lmin = 1;
1603
Lmax = UINT32_MAX;
1604
Fecode++;
1605
goto REPEATNOTCHAR;
1606
1607
case OP_NOTPOSQUERY:
1608
case OP_NOTPOSQUERYI:
1609
reptype = REPTYPE_POS;
1610
Lmin = 0;
1611
Lmax = 1;
1612
Fecode++;
1613
goto REPEATNOTCHAR;
1614
1615
case OP_NOTPOSUPTO:
1616
case OP_NOTPOSUPTOI:
1617
reptype = REPTYPE_POS;
1618
Lmin = 0;
1619
Lmax = GET2(Fecode, 1);
1620
Fecode += 1 + IMM2_SIZE;
1621
goto REPEATNOTCHAR;
1622
1623
case OP_NOTSTAR:
1624
case OP_NOTSTARI:
1625
case OP_NOTMINSTAR:
1626
case OP_NOTMINSTARI:
1627
case OP_NOTPLUS:
1628
case OP_NOTPLUSI:
1629
case OP_NOTMINPLUS:
1630
case OP_NOTMINPLUSI:
1631
case OP_NOTQUERY:
1632
case OP_NOTQUERYI:
1633
case OP_NOTMINQUERY:
1634
case OP_NOTMINQUERYI:
1635
fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1636
Lmin = rep_min[fc];
1637
Lmax = rep_max[fc];
1638
reptype = rep_typ[fc];
1639
1640
/* Common code for all repeated single-character non-matches. */
1641
1642
REPEATNOTCHAR:
1643
GETCHARINCTEST(Lc, Fecode);
1644
1645
/* The code is duplicated for the caseless and caseful cases, for speed,
1646
since matching characters is likely to be quite common. First, ensure the
1647
minimum number of matches are present. If Lmin = Lmax, we are done.
1648
Otherwise, if minimizing, keep trying the rest of the expression and
1649
advancing one matching character if failing, up to the maximum.
1650
Alternatively, if maximizing, find the maximum number of characters and
1651
work backwards. */
1652
1653
if (Fop >= OP_NOTSTARI) /* Caseless */
1654
{
1655
#ifdef SUPPORT_UNICODE
1656
if ((utf || ucp) && Lc > 127)
1657
Loc = UCD_OTHERCASE(Lc);
1658
else
1659
#endif /* SUPPORT_UNICODE */
1660
1661
Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1662
1663
#ifdef SUPPORT_UNICODE
1664
if (utf)
1665
{
1666
uint32_t d;
1667
for (i = 1; i <= Lmin; i++)
1668
{
1669
if (Feptr >= mb->end_subject)
1670
{
1671
SCHECK_PARTIAL();
1672
RRETURN(MATCH_NOMATCH);
1673
}
1674
GETCHARINC(d, Feptr);
1675
if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1676
}
1677
}
1678
else
1679
#endif /* SUPPORT_UNICODE */
1680
1681
/* Not UTF mode */
1682
{
1683
for (i = 1; i <= Lmin; i++)
1684
{
1685
if (Feptr >= mb->end_subject)
1686
{
1687
SCHECK_PARTIAL();
1688
RRETURN(MATCH_NOMATCH);
1689
}
1690
if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1691
Feptr++;
1692
}
1693
}
1694
1695
if (Lmin == Lmax) continue; /* Finished for exact count */
1696
1697
if (reptype == REPTYPE_MIN)
1698
{
1699
#ifdef SUPPORT_UNICODE
1700
if (utf)
1701
{
1702
uint32_t d;
1703
for (;;)
1704
{
1705
RMATCH(Fecode, RM204);
1706
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1707
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1708
if (Feptr >= mb->end_subject)
1709
{
1710
SCHECK_PARTIAL();
1711
RRETURN(MATCH_NOMATCH);
1712
}
1713
GETCHARINC(d, Feptr);
1714
if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1715
}
1716
}
1717
else
1718
#endif /*SUPPORT_UNICODE */
1719
1720
/* Not UTF mode */
1721
{
1722
for (;;)
1723
{
1724
RMATCH(Fecode, RM29);
1725
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1726
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1727
if (Feptr >= mb->end_subject)
1728
{
1729
SCHECK_PARTIAL();
1730
RRETURN(MATCH_NOMATCH);
1731
}
1732
if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1733
Feptr++;
1734
}
1735
}
1736
PCRE2_UNREACHABLE(); /* Control never reaches here */
1737
}
1738
1739
/* Maximize case */
1740
1741
else
1742
{
1743
Lstart_eptr = Feptr;
1744
1745
#ifdef SUPPORT_UNICODE
1746
if (utf)
1747
{
1748
uint32_t d;
1749
for (i = Lmin; i < Lmax; i++)
1750
{
1751
int len = 1;
1752
if (Feptr >= mb->end_subject)
1753
{
1754
SCHECK_PARTIAL();
1755
break;
1756
}
1757
GETCHARLEN(d, Feptr, len);
1758
if (Lc == d || Loc == d) break;
1759
Feptr += len;
1760
}
1761
1762
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
1763
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1764
go too far. */
1765
1766
if (reptype != REPTYPE_POS) for(;;)
1767
{
1768
if (Feptr <= Lstart_eptr) break;
1769
RMATCH(Fecode, RM205);
1770
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1771
Feptr--;
1772
BACKCHAR(Feptr);
1773
}
1774
}
1775
else
1776
#endif /* SUPPORT_UNICODE */
1777
1778
/* Not UTF mode */
1779
{
1780
for (i = Lmin; i < Lmax; i++)
1781
{
1782
if (Feptr >= mb->end_subject)
1783
{
1784
SCHECK_PARTIAL();
1785
break;
1786
}
1787
if (Lc == *Feptr || Loc == *Feptr) break;
1788
Feptr++;
1789
}
1790
if (reptype != REPTYPE_POS) for (;;)
1791
{
1792
if (Feptr == Lstart_eptr) break;
1793
RMATCH(Fecode, RM30);
1794
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1795
Feptr--;
1796
}
1797
}
1798
}
1799
}
1800
1801
/* Caseful comparisons */
1802
1803
else
1804
{
1805
#ifdef SUPPORT_UNICODE
1806
if (utf)
1807
{
1808
uint32_t d;
1809
for (i = 1; i <= Lmin; i++)
1810
{
1811
if (Feptr >= mb->end_subject)
1812
{
1813
SCHECK_PARTIAL();
1814
RRETURN(MATCH_NOMATCH);
1815
}
1816
GETCHARINC(d, Feptr);
1817
if (Lc == d) RRETURN(MATCH_NOMATCH);
1818
}
1819
}
1820
else
1821
#endif
1822
/* Not UTF mode */
1823
{
1824
for (i = 1; i <= Lmin; i++)
1825
{
1826
if (Feptr >= mb->end_subject)
1827
{
1828
SCHECK_PARTIAL();
1829
RRETURN(MATCH_NOMATCH);
1830
}
1831
if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1832
}
1833
}
1834
1835
if (Lmin == Lmax) continue;
1836
1837
if (reptype == REPTYPE_MIN)
1838
{
1839
#ifdef SUPPORT_UNICODE
1840
if (utf)
1841
{
1842
uint32_t d;
1843
for (;;)
1844
{
1845
RMATCH(Fecode, RM206);
1846
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1847
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1848
if (Feptr >= mb->end_subject)
1849
{
1850
SCHECK_PARTIAL();
1851
RRETURN(MATCH_NOMATCH);
1852
}
1853
GETCHARINC(d, Feptr);
1854
if (Lc == d) RRETURN(MATCH_NOMATCH);
1855
}
1856
}
1857
else
1858
#endif
1859
/* Not UTF mode */
1860
{
1861
for (;;)
1862
{
1863
RMATCH(Fecode, RM31);
1864
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1865
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1866
if (Feptr >= mb->end_subject)
1867
{
1868
SCHECK_PARTIAL();
1869
RRETURN(MATCH_NOMATCH);
1870
}
1871
if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1872
}
1873
}
1874
PCRE2_UNREACHABLE(); /* Control never reaches here */
1875
}
1876
1877
/* Maximize case */
1878
1879
else
1880
{
1881
Lstart_eptr = Feptr;
1882
1883
#ifdef SUPPORT_UNICODE
1884
if (utf)
1885
{
1886
uint32_t d;
1887
for (i = Lmin; i < Lmax; i++)
1888
{
1889
int len = 1;
1890
if (Feptr >= mb->end_subject)
1891
{
1892
SCHECK_PARTIAL();
1893
break;
1894
}
1895
GETCHARLEN(d, Feptr, len);
1896
if (Lc == d) break;
1897
Feptr += len;
1898
}
1899
1900
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
1901
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1902
go too far. */
1903
1904
if (reptype != REPTYPE_POS) for(;;)
1905
{
1906
if (Feptr <= Lstart_eptr) break;
1907
RMATCH(Fecode, RM207);
1908
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1909
Feptr--;
1910
BACKCHAR(Feptr);
1911
}
1912
}
1913
else
1914
#endif
1915
/* Not UTF mode */
1916
{
1917
for (i = Lmin; i < Lmax; i++)
1918
{
1919
if (Feptr >= mb->end_subject)
1920
{
1921
SCHECK_PARTIAL();
1922
break;
1923
}
1924
if (Lc == *Feptr) break;
1925
Feptr++;
1926
}
1927
if (reptype != REPTYPE_POS) for (;;)
1928
{
1929
if (Feptr == Lstart_eptr) break;
1930
RMATCH(Fecode, RM32);
1931
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1932
Feptr--;
1933
}
1934
}
1935
}
1936
}
1937
break;
1938
1939
#undef Lstart_eptr
1940
#undef Lmin
1941
#undef Lmax
1942
#undef Lc
1943
#undef Loc
1944
1945
1946
/* ===================================================================== */
1947
/* Match a bit-mapped character class, possibly repeatedly. These opcodes
1948
are used when all the characters in the class have values in the range
1949
0-255, and either the matching is caseful, or the characters are in the
1950
range 0-127 when UTF processing is enabled. The only difference between
1951
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1952
encountered. */
1953
1954
#define Lmin F->temp_32[0]
1955
#define Lmax F->temp_32[1]
1956
#define Lstart_eptr F->temp_sptr[0]
1957
#define Lbyte_map_address F->temp_sptr[1]
1958
#define Lbyte_map ((const unsigned char *)Lbyte_map_address)
1959
1960
case OP_NCLASS:
1961
case OP_CLASS:
1962
{
1963
Lbyte_map_address = Fecode + 1; /* Save for matching */
1964
Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1965
1966
/* Look past the end of the item to see if there is repeat information
1967
following. Then obey similar code to character type repeats. */
1968
1969
switch (*Fecode)
1970
{
1971
case OP_CRSTAR:
1972
case OP_CRMINSTAR:
1973
case OP_CRPLUS:
1974
case OP_CRMINPLUS:
1975
case OP_CRQUERY:
1976
case OP_CRMINQUERY:
1977
case OP_CRPOSSTAR:
1978
case OP_CRPOSPLUS:
1979
case OP_CRPOSQUERY:
1980
fc = *Fecode++ - OP_CRSTAR;
1981
Lmin = rep_min[fc];
1982
Lmax = rep_max[fc];
1983
reptype = rep_typ[fc];
1984
break;
1985
1986
case OP_CRRANGE:
1987
case OP_CRMINRANGE:
1988
case OP_CRPOSRANGE:
1989
Lmin = GET2(Fecode, 1);
1990
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1991
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1992
reptype = rep_typ[*Fecode - OP_CRSTAR];
1993
Fecode += 1 + 2 * IMM2_SIZE;
1994
break;
1995
1996
default: /* No repeat follows */
1997
Lmin = Lmax = 1;
1998
break;
1999
}
2000
2001
/* First, ensure the minimum number of matches are present. */
2002
2003
#ifdef SUPPORT_UNICODE
2004
if (utf)
2005
{
2006
for (i = 1; i <= Lmin; i++)
2007
{
2008
if (Feptr >= mb->end_subject)
2009
{
2010
SCHECK_PARTIAL();
2011
RRETURN(MATCH_NOMATCH);
2012
}
2013
GETCHARINC(fc, Feptr);
2014
if (fc > 255)
2015
{
2016
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2017
}
2018
else
2019
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2020
}
2021
}
2022
else
2023
#endif
2024
/* Not UTF mode */
2025
{
2026
for (i = 1; i <= Lmin; i++)
2027
{
2028
if (Feptr >= mb->end_subject)
2029
{
2030
SCHECK_PARTIAL();
2031
RRETURN(MATCH_NOMATCH);
2032
}
2033
fc = *Feptr++;
2034
#if PCRE2_CODE_UNIT_WIDTH != 8
2035
if (fc > 255)
2036
{
2037
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2038
}
2039
else
2040
#endif
2041
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2042
}
2043
}
2044
2045
/* If Lmax == Lmin we are done. Continue with main loop. */
2046
2047
if (Lmin == Lmax) continue;
2048
2049
/* If minimizing, keep testing the rest of the expression and advancing
2050
the pointer while it matches the class. */
2051
2052
if (reptype == REPTYPE_MIN)
2053
{
2054
#ifdef SUPPORT_UNICODE
2055
if (utf)
2056
{
2057
for (;;)
2058
{
2059
RMATCH(Fecode, RM200);
2060
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2061
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2062
if (Feptr >= mb->end_subject)
2063
{
2064
SCHECK_PARTIAL();
2065
RRETURN(MATCH_NOMATCH);
2066
}
2067
GETCHARINC(fc, Feptr);
2068
if (fc > 255)
2069
{
2070
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2071
}
2072
else
2073
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2074
}
2075
}
2076
else
2077
#endif
2078
/* Not UTF mode */
2079
{
2080
for (;;)
2081
{
2082
RMATCH(Fecode, RM23);
2083
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2084
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2085
if (Feptr >= mb->end_subject)
2086
{
2087
SCHECK_PARTIAL();
2088
RRETURN(MATCH_NOMATCH);
2089
}
2090
fc = *Feptr++;
2091
#if PCRE2_CODE_UNIT_WIDTH != 8
2092
if (fc > 255)
2093
{
2094
if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2095
}
2096
else
2097
#endif
2098
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2099
}
2100
}
2101
PCRE2_UNREACHABLE(); /* Control never reaches here */
2102
}
2103
2104
/* If maximizing, find the longest possible run, then work backwards. */
2105
2106
else
2107
{
2108
Lstart_eptr = Feptr;
2109
2110
#ifdef SUPPORT_UNICODE
2111
if (utf)
2112
{
2113
for (i = Lmin; i < Lmax; i++)
2114
{
2115
int len = 1;
2116
if (Feptr >= mb->end_subject)
2117
{
2118
SCHECK_PARTIAL();
2119
break;
2120
}
2121
GETCHARLEN(fc, Feptr, len);
2122
if (fc > 255)
2123
{
2124
if (Fop == OP_CLASS) break;
2125
}
2126
else
2127
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2128
Feptr += len;
2129
}
2130
2131
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2132
2133
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
2134
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2135
go too far. */
2136
2137
for (;;)
2138
{
2139
RMATCH(Fecode, RM201);
2140
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2141
if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2142
BACKCHAR(Feptr);
2143
}
2144
}
2145
else
2146
#endif
2147
/* Not UTF mode */
2148
{
2149
for (i = Lmin; i < Lmax; i++)
2150
{
2151
if (Feptr >= mb->end_subject)
2152
{
2153
SCHECK_PARTIAL();
2154
break;
2155
}
2156
fc = *Feptr;
2157
#if PCRE2_CODE_UNIT_WIDTH != 8
2158
if (fc > 255)
2159
{
2160
if (Fop == OP_CLASS) break;
2161
}
2162
else
2163
#endif
2164
if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2165
Feptr++;
2166
}
2167
2168
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2169
2170
while (Feptr >= Lstart_eptr)
2171
{
2172
RMATCH(Fecode, RM24);
2173
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2174
Feptr--;
2175
}
2176
}
2177
2178
RRETURN(MATCH_NOMATCH);
2179
}
2180
}
2181
2182
PCRE2_UNREACHABLE(); /* Control never reaches here */
2183
2184
#undef Lbyte_map_address
2185
#undef Lbyte_map
2186
#undef Lstart_eptr
2187
#undef Lmin
2188
#undef Lmax
2189
2190
2191
/* ===================================================================== */
2192
/* Match an extended character class. In the 8-bit library, this opcode is
2193
encountered only when UTF-8 mode mode is supported. In the 16-bit and
2194
32-bit libraries, codepoints greater than 255 may be encountered even when
2195
UTF is not supported. */
2196
2197
#define Lstart_eptr F->temp_sptr[0]
2198
#define Lxclass_data F->temp_sptr[1]
2199
#define Lmin F->temp_32[0]
2200
#define Lmax F->temp_32[1]
2201
2202
#ifdef SUPPORT_WIDE_CHARS
2203
case OP_XCLASS:
2204
{
2205
Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2206
Fecode += GET(Fecode, 1); /* Advance past the item */
2207
2208
switch (*Fecode)
2209
{
2210
case OP_CRSTAR:
2211
case OP_CRMINSTAR:
2212
case OP_CRPLUS:
2213
case OP_CRMINPLUS:
2214
case OP_CRQUERY:
2215
case OP_CRMINQUERY:
2216
case OP_CRPOSSTAR:
2217
case OP_CRPOSPLUS:
2218
case OP_CRPOSQUERY:
2219
fc = *Fecode++ - OP_CRSTAR;
2220
Lmin = rep_min[fc];
2221
Lmax = rep_max[fc];
2222
reptype = rep_typ[fc];
2223
break;
2224
2225
case OP_CRRANGE:
2226
case OP_CRMINRANGE:
2227
case OP_CRPOSRANGE:
2228
Lmin = GET2(Fecode, 1);
2229
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2230
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2231
reptype = rep_typ[*Fecode - OP_CRSTAR];
2232
Fecode += 1 + 2 * IMM2_SIZE;
2233
break;
2234
2235
default: /* No repeat follows */
2236
Lmin = Lmax = 1;
2237
break;
2238
}
2239
2240
/* First, ensure the minimum number of matches are present. */
2241
2242
for (i = 1; i <= Lmin; i++)
2243
{
2244
if (Feptr >= mb->end_subject)
2245
{
2246
SCHECK_PARTIAL();
2247
RRETURN(MATCH_NOMATCH);
2248
}
2249
GETCHARINCTEST(fc, Feptr);
2250
if (!PRIV(xclass)(fc, Lxclass_data,
2251
(const uint8_t*)mb->start_code, utf))
2252
RRETURN(MATCH_NOMATCH);
2253
}
2254
2255
/* If Lmax == Lmin we can just continue with the main loop. */
2256
2257
if (Lmin == Lmax) continue;
2258
2259
/* If minimizing, keep testing the rest of the expression and advancing
2260
the pointer while it matches the class. */
2261
2262
if (reptype == REPTYPE_MIN)
2263
{
2264
for (;;)
2265
{
2266
RMATCH(Fecode, RM100);
2267
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2268
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2269
if (Feptr >= mb->end_subject)
2270
{
2271
SCHECK_PARTIAL();
2272
RRETURN(MATCH_NOMATCH);
2273
}
2274
GETCHARINCTEST(fc, Feptr);
2275
if (!PRIV(xclass)(fc, Lxclass_data,
2276
(const uint8_t*)mb->start_code, utf))
2277
RRETURN(MATCH_NOMATCH);
2278
}
2279
PCRE2_UNREACHABLE(); /* Control never reaches here */
2280
}
2281
2282
/* If maximizing, find the longest possible run, then work backwards. */
2283
2284
else
2285
{
2286
Lstart_eptr = Feptr;
2287
for (i = Lmin; i < Lmax; i++)
2288
{
2289
int len = 1;
2290
if (Feptr >= mb->end_subject)
2291
{
2292
SCHECK_PARTIAL();
2293
break;
2294
}
2295
#ifdef SUPPORT_UNICODE
2296
GETCHARLENTEST(fc, Feptr, len);
2297
#else
2298
fc = *Feptr;
2299
#endif
2300
if (!PRIV(xclass)(fc, Lxclass_data,
2301
(const uint8_t*)mb->start_code, utf)) break;
2302
Feptr += len;
2303
}
2304
2305
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2306
2307
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
2308
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2309
go too far. */
2310
2311
for(;;)
2312
{
2313
RMATCH(Fecode, RM101);
2314
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2315
if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2316
#ifdef SUPPORT_UNICODE
2317
if (utf) BACKCHAR(Feptr);
2318
#endif
2319
}
2320
RRETURN(MATCH_NOMATCH);
2321
}
2322
2323
PCRE2_UNREACHABLE(); /* Control never reaches here */
2324
}
2325
#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2326
2327
#undef Lstart_eptr
2328
#undef Lxclass_data
2329
#undef Lmin
2330
#undef Lmax
2331
2332
2333
/* ===================================================================== */
2334
/* Match a complex, set-based character class. This opcodes are used when
2335
there is complex nesting or logical operations within the character
2336
class. */
2337
2338
#define Lstart_eptr F->temp_sptr[0]
2339
#define Leclass_data F->temp_sptr[1]
2340
#define Leclass_len F->temp_size
2341
#define Lmin F->temp_32[0]
2342
#define Lmax F->temp_32[1]
2343
2344
#ifdef SUPPORT_WIDE_CHARS
2345
case OP_ECLASS:
2346
{
2347
Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2348
Fecode += GET(Fecode, 1); /* Advance past the item */
2349
Leclass_len = (PCRE2_SIZE)(Fecode - Leclass_data);
2350
2351
switch (*Fecode)
2352
{
2353
case OP_CRSTAR:
2354
case OP_CRMINSTAR:
2355
case OP_CRPLUS:
2356
case OP_CRMINPLUS:
2357
case OP_CRQUERY:
2358
case OP_CRMINQUERY:
2359
case OP_CRPOSSTAR:
2360
case OP_CRPOSPLUS:
2361
case OP_CRPOSQUERY:
2362
fc = *Fecode++ - OP_CRSTAR;
2363
Lmin = rep_min[fc];
2364
Lmax = rep_max[fc];
2365
reptype = rep_typ[fc];
2366
break;
2367
2368
case OP_CRRANGE:
2369
case OP_CRMINRANGE:
2370
case OP_CRPOSRANGE:
2371
Lmin = GET2(Fecode, 1);
2372
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2373
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2374
reptype = rep_typ[*Fecode - OP_CRSTAR];
2375
Fecode += 1 + 2 * IMM2_SIZE;
2376
break;
2377
2378
default: /* No repeat follows */
2379
Lmin = Lmax = 1;
2380
break;
2381
}
2382
2383
/* First, ensure the minimum number of matches are present. */
2384
2385
for (i = 1; i <= Lmin; i++)
2386
{
2387
if (Feptr >= mb->end_subject)
2388
{
2389
SCHECK_PARTIAL();
2390
RRETURN(MATCH_NOMATCH);
2391
}
2392
GETCHARINCTEST(fc, Feptr);
2393
if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,
2394
(const uint8_t*)mb->start_code, utf))
2395
RRETURN(MATCH_NOMATCH);
2396
}
2397
2398
/* If Lmax == Lmin we can just continue with the main loop. */
2399
2400
if (Lmin == Lmax) continue;
2401
2402
/* If minimizing, keep testing the rest of the expression and advancing
2403
the pointer while it matches the class. */
2404
2405
if (reptype == REPTYPE_MIN)
2406
{
2407
for (;;)
2408
{
2409
RMATCH(Fecode, RM102);
2410
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2412
if (Feptr >= mb->end_subject)
2413
{
2414
SCHECK_PARTIAL();
2415
RRETURN(MATCH_NOMATCH);
2416
}
2417
GETCHARINCTEST(fc, Feptr);
2418
if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,
2419
(const uint8_t*)mb->start_code, utf))
2420
RRETURN(MATCH_NOMATCH);
2421
}
2422
PCRE2_UNREACHABLE(); /* Control never reaches here */
2423
}
2424
2425
/* If maximizing, find the longest possible run, then work backwards. */
2426
2427
else
2428
{
2429
Lstart_eptr = Feptr;
2430
for (i = Lmin; i < Lmax; i++)
2431
{
2432
int len = 1;
2433
if (Feptr >= mb->end_subject)
2434
{
2435
SCHECK_PARTIAL();
2436
break;
2437
}
2438
#ifdef SUPPORT_UNICODE
2439
GETCHARLENTEST(fc, Feptr, len);
2440
#else
2441
fc = *Feptr;
2442
#endif
2443
if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,
2444
(const uint8_t*)mb->start_code, utf))
2445
break;
2446
Feptr += len;
2447
}
2448
2449
if (reptype == REPTYPE_POS) continue; /* No backtracking */
2450
2451
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
2452
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2453
go too far. */
2454
2455
for(;;)
2456
{
2457
RMATCH(Fecode, RM103);
2458
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2459
if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2460
#ifdef SUPPORT_UNICODE
2461
if (utf) BACKCHAR(Feptr);
2462
#endif
2463
}
2464
RRETURN(MATCH_NOMATCH);
2465
}
2466
2467
PCRE2_UNREACHABLE(); /* Control never reaches here */
2468
}
2469
#endif /* SUPPORT_WIDE_CHARS: end of ECLASS */
2470
2471
#undef Lstart_eptr
2472
#undef Leclass_data
2473
#undef Leclass_len
2474
#undef Lmin
2475
#undef Lmax
2476
2477
2478
/* ===================================================================== */
2479
/* Match various character types when PCRE2_UCP is not set. These opcodes
2480
are not generated when PCRE2_UCP is set - instead appropriate property
2481
tests are compiled. */
2482
2483
case OP_NOT_DIGIT:
2484
if (Feptr >= mb->end_subject)
2485
{
2486
SCHECK_PARTIAL();
2487
RRETURN(MATCH_NOMATCH);
2488
}
2489
GETCHARINCTEST(fc, Feptr);
2490
if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2491
RRETURN(MATCH_NOMATCH);
2492
Fecode++;
2493
break;
2494
2495
case OP_DIGIT:
2496
if (Feptr >= mb->end_subject)
2497
{
2498
SCHECK_PARTIAL();
2499
RRETURN(MATCH_NOMATCH);
2500
}
2501
GETCHARINCTEST(fc, Feptr);
2502
if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2503
RRETURN(MATCH_NOMATCH);
2504
Fecode++;
2505
break;
2506
2507
case OP_NOT_WHITESPACE:
2508
if (Feptr >= mb->end_subject)
2509
{
2510
SCHECK_PARTIAL();
2511
RRETURN(MATCH_NOMATCH);
2512
}
2513
GETCHARINCTEST(fc, Feptr);
2514
if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2515
RRETURN(MATCH_NOMATCH);
2516
Fecode++;
2517
break;
2518
2519
case OP_WHITESPACE:
2520
if (Feptr >= mb->end_subject)
2521
{
2522
SCHECK_PARTIAL();
2523
RRETURN(MATCH_NOMATCH);
2524
}
2525
GETCHARINCTEST(fc, Feptr);
2526
if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2527
RRETURN(MATCH_NOMATCH);
2528
Fecode++;
2529
break;
2530
2531
case OP_NOT_WORDCHAR:
2532
if (Feptr >= mb->end_subject)
2533
{
2534
SCHECK_PARTIAL();
2535
RRETURN(MATCH_NOMATCH);
2536
}
2537
GETCHARINCTEST(fc, Feptr);
2538
if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2539
RRETURN(MATCH_NOMATCH);
2540
Fecode++;
2541
break;
2542
2543
case OP_WORDCHAR:
2544
if (Feptr >= mb->end_subject)
2545
{
2546
SCHECK_PARTIAL();
2547
RRETURN(MATCH_NOMATCH);
2548
}
2549
GETCHARINCTEST(fc, Feptr);
2550
if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2551
RRETURN(MATCH_NOMATCH);
2552
Fecode++;
2553
break;
2554
2555
case OP_ANYNL:
2556
if (Feptr >= mb->end_subject)
2557
{
2558
SCHECK_PARTIAL();
2559
RRETURN(MATCH_NOMATCH);
2560
}
2561
GETCHARINCTEST(fc, Feptr);
2562
switch(fc)
2563
{
2564
default: RRETURN(MATCH_NOMATCH);
2565
2566
case CHAR_CR:
2567
if (Feptr >= mb->end_subject)
2568
{
2569
SCHECK_PARTIAL();
2570
}
2571
else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2572
break;
2573
2574
case CHAR_LF:
2575
break;
2576
2577
case CHAR_VT:
2578
case CHAR_FF:
2579
case CHAR_NEL:
2580
#ifndef EBCDIC
2581
case 0x2028:
2582
case 0x2029:
2583
#endif /* Not EBCDIC */
2584
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2585
break;
2586
}
2587
Fecode++;
2588
break;
2589
2590
case OP_NOT_HSPACE:
2591
if (Feptr >= mb->end_subject)
2592
{
2593
SCHECK_PARTIAL();
2594
RRETURN(MATCH_NOMATCH);
2595
}
2596
GETCHARINCTEST(fc, Feptr);
2597
switch(fc)
2598
{
2599
HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2600
default: break;
2601
}
2602
Fecode++;
2603
break;
2604
2605
case OP_HSPACE:
2606
if (Feptr >= mb->end_subject)
2607
{
2608
SCHECK_PARTIAL();
2609
RRETURN(MATCH_NOMATCH);
2610
}
2611
GETCHARINCTEST(fc, Feptr);
2612
switch(fc)
2613
{
2614
HSPACE_CASES: break; /* Byte and multibyte cases */
2615
default: RRETURN(MATCH_NOMATCH);
2616
}
2617
Fecode++;
2618
break;
2619
2620
case OP_NOT_VSPACE:
2621
if (Feptr >= mb->end_subject)
2622
{
2623
SCHECK_PARTIAL();
2624
RRETURN(MATCH_NOMATCH);
2625
}
2626
GETCHARINCTEST(fc, Feptr);
2627
switch(fc)
2628
{
2629
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2630
default: break;
2631
}
2632
Fecode++;
2633
break;
2634
2635
case OP_VSPACE:
2636
if (Feptr >= mb->end_subject)
2637
{
2638
SCHECK_PARTIAL();
2639
RRETURN(MATCH_NOMATCH);
2640
}
2641
GETCHARINCTEST(fc, Feptr);
2642
switch(fc)
2643
{
2644
VSPACE_CASES: break;
2645
default: RRETURN(MATCH_NOMATCH);
2646
}
2647
Fecode++;
2648
break;
2649
2650
2651
#ifdef SUPPORT_UNICODE
2652
2653
/* ===================================================================== */
2654
/* Check the next character by Unicode property. We will get here only
2655
if the support is in the binary; otherwise a compile-time error occurs. */
2656
2657
case OP_PROP:
2658
case OP_NOTPROP:
2659
if (Feptr >= mb->end_subject)
2660
{
2661
SCHECK_PARTIAL();
2662
RRETURN(MATCH_NOMATCH);
2663
}
2664
GETCHARINCTEST(fc, Feptr);
2665
{
2666
const uint32_t *cp;
2667
uint32_t chartype;
2668
const ucd_record *prop = GET_UCD(fc);
2669
BOOL notmatch = Fop == OP_NOTPROP;
2670
2671
switch(Fecode[1])
2672
{
2673
case PT_LAMP:
2674
chartype = prop->chartype;
2675
if ((chartype == ucp_Lu ||
2676
chartype == ucp_Ll ||
2677
chartype == ucp_Lt) == notmatch)
2678
RRETURN(MATCH_NOMATCH);
2679
break;
2680
2681
case PT_GC:
2682
if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)
2683
RRETURN(MATCH_NOMATCH);
2684
break;
2685
2686
case PT_PC:
2687
if ((Fecode[2] == prop->chartype) == notmatch)
2688
RRETURN(MATCH_NOMATCH);
2689
break;
2690
2691
case PT_SC:
2692
if ((Fecode[2] == prop->script) == notmatch)
2693
RRETURN(MATCH_NOMATCH);
2694
break;
2695
2696
case PT_SCX:
2697
{
2698
BOOL ok = (Fecode[2] == prop->script ||
2699
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);
2700
if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2701
}
2702
break;
2703
2704
/* These are specials */
2705
2706
case PT_ALNUM:
2707
chartype = prop->chartype;
2708
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2709
PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)
2710
RRETURN(MATCH_NOMATCH);
2711
break;
2712
2713
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
2714
which means that Perl space and POSIX space are now identical. PCRE
2715
was changed at release 8.34. */
2716
2717
case PT_SPACE: /* Perl space */
2718
case PT_PXSPACE: /* POSIX space */
2719
switch(fc)
2720
{
2721
HSPACE_CASES:
2722
VSPACE_CASES:
2723
if (notmatch) RRETURN(MATCH_NOMATCH);
2724
break;
2725
2726
default:
2727
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)
2728
RRETURN(MATCH_NOMATCH);
2729
break;
2730
}
2731
break;
2732
2733
case PT_WORD:
2734
chartype = prop->chartype;
2735
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2736
PRIV(ucp_gentype)[chartype] == ucp_N ||
2737
chartype == ucp_Mn ||
2738
chartype == ucp_Pc) == notmatch)
2739
RRETURN(MATCH_NOMATCH);
2740
break;
2741
2742
case PT_CLIST:
2743
#if PCRE2_CODE_UNIT_WIDTH == 32
2744
if (fc > MAX_UTF_CODE_POINT)
2745
{
2746
if (notmatch) break;;
2747
RRETURN(MATCH_NOMATCH);
2748
}
2749
#endif
2750
cp = PRIV(ucd_caseless_sets) + Fecode[2];
2751
for (;;)
2752
{
2753
if (fc < *cp)
2754
{ if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }
2755
if (fc == *cp++)
2756
{ if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }
2757
}
2758
break;
2759
2760
case PT_UCNC:
2761
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2762
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2763
fc >= 0xe000) == notmatch)
2764
RRETURN(MATCH_NOMATCH);
2765
break;
2766
2767
case PT_BIDICL:
2768
if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)
2769
RRETURN(MATCH_NOMATCH);
2770
break;
2771
2772
case PT_BOOL:
2773
{
2774
BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2775
UCD_BPROPS_PROP(prop), Fecode[2]) != 0;
2776
if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2777
}
2778
break;
2779
2780
/* This should never occur */
2781
2782
default:
2783
PCRE2_DEBUG_UNREACHABLE();
2784
return PCRE2_ERROR_INTERNAL;
2785
}
2786
2787
Fecode += 3;
2788
}
2789
break;
2790
2791
2792
/* ===================================================================== */
2793
/* Match an extended Unicode sequence. We will get here only if the support
2794
is in the binary; otherwise a compile-time error occurs. */
2795
2796
case OP_EXTUNI:
2797
if (Feptr >= mb->end_subject)
2798
{
2799
SCHECK_PARTIAL();
2800
RRETURN(MATCH_NOMATCH);
2801
}
2802
else
2803
{
2804
GETCHARINCTEST(fc, Feptr);
2805
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2806
NULL);
2807
}
2808
CHECK_PARTIAL();
2809
Fecode++;
2810
break;
2811
2812
#endif /* SUPPORT_UNICODE */
2813
2814
2815
/* ===================================================================== */
2816
/* Match a single character type repeatedly. Note that the property type
2817
does not need to be in a stack frame as it is not used within an RMATCH()
2818
loop. */
2819
2820
#define Lstart_eptr F->temp_sptr[0]
2821
#define Lmin F->temp_32[0]
2822
#define Lmax F->temp_32[1]
2823
#define Lctype F->temp_32[2]
2824
#define Lpropvalue F->temp_32[3]
2825
2826
case OP_TYPEEXACT:
2827
Lmin = Lmax = GET2(Fecode, 1);
2828
Fecode += 1 + IMM2_SIZE;
2829
goto REPEATTYPE;
2830
2831
case OP_TYPEUPTO:
2832
case OP_TYPEMINUPTO:
2833
Lmin = 0;
2834
Lmax = GET2(Fecode, 1);
2835
reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2836
Fecode += 1 + IMM2_SIZE;
2837
goto REPEATTYPE;
2838
2839
case OP_TYPEPOSSTAR:
2840
reptype = REPTYPE_POS;
2841
Lmin = 0;
2842
Lmax = UINT32_MAX;
2843
Fecode++;
2844
goto REPEATTYPE;
2845
2846
case OP_TYPEPOSPLUS:
2847
reptype = REPTYPE_POS;
2848
Lmin = 1;
2849
Lmax = UINT32_MAX;
2850
Fecode++;
2851
goto REPEATTYPE;
2852
2853
case OP_TYPEPOSQUERY:
2854
reptype = REPTYPE_POS;
2855
Lmin = 0;
2856
Lmax = 1;
2857
Fecode++;
2858
goto REPEATTYPE;
2859
2860
case OP_TYPEPOSUPTO:
2861
reptype = REPTYPE_POS;
2862
Lmin = 0;
2863
Lmax = GET2(Fecode, 1);
2864
Fecode += 1 + IMM2_SIZE;
2865
goto REPEATTYPE;
2866
2867
case OP_TYPESTAR:
2868
case OP_TYPEMINSTAR:
2869
case OP_TYPEPLUS:
2870
case OP_TYPEMINPLUS:
2871
case OP_TYPEQUERY:
2872
case OP_TYPEMINQUERY:
2873
fc = *Fecode++ - OP_TYPESTAR;
2874
Lmin = rep_min[fc];
2875
Lmax = rep_max[fc];
2876
reptype = rep_typ[fc];
2877
2878
/* Common code for all repeated character type matches. */
2879
2880
REPEATTYPE:
2881
Lctype = *Fecode++; /* Code for the character type */
2882
2883
#ifdef SUPPORT_UNICODE
2884
if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2885
{
2886
proptype = *Fecode++;
2887
Lpropvalue = *Fecode++;
2888
}
2889
else proptype = -1;
2890
#endif
2891
2892
/* First, ensure the minimum number of matches are present. Use inline
2893
code for maximizing the speed, and do the type test once at the start
2894
(i.e. keep it out of the loops). As there are no calls to RMATCH in the
2895
loops, we can use an ordinary variable for "notmatch". The code for UTF
2896
mode is separated out for tidiness, except for Unicode property tests. */
2897
2898
if (Lmin > 0)
2899
{
2900
#ifdef SUPPORT_UNICODE
2901
if (proptype >= 0) /* Property tests in all modes */
2902
{
2903
BOOL notmatch = Lctype == OP_NOTPROP;
2904
switch(proptype)
2905
{
2906
case PT_LAMP:
2907
for (i = 1; i <= Lmin; i++)
2908
{
2909
int chartype;
2910
if (Feptr >= mb->end_subject)
2911
{
2912
SCHECK_PARTIAL();
2913
RRETURN(MATCH_NOMATCH);
2914
}
2915
GETCHARINCTEST(fc, Feptr);
2916
chartype = UCD_CHARTYPE(fc);
2917
if ((chartype == ucp_Lu ||
2918
chartype == ucp_Ll ||
2919
chartype == ucp_Lt) == notmatch)
2920
RRETURN(MATCH_NOMATCH);
2921
}
2922
break;
2923
2924
case PT_GC:
2925
for (i = 1; i <= Lmin; i++)
2926
{
2927
if (Feptr >= mb->end_subject)
2928
{
2929
SCHECK_PARTIAL();
2930
RRETURN(MATCH_NOMATCH);
2931
}
2932
GETCHARINCTEST(fc, Feptr);
2933
if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)
2934
RRETURN(MATCH_NOMATCH);
2935
}
2936
break;
2937
2938
case PT_PC:
2939
for (i = 1; i <= Lmin; i++)
2940
{
2941
if (Feptr >= mb->end_subject)
2942
{
2943
SCHECK_PARTIAL();
2944
RRETURN(MATCH_NOMATCH);
2945
}
2946
GETCHARINCTEST(fc, Feptr);
2947
if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)
2948
RRETURN(MATCH_NOMATCH);
2949
}
2950
break;
2951
2952
case PT_SC:
2953
for (i = 1; i <= Lmin; i++)
2954
{
2955
if (Feptr >= mb->end_subject)
2956
{
2957
SCHECK_PARTIAL();
2958
RRETURN(MATCH_NOMATCH);
2959
}
2960
GETCHARINCTEST(fc, Feptr);
2961
if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)
2962
RRETURN(MATCH_NOMATCH);
2963
}
2964
break;
2965
2966
case PT_SCX:
2967
for (i = 1; i <= Lmin; i++)
2968
{
2969
BOOL ok;
2970
const ucd_record *prop;
2971
if (Feptr >= mb->end_subject)
2972
{
2973
SCHECK_PARTIAL();
2974
RRETURN(MATCH_NOMATCH);
2975
}
2976
GETCHARINCTEST(fc, Feptr);
2977
prop = GET_UCD(fc);
2978
ok = (prop->script == Lpropvalue ||
2979
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
2980
if (ok == notmatch)
2981
RRETURN(MATCH_NOMATCH);
2982
}
2983
break;
2984
2985
case PT_ALNUM:
2986
for (i = 1; i <= Lmin; i++)
2987
{
2988
int category;
2989
if (Feptr >= mb->end_subject)
2990
{
2991
SCHECK_PARTIAL();
2992
RRETURN(MATCH_NOMATCH);
2993
}
2994
GETCHARINCTEST(fc, Feptr);
2995
category = UCD_CATEGORY(fc);
2996
if ((category == ucp_L || category == ucp_N) == notmatch)
2997
RRETURN(MATCH_NOMATCH);
2998
}
2999
break;
3000
3001
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
3002
which means that Perl space and POSIX space are now identical. PCRE
3003
was changed at release 8.34. */
3004
3005
case PT_SPACE: /* Perl space */
3006
case PT_PXSPACE: /* POSIX space */
3007
for (i = 1; i <= Lmin; i++)
3008
{
3009
if (Feptr >= mb->end_subject)
3010
{
3011
SCHECK_PARTIAL();
3012
RRETURN(MATCH_NOMATCH);
3013
}
3014
GETCHARINCTEST(fc, Feptr);
3015
switch(fc)
3016
{
3017
HSPACE_CASES:
3018
VSPACE_CASES:
3019
if (notmatch) RRETURN(MATCH_NOMATCH);
3020
break;
3021
3022
default:
3023
if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
3024
RRETURN(MATCH_NOMATCH);
3025
break;
3026
}
3027
}
3028
break;
3029
3030
case PT_WORD:
3031
for (i = 1; i <= Lmin; i++)
3032
{
3033
int chartype, category;
3034
if (Feptr >= mb->end_subject)
3035
{
3036
SCHECK_PARTIAL();
3037
RRETURN(MATCH_NOMATCH);
3038
}
3039
GETCHARINCTEST(fc, Feptr);
3040
chartype = UCD_CHARTYPE(fc);
3041
category = PRIV(ucp_gentype)[chartype];
3042
if ((category == ucp_L || category == ucp_N ||
3043
chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)
3044
RRETURN(MATCH_NOMATCH);
3045
}
3046
break;
3047
3048
case PT_CLIST:
3049
for (i = 1; i <= Lmin; i++)
3050
{
3051
const uint32_t *cp;
3052
if (Feptr >= mb->end_subject)
3053
{
3054
SCHECK_PARTIAL();
3055
RRETURN(MATCH_NOMATCH);
3056
}
3057
GETCHARINCTEST(fc, Feptr);
3058
#if PCRE2_CODE_UNIT_WIDTH == 32
3059
if (fc > MAX_UTF_CODE_POINT)
3060
{
3061
if (notmatch) continue;
3062
RRETURN(MATCH_NOMATCH);
3063
}
3064
#endif
3065
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3066
for (;;)
3067
{
3068
if (fc < *cp)
3069
{
3070
if (notmatch) break;
3071
RRETURN(MATCH_NOMATCH);
3072
}
3073
if (fc == *cp++)
3074
{
3075
if (notmatch) RRETURN(MATCH_NOMATCH);
3076
break;
3077
}
3078
}
3079
}
3080
break;
3081
3082
case PT_UCNC:
3083
for (i = 1; i <= Lmin; i++)
3084
{
3085
if (Feptr >= mb->end_subject)
3086
{
3087
SCHECK_PARTIAL();
3088
RRETURN(MATCH_NOMATCH);
3089
}
3090
GETCHARINCTEST(fc, Feptr);
3091
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3092
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3093
fc >= 0xe000) == notmatch)
3094
RRETURN(MATCH_NOMATCH);
3095
}
3096
break;
3097
3098
case PT_BIDICL:
3099
for (i = 1; i <= Lmin; i++)
3100
{
3101
if (Feptr >= mb->end_subject)
3102
{
3103
SCHECK_PARTIAL();
3104
RRETURN(MATCH_NOMATCH);
3105
}
3106
GETCHARINCTEST(fc, Feptr);
3107
if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)
3108
RRETURN(MATCH_NOMATCH);
3109
}
3110
break;
3111
3112
case PT_BOOL:
3113
for (i = 1; i <= Lmin; i++)
3114
{
3115
BOOL ok;
3116
const ucd_record *prop;
3117
if (Feptr >= mb->end_subject)
3118
{
3119
SCHECK_PARTIAL();
3120
RRETURN(MATCH_NOMATCH);
3121
}
3122
GETCHARINCTEST(fc, Feptr);
3123
prop = GET_UCD(fc);
3124
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
3125
UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
3126
if (ok == notmatch)
3127
RRETURN(MATCH_NOMATCH);
3128
}
3129
break;
3130
3131
/* This should not occur */
3132
3133
default:
3134
PCRE2_DEBUG_UNREACHABLE();
3135
return PCRE2_ERROR_INTERNAL;
3136
}
3137
}
3138
3139
/* Match extended Unicode sequences. We will get here only if the
3140
support is in the binary; otherwise a compile-time error occurs. */
3141
3142
else if (Lctype == OP_EXTUNI)
3143
{
3144
for (i = 1; i <= Lmin; i++)
3145
{
3146
if (Feptr >= mb->end_subject)
3147
{
3148
SCHECK_PARTIAL();
3149
RRETURN(MATCH_NOMATCH);
3150
}
3151
else
3152
{
3153
GETCHARINCTEST(fc, Feptr);
3154
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
3155
mb->end_subject, utf, NULL);
3156
}
3157
CHECK_PARTIAL();
3158
}
3159
}
3160
else
3161
#endif /* SUPPORT_UNICODE */
3162
3163
/* Handle all other cases in UTF mode */
3164
3165
#ifdef SUPPORT_UNICODE
3166
if (utf) switch(Lctype)
3167
{
3168
case OP_ANY:
3169
for (i = 1; i <= Lmin; i++)
3170
{
3171
if (Feptr >= mb->end_subject)
3172
{
3173
SCHECK_PARTIAL();
3174
RRETURN(MATCH_NOMATCH);
3175
}
3176
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3177
if (mb->partial != 0 &&
3178
Feptr + 1 >= mb->end_subject &&
3179
NLBLOCK->nltype == NLTYPE_FIXED &&
3180
NLBLOCK->nllen == 2 &&
3181
UCHAR21(Feptr) == NLBLOCK->nl[0])
3182
{
3183
mb->hitend = TRUE;
3184
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3185
}
3186
Feptr++;
3187
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3188
}
3189
break;
3190
3191
case OP_ALLANY:
3192
for (i = 1; i <= Lmin; i++)
3193
{
3194
if (Feptr >= mb->end_subject)
3195
{
3196
SCHECK_PARTIAL();
3197
RRETURN(MATCH_NOMATCH);
3198
}
3199
Feptr++;
3200
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3201
}
3202
break;
3203
3204
case OP_ANYBYTE:
3205
if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
3206
Feptr += Lmin;
3207
break;
3208
3209
case OP_ANYNL:
3210
for (i = 1; i <= Lmin; i++)
3211
{
3212
if (Feptr >= mb->end_subject)
3213
{
3214
SCHECK_PARTIAL();
3215
RRETURN(MATCH_NOMATCH);
3216
}
3217
GETCHARINC(fc, Feptr);
3218
switch(fc)
3219
{
3220
default: RRETURN(MATCH_NOMATCH);
3221
3222
case CHAR_CR:
3223
if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3224
break;
3225
3226
case CHAR_LF:
3227
break;
3228
3229
case CHAR_VT:
3230
case CHAR_FF:
3231
case CHAR_NEL:
3232
#ifndef EBCDIC
3233
case 0x2028:
3234
case 0x2029:
3235
#endif /* Not EBCDIC */
3236
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3237
break;
3238
}
3239
}
3240
break;
3241
3242
case OP_NOT_HSPACE:
3243
for (i = 1; i <= Lmin; i++)
3244
{
3245
if (Feptr >= mb->end_subject)
3246
{
3247
SCHECK_PARTIAL();
3248
RRETURN(MATCH_NOMATCH);
3249
}
3250
GETCHARINC(fc, Feptr);
3251
switch(fc)
3252
{
3253
HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3254
default: break;
3255
}
3256
}
3257
break;
3258
3259
case OP_HSPACE:
3260
for (i = 1; i <= Lmin; i++)
3261
{
3262
if (Feptr >= mb->end_subject)
3263
{
3264
SCHECK_PARTIAL();
3265
RRETURN(MATCH_NOMATCH);
3266
}
3267
GETCHARINC(fc, Feptr);
3268
switch(fc)
3269
{
3270
HSPACE_CASES: break;
3271
default: RRETURN(MATCH_NOMATCH);
3272
}
3273
}
3274
break;
3275
3276
case OP_NOT_VSPACE:
3277
for (i = 1; i <= Lmin; i++)
3278
{
3279
if (Feptr >= mb->end_subject)
3280
{
3281
SCHECK_PARTIAL();
3282
RRETURN(MATCH_NOMATCH);
3283
}
3284
GETCHARINC(fc, Feptr);
3285
switch(fc)
3286
{
3287
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3288
default: break;
3289
}
3290
}
3291
break;
3292
3293
case OP_VSPACE:
3294
for (i = 1; i <= Lmin; i++)
3295
{
3296
if (Feptr >= mb->end_subject)
3297
{
3298
SCHECK_PARTIAL();
3299
RRETURN(MATCH_NOMATCH);
3300
}
3301
GETCHARINC(fc, Feptr);
3302
switch(fc)
3303
{
3304
VSPACE_CASES: break;
3305
default: RRETURN(MATCH_NOMATCH);
3306
}
3307
}
3308
break;
3309
3310
case OP_NOT_DIGIT:
3311
for (i = 1; i <= Lmin; i++)
3312
{
3313
if (Feptr >= mb->end_subject)
3314
{
3315
SCHECK_PARTIAL();
3316
RRETURN(MATCH_NOMATCH);
3317
}
3318
GETCHARINC(fc, Feptr);
3319
if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
3320
RRETURN(MATCH_NOMATCH);
3321
}
3322
break;
3323
3324
case OP_DIGIT:
3325
for (i = 1; i <= Lmin; i++)
3326
{
3327
uint32_t cc;
3328
if (Feptr >= mb->end_subject)
3329
{
3330
SCHECK_PARTIAL();
3331
RRETURN(MATCH_NOMATCH);
3332
}
3333
cc = UCHAR21(Feptr);
3334
if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3335
RRETURN(MATCH_NOMATCH);
3336
Feptr++;
3337
/* No need to skip more code units - we know it has only one. */
3338
}
3339
break;
3340
3341
case OP_NOT_WHITESPACE:
3342
for (i = 1; i <= Lmin; i++)
3343
{
3344
uint32_t cc;
3345
if (Feptr >= mb->end_subject)
3346
{
3347
SCHECK_PARTIAL();
3348
RRETURN(MATCH_NOMATCH);
3349
}
3350
cc = UCHAR21(Feptr);
3351
if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3352
RRETURN(MATCH_NOMATCH);
3353
Feptr++;
3354
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3355
}
3356
break;
3357
3358
case OP_WHITESPACE:
3359
for (i = 1; i <= Lmin; i++)
3360
{
3361
uint32_t cc;
3362
if (Feptr >= mb->end_subject)
3363
{
3364
SCHECK_PARTIAL();
3365
RRETURN(MATCH_NOMATCH);
3366
}
3367
cc = UCHAR21(Feptr);
3368
if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3369
RRETURN(MATCH_NOMATCH);
3370
Feptr++;
3371
/* No need to skip more code units - we know it has only one. */
3372
}
3373
break;
3374
3375
case OP_NOT_WORDCHAR:
3376
for (i = 1; i <= Lmin; i++)
3377
{
3378
uint32_t cc;
3379
if (Feptr >= mb->end_subject)
3380
{
3381
SCHECK_PARTIAL();
3382
RRETURN(MATCH_NOMATCH);
3383
}
3384
cc = UCHAR21(Feptr);
3385
if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3386
RRETURN(MATCH_NOMATCH);
3387
Feptr++;
3388
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3389
}
3390
break;
3391
3392
case OP_WORDCHAR:
3393
for (i = 1; i <= Lmin; i++)
3394
{
3395
uint32_t cc;
3396
if (Feptr >= mb->end_subject)
3397
{
3398
SCHECK_PARTIAL();
3399
RRETURN(MATCH_NOMATCH);
3400
}
3401
cc = UCHAR21(Feptr);
3402
if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3403
RRETURN(MATCH_NOMATCH);
3404
Feptr++;
3405
/* No need to skip more code units - we know it has only one. */
3406
}
3407
break;
3408
3409
default:
3410
PCRE2_DEBUG_UNREACHABLE();
3411
return PCRE2_ERROR_INTERNAL;
3412
} /* End switch(Lctype) */
3413
3414
else
3415
#endif /* SUPPORT_UNICODE */
3416
3417
/* Code for the non-UTF case for minimum matching of operators other
3418
than OP_PROP and OP_NOTPROP. */
3419
3420
switch(Lctype)
3421
{
3422
case OP_ANY:
3423
for (i = 1; i <= Lmin; i++)
3424
{
3425
if (Feptr >= mb->end_subject)
3426
{
3427
SCHECK_PARTIAL();
3428
RRETURN(MATCH_NOMATCH);
3429
}
3430
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3431
if (mb->partial != 0 &&
3432
Feptr + 1 >= mb->end_subject &&
3433
NLBLOCK->nltype == NLTYPE_FIXED &&
3434
NLBLOCK->nllen == 2 &&
3435
*Feptr == NLBLOCK->nl[0])
3436
{
3437
mb->hitend = TRUE;
3438
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3439
}
3440
Feptr++;
3441
}
3442
break;
3443
3444
case OP_ALLANY:
3445
if (Feptr > mb->end_subject - Lmin)
3446
{
3447
SCHECK_PARTIAL();
3448
RRETURN(MATCH_NOMATCH);
3449
}
3450
Feptr += Lmin;
3451
break;
3452
3453
/* This OP_ANYBYTE case will never be reached because \C gets turned
3454
into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3455
reports don't complain about it's never being used. */
3456
3457
/* case OP_ANYBYTE:
3458
* if (Feptr > mb->end_subject - Lmin)
3459
* {
3460
* SCHECK_PARTIAL();
3461
* RRETURN(MATCH_NOMATCH);
3462
* }
3463
* Feptr += Lmin;
3464
* break;
3465
*/
3466
case OP_ANYNL:
3467
for (i = 1; i <= Lmin; i++)
3468
{
3469
if (Feptr >= mb->end_subject)
3470
{
3471
SCHECK_PARTIAL();
3472
RRETURN(MATCH_NOMATCH);
3473
}
3474
switch(*Feptr++)
3475
{
3476
default: RRETURN(MATCH_NOMATCH);
3477
3478
case CHAR_CR:
3479
if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3480
break;
3481
3482
case CHAR_LF:
3483
break;
3484
3485
case CHAR_VT:
3486
case CHAR_FF:
3487
case CHAR_NEL:
3488
#if PCRE2_CODE_UNIT_WIDTH != 8
3489
case 0x2028:
3490
case 0x2029:
3491
#endif
3492
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3493
break;
3494
}
3495
}
3496
break;
3497
3498
case OP_NOT_HSPACE:
3499
for (i = 1; i <= Lmin; i++)
3500
{
3501
if (Feptr >= mb->end_subject)
3502
{
3503
SCHECK_PARTIAL();
3504
RRETURN(MATCH_NOMATCH);
3505
}
3506
switch(*Feptr++)
3507
{
3508
default: break;
3509
HSPACE_BYTE_CASES:
3510
#if PCRE2_CODE_UNIT_WIDTH != 8
3511
HSPACE_MULTIBYTE_CASES:
3512
#endif
3513
RRETURN(MATCH_NOMATCH);
3514
}
3515
}
3516
break;
3517
3518
case OP_HSPACE:
3519
for (i = 1; i <= Lmin; i++)
3520
{
3521
if (Feptr >= mb->end_subject)
3522
{
3523
SCHECK_PARTIAL();
3524
RRETURN(MATCH_NOMATCH);
3525
}
3526
switch(*Feptr++)
3527
{
3528
default: RRETURN(MATCH_NOMATCH);
3529
HSPACE_BYTE_CASES:
3530
#if PCRE2_CODE_UNIT_WIDTH != 8
3531
HSPACE_MULTIBYTE_CASES:
3532
#endif
3533
break;
3534
}
3535
}
3536
break;
3537
3538
case OP_NOT_VSPACE:
3539
for (i = 1; i <= Lmin; i++)
3540
{
3541
if (Feptr >= mb->end_subject)
3542
{
3543
SCHECK_PARTIAL();
3544
RRETURN(MATCH_NOMATCH);
3545
}
3546
switch(*Feptr++)
3547
{
3548
VSPACE_BYTE_CASES:
3549
#if PCRE2_CODE_UNIT_WIDTH != 8
3550
VSPACE_MULTIBYTE_CASES:
3551
#endif
3552
RRETURN(MATCH_NOMATCH);
3553
default: break;
3554
}
3555
}
3556
break;
3557
3558
case OP_VSPACE:
3559
for (i = 1; i <= Lmin; i++)
3560
{
3561
if (Feptr >= mb->end_subject)
3562
{
3563
SCHECK_PARTIAL();
3564
RRETURN(MATCH_NOMATCH);
3565
}
3566
switch(*Feptr++)
3567
{
3568
default: RRETURN(MATCH_NOMATCH);
3569
VSPACE_BYTE_CASES:
3570
#if PCRE2_CODE_UNIT_WIDTH != 8
3571
VSPACE_MULTIBYTE_CASES:
3572
#endif
3573
break;
3574
}
3575
}
3576
break;
3577
3578
case OP_NOT_DIGIT:
3579
for (i = 1; i <= Lmin; i++)
3580
{
3581
if (Feptr >= mb->end_subject)
3582
{
3583
SCHECK_PARTIAL();
3584
RRETURN(MATCH_NOMATCH);
3585
}
3586
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3587
RRETURN(MATCH_NOMATCH);
3588
Feptr++;
3589
}
3590
break;
3591
3592
case OP_DIGIT:
3593
for (i = 1; i <= Lmin; i++)
3594
{
3595
if (Feptr >= mb->end_subject)
3596
{
3597
SCHECK_PARTIAL();
3598
RRETURN(MATCH_NOMATCH);
3599
}
3600
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3601
RRETURN(MATCH_NOMATCH);
3602
Feptr++;
3603
}
3604
break;
3605
3606
case OP_NOT_WHITESPACE:
3607
for (i = 1; i <= Lmin; i++)
3608
{
3609
if (Feptr >= mb->end_subject)
3610
{
3611
SCHECK_PARTIAL();
3612
RRETURN(MATCH_NOMATCH);
3613
}
3614
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3615
RRETURN(MATCH_NOMATCH);
3616
Feptr++;
3617
}
3618
break;
3619
3620
case OP_WHITESPACE:
3621
for (i = 1; i <= Lmin; i++)
3622
{
3623
if (Feptr >= mb->end_subject)
3624
{
3625
SCHECK_PARTIAL();
3626
RRETURN(MATCH_NOMATCH);
3627
}
3628
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3629
RRETURN(MATCH_NOMATCH);
3630
Feptr++;
3631
}
3632
break;
3633
3634
case OP_NOT_WORDCHAR:
3635
for (i = 1; i <= Lmin; i++)
3636
{
3637
if (Feptr >= mb->end_subject)
3638
{
3639
SCHECK_PARTIAL();
3640
RRETURN(MATCH_NOMATCH);
3641
}
3642
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3643
RRETURN(MATCH_NOMATCH);
3644
Feptr++;
3645
}
3646
break;
3647
3648
case OP_WORDCHAR:
3649
for (i = 1; i <= Lmin; i++)
3650
{
3651
if (Feptr >= mb->end_subject)
3652
{
3653
SCHECK_PARTIAL();
3654
RRETURN(MATCH_NOMATCH);
3655
}
3656
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3657
RRETURN(MATCH_NOMATCH);
3658
Feptr++;
3659
}
3660
break;
3661
3662
default:
3663
PCRE2_DEBUG_UNREACHABLE();
3664
return PCRE2_ERROR_INTERNAL;
3665
}
3666
}
3667
3668
/* If Lmin = Lmax we are done. Continue with the main loop. */
3669
3670
if (Lmin == Lmax) continue;
3671
3672
/* If minimizing, we have to test the rest of the pattern before each
3673
subsequent match. This means we cannot use a local "notmatch" variable as
3674
in the other cases. As all 4 temporary 32-bit values in the frame are
3675
already in use, just test the type each time. */
3676
3677
if (reptype == REPTYPE_MIN)
3678
{
3679
#ifdef SUPPORT_UNICODE
3680
if (proptype >= 0)
3681
{
3682
switch(proptype)
3683
{
3684
case PT_LAMP:
3685
for (;;)
3686
{
3687
int chartype;
3688
RMATCH(Fecode, RM208);
3689
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3690
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3691
if (Feptr >= mb->end_subject)
3692
{
3693
SCHECK_PARTIAL();
3694
RRETURN(MATCH_NOMATCH);
3695
}
3696
GETCHARINCTEST(fc, Feptr);
3697
chartype = UCD_CHARTYPE(fc);
3698
if ((chartype == ucp_Lu ||
3699
chartype == ucp_Ll ||
3700
chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3701
RRETURN(MATCH_NOMATCH);
3702
}
3703
PCRE2_UNREACHABLE(); /* Control never reaches here */
3704
3705
case PT_GC:
3706
for (;;)
3707
{
3708
RMATCH(Fecode, RM209);
3709
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3710
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3711
if (Feptr >= mb->end_subject)
3712
{
3713
SCHECK_PARTIAL();
3714
RRETURN(MATCH_NOMATCH);
3715
}
3716
GETCHARINCTEST(fc, Feptr);
3717
if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3718
RRETURN(MATCH_NOMATCH);
3719
}
3720
PCRE2_UNREACHABLE(); /* Control never reaches here */
3721
3722
case PT_PC:
3723
for (;;)
3724
{
3725
RMATCH(Fecode, RM210);
3726
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3727
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3728
if (Feptr >= mb->end_subject)
3729
{
3730
SCHECK_PARTIAL();
3731
RRETURN(MATCH_NOMATCH);
3732
}
3733
GETCHARINCTEST(fc, Feptr);
3734
if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3735
RRETURN(MATCH_NOMATCH);
3736
}
3737
PCRE2_UNREACHABLE(); /* Control never reaches here */
3738
3739
case PT_SC:
3740
for (;;)
3741
{
3742
RMATCH(Fecode, RM211);
3743
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3744
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3745
if (Feptr >= mb->end_subject)
3746
{
3747
SCHECK_PARTIAL();
3748
RRETURN(MATCH_NOMATCH);
3749
}
3750
GETCHARINCTEST(fc, Feptr);
3751
if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3752
RRETURN(MATCH_NOMATCH);
3753
}
3754
PCRE2_UNREACHABLE(); /* Control never reaches here */
3755
3756
case PT_SCX:
3757
for (;;)
3758
{
3759
BOOL ok;
3760
const ucd_record *prop;
3761
RMATCH(Fecode, RM224);
3762
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3763
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3764
if (Feptr >= mb->end_subject)
3765
{
3766
SCHECK_PARTIAL();
3767
RRETURN(MATCH_NOMATCH);
3768
}
3769
GETCHARINCTEST(fc, Feptr);
3770
prop = GET_UCD(fc);
3771
ok = (prop->script == Lpropvalue
3772
|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
3773
if (ok == (Lctype == OP_NOTPROP))
3774
RRETURN(MATCH_NOMATCH);
3775
}
3776
PCRE2_UNREACHABLE(); /* Control never reaches here */
3777
3778
case PT_ALNUM:
3779
for (;;)
3780
{
3781
int category;
3782
RMATCH(Fecode, RM212);
3783
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3784
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3785
if (Feptr >= mb->end_subject)
3786
{
3787
SCHECK_PARTIAL();
3788
RRETURN(MATCH_NOMATCH);
3789
}
3790
GETCHARINCTEST(fc, Feptr);
3791
category = UCD_CATEGORY(fc);
3792
if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
3793
RRETURN(MATCH_NOMATCH);
3794
}
3795
PCRE2_UNREACHABLE(); /* Control never reaches here */
3796
3797
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
3798
which means that Perl space and POSIX space are now identical. PCRE
3799
was changed at release 8.34. */
3800
3801
case PT_SPACE: /* Perl space */
3802
case PT_PXSPACE: /* POSIX space */
3803
for (;;)
3804
{
3805
RMATCH(Fecode, RM213);
3806
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3807
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3808
if (Feptr >= mb->end_subject)
3809
{
3810
SCHECK_PARTIAL();
3811
RRETURN(MATCH_NOMATCH);
3812
}
3813
GETCHARINCTEST(fc, Feptr);
3814
switch(fc)
3815
{
3816
HSPACE_CASES:
3817
VSPACE_CASES:
3818
if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3819
break;
3820
3821
default:
3822
if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3823
RRETURN(MATCH_NOMATCH);
3824
break;
3825
}
3826
}
3827
PCRE2_UNREACHABLE(); /* Control never reaches here */
3828
3829
case PT_WORD:
3830
for (;;)
3831
{
3832
int chartype, category;
3833
RMATCH(Fecode, RM214);
3834
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3835
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3836
if (Feptr >= mb->end_subject)
3837
{
3838
SCHECK_PARTIAL();
3839
RRETURN(MATCH_NOMATCH);
3840
}
3841
GETCHARINCTEST(fc, Feptr);
3842
chartype = UCD_CHARTYPE(fc);
3843
category = PRIV(ucp_gentype)[chartype];
3844
if ((category == ucp_L ||
3845
category == ucp_N ||
3846
chartype == ucp_Mn ||
3847
chartype == ucp_Pc) == (Lctype == OP_NOTPROP))
3848
RRETURN(MATCH_NOMATCH);
3849
}
3850
PCRE2_UNREACHABLE(); /* Control never reaches here */
3851
3852
case PT_CLIST:
3853
for (;;)
3854
{
3855
const uint32_t *cp;
3856
RMATCH(Fecode, RM215);
3857
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3859
if (Feptr >= mb->end_subject)
3860
{
3861
SCHECK_PARTIAL();
3862
RRETURN(MATCH_NOMATCH);
3863
}
3864
GETCHARINCTEST(fc, Feptr);
3865
#if PCRE2_CODE_UNIT_WIDTH == 32
3866
if (fc > MAX_UTF_CODE_POINT)
3867
{
3868
if (Lctype == OP_NOTPROP) continue;
3869
RRETURN(MATCH_NOMATCH);
3870
}
3871
#endif
3872
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3873
for (;;)
3874
{
3875
if (fc < *cp)
3876
{
3877
if (Lctype == OP_NOTPROP) break;
3878
RRETURN(MATCH_NOMATCH);
3879
}
3880
if (fc == *cp++)
3881
{
3882
if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3883
break;
3884
}
3885
}
3886
}
3887
PCRE2_UNREACHABLE(); /* Control never reaches here */
3888
3889
case PT_UCNC:
3890
for (;;)
3891
{
3892
RMATCH(Fecode, RM216);
3893
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3894
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3895
if (Feptr >= mb->end_subject)
3896
{
3897
SCHECK_PARTIAL();
3898
RRETURN(MATCH_NOMATCH);
3899
}
3900
GETCHARINCTEST(fc, Feptr);
3901
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3902
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3903
fc >= 0xe000) == (Lctype == OP_NOTPROP))
3904
RRETURN(MATCH_NOMATCH);
3905
}
3906
PCRE2_UNREACHABLE(); /* Control never reaches here */
3907
3908
case PT_BIDICL:
3909
for (;;)
3910
{
3911
RMATCH(Fecode, RM223);
3912
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3913
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3914
if (Feptr >= mb->end_subject)
3915
{
3916
SCHECK_PARTIAL();
3917
RRETURN(MATCH_NOMATCH);
3918
}
3919
GETCHARINCTEST(fc, Feptr);
3920
if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3921
RRETURN(MATCH_NOMATCH);
3922
}
3923
PCRE2_UNREACHABLE(); /* Control never reaches here */
3924
3925
case PT_BOOL:
3926
for (;;)
3927
{
3928
BOOL ok;
3929
const ucd_record *prop;
3930
RMATCH(Fecode, RM222);
3931
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3932
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3933
if (Feptr >= mb->end_subject)
3934
{
3935
SCHECK_PARTIAL();
3936
RRETURN(MATCH_NOMATCH);
3937
}
3938
GETCHARINCTEST(fc, Feptr);
3939
prop = GET_UCD(fc);
3940
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
3941
UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
3942
if (ok == (Lctype == OP_NOTPROP))
3943
RRETURN(MATCH_NOMATCH);
3944
}
3945
PCRE2_UNREACHABLE(); /* Control never reaches here */
3946
3947
/* This should never occur */
3948
default:
3949
PCRE2_DEBUG_UNREACHABLE();
3950
return PCRE2_ERROR_INTERNAL;
3951
}
3952
}
3953
3954
/* Match extended Unicode sequences. We will get here only if the
3955
support is in the binary; otherwise a compile-time error occurs. */
3956
3957
else if (Lctype == OP_EXTUNI)
3958
{
3959
for (;;)
3960
{
3961
RMATCH(Fecode, RM217);
3962
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3963
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3964
if (Feptr >= mb->end_subject)
3965
{
3966
SCHECK_PARTIAL();
3967
RRETURN(MATCH_NOMATCH);
3968
}
3969
else
3970
{
3971
GETCHARINCTEST(fc, Feptr);
3972
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3973
utf, NULL);
3974
}
3975
CHECK_PARTIAL();
3976
}
3977
}
3978
else
3979
#endif /* SUPPORT_UNICODE */
3980
3981
/* UTF mode for non-property testing character types. */
3982
3983
#ifdef SUPPORT_UNICODE
3984
if (utf)
3985
{
3986
for (;;)
3987
{
3988
RMATCH(Fecode, RM218);
3989
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3990
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3991
if (Feptr >= mb->end_subject)
3992
{
3993
SCHECK_PARTIAL();
3994
RRETURN(MATCH_NOMATCH);
3995
}
3996
if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3997
GETCHARINC(fc, Feptr);
3998
switch(Lctype)
3999
{
4000
case OP_ANY: /* This is the non-NL case */
4001
if (mb->partial != 0 && /* Take care with CRLF partial */
4002
Feptr >= mb->end_subject &&
4003
NLBLOCK->nltype == NLTYPE_FIXED &&
4004
NLBLOCK->nllen == 2 &&
4005
fc == NLBLOCK->nl[0])
4006
{
4007
mb->hitend = TRUE;
4008
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4009
}
4010
break;
4011
4012
case OP_ALLANY:
4013
case OP_ANYBYTE:
4014
break;
4015
4016
case OP_ANYNL:
4017
switch(fc)
4018
{
4019
default: RRETURN(MATCH_NOMATCH);
4020
4021
case CHAR_CR:
4022
if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
4023
break;
4024
4025
case CHAR_LF:
4026
break;
4027
4028
case CHAR_VT:
4029
case CHAR_FF:
4030
case CHAR_NEL:
4031
#ifndef EBCDIC
4032
case 0x2028:
4033
case 0x2029:
4034
#endif /* Not EBCDIC */
4035
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
4036
RRETURN(MATCH_NOMATCH);
4037
break;
4038
}
4039
break;
4040
4041
case OP_NOT_HSPACE:
4042
switch(fc)
4043
{
4044
HSPACE_CASES: RRETURN(MATCH_NOMATCH);
4045
default: break;
4046
}
4047
break;
4048
4049
case OP_HSPACE:
4050
switch(fc)
4051
{
4052
HSPACE_CASES: break;
4053
default: RRETURN(MATCH_NOMATCH);
4054
}
4055
break;
4056
4057
case OP_NOT_VSPACE:
4058
switch(fc)
4059
{
4060
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4061
default: break;
4062
}
4063
break;
4064
4065
case OP_VSPACE:
4066
switch(fc)
4067
{
4068
VSPACE_CASES: break;
4069
default: RRETURN(MATCH_NOMATCH);
4070
}
4071
break;
4072
4073
case OP_NOT_DIGIT:
4074
if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
4075
RRETURN(MATCH_NOMATCH);
4076
break;
4077
4078
case OP_DIGIT:
4079
if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
4080
RRETURN(MATCH_NOMATCH);
4081
break;
4082
4083
case OP_NOT_WHITESPACE:
4084
if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
4085
RRETURN(MATCH_NOMATCH);
4086
break;
4087
4088
case OP_WHITESPACE:
4089
if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
4090
RRETURN(MATCH_NOMATCH);
4091
break;
4092
4093
case OP_NOT_WORDCHAR:
4094
if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
4095
RRETURN(MATCH_NOMATCH);
4096
break;
4097
4098
case OP_WORDCHAR:
4099
if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
4100
RRETURN(MATCH_NOMATCH);
4101
break;
4102
4103
default:
4104
PCRE2_DEBUG_UNREACHABLE();
4105
return PCRE2_ERROR_INTERNAL;
4106
}
4107
}
4108
}
4109
else
4110
#endif /* SUPPORT_UNICODE */
4111
4112
/* Not UTF mode */
4113
{
4114
for (;;)
4115
{
4116
RMATCH(Fecode, RM33);
4117
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4118
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4119
if (Feptr >= mb->end_subject)
4120
{
4121
SCHECK_PARTIAL();
4122
RRETURN(MATCH_NOMATCH);
4123
}
4124
if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
4125
RRETURN(MATCH_NOMATCH);
4126
fc = *Feptr++;
4127
switch(Lctype)
4128
{
4129
case OP_ANY: /* This is the non-NL case */
4130
if (mb->partial != 0 && /* Take care with CRLF partial */
4131
Feptr >= mb->end_subject &&
4132
NLBLOCK->nltype == NLTYPE_FIXED &&
4133
NLBLOCK->nllen == 2 &&
4134
fc == NLBLOCK->nl[0])
4135
{
4136
mb->hitend = TRUE;
4137
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4138
}
4139
break;
4140
4141
case OP_ALLANY:
4142
case OP_ANYBYTE:
4143
break;
4144
4145
case OP_ANYNL:
4146
switch(fc)
4147
{
4148
default: RRETURN(MATCH_NOMATCH);
4149
4150
case CHAR_CR:
4151
if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
4152
break;
4153
4154
case CHAR_LF:
4155
break;
4156
4157
case CHAR_VT:
4158
case CHAR_FF:
4159
case CHAR_NEL:
4160
#if PCRE2_CODE_UNIT_WIDTH != 8
4161
case 0x2028:
4162
case 0x2029:
4163
#endif
4164
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
4165
RRETURN(MATCH_NOMATCH);
4166
break;
4167
}
4168
break;
4169
4170
case OP_NOT_HSPACE:
4171
switch(fc)
4172
{
4173
default: break;
4174
HSPACE_BYTE_CASES:
4175
#if PCRE2_CODE_UNIT_WIDTH != 8
4176
HSPACE_MULTIBYTE_CASES:
4177
#endif
4178
RRETURN(MATCH_NOMATCH);
4179
}
4180
break;
4181
4182
case OP_HSPACE:
4183
switch(fc)
4184
{
4185
default: RRETURN(MATCH_NOMATCH);
4186
HSPACE_BYTE_CASES:
4187
#if PCRE2_CODE_UNIT_WIDTH != 8
4188
HSPACE_MULTIBYTE_CASES:
4189
#endif
4190
break;
4191
}
4192
break;
4193
4194
case OP_NOT_VSPACE:
4195
switch(fc)
4196
{
4197
default: break;
4198
VSPACE_BYTE_CASES:
4199
#if PCRE2_CODE_UNIT_WIDTH != 8
4200
VSPACE_MULTIBYTE_CASES:
4201
#endif
4202
RRETURN(MATCH_NOMATCH);
4203
}
4204
break;
4205
4206
case OP_VSPACE:
4207
switch(fc)
4208
{
4209
default: RRETURN(MATCH_NOMATCH);
4210
VSPACE_BYTE_CASES:
4211
#if PCRE2_CODE_UNIT_WIDTH != 8
4212
VSPACE_MULTIBYTE_CASES:
4213
#endif
4214
break;
4215
}
4216
break;
4217
4218
case OP_NOT_DIGIT:
4219
if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
4220
RRETURN(MATCH_NOMATCH);
4221
break;
4222
4223
case OP_DIGIT:
4224
if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
4225
RRETURN(MATCH_NOMATCH);
4226
break;
4227
4228
case OP_NOT_WHITESPACE:
4229
if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
4230
RRETURN(MATCH_NOMATCH);
4231
break;
4232
4233
case OP_WHITESPACE:
4234
if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
4235
RRETURN(MATCH_NOMATCH);
4236
break;
4237
4238
case OP_NOT_WORDCHAR:
4239
if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
4240
RRETURN(MATCH_NOMATCH);
4241
break;
4242
4243
case OP_WORDCHAR:
4244
if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
4245
RRETURN(MATCH_NOMATCH);
4246
break;
4247
4248
default:
4249
PCRE2_DEBUG_UNREACHABLE();
4250
return PCRE2_ERROR_INTERNAL;
4251
}
4252
}
4253
}
4254
4255
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
4256
}
4257
4258
/* If maximizing, it is worth using inline code for speed, doing the type
4259
test once at the start (i.e. keep it out of the loops). Once again,
4260
"notmatch" can be an ordinary local variable because the loops do not call
4261
RMATCH. */
4262
4263
else
4264
{
4265
Lstart_eptr = Feptr; /* Remember where we started */
4266
4267
#ifdef SUPPORT_UNICODE
4268
if (proptype >= 0)
4269
{
4270
BOOL notmatch = Lctype == OP_NOTPROP;
4271
switch(proptype)
4272
{
4273
case PT_LAMP:
4274
for (i = Lmin; i < Lmax; i++)
4275
{
4276
int chartype;
4277
int len = 1;
4278
if (Feptr >= mb->end_subject)
4279
{
4280
SCHECK_PARTIAL();
4281
break;
4282
}
4283
GETCHARLENTEST(fc, Feptr, len);
4284
chartype = UCD_CHARTYPE(fc);
4285
if ((chartype == ucp_Lu ||
4286
chartype == ucp_Ll ||
4287
chartype == ucp_Lt) == notmatch)
4288
break;
4289
Feptr+= len;
4290
}
4291
break;
4292
4293
case PT_GC:
4294
for (i = Lmin; i < Lmax; i++)
4295
{
4296
int len = 1;
4297
if (Feptr >= mb->end_subject)
4298
{
4299
SCHECK_PARTIAL();
4300
break;
4301
}
4302
GETCHARLENTEST(fc, Feptr, len);
4303
if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;
4304
Feptr+= len;
4305
}
4306
break;
4307
4308
case PT_PC:
4309
for (i = Lmin; i < Lmax; i++)
4310
{
4311
int len = 1;
4312
if (Feptr >= mb->end_subject)
4313
{
4314
SCHECK_PARTIAL();
4315
break;
4316
}
4317
GETCHARLENTEST(fc, Feptr, len);
4318
if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;
4319
Feptr+= len;
4320
}
4321
break;
4322
4323
case PT_SC:
4324
for (i = Lmin; i < Lmax; i++)
4325
{
4326
int len = 1;
4327
if (Feptr >= mb->end_subject)
4328
{
4329
SCHECK_PARTIAL();
4330
break;
4331
}
4332
GETCHARLENTEST(fc, Feptr, len);
4333
if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;
4334
Feptr+= len;
4335
}
4336
break;
4337
4338
case PT_SCX:
4339
for (i = Lmin; i < Lmax; i++)
4340
{
4341
BOOL ok;
4342
const ucd_record *prop;
4343
int len = 1;
4344
if (Feptr >= mb->end_subject)
4345
{
4346
SCHECK_PARTIAL();
4347
break;
4348
}
4349
GETCHARLENTEST(fc, Feptr, len);
4350
prop = GET_UCD(fc);
4351
ok = (prop->script == Lpropvalue ||
4352
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
4353
if (ok == notmatch) break;
4354
Feptr+= len;
4355
}
4356
break;
4357
4358
case PT_ALNUM:
4359
for (i = Lmin; i < Lmax; i++)
4360
{
4361
int category;
4362
int len = 1;
4363
if (Feptr >= mb->end_subject)
4364
{
4365
SCHECK_PARTIAL();
4366
break;
4367
}
4368
GETCHARLENTEST(fc, Feptr, len);
4369
category = UCD_CATEGORY(fc);
4370
if ((category == ucp_L || category == ucp_N) == notmatch)
4371
break;
4372
Feptr+= len;
4373
}
4374
break;
4375
4376
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
4377
which means that Perl space and POSIX space are now identical. PCRE
4378
was changed at release 8.34. */
4379
4380
case PT_SPACE: /* Perl space */
4381
case PT_PXSPACE: /* POSIX space */
4382
for (i = Lmin; i < Lmax; i++)
4383
{
4384
int len = 1;
4385
if (Feptr >= mb->end_subject)
4386
{
4387
SCHECK_PARTIAL();
4388
break;
4389
}
4390
GETCHARLENTEST(fc, Feptr, len);
4391
switch(fc)
4392
{
4393
HSPACE_CASES:
4394
VSPACE_CASES:
4395
if (notmatch) goto ENDLOOP99; /* Break the loop */
4396
break;
4397
4398
default:
4399
if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
4400
goto ENDLOOP99; /* Break the loop */
4401
break;
4402
}
4403
Feptr+= len;
4404
}
4405
ENDLOOP99:
4406
break;
4407
4408
case PT_WORD:
4409
for (i = Lmin; i < Lmax; i++)
4410
{
4411
int chartype, category;
4412
int len = 1;
4413
if (Feptr >= mb->end_subject)
4414
{
4415
SCHECK_PARTIAL();
4416
break;
4417
}
4418
GETCHARLENTEST(fc, Feptr, len);
4419
chartype = UCD_CHARTYPE(fc);
4420
category = PRIV(ucp_gentype)[chartype];
4421
if ((category == ucp_L ||
4422
category == ucp_N ||
4423
chartype == ucp_Mn ||
4424
chartype == ucp_Pc) == notmatch)
4425
break;
4426
Feptr+= len;
4427
}
4428
break;
4429
4430
case PT_CLIST:
4431
for (i = Lmin; i < Lmax; i++)
4432
{
4433
const uint32_t *cp;
4434
int len = 1;
4435
if (Feptr >= mb->end_subject)
4436
{
4437
SCHECK_PARTIAL();
4438
break;
4439
}
4440
GETCHARLENTEST(fc, Feptr, len);
4441
#if PCRE2_CODE_UNIT_WIDTH == 32
4442
if (fc > MAX_UTF_CODE_POINT)
4443
{
4444
if (!notmatch) goto GOT_MAX;
4445
}
4446
else
4447
#endif
4448
{
4449
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4450
for (;;)
4451
{
4452
if (fc < *cp)
4453
{ if (notmatch) break; else goto GOT_MAX; }
4454
if (fc == *cp++)
4455
{ if (notmatch) goto GOT_MAX; else break; }
4456
}
4457
}
4458
4459
Feptr += len;
4460
}
4461
GOT_MAX:
4462
break;
4463
4464
case PT_UCNC:
4465
for (i = Lmin; i < Lmax; i++)
4466
{
4467
int len = 1;
4468
if (Feptr >= mb->end_subject)
4469
{
4470
SCHECK_PARTIAL();
4471
break;
4472
}
4473
GETCHARLENTEST(fc, Feptr, len);
4474
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4475
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4476
fc >= 0xe000) == notmatch)
4477
break;
4478
Feptr += len;
4479
}
4480
break;
4481
4482
case PT_BIDICL:
4483
for (i = Lmin; i < Lmax; i++)
4484
{
4485
int len = 1;
4486
if (Feptr >= mb->end_subject)
4487
{
4488
SCHECK_PARTIAL();
4489
break;
4490
}
4491
GETCHARLENTEST(fc, Feptr, len);
4492
if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;
4493
Feptr+= len;
4494
}
4495
break;
4496
4497
case PT_BOOL:
4498
for (i = Lmin; i < Lmax; i++)
4499
{
4500
BOOL ok;
4501
const ucd_record *prop;
4502
int len = 1;
4503
if (Feptr >= mb->end_subject)
4504
{
4505
SCHECK_PARTIAL();
4506
break;
4507
}
4508
GETCHARLENTEST(fc, Feptr, len);
4509
prop = GET_UCD(fc);
4510
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
4511
UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
4512
if (ok == notmatch) break;
4513
Feptr+= len;
4514
}
4515
break;
4516
4517
default:
4518
PCRE2_DEBUG_UNREACHABLE();
4519
return PCRE2_ERROR_INTERNAL;
4520
}
4521
4522
/* Feptr is now past the end of the maximum run */
4523
4524
if (reptype == REPTYPE_POS) continue; /* No backtracking */
4525
4526
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
4527
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4528
go too far. */
4529
4530
for(;;)
4531
{
4532
if (Feptr <= Lstart_eptr) break;
4533
RMATCH(Fecode, RM221);
4534
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4535
Feptr--;
4536
if (utf) BACKCHAR(Feptr);
4537
}
4538
}
4539
4540
/* Match extended Unicode grapheme clusters. We will get here only if the
4541
support is in the binary; otherwise a compile-time error occurs. */
4542
4543
else if (Lctype == OP_EXTUNI)
4544
{
4545
for (i = Lmin; i < Lmax; i++)
4546
{
4547
if (Feptr >= mb->end_subject)
4548
{
4549
SCHECK_PARTIAL();
4550
break;
4551
}
4552
else
4553
{
4554
GETCHARINCTEST(fc, Feptr);
4555
Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4556
utf, NULL);
4557
}
4558
CHECK_PARTIAL();
4559
}
4560
4561
/* Feptr is now past the end of the maximum run */
4562
4563
if (reptype == REPTYPE_POS) continue; /* No backtracking */
4564
4565
/* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4566
of the run while backtracking because the use of \C in UTF mode can
4567
cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4568
the use of \C in UTF mode is fraught with danger. */
4569
4570
for(;;)
4571
{
4572
int lgb, rgb;
4573
PCRE2_SPTR fptr;
4574
4575
if (Feptr <= Lstart_eptr) break; /* At start of char run */
4576
RMATCH(Fecode, RM219);
4577
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4578
4579
/* Backtracking over an extended grapheme cluster involves inspecting
4580
the previous two characters (if present) to see if a break is
4581
permitted between them. */
4582
4583
Feptr--;
4584
if (!utf) fc = *Feptr; else
4585
{
4586
BACKCHAR(Feptr);
4587
GETCHAR(fc, Feptr);
4588
}
4589
rgb = UCD_GRAPHBREAK(fc);
4590
4591
for (;;)
4592
{
4593
if (Feptr <= Lstart_eptr) break; /* At start of char run */
4594
fptr = Feptr - 1;
4595
if (!utf) fc = *fptr; else
4596
{
4597
BACKCHAR(fptr);
4598
GETCHAR(fc, fptr);
4599
}
4600
lgb = UCD_GRAPHBREAK(fc);
4601
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4602
Feptr = fptr;
4603
rgb = lgb;
4604
}
4605
}
4606
}
4607
4608
else
4609
#endif /* SUPPORT_UNICODE */
4610
4611
#ifdef SUPPORT_UNICODE
4612
if (utf)
4613
{
4614
switch(Lctype)
4615
{
4616
case OP_ANY:
4617
for (i = Lmin; i < Lmax; i++)
4618
{
4619
if (Feptr >= mb->end_subject)
4620
{
4621
SCHECK_PARTIAL();
4622
break;
4623
}
4624
if (IS_NEWLINE(Feptr)) break;
4625
if (mb->partial != 0 && /* Take care with CRLF partial */
4626
Feptr + 1 >= mb->end_subject &&
4627
NLBLOCK->nltype == NLTYPE_FIXED &&
4628
NLBLOCK->nllen == 2 &&
4629
UCHAR21(Feptr) == NLBLOCK->nl[0])
4630
{
4631
mb->hitend = TRUE;
4632
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4633
}
4634
Feptr++;
4635
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4636
}
4637
break;
4638
4639
case OP_ALLANY:
4640
if (Lmax < UINT32_MAX)
4641
{
4642
for (i = Lmin; i < Lmax; i++)
4643
{
4644
if (Feptr >= mb->end_subject)
4645
{
4646
SCHECK_PARTIAL();
4647
break;
4648
}
4649
Feptr++;
4650
ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4651
}
4652
}
4653
else
4654
{
4655
Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4656
SCHECK_PARTIAL();
4657
}
4658
break;
4659
4660
/* The "byte" (i.e. "code unit") case is the same as non-UTF */
4661
4662
case OP_ANYBYTE:
4663
fc = Lmax - Lmin;
4664
if (fc > (uint32_t)(mb->end_subject - Feptr))
4665
{
4666
Feptr = mb->end_subject;
4667
SCHECK_PARTIAL();
4668
}
4669
else Feptr += fc;
4670
break;
4671
4672
case OP_ANYNL:
4673
for (i = Lmin; i < Lmax; i++)
4674
{
4675
int len = 1;
4676
if (Feptr >= mb->end_subject)
4677
{
4678
SCHECK_PARTIAL();
4679
break;
4680
}
4681
GETCHARLEN(fc, Feptr, len);
4682
if (fc == CHAR_CR)
4683
{
4684
if (++Feptr >= mb->end_subject) break;
4685
if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4686
}
4687
else
4688
{
4689
if (fc != CHAR_LF &&
4690
(mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4691
(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4692
#ifndef EBCDIC
4693
&& fc != 0x2028 && fc != 0x2029
4694
#endif /* Not EBCDIC */
4695
)))
4696
break;
4697
Feptr += len;
4698
}
4699
}
4700
break;
4701
4702
case OP_NOT_HSPACE:
4703
case OP_HSPACE:
4704
for (i = Lmin; i < Lmax; i++)
4705
{
4706
BOOL gotspace;
4707
int len = 1;
4708
if (Feptr >= mb->end_subject)
4709
{
4710
SCHECK_PARTIAL();
4711
break;
4712
}
4713
GETCHARLEN(fc, Feptr, len);
4714
switch(fc)
4715
{
4716
HSPACE_CASES: gotspace = TRUE; break;
4717
default: gotspace = FALSE; break;
4718
}
4719
if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4720
Feptr += len;
4721
}
4722
break;
4723
4724
case OP_NOT_VSPACE:
4725
case OP_VSPACE:
4726
for (i = Lmin; i < Lmax; i++)
4727
{
4728
BOOL gotspace;
4729
int len = 1;
4730
if (Feptr >= mb->end_subject)
4731
{
4732
SCHECK_PARTIAL();
4733
break;
4734
}
4735
GETCHARLEN(fc, Feptr, len);
4736
switch(fc)
4737
{
4738
VSPACE_CASES: gotspace = TRUE; break;
4739
default: gotspace = FALSE; break;
4740
}
4741
if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4742
Feptr += len;
4743
}
4744
break;
4745
4746
case OP_NOT_DIGIT:
4747
for (i = Lmin; i < Lmax; i++)
4748
{
4749
int len = 1;
4750
if (Feptr >= mb->end_subject)
4751
{
4752
SCHECK_PARTIAL();
4753
break;
4754
}
4755
GETCHARLEN(fc, Feptr, len);
4756
if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4757
Feptr+= len;
4758
}
4759
break;
4760
4761
case OP_DIGIT:
4762
for (i = Lmin; i < Lmax; i++)
4763
{
4764
int len = 1;
4765
if (Feptr >= mb->end_subject)
4766
{
4767
SCHECK_PARTIAL();
4768
break;
4769
}
4770
GETCHARLEN(fc, Feptr, len);
4771
if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4772
Feptr+= len;
4773
}
4774
break;
4775
4776
case OP_NOT_WHITESPACE:
4777
for (i = Lmin; i < Lmax; i++)
4778
{
4779
int len = 1;
4780
if (Feptr >= mb->end_subject)
4781
{
4782
SCHECK_PARTIAL();
4783
break;
4784
}
4785
GETCHARLEN(fc, Feptr, len);
4786
if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4787
Feptr+= len;
4788
}
4789
break;
4790
4791
case OP_WHITESPACE:
4792
for (i = Lmin; i < Lmax; i++)
4793
{
4794
int len = 1;
4795
if (Feptr >= mb->end_subject)
4796
{
4797
SCHECK_PARTIAL();
4798
break;
4799
}
4800
GETCHARLEN(fc, Feptr, len);
4801
if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4802
Feptr+= len;
4803
}
4804
break;
4805
4806
case OP_NOT_WORDCHAR:
4807
for (i = Lmin; i < Lmax; i++)
4808
{
4809
int len = 1;
4810
if (Feptr >= mb->end_subject)
4811
{
4812
SCHECK_PARTIAL();
4813
break;
4814
}
4815
GETCHARLEN(fc, Feptr, len);
4816
if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4817
Feptr+= len;
4818
}
4819
break;
4820
4821
case OP_WORDCHAR:
4822
for (i = Lmin; i < Lmax; i++)
4823
{
4824
int len = 1;
4825
if (Feptr >= mb->end_subject)
4826
{
4827
SCHECK_PARTIAL();
4828
break;
4829
}
4830
GETCHARLEN(fc, Feptr, len);
4831
if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4832
Feptr+= len;
4833
}
4834
break;
4835
4836
default:
4837
PCRE2_DEBUG_UNREACHABLE();
4838
return PCRE2_ERROR_INTERNAL;
4839
}
4840
4841
if (reptype == REPTYPE_POS) continue; /* No backtracking */
4842
4843
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
4844
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4845
too far. */
4846
4847
for(;;)
4848
{
4849
if (Feptr <= Lstart_eptr) break;
4850
RMATCH(Fecode, RM220);
4851
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4852
Feptr--;
4853
BACKCHAR(Feptr);
4854
if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4855
UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4856
Feptr--;
4857
}
4858
}
4859
else
4860
#endif /* SUPPORT_UNICODE */
4861
4862
/* Not UTF mode */
4863
{
4864
switch(Lctype)
4865
{
4866
case OP_ANY:
4867
for (i = Lmin; i < Lmax; i++)
4868
{
4869
if (Feptr >= mb->end_subject)
4870
{
4871
SCHECK_PARTIAL();
4872
break;
4873
}
4874
if (IS_NEWLINE(Feptr)) break;
4875
if (mb->partial != 0 && /* Take care with CRLF partial */
4876
Feptr + 1 >= mb->end_subject &&
4877
NLBLOCK->nltype == NLTYPE_FIXED &&
4878
NLBLOCK->nllen == 2 &&
4879
*Feptr == NLBLOCK->nl[0])
4880
{
4881
mb->hitend = TRUE;
4882
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4883
}
4884
Feptr++;
4885
}
4886
break;
4887
4888
case OP_ALLANY:
4889
case OP_ANYBYTE:
4890
fc = Lmax - Lmin;
4891
if (fc > (uint32_t)(mb->end_subject - Feptr))
4892
{
4893
Feptr = mb->end_subject;
4894
SCHECK_PARTIAL();
4895
}
4896
else Feptr += fc;
4897
break;
4898
4899
case OP_ANYNL:
4900
for (i = Lmin; i < Lmax; i++)
4901
{
4902
if (Feptr >= mb->end_subject)
4903
{
4904
SCHECK_PARTIAL();
4905
break;
4906
}
4907
fc = *Feptr;
4908
if (fc == CHAR_CR)
4909
{
4910
if (++Feptr >= mb->end_subject) break;
4911
if (*Feptr == CHAR_LF) Feptr++;
4912
}
4913
else
4914
{
4915
if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4916
(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4917
#if PCRE2_CODE_UNIT_WIDTH != 8
4918
&& fc != 0x2028 && fc != 0x2029
4919
#endif
4920
))) break;
4921
Feptr++;
4922
}
4923
}
4924
break;
4925
4926
case OP_NOT_HSPACE:
4927
for (i = Lmin; i < Lmax; i++)
4928
{
4929
if (Feptr >= mb->end_subject)
4930
{
4931
SCHECK_PARTIAL();
4932
break;
4933
}
4934
switch(*Feptr)
4935
{
4936
default: Feptr++; break;
4937
HSPACE_BYTE_CASES:
4938
#if PCRE2_CODE_UNIT_WIDTH != 8
4939
HSPACE_MULTIBYTE_CASES:
4940
#endif
4941
goto ENDLOOP00;
4942
}
4943
}
4944
ENDLOOP00:
4945
break;
4946
4947
case OP_HSPACE:
4948
for (i = Lmin; i < Lmax; i++)
4949
{
4950
if (Feptr >= mb->end_subject)
4951
{
4952
SCHECK_PARTIAL();
4953
break;
4954
}
4955
switch(*Feptr)
4956
{
4957
default: goto ENDLOOP01;
4958
HSPACE_BYTE_CASES:
4959
#if PCRE2_CODE_UNIT_WIDTH != 8
4960
HSPACE_MULTIBYTE_CASES:
4961
#endif
4962
Feptr++; break;
4963
}
4964
}
4965
ENDLOOP01:
4966
break;
4967
4968
case OP_NOT_VSPACE:
4969
for (i = Lmin; i < Lmax; i++)
4970
{
4971
if (Feptr >= mb->end_subject)
4972
{
4973
SCHECK_PARTIAL();
4974
break;
4975
}
4976
switch(*Feptr)
4977
{
4978
default: Feptr++; break;
4979
VSPACE_BYTE_CASES:
4980
#if PCRE2_CODE_UNIT_WIDTH != 8
4981
VSPACE_MULTIBYTE_CASES:
4982
#endif
4983
goto ENDLOOP02;
4984
}
4985
}
4986
ENDLOOP02:
4987
break;
4988
4989
case OP_VSPACE:
4990
for (i = Lmin; i < Lmax; i++)
4991
{
4992
if (Feptr >= mb->end_subject)
4993
{
4994
SCHECK_PARTIAL();
4995
break;
4996
}
4997
switch(*Feptr)
4998
{
4999
default: goto ENDLOOP03;
5000
VSPACE_BYTE_CASES:
5001
#if PCRE2_CODE_UNIT_WIDTH != 8
5002
VSPACE_MULTIBYTE_CASES:
5003
#endif
5004
Feptr++; break;
5005
}
5006
}
5007
ENDLOOP03:
5008
break;
5009
5010
case OP_NOT_DIGIT:
5011
for (i = Lmin; i < Lmax; i++)
5012
{
5013
if (Feptr >= mb->end_subject)
5014
{
5015
SCHECK_PARTIAL();
5016
break;
5017
}
5018
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
5019
break;
5020
Feptr++;
5021
}
5022
break;
5023
5024
case OP_DIGIT:
5025
for (i = Lmin; i < Lmax; i++)
5026
{
5027
if (Feptr >= mb->end_subject)
5028
{
5029
SCHECK_PARTIAL();
5030
break;
5031
}
5032
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
5033
break;
5034
Feptr++;
5035
}
5036
break;
5037
5038
case OP_NOT_WHITESPACE:
5039
for (i = Lmin; i < Lmax; i++)
5040
{
5041
if (Feptr >= mb->end_subject)
5042
{
5043
SCHECK_PARTIAL();
5044
break;
5045
}
5046
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
5047
break;
5048
Feptr++;
5049
}
5050
break;
5051
5052
case OP_WHITESPACE:
5053
for (i = Lmin; i < Lmax; i++)
5054
{
5055
if (Feptr >= mb->end_subject)
5056
{
5057
SCHECK_PARTIAL();
5058
break;
5059
}
5060
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
5061
break;
5062
Feptr++;
5063
}
5064
break;
5065
5066
case OP_NOT_WORDCHAR:
5067
for (i = Lmin; i < Lmax; i++)
5068
{
5069
if (Feptr >= mb->end_subject)
5070
{
5071
SCHECK_PARTIAL();
5072
break;
5073
}
5074
if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
5075
break;
5076
Feptr++;
5077
}
5078
break;
5079
5080
case OP_WORDCHAR:
5081
for (i = Lmin; i < Lmax; i++)
5082
{
5083
if (Feptr >= mb->end_subject)
5084
{
5085
SCHECK_PARTIAL();
5086
break;
5087
}
5088
if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
5089
break;
5090
Feptr++;
5091
}
5092
break;
5093
5094
default:
5095
PCRE2_DEBUG_UNREACHABLE();
5096
return PCRE2_ERROR_INTERNAL;
5097
}
5098
5099
if (reptype == REPTYPE_POS) continue; /* No backtracking */
5100
5101
for (;;)
5102
{
5103
if (Feptr == Lstart_eptr) break;
5104
RMATCH(Fecode, RM34);
5105
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5106
Feptr--;
5107
if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
5108
Feptr[-1] == CHAR_CR) Feptr--;
5109
}
5110
}
5111
}
5112
break; /* End of repeat character type processing */
5113
5114
#undef Lstart_eptr
5115
#undef Lmin
5116
#undef Lmax
5117
#undef Lctype
5118
#undef Lpropvalue
5119
5120
5121
/* ===================================================================== */
5122
/* Match a back reference, possibly repeatedly. Look past the end of the
5123
item to see if there is repeat information following. The OP_REF and
5124
OP_REFI opcodes are used for a reference to a numbered group or to a
5125
non-duplicated named group. For a duplicated named group, OP_DNREF and
5126
OP_DNREFI are used. In this case we must scan the list of groups to which
5127
the name refers, and use the first one that is set. */
5128
5129
#define Lmin F->temp_32[0]
5130
#define Lmax F->temp_32[1]
5131
#define Lcaseless F->temp_32[2]
5132
#define Lcaseopts F->temp_32[3]
5133
#define Lstart F->temp_sptr[0]
5134
#define Loffset F->temp_size
5135
5136
case OP_DNREF:
5137
case OP_DNREFI:
5138
Lcaseless = (Fop == OP_DNREFI);
5139
Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0;
5140
{
5141
int count = GET2(Fecode, 1+IMM2_SIZE);
5142
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5143
Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0);
5144
5145
while (count-- > 0)
5146
{
5147
Loffset = (GET2(slot, 0) << 1) - 2;
5148
if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
5149
slot += mb->name_entry_size;
5150
}
5151
}
5152
goto REF_REPEAT;
5153
5154
case OP_REF:
5155
case OP_REFI:
5156
Lcaseless = (Fop == OP_REFI);
5157
Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0;
5158
Loffset = (GET2(Fecode, 1) << 1) - 2;
5159
Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0);
5160
5161
/* Set up for repetition, or handle the non-repeated case. The maximum and
5162
minimum must be in the heap frame, but as they are short-term values, we
5163
use temporary fields. */
5164
5165
REF_REPEAT:
5166
switch (*Fecode)
5167
{
5168
case OP_CRSTAR:
5169
case OP_CRMINSTAR:
5170
case OP_CRPLUS:
5171
case OP_CRMINPLUS:
5172
case OP_CRQUERY:
5173
case OP_CRMINQUERY:
5174
fc = *Fecode++ - OP_CRSTAR;
5175
Lmin = rep_min[fc];
5176
Lmax = rep_max[fc];
5177
reptype = rep_typ[fc];
5178
break;
5179
5180
case OP_CRRANGE:
5181
case OP_CRMINRANGE:
5182
Lmin = GET2(Fecode, 1);
5183
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
5184
reptype = rep_typ[*Fecode - OP_CRSTAR];
5185
if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
5186
Fecode += 1 + 2 * IMM2_SIZE;
5187
break;
5188
5189
default: /* No repeat follows */
5190
{
5191
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length);
5192
if (rrc != 0)
5193
{
5194
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5195
CHECK_PARTIAL();
5196
RRETURN(MATCH_NOMATCH);
5197
}
5198
}
5199
Feptr += length;
5200
continue; /* With the main loop */
5201
}
5202
5203
/* Handle repeated back references. If a set group has length zero, just
5204
continue with the main loop, because it matches however many times. For an
5205
unset reference, if the minimum is zero, we can also just continue. We can
5206
also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
5207
group behave as a zero-length group. For any other unset cases, carrying
5208
on will result in NOMATCH. */
5209
5210
if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
5211
{
5212
if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
5213
}
5214
else /* Group is not set */
5215
{
5216
if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
5217
continue;
5218
}
5219
5220
/* First, ensure the minimum number of matches are present. */
5221
5222
for (i = 1; i <= Lmin; i++)
5223
{
5224
PCRE2_SIZE slength;
5225
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5226
if (rrc != 0)
5227
{
5228
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5229
CHECK_PARTIAL();
5230
RRETURN(MATCH_NOMATCH);
5231
}
5232
Feptr += slength;
5233
}
5234
5235
/* If min = max, we are done. They are not both allowed to be zero. */
5236
5237
if (Lmin == Lmax) continue;
5238
5239
/* If minimizing, keep trying and advancing the pointer. */
5240
5241
if (reptype == REPTYPE_MIN)
5242
{
5243
for (;;)
5244
{
5245
PCRE2_SIZE slength;
5246
RMATCH(Fecode, RM20);
5247
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5248
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
5249
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5250
if (rrc != 0)
5251
{
5252
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5253
CHECK_PARTIAL();
5254
RRETURN(MATCH_NOMATCH);
5255
}
5256
Feptr += slength;
5257
}
5258
5259
PCRE2_UNREACHABLE(); /* Control never reaches here */
5260
}
5261
5262
/* If maximizing, find the longest string and work backwards, as long as
5263
the matched lengths for each iteration are the same. */
5264
5265
else
5266
{
5267
BOOL samelengths = TRUE;
5268
Lstart = Feptr; /* Starting position */
5269
Flength = Fovector[Loffset+1] - Fovector[Loffset];
5270
5271
for (i = Lmin; i < Lmax; i++)
5272
{
5273
PCRE2_SIZE slength;
5274
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5275
if (rrc != 0)
5276
{
5277
/* Can't use CHECK_PARTIAL because we don't want to update Feptr in
5278
the soft partial matching case. */
5279
5280
if (rrc > 0 && mb->partial != 0 &&
5281
mb->end_subject > mb->start_used_ptr)
5282
{
5283
mb->hitend = TRUE;
5284
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5285
}
5286
break;
5287
}
5288
5289
if (slength != Flength) samelengths = FALSE;
5290
Feptr += slength;
5291
}
5292
5293
/* If the length matched for each repetition is the same as the length of
5294
the captured group, we can easily work backwards. This is the normal
5295
case. However, in caseless UTF-8 mode there are pairs of case-equivalent
5296
characters whose lengths (in terms of code units) differ. However, this
5297
is very rare, so we handle it by re-matching fewer and fewer times. */
5298
5299
if (samelengths)
5300
{
5301
while (Feptr >= Lstart)
5302
{
5303
RMATCH(Fecode, RM21);
5304
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5305
Feptr -= Flength;
5306
}
5307
}
5308
5309
/* The rare case of non-matching lengths. Re-scan the repetition for each
5310
iteration. We know that match_ref() will succeed every time. */
5311
5312
else
5313
{
5314
Lmax = i;
5315
for (;;)
5316
{
5317
RMATCH(Fecode, RM22);
5318
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5319
if (Feptr == Lstart) break; /* Failed after minimal repetition */
5320
Feptr = Lstart;
5321
Lmax--;
5322
for (i = Lmin; i < Lmax; i++)
5323
{
5324
PCRE2_SIZE slength;
5325
(void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
5326
Feptr += slength;
5327
}
5328
}
5329
}
5330
5331
RRETURN(MATCH_NOMATCH);
5332
}
5333
5334
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
5335
5336
#undef Lcaseless
5337
#undef Lmin
5338
#undef Lmax
5339
#undef Lstart
5340
#undef Loffset
5341
5342
5343
5344
/* ========================================================================= */
5345
/* Opcodes for the start of various parenthesized items */
5346
/* ========================================================================= */
5347
5348
/* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
5349
(*THEN) is within the current branch by comparing the address of OP_THEN
5350
that is passed back with the end of the branch. If (*THEN) is within the
5351
current branch, and the branch is one of two or more alternatives (it
5352
either starts or ends with OP_ALT), we have reached the limit of THEN's
5353
action, so convert the return code to NOMATCH, which will cause normal
5354
backtracking to happen from now on. Otherwise, THEN is passed back to an
5355
outer alternative. This implements Perl's treatment of parenthesized
5356
groups, where a group not containing | does not affect the current
5357
alternative, that is, (X) is NOT the same as (X|(*F)). */
5358
5359
5360
/* ===================================================================== */
5361
/* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
5362
bracket group, indicating that it may occur zero times. It may repeat
5363
infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
5364
the pattern. Brackets with fixed upper repeat limits are compiled as a
5365
number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
5366
Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
5367
5368
#define Lnext_ecode F->temp_sptr[0]
5369
5370
case OP_BRAZERO:
5371
Lnext_ecode = Fecode + 1;
5372
RMATCH(Lnext_ecode, RM9);
5373
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5374
do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5375
Fecode = Lnext_ecode + 1 + LINK_SIZE;
5376
break;
5377
5378
case OP_BRAMINZERO:
5379
Lnext_ecode = Fecode + 1;
5380
do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5381
RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
5382
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5383
Fecode++;
5384
break;
5385
5386
#undef Lnext_ecode
5387
5388
case OP_SKIPZERO:
5389
Fecode++;
5390
do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5391
Fecode += 1 + LINK_SIZE;
5392
break;
5393
5394
5395
/* ===================================================================== */
5396
/* Handle possessive brackets with an unlimited repeat. The end of these
5397
brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
5398
going further in the pattern. */
5399
5400
#define Lframe_type F->temp_32[0]
5401
#define Lmatched_once F->temp_32[1]
5402
#define Lzero_allowed F->temp_32[2]
5403
#define Lstart_eptr F->temp_sptr[0]
5404
#define Lstart_group F->temp_sptr[1]
5405
5406
case OP_BRAPOSZERO:
5407
Lzero_allowed = TRUE; /* Zero repeat is allowed */
5408
Fecode += 1;
5409
if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
5410
goto POSSESSIVE_CAPTURE;
5411
goto POSSESSIVE_NON_CAPTURE;
5412
5413
case OP_BRAPOS:
5414
case OP_SBRAPOS:
5415
Lzero_allowed = FALSE; /* Zero repeat not allowed */
5416
5417
POSSESSIVE_NON_CAPTURE:
5418
Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
5419
goto POSSESSIVE_GROUP;
5420
5421
case OP_CBRAPOS:
5422
case OP_SCBRAPOS:
5423
Lzero_allowed = FALSE; /* Zero repeat not allowed */
5424
5425
POSSESSIVE_CAPTURE:
5426
number = GET2(Fecode, 1+LINK_SIZE);
5427
Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
5428
5429
POSSESSIVE_GROUP:
5430
Lmatched_once = FALSE; /* Never matched */
5431
Lstart_group = Fecode; /* Start of this group */
5432
5433
for (;;)
5434
{
5435
Lstart_eptr = Feptr; /* Position at group start */
5436
group_frame_type = Lframe_type;
5437
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
5438
if (rrc == MATCH_KETRPOS)
5439
{
5440
Lmatched_once = TRUE; /* Matched at least once */
5441
if (Feptr == Lstart_eptr) /* Empty match; skip to end */
5442
{
5443
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5444
break;
5445
}
5446
5447
Fecode = Lstart_group;
5448
continue;
5449
}
5450
5451
/* See comment above about handling THEN. */
5452
5453
if (rrc == MATCH_THEN)
5454
{
5455
PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5456
if (mb->verb_ecode_ptr < next_ecode &&
5457
(*Fecode == OP_ALT || *next_ecode == OP_ALT))
5458
rrc = MATCH_NOMATCH;
5459
}
5460
5461
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5462
Fecode += GET(Fecode, 1);
5463
if (*Fecode != OP_ALT) break;
5464
}
5465
5466
/* Success if matched something or zero repeat allowed */
5467
5468
if (Lmatched_once || Lzero_allowed)
5469
{
5470
Fecode += 1 + LINK_SIZE;
5471
break;
5472
}
5473
5474
RRETURN(MATCH_NOMATCH);
5475
5476
#undef Lmatched_once
5477
#undef Lzero_allowed
5478
#undef Lframe_type
5479
#undef Lstart_eptr
5480
#undef Lstart_group
5481
5482
5483
/* ===================================================================== */
5484
/* Handle non-capturing brackets that cannot match an empty string. When we
5485
get to the final alternative within the brackets, as long as there are no
5486
THEN's in the pattern, we can optimize by not recording a new backtracking
5487
point. (Ideally we should test for a THEN within this group, but we don't
5488
have that information.) Don't do this if we are at the very top level,
5489
however, because that would make handling assertions and once-only brackets
5490
messier when there is nothing to go back to. */
5491
5492
#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5493
#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5494
5495
case OP_BRA:
5496
if (mb->hasthen || Frdepth == 0)
5497
{
5498
Lframe_type = 0;
5499
goto GROUPLOOP;
5500
}
5501
5502
for (;;)
5503
{
5504
Lnext_branch = Fecode + GET(Fecode, 1);
5505
if (*Lnext_branch != OP_ALT) break;
5506
5507
/* This is never the final branch. We do not need to test for MATCH_THEN
5508
here because this code is not used when there is a THEN in the pattern. */
5509
5510
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5511
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5512
Fecode = Lnext_branch;
5513
}
5514
5515
/* Hit the start of the final branch. Continue at this level. */
5516
5517
Fecode += PRIV(OP_lengths)[*Fecode];
5518
break;
5519
5520
#undef Lnext_branch
5521
5522
5523
/* ===================================================================== */
5524
/* Handle a capturing bracket, other than those that are possessive with an
5525
unlimited repeat. */
5526
5527
case OP_CBRA:
5528
case OP_SCBRA:
5529
Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5530
goto GROUPLOOP;
5531
5532
5533
/* ===================================================================== */
5534
/* Atomic groups and non-capturing brackets that can match an empty string
5535
must record a backtracking point and also set up a chained frame. */
5536
5537
case OP_ONCE:
5538
case OP_SCRIPT_RUN:
5539
case OP_SBRA:
5540
Lframe_type = GF_NOCAPTURE | Fop;
5541
5542
GROUPLOOP:
5543
for (;;)
5544
{
5545
group_frame_type = Lframe_type;
5546
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5547
if (rrc == MATCH_THEN)
5548
{
5549
PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5550
if (mb->verb_ecode_ptr < next_ecode &&
5551
(*Fecode == OP_ALT || *next_ecode == OP_ALT))
5552
rrc = MATCH_NOMATCH;
5553
}
5554
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5555
Fecode += GET(Fecode, 1);
5556
if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5557
}
5558
PCRE2_UNREACHABLE(); /* Control never reaches here */
5559
5560
#undef Lframe_type
5561
5562
5563
/* ===================================================================== */
5564
/* Pattern recursion either matches the current regex, or some
5565
subexpression. The offset data is the offset to the starting bracket from
5566
the start of the whole pattern. This is so that it works from duplicated
5567
subpatterns. For a whole-pattern recursion, we have to infer the number
5568
zero. */
5569
5570
#define Lframe_type F->temp_32[0]
5571
#define Lstart_branch F->temp_sptr[0]
5572
5573
case OP_RECURSE:
5574
bracode = mb->start_code + GET(Fecode, 1);
5575
number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5576
5577
/* If we are already in a pattern recursion, check for repeating the same
5578
one without changing the subject pointer or the last referenced character
5579
in the subject. This should catch convoluted mutual recursions; some
5580
simple cases are caught at compile time. However, there are rare cases when
5581
this check needs to be turned off. In this case, actual recursion loops
5582
will be caught by the match or heap limits. */
5583
5584
if (Fcurrent_recurse != RECURSE_UNSET)
5585
{
5586
offset = Flast_group_offset;
5587
while (offset != PCRE2_UNSET)
5588
{
5589
N = (heapframe *)((char *)match_data->heapframes + offset);
5590
P = (heapframe *)((char *)N - frame_size);
5591
if (N->group_frame_type == (GF_RECURSE | number))
5592
{
5593
if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&
5594
(mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)
5595
return PCRE2_ERROR_RECURSELOOP;
5596
break;
5597
}
5598
offset = P->last_group_offset;
5599
}
5600
}
5601
5602
/* Remember the current last referenced character and then run the
5603
recursion branch by branch. */
5604
5605
F->recurse_last_used = mb->last_used_ptr;
5606
Lstart_branch = bracode;
5607
Lframe_type = GF_RECURSE | number;
5608
5609
for (;;)
5610
{
5611
PCRE2_SPTR next_ecode;
5612
5613
group_frame_type = Lframe_type;
5614
RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5615
next_ecode = Lstart_branch + GET(Lstart_branch,1);
5616
5617
/* Handle backtracking verbs, which are defined in a range that can
5618
easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5619
escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5620
5621
When one of these verbs triggers, the current recursion group number is
5622
recorded. If it matches the recursion we are processing, the verb
5623
happened within the recursion and we must deal with it. Otherwise it must
5624
have happened after the recursion completed, and so has to be passed
5625
back. See comment above about handling THEN. */
5626
5627
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5628
mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5629
{
5630
if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5631
(*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5632
rrc = MATCH_NOMATCH;
5633
else RRETURN(MATCH_NOMATCH);
5634
}
5635
5636
/* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5637
OP_ACCEPT code. Nothing needs to be done here. */
5638
5639
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5640
Lstart_branch = next_ecode;
5641
if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5642
}
5643
PCRE2_UNREACHABLE(); /* Control never reaches here */
5644
5645
#undef Lframe_type
5646
#undef Lstart_branch
5647
5648
5649
/* ===================================================================== */
5650
/* Positive assertions are like other groups except that PCRE doesn't allow
5651
the effect of (*THEN) to escape beyond an assertion; it is therefore
5652
treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5653
captures and mark retained. Any other return is an error. */
5654
5655
#define Lframe_type F->temp_32[0]
5656
5657
case OP_ASSERT:
5658
case OP_ASSERTBACK:
5659
case OP_ASSERT_NA:
5660
case OP_ASSERTBACK_NA:
5661
Lframe_type = GF_NOCAPTURE | Fop;
5662
for (;;)
5663
{
5664
group_frame_type = Lframe_type;
5665
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5666
if (rrc == MATCH_ACCEPT)
5667
{
5668
memcpy(Fovector,
5669
(char *)assert_accept_frame + offsetof(heapframe, ovector),
5670
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5671
Foffset_top = assert_accept_frame->offset_top;
5672
Fmark = assert_accept_frame->mark;
5673
break;
5674
}
5675
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5676
Fecode += GET(Fecode, 1);
5677
if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5678
}
5679
5680
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5681
Fecode += 1 + LINK_SIZE;
5682
break;
5683
5684
#undef Lframe_type
5685
5686
5687
/* ===================================================================== */
5688
/* Handle negative assertions. Loop for each non-matching branch as for
5689
positive assertions. */
5690
5691
#define Lframe_type F->temp_32[0]
5692
5693
case OP_ASSERT_NOT:
5694
case OP_ASSERTBACK_NOT:
5695
Lframe_type = GF_NOCAPTURE | Fop;
5696
5697
for (;;)
5698
{
5699
group_frame_type = Lframe_type;
5700
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5701
switch(rrc)
5702
{
5703
case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5704
case MATCH_MATCH:
5705
RRETURN (MATCH_NOMATCH);
5706
5707
case MATCH_NOMATCH: /* Branch failed, try next if present. */
5708
case MATCH_THEN:
5709
Fecode += GET(Fecode, 1);
5710
if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5711
break;
5712
5713
case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5714
case MATCH_SKIP:
5715
case MATCH_PRUNE:
5716
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5717
goto ASSERT_NOT_FAILED;
5718
5719
default: /* Pass back any other return */
5720
RRETURN(rrc);
5721
}
5722
}
5723
5724
/* None of the branches have matched or there was a backtrack to (*COMMIT),
5725
(*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5726
negative assertion, so carry on. */
5727
5728
ASSERT_NOT_FAILED:
5729
Fecode += 1 + LINK_SIZE;
5730
break;
5731
5732
#undef Lframe_type
5733
5734
/* ===================================================================== */
5735
/* Handle scan substring operation. */
5736
5737
#define Lframe_type F->temp_32[0]
5738
#define Lextra_size F->temp_32[1]
5739
#define Lsaved_moptions F->temp_32[2]
5740
#define Lsaved_end_subject F->temp_sptr[0]
5741
#define Lsaved_eptr F->temp_sptr[1]
5742
#define Ltrue_end_extra F->temp_size
5743
5744
case OP_ASSERT_SCS:
5745
{
5746
PCRE2_SPTR ecode = Fecode + 1 + LINK_SIZE;
5747
uint32_t extra_size = 0;
5748
int count;
5749
PCRE2_SPTR slot;
5750
5751
/* Disable compiler warning. */
5752
offset = 0;
5753
(void)offset;
5754
5755
for (;;)
5756
{
5757
if (*ecode == OP_CREF)
5758
{
5759
extra_size += 1+IMM2_SIZE;
5760
offset = (GET2(ecode, 1) << 1) - 2;
5761
ecode += 1+IMM2_SIZE;
5762
if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)
5763
goto SCS_OFFSET_FOUND;
5764
continue;
5765
}
5766
5767
if (*ecode != OP_DNCREF) RRETURN(MATCH_NOMATCH);
5768
5769
count = GET2(ecode, 1 + IMM2_SIZE);
5770
slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
5771
extra_size += 1+2*IMM2_SIZE;
5772
ecode += 1+2*IMM2_SIZE;
5773
5774
while (count > 0)
5775
{
5776
offset = (GET2(slot, 0) << 1) - 2;
5777
if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)
5778
goto SCS_OFFSET_FOUND;
5779
slot += mb->name_entry_size;
5780
count--;
5781
}
5782
}
5783
5784
SCS_OFFSET_FOUND:
5785
5786
/* Skip remaining options. */
5787
for (;;)
5788
{
5789
if (*ecode == OP_CREF)
5790
{
5791
extra_size += 1+IMM2_SIZE;
5792
ecode += 1+IMM2_SIZE;
5793
}
5794
else if (*ecode == OP_DNCREF)
5795
{
5796
extra_size += 1+2*IMM2_SIZE;
5797
ecode += 1+2*IMM2_SIZE;
5798
}
5799
else break;
5800
}
5801
5802
Lextra_size = extra_size;
5803
}
5804
5805
Lsaved_end_subject = mb->end_subject;
5806
Ltrue_end_extra = mb->true_end_subject - mb->end_subject;
5807
Lsaved_eptr = Feptr;
5808
Lsaved_moptions = mb->moptions;
5809
5810
Feptr = mb->start_subject + Fovector[offset];
5811
mb->true_end_subject = mb->end_subject =
5812
mb->start_subject + Fovector[offset + 1];
5813
mb->moptions &= ~PCRE2_NOTEOL;
5814
5815
Lframe_type = GF_NOCAPTURE | Fop;
5816
for (;;)
5817
{
5818
group_frame_type = Lframe_type;
5819
RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38);
5820
if (rrc == MATCH_ACCEPT)
5821
{
5822
memcpy(Fovector,
5823
(char *)assert_accept_frame + offsetof(heapframe, ovector),
5824
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5825
Foffset_top = assert_accept_frame->offset_top;
5826
Fmark = assert_accept_frame->mark;
5827
break;
5828
}
5829
5830
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
5831
{
5832
mb->end_subject = Lsaved_end_subject;
5833
mb->true_end_subject = mb->end_subject + Ltrue_end_extra;
5834
mb->moptions = Lsaved_moptions;
5835
RRETURN(rrc);
5836
}
5837
5838
Fecode += GET(Fecode, 1);
5839
if (*Fecode != OP_ALT)
5840
{
5841
mb->end_subject = Lsaved_end_subject;
5842
mb->true_end_subject = mb->end_subject + Ltrue_end_extra;
5843
mb->moptions = Lsaved_moptions;
5844
RRETURN(MATCH_NOMATCH);
5845
}
5846
Lextra_size = 0;
5847
}
5848
5849
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5850
Fecode += 1 + LINK_SIZE;
5851
Feptr = Lsaved_eptr;
5852
break;
5853
5854
#undef Lframe_type
5855
#undef Lextra_size
5856
#undef Lsaved_end_subject
5857
#undef Lsaved_eptr
5858
#undef Ltrue_end_extra
5859
#undef Lsave_moptions
5860
5861
/* ===================================================================== */
5862
/* The callout item calls an external function, if one is provided, passing
5863
details of the match so far. This is mainly for debugging, though the
5864
function is able to force a failure. */
5865
5866
case OP_CALLOUT:
5867
case OP_CALLOUT_STR:
5868
rrc = do_callout(F, mb, &length);
5869
if (rrc > 0) RRETURN(MATCH_NOMATCH);
5870
if (rrc < 0) RRETURN(rrc);
5871
Fecode += length;
5872
break;
5873
5874
5875
/* ===================================================================== */
5876
/* Conditional group: compilation checked that there are no more than two
5877
branches. If the condition is false, skipping the first branch takes us
5878
past the end of the item if there is only one branch, but that's exactly
5879
what we want. */
5880
5881
case OP_COND:
5882
case OP_SCOND:
5883
5884
/* The variable Flength will be added to Fecode when the condition is
5885
false, to get to the second branch. Setting it to the offset to the ALT or
5886
KET, then incrementing Fecode achieves this effect. However, if the second
5887
branch is non-existent, we must point to the KET so that the end of the
5888
group is correctly processed. We now have Fecode pointing to the condition
5889
or callout. */
5890
5891
Flength = GET(Fecode, 1); /* Offset to the second branch */
5892
if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5893
Fecode += 1 + LINK_SIZE; /* From this opcode */
5894
5895
/* Because of the way auto-callout works during compile, a callout item is
5896
inserted between OP_COND and an assertion condition. Such a callout can
5897
also be inserted manually. */
5898
5899
if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5900
{
5901
rrc = do_callout(F, mb, &length);
5902
if (rrc > 0) RRETURN(MATCH_NOMATCH);
5903
if (rrc < 0) RRETURN(rrc);
5904
5905
/* Advance Fecode past the callout, so it now points to the condition. We
5906
must adjust Flength so that the value of Fecode+Flength is unchanged. */
5907
5908
Fecode += length;
5909
Flength -= length;
5910
}
5911
5912
/* Test the various possible conditions */
5913
5914
condition = FALSE;
5915
switch(*Fecode)
5916
{
5917
case OP_RREF: /* Group recursion test */
5918
if (Fcurrent_recurse != RECURSE_UNSET)
5919
{
5920
number = GET2(Fecode, 1);
5921
condition = (number == RREF_ANY || number == Fcurrent_recurse);
5922
}
5923
break;
5924
5925
case OP_DNRREF: /* Duplicate named group recursion test */
5926
if (Fcurrent_recurse != RECURSE_UNSET)
5927
{
5928
int count = GET2(Fecode, 1 + IMM2_SIZE);
5929
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5930
while (count-- > 0)
5931
{
5932
number = GET2(slot, 0);
5933
condition = number == Fcurrent_recurse;
5934
if (condition) break;
5935
slot += mb->name_entry_size;
5936
}
5937
}
5938
break;
5939
5940
case OP_CREF: /* Numbered group used test */
5941
offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5942
condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5943
break;
5944
5945
case OP_DNCREF: /* Duplicate named group used test */
5946
{
5947
int count = GET2(Fecode, 1 + IMM2_SIZE);
5948
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5949
while (count-- > 0)
5950
{
5951
offset = (GET2(slot, 0) << 1) - 2;
5952
condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5953
if (condition) break;
5954
slot += mb->name_entry_size;
5955
}
5956
}
5957
break;
5958
5959
case OP_FALSE:
5960
case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5961
break;
5962
5963
case OP_TRUE:
5964
condition = TRUE;
5965
break;
5966
5967
/* The condition is an assertion. Run code similar to the assertion code
5968
above. */
5969
5970
#define Lpositive F->temp_32[0]
5971
#define Lstart_branch F->temp_sptr[0]
5972
5973
default:
5974
Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5975
Lstart_branch = Fecode;
5976
5977
for (;;)
5978
{
5979
group_frame_type = GF_CONDASSERT | *Fecode;
5980
RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5981
5982
switch(rrc)
5983
{
5984
case MATCH_ACCEPT: /* Save captures */
5985
memcpy(Fovector,
5986
(char *)assert_accept_frame + offsetof(heapframe, ovector),
5987
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5988
Foffset_top = assert_accept_frame->offset_top;
5989
5990
/* Fall through */
5991
/* In the case of a match, the captures have already been put into
5992
the current frame. */
5993
5994
case MATCH_MATCH:
5995
condition = Lpositive; /* TRUE for positive assertion */
5996
break;
5997
5998
/* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5999
assertion; it is therefore always treated as NOMATCH. */
6000
6001
case MATCH_NOMATCH:
6002
case MATCH_THEN:
6003
Lstart_branch += GET(Lstart_branch, 1);
6004
if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
6005
condition = !Lpositive; /* TRUE for negative assertion */
6006
break;
6007
6008
/* These force no match without checking other branches. */
6009
6010
case MATCH_COMMIT:
6011
case MATCH_SKIP:
6012
case MATCH_PRUNE:
6013
condition = !Lpositive;
6014
break;
6015
6016
default:
6017
RRETURN(rrc);
6018
}
6019
break; /* Out of the branch loop */
6020
}
6021
6022
/* If the condition is true, find the end of the assertion so that
6023
advancing past it gets us to the start of the first branch. */
6024
6025
if (condition)
6026
{
6027
do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
6028
}
6029
break; /* End of assertion condition */
6030
}
6031
6032
#undef Lpositive
6033
#undef Lstart_branch
6034
6035
/* Choose branch according to the condition. */
6036
6037
Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
6038
6039
/* If the opcode is OP_SCOND it means we are at a repeated conditional
6040
group that might match an empty string. We must therefore descend a level
6041
so that the start is remembered for checking. For OP_COND we can just
6042
continue at this level. */
6043
6044
if (Fop == OP_SCOND)
6045
{
6046
group_frame_type = GF_NOCAPTURE | Fop;
6047
RMATCH(Fecode, RM35);
6048
RRETURN(rrc);
6049
}
6050
break;
6051
6052
6053
6054
/* ========================================================================= */
6055
/* End of start of parenthesis opcodes */
6056
/* ========================================================================= */
6057
6058
6059
/* ===================================================================== */
6060
/* Move the subject pointer back by one fixed amount. This occurs at the
6061
start of each branch that has a fixed length in a lookbehind assertion. If
6062
we are too close to the start to move back, fail. When working with UTF-8
6063
we move back a number of characters, not bytes. */
6064
6065
case OP_REVERSE:
6066
number = GET2(Fecode, 1);
6067
#ifdef SUPPORT_UNICODE
6068
if (utf)
6069
{
6070
/* We used to do a simpler `while (number-- > 0)` but that triggers
6071
clang's unsigned integer overflow sanitizer. */
6072
while (number > 0)
6073
{
6074
--number;
6075
if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
6076
Feptr--;
6077
BACKCHAR(Feptr);
6078
}
6079
}
6080
else
6081
#endif
6082
6083
/* No UTF support, or not in UTF mode: count is code unit count */
6084
6085
{
6086
if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
6087
Feptr -= number;
6088
}
6089
6090
/* Save the earliest consulted character, then skip to next opcode */
6091
6092
if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
6093
Fecode += 1 + IMM2_SIZE;
6094
break;
6095
6096
6097
/* ===================================================================== */
6098
/* Move the subject pointer back by a variable amount. This occurs at the
6099
start of each branch of a lookbehind assertion when the branch has a
6100
variable, but limited, length. A loop is needed to try matching the branch
6101
after moving back different numbers of characters. If we are too close to
6102
the start to move back even the minimum amount, fail. When working with
6103
UTF-8 we move back a number of characters, not bytes. */
6104
6105
#define Lmin F->temp_32[0]
6106
#define Lmax F->temp_32[1]
6107
#define Leptr F->temp_sptr[0]
6108
6109
case OP_VREVERSE:
6110
Lmin = GET2(Fecode, 1);
6111
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
6112
Leptr = Feptr;
6113
6114
/* Move back by the maximum branch length and then work forwards. This
6115
ensures that items such as \d{3,5} get the maximum length, which is
6116
relevant for captures, and makes for Perl compatibility. */
6117
6118
#ifdef SUPPORT_UNICODE
6119
if (utf)
6120
{
6121
for (i = 0; i < Lmax; i++)
6122
{
6123
if (Feptr == mb->start_subject)
6124
{
6125
if (i < Lmin) RRETURN(MATCH_NOMATCH);
6126
Lmax = i;
6127
break;
6128
}
6129
Feptr--;
6130
BACKCHAR(Feptr);
6131
}
6132
}
6133
else
6134
#endif
6135
6136
/* No UTF support or not in UTF mode */
6137
6138
{
6139
ptrdiff_t diff = Feptr - mb->start_subject;
6140
uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);
6141
if (Lmin > available) RRETURN(MATCH_NOMATCH);
6142
if (Lmax > available) Lmax = available;
6143
Feptr -= Lmax;
6144
}
6145
6146
/* Now try matching, moving forward one character on failure, until we
6147
reach the minimum back length. */
6148
6149
for (;;)
6150
{
6151
RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);
6152
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6153
if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);
6154
Feptr++;
6155
#ifdef SUPPORT_UNICODE
6156
if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }
6157
#endif
6158
}
6159
PCRE2_UNREACHABLE(); /* Control never reaches here */
6160
6161
#undef Lmin
6162
#undef Lmax
6163
#undef Leptr
6164
6165
/* ===================================================================== */
6166
/* An alternation is the end of a branch; scan along to find the end of the
6167
bracketed group. */
6168
6169
case OP_ALT:
6170
branch_end = Fecode;
6171
do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
6172
break;
6173
6174
6175
/* ===================================================================== */
6176
/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
6177
starting frame was added to the chained frames in order to remember the
6178
starting subject position for the group. (Not true for OP_BRA when it's a
6179
whole pattern recursion, but that is handled separately below.)*/
6180
6181
case OP_KET:
6182
case OP_KETRMIN:
6183
case OP_KETRMAX:
6184
case OP_KETRPOS:
6185
6186
bracode = Fecode - GET(Fecode, 1);
6187
6188
if (branch_end == NULL) branch_end = Fecode;
6189
branch_start = bracode;
6190
while (branch_start + GET(branch_start, 1) != branch_end)
6191
branch_start += GET(branch_start, 1);
6192
branch_end = NULL;
6193
6194
/* Point N to the frame at the start of the most recent group, and P to its
6195
predecessor. Remember the subject pointer at the start of the group. */
6196
6197
if (*bracode != OP_BRA && *bracode != OP_COND)
6198
{
6199
N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);
6200
P = (heapframe *)((char *)N - frame_size);
6201
Flast_group_offset = P->last_group_offset;
6202
6203
#ifdef DEBUG_SHOW_RMATCH
6204
fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
6205
N->rdepth, N->group_frame_type,
6206
(char *)P->eptr - (char *)mb->start_subject);
6207
#endif
6208
6209
/* If we are at the end of an assertion that is a condition, first check
6210
to see if we are at the end of a variable-length branch in a lookbehind.
6211
If this is the case and we have not landed on the current character,
6212
return no match. Compare code below for non-condition lookbehinds. In
6213
other cases, return a match, discarding any intermediate backtracking
6214
points. Copy back the mark setting and the captures into the frame before
6215
N so that they are set on return. Doing this for all assertions, both
6216
positive and negative, seems to match what Perl does. */
6217
6218
if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
6219
{
6220
if ((*bracode == OP_ASSERTBACK || *bracode == OP_ASSERTBACK_NOT) &&
6221
branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6222
RRETURN(MATCH_NOMATCH);
6223
memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
6224
Foffset_top * sizeof(PCRE2_SIZE));
6225
P->offset_top = Foffset_top;
6226
P->mark = Fmark;
6227
Fback_frame = (char *)F - (char *)P;
6228
RRETURN(MATCH_MATCH);
6229
}
6230
}
6231
else P = NULL; /* Indicates starting frame not recorded */
6232
6233
/* The group was not a conditional assertion. */
6234
6235
switch (*bracode)
6236
{
6237
/* Whole pattern recursion is handled as a recursion into group 0, but
6238
the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
6239
group - a design mistake: it should perhaps have been capture group 0.
6240
Anyway, that means the end of such recursion must be handled here. It is
6241
detected by checking for an immediately following OP_END when we are
6242
recursing in group 0. If this is not the end of a whole-pattern
6243
recursion, there is nothing to be done. */
6244
6245
case OP_BRA:
6246
if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;
6247
6248
/* It is the end of whole-pattern recursion. */
6249
6250
offset = Flast_group_offset;
6251
6252
/* Corrupted heapframes?. Trigger an assert and return an error */
6253
PCRE2_ASSERT(offset != PCRE2_UNSET);
6254
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
6255
6256
N = (heapframe *)((char *)match_data->heapframes + offset);
6257
P = (heapframe *)((char *)N - frame_size);
6258
Flast_group_offset = P->last_group_offset;
6259
6260
/* Reinstate the previous set of captures and then carry on after the
6261
recursion call. */
6262
6263
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
6264
Foffset_top * sizeof(PCRE2_SIZE));
6265
Foffset_top = P->offset_top;
6266
Fcapture_last = P->capture_last;
6267
Fcurrent_recurse = P->current_recurse;
6268
Fecode = P->ecode + 1 + LINK_SIZE;
6269
continue; /* With next opcode */
6270
6271
case OP_COND: /* No need to do anything for these */
6272
case OP_SCOND:
6273
break;
6274
6275
/* Non-atomic positive assertions are like OP_BRA, except that the
6276
subject pointer must be put back to where it was at the start of the
6277
assertion. For a variable lookbehind, check its end point. */
6278
6279
case OP_ASSERTBACK_NA:
6280
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6281
RRETURN(MATCH_NOMATCH);
6282
/* Fall through */
6283
6284
case OP_ASSERT_NA:
6285
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6286
Feptr = P->eptr;
6287
break;
6288
6289
/* Atomic positive assertions are like OP_ONCE, except that in addition
6290
the subject pointer must be put back to where it was at the start of the
6291
assertion. For a variable lookbehind, check its end point. */
6292
6293
case OP_ASSERTBACK:
6294
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6295
RRETURN(MATCH_NOMATCH);
6296
/* Fall through */
6297
6298
case OP_ASSERT:
6299
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6300
Feptr = P->eptr;
6301
/* Fall through */
6302
6303
/* For an atomic group, discard internal backtracking points. We must
6304
also ensure that any remaining branches within the top-level of the group
6305
are not tried. Do this by adjusting the code pointer within the backtrack
6306
frame so that it points to the final branch. */
6307
6308
case OP_ONCE:
6309
Fback_frame = ((char *)F - (char *)P);
6310
for (;;)
6311
{
6312
uint32_t y = GET(P->ecode,1);
6313
if ((P->ecode)[y] != OP_ALT) break;
6314
P->ecode += y;
6315
}
6316
break;
6317
6318
/* A matching negative assertion returns MATCH, which is turned into
6319
NOMATCH at the assertion level. For a variable lookbehind, check its end
6320
point. */
6321
6322
case OP_ASSERTBACK_NOT:
6323
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6324
RRETURN(MATCH_NOMATCH);
6325
/* Fall through */
6326
6327
case OP_ASSERT_NOT:
6328
RRETURN(MATCH_MATCH);
6329
6330
/* A scan substring group must preserve the current end_subject,
6331
and restore it before the backtracking is performed into its sub
6332
pattern. */
6333
6334
case OP_ASSERT_SCS:
6335
F->temp_sptr[0] = mb->end_subject;
6336
mb->end_subject = P->temp_sptr[0];
6337
mb->true_end_subject = mb->end_subject + P->temp_size;
6338
Feptr = P->temp_sptr[1];
6339
6340
RMATCH(Fecode + 1 + LINK_SIZE, RM39);
6341
6342
mb->end_subject = F->temp_sptr[0];
6343
mb->true_end_subject = mb->end_subject;
6344
RRETURN(rrc);
6345
break;
6346
6347
/* At the end of a script run, apply the script-checking rules. This code
6348
will never by exercised if Unicode support it not compiled, because in
6349
that environment script runs cause an error at compile time. */
6350
6351
case OP_SCRIPT_RUN:
6352
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
6353
break;
6354
6355
/* Whole-pattern recursion is coded as a recurse into group 0, and is
6356
handled with OP_BRA above. Other recursion is handled here. */
6357
6358
case OP_CBRA:
6359
case OP_CBRAPOS:
6360
case OP_SCBRA:
6361
case OP_SCBRAPOS:
6362
number = GET2(bracode, 1+LINK_SIZE);
6363
6364
/* Handle a recursively called group. We reinstate the previous set of
6365
captures and then carry on after the recursion call. */
6366
6367
if (Fcurrent_recurse == number)
6368
{
6369
P = (heapframe *)((char *)N - frame_size);
6370
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
6371
Foffset_top * sizeof(PCRE2_SIZE));
6372
Foffset_top = P->offset_top;
6373
Fcapture_last = P->capture_last;
6374
Fcurrent_recurse = P->current_recurse;
6375
Fecode = P->ecode + 1 + LINK_SIZE;
6376
continue; /* With next opcode */
6377
}
6378
6379
/* Deal with actual capturing. */
6380
6381
offset = (number << 1) - 2;
6382
Fcapture_last = number;
6383
Fovector[offset] = P->eptr - mb->start_subject;
6384
Fovector[offset+1] = Feptr - mb->start_subject;
6385
if (offset >= Foffset_top) Foffset_top = offset + 2;
6386
break;
6387
} /* End actions relating to the starting opcode */
6388
6389
/* OP_KETRPOS is a possessive repeating ket. Remember the current position,
6390
and return the MATCH_KETRPOS. This makes it possible to do the repeats one
6391
at a time from the outer level. This must precede the empty string test -
6392
in this case that test is done at the outer level. */
6393
6394
if (*Fecode == OP_KETRPOS)
6395
{
6396
memcpy((char *)P + offsetof(heapframe, eptr),
6397
(char *)F + offsetof(heapframe, eptr),
6398
frame_copy_size);
6399
RRETURN(MATCH_KETRPOS);
6400
}
6401
6402
/* Handle the different kinds of closing brackets. A non-repeating ket
6403
needs no special action, just continuing at this level. This also happens
6404
for the repeating kets if the group matched no characters, in order to
6405
forcibly break infinite loops. Otherwise, the repeating kets try the rest
6406
of the pattern or restart from the preceding bracket, in the appropriate
6407
order. */
6408
6409
if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
6410
{
6411
if (Fop == OP_KETRMIN)
6412
{
6413
RMATCH(Fecode + 1 + LINK_SIZE, RM6);
6414
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6415
Fecode -= GET(Fecode, 1);
6416
break; /* End of ket processing */
6417
}
6418
6419
/* Repeat the maximum number of times (KETRMAX) */
6420
6421
RMATCH(bracode, RM7);
6422
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6423
}
6424
6425
/* Carry on at this level for a non-repeating ket, or after matching an
6426
empty string, or after repeating for a maximum number of times. */
6427
6428
Fecode += 1 + LINK_SIZE;
6429
break;
6430
6431
6432
/* ===================================================================== */
6433
/* Start and end of line assertions, not multiline mode. */
6434
6435
case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
6436
if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
6437
RRETURN(MATCH_NOMATCH);
6438
Fecode++;
6439
break;
6440
6441
case OP_SOD: /* Unconditional start of subject */
6442
if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
6443
Fecode++;
6444
break;
6445
6446
/* When PCRE2_NOTEOL is unset, assert before the subject end, or a
6447
terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
6448
6449
case OP_DOLL:
6450
if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6451
if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
6452
6453
/* Fall through */
6454
/* Unconditional end of subject assertion (\z). */
6455
6456
case OP_EOD:
6457
if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);
6458
if (mb->partial != 0)
6459
{
6460
mb->hitend = TRUE;
6461
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6462
}
6463
Fecode++;
6464
break;
6465
6466
/* End of subject or ending \n assertion (\Z) */
6467
6468
case OP_EODN:
6469
ASSERT_NL_OR_EOS:
6470
if (Feptr < mb->true_end_subject &&
6471
(!IS_NEWLINE(Feptr) || Feptr != mb->true_end_subject - mb->nllen))
6472
{
6473
if (mb->partial != 0 &&
6474
Feptr + 1 >= mb->end_subject &&
6475
NLBLOCK->nltype == NLTYPE_FIXED &&
6476
NLBLOCK->nllen == 2 &&
6477
UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6478
{
6479
mb->hitend = TRUE;
6480
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6481
}
6482
RRETURN(MATCH_NOMATCH);
6483
}
6484
6485
/* Either at end of string or \n before end. */
6486
6487
if (mb->partial != 0)
6488
{
6489
mb->hitend = TRUE;
6490
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6491
}
6492
Fecode++;
6493
break;
6494
6495
6496
/* ===================================================================== */
6497
/* Start and end of line assertions, multiline mode. */
6498
6499
/* Start of subject unless notbol, or after any newline except for one at
6500
the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
6501
6502
case OP_CIRCM:
6503
if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
6504
RRETURN(MATCH_NOMATCH);
6505
if (Feptr != mb->start_subject &&
6506
((Feptr == mb->end_subject &&
6507
(mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
6508
!WAS_NEWLINE(Feptr)))
6509
RRETURN(MATCH_NOMATCH);
6510
Fecode++;
6511
break;
6512
6513
/* Assert before any newline, or before end of subject unless noteol is
6514
set. */
6515
6516
case OP_DOLLM:
6517
if (Feptr < mb->end_subject)
6518
{
6519
if (!IS_NEWLINE(Feptr))
6520
{
6521
if (mb->partial != 0 &&
6522
Feptr + 1 >= mb->end_subject &&
6523
NLBLOCK->nltype == NLTYPE_FIXED &&
6524
NLBLOCK->nllen == 2 &&
6525
UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6526
{
6527
mb->hitend = TRUE;
6528
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6529
}
6530
RRETURN(MATCH_NOMATCH);
6531
}
6532
}
6533
else
6534
{
6535
if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6536
SCHECK_PARTIAL();
6537
}
6538
Fecode++;
6539
break;
6540
6541
6542
/* ===================================================================== */
6543
/* Start of match assertion */
6544
6545
case OP_SOM:
6546
if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
6547
Fecode++;
6548
break;
6549
6550
6551
/* ===================================================================== */
6552
/* Reset the start of match point */
6553
6554
case OP_SET_SOM:
6555
Fstart_match = Feptr;
6556
Fecode++;
6557
break;
6558
6559
6560
/* ===================================================================== */
6561
/* Word boundary assertions. Find out if the previous and current
6562
characters are "word" characters. It takes a bit more work in UTF mode.
6563
Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
6564
not set. When it is set, use Unicode properties if available, even when not
6565
in UTF mode. Remember the earliest and latest consulted characters. */
6566
6567
case OP_NOT_WORD_BOUNDARY:
6568
case OP_WORD_BOUNDARY:
6569
case OP_NOT_UCP_WORD_BOUNDARY:
6570
case OP_UCP_WORD_BOUNDARY:
6571
if (Feptr == mb->check_subject) prev_is_word = FALSE; else
6572
{
6573
PCRE2_SPTR lastptr = Feptr - 1;
6574
#ifdef SUPPORT_UNICODE
6575
if (utf)
6576
{
6577
BACKCHAR(lastptr);
6578
GETCHAR(fc, lastptr);
6579
}
6580
else
6581
#endif /* SUPPORT_UNICODE */
6582
fc = *lastptr;
6583
if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
6584
#ifdef SUPPORT_UNICODE
6585
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
6586
{
6587
int chartype = UCD_CHARTYPE(fc);
6588
int category = PRIV(ucp_gentype)[chartype];
6589
prev_is_word = (category == ucp_L || category == ucp_N ||
6590
chartype == ucp_Mn || chartype == ucp_Pc);
6591
}
6592
else
6593
#endif /* SUPPORT_UNICODE */
6594
prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6595
}
6596
6597
/* Get status of next character */
6598
6599
if (Feptr >= mb->end_subject)
6600
{
6601
SCHECK_PARTIAL();
6602
cur_is_word = FALSE;
6603
}
6604
else
6605
{
6606
PCRE2_SPTR nextptr = Feptr + 1;
6607
#ifdef SUPPORT_UNICODE
6608
if (utf)
6609
{
6610
FORWARDCHARTEST(nextptr, mb->end_subject);
6611
GETCHAR(fc, Feptr);
6612
}
6613
else
6614
#endif /* SUPPORT_UNICODE */
6615
fc = *Feptr;
6616
if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
6617
#ifdef SUPPORT_UNICODE
6618
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
6619
{
6620
int chartype = UCD_CHARTYPE(fc);
6621
int category = PRIV(ucp_gentype)[chartype];
6622
cur_is_word = (category == ucp_L || category == ucp_N ||
6623
chartype == ucp_Mn || chartype == ucp_Pc);
6624
}
6625
else
6626
#endif /* SUPPORT_UNICODE */
6627
cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6628
}
6629
6630
/* Now see if the situation is what we want */
6631
6632
if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?
6633
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6634
RRETURN(MATCH_NOMATCH);
6635
break;
6636
6637
6638
/* ===================================================================== */
6639
/* Backtracking (*VERB)s, with and without arguments. Note that if the
6640
pattern is successfully matched, we do not come back from RMATCH. */
6641
6642
case OP_MARK:
6643
Fmark = mb->nomatch_mark = Fecode + 2;
6644
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
6645
6646
/* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
6647
argument, and we must check whether that argument matches this MARK's
6648
argument. It is passed back in mb->verb_skip_ptr. If it does match, we
6649
return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
6650
position that corresponds to this mark. Otherwise, pass back the return
6651
code unaltered. */
6652
6653
if (rrc == MATCH_SKIP_ARG &&
6654
PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
6655
{
6656
mb->verb_skip_ptr = Feptr; /* Pass back current position */
6657
RRETURN(MATCH_SKIP);
6658
}
6659
RRETURN(rrc);
6660
6661
case OP_FAIL:
6662
RRETURN(MATCH_NOMATCH);
6663
6664
/* Record the current recursing group number in mb->verb_current_recurse
6665
when a backtracking return such as MATCH_COMMIT is given. This enables the
6666
recurse processing to catch verbs from within the recursion. */
6667
6668
case OP_COMMIT:
6669
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
6670
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6671
mb->verb_current_recurse = Fcurrent_recurse;
6672
RRETURN(MATCH_COMMIT);
6673
6674
case OP_COMMIT_ARG:
6675
Fmark = mb->nomatch_mark = Fecode + 2;
6676
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
6677
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6678
mb->verb_current_recurse = Fcurrent_recurse;
6679
RRETURN(MATCH_COMMIT);
6680
6681
case OP_PRUNE:
6682
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
6683
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6684
mb->verb_current_recurse = Fcurrent_recurse;
6685
RRETURN(MATCH_PRUNE);
6686
6687
case OP_PRUNE_ARG:
6688
Fmark = mb->nomatch_mark = Fecode + 2;
6689
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
6690
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6691
mb->verb_current_recurse = Fcurrent_recurse;
6692
RRETURN(MATCH_PRUNE);
6693
6694
case OP_SKIP:
6695
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
6696
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6697
mb->verb_skip_ptr = Feptr; /* Pass back current position */
6698
mb->verb_current_recurse = Fcurrent_recurse;
6699
RRETURN(MATCH_SKIP);
6700
6701
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
6702
nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
6703
not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
6704
that failed and any that precede it (either they also failed, or were not
6705
triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
6706
SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
6707
set to the count of the one that failed. */
6708
6709
case OP_SKIP_ARG:
6710
mb->skip_arg_count++;
6711
if (mb->skip_arg_count <= mb->ignore_skip_arg)
6712
{
6713
Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
6714
break;
6715
}
6716
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
6717
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6718
6719
/* Pass back the current skip name and return the special MATCH_SKIP_ARG
6720
return code. This will either be caught by a matching MARK, or get to the
6721
top, where it causes a rematch with mb->ignore_skip_arg set to the value of
6722
mb->skip_arg_count. */
6723
6724
mb->verb_skip_ptr = Fecode + 2;
6725
mb->verb_current_recurse = Fcurrent_recurse;
6726
RRETURN(MATCH_SKIP_ARG);
6727
6728
/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
6729
the branch in which it occurs can be determined. */
6730
6731
case OP_THEN:
6732
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6733
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6734
mb->verb_ecode_ptr = Fecode;
6735
mb->verb_current_recurse = Fcurrent_recurse;
6736
RRETURN(MATCH_THEN);
6737
6738
case OP_THEN_ARG:
6739
Fmark = mb->nomatch_mark = Fecode + 2;
6740
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6741
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6742
mb->verb_ecode_ptr = Fecode;
6743
mb->verb_current_recurse = Fcurrent_recurse;
6744
RRETURN(MATCH_THEN);
6745
6746
6747
/* ===================================================================== */
6748
/* There's been some horrible disaster. Arrival here can only mean there is
6749
something seriously wrong in the code above or the OP_xxx definitions. */
6750
6751
default:
6752
PCRE2_DEBUG_UNREACHABLE();
6753
return PCRE2_ERROR_INTERNAL;
6754
}
6755
6756
/* Do not insert any code in here without much thought; it is assumed
6757
that "continue" in the code above comes out to here to repeat the main
6758
loop. */
6759
6760
} /* End of main loop */
6761
6762
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6763
6764
/* ========================================================================= */
6765
/* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6766
indicates which label we actually want to return to. The value in Frdepth is
6767
the index number of the frame in the vector. The return value has been placed
6768
in rrc. */
6769
6770
#define LBL(val) case val: goto L_RM##val;
6771
6772
RETURN_SWITCH:
6773
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6774
if (Frdepth == 0) return rrc; /* Exit from the top level */
6775
F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6776
mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6777
6778
#ifdef DEBUG_SHOW_RMATCH
6779
fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);
6780
#endif
6781
6782
switch (Freturn_id)
6783
{
6784
LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6785
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6786
LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6787
LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6788
LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39)
6789
6790
#ifdef SUPPORT_WIDE_CHARS
6791
LBL(100) LBL(101) LBL(102) LBL(103)
6792
#endif
6793
6794
#ifdef SUPPORT_UNICODE
6795
LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6796
LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6797
LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
6798
LBL(221) LBL(222) LBL(223) LBL(224)
6799
#endif
6800
6801
default:
6802
PCRE2_DEBUG_UNREACHABLE();
6803
return PCRE2_ERROR_INTERNAL;
6804
}
6805
#undef LBL
6806
}
6807
6808
6809
/*************************************************
6810
* Match a Regular Expression *
6811
*************************************************/
6812
6813
/* This function applies a compiled pattern to a subject string and picks out
6814
portions of the string if it matches. Two elements in the vector are set for
6815
each substring: the offsets to the start and end of the substring.
6816
6817
Arguments:
6818
code points to the compiled expression
6819
subject points to the subject string
6820
length length of subject string (may contain binary zeros)
6821
start_offset where to start in the subject string
6822
options option bits
6823
match_data points to a match_data block
6824
mcontext points a PCRE2 context
6825
6826
Returns: > 0 => success; value is the number of ovector pairs filled
6827
= 0 => success, but ovector is not big enough
6828
= -1 => failed to match (PCRE2_ERROR_NOMATCH)
6829
= -2 => partial match (PCRE2_ERROR_PARTIAL)
6830
< -2 => some kind of unexpected problem
6831
*/
6832
6833
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
6834
pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6835
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6836
pcre2_match_context *mcontext)
6837
{
6838
int rc;
6839
int was_zero_terminated = 0;
6840
const uint8_t *start_bits = NULL;
6841
const pcre2_real_code *re = (const pcre2_real_code *)code;
6842
6843
BOOL anchored;
6844
BOOL firstline;
6845
BOOL has_first_cu = FALSE;
6846
BOOL has_req_cu = FALSE;
6847
BOOL startline;
6848
6849
#if PCRE2_CODE_UNIT_WIDTH == 8
6850
PCRE2_SPTR memchr_found_first_cu;
6851
PCRE2_SPTR memchr_found_first_cu2;
6852
#endif
6853
6854
PCRE2_UCHAR first_cu = 0;
6855
PCRE2_UCHAR first_cu2 = 0;
6856
PCRE2_UCHAR req_cu = 0;
6857
PCRE2_UCHAR req_cu2 = 0;
6858
6859
PCRE2_SPTR bumpalong_limit;
6860
PCRE2_SPTR end_subject;
6861
PCRE2_SPTR true_end_subject;
6862
PCRE2_SPTR start_match;
6863
PCRE2_SPTR req_cu_ptr;
6864
PCRE2_SPTR start_partial;
6865
PCRE2_SPTR match_partial;
6866
6867
#ifdef SUPPORT_JIT
6868
BOOL use_jit;
6869
#endif
6870
6871
/* This flag is needed even when Unicode is not supported for convenience
6872
(it is used by the IS_NEWLINE macro). */
6873
6874
BOOL utf = FALSE;
6875
6876
#ifdef SUPPORT_UNICODE
6877
BOOL ucp = FALSE;
6878
BOOL allow_invalid;
6879
uint32_t fragment_options = 0;
6880
#ifdef SUPPORT_JIT
6881
BOOL jit_checked_utf = FALSE;
6882
#endif
6883
#endif /* SUPPORT_UNICODE */
6884
6885
PCRE2_SIZE frame_size;
6886
PCRE2_SIZE heapframes_size;
6887
6888
/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6889
macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6890
6891
pcre2_callout_block cb;
6892
match_block actual_match_block;
6893
match_block *mb = &actual_match_block;
6894
6895
/* Recognize NULL, length 0 as an empty string. */
6896
6897
if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
6898
6899
/* Plausibility checks */
6900
6901
if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6902
if (code == NULL || subject == NULL || match_data == NULL)
6903
return PCRE2_ERROR_NULL;
6904
6905
start_match = subject + start_offset;
6906
req_cu_ptr = start_match - 1;
6907
if (length == PCRE2_ZERO_TERMINATED)
6908
{
6909
length = PRIV(strlen)(subject);
6910
was_zero_terminated = 1;
6911
}
6912
true_end_subject = end_subject = subject + length;
6913
6914
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6915
6916
/* Check that the first field in the block is the magic number. */
6917
6918
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6919
6920
/* Check the code unit width. */
6921
6922
if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6923
return PCRE2_ERROR_BADMODE;
6924
6925
/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6926
options variable for this function. Users of PCRE2 who are not calling the
6927
function directly would like to have a way of setting these flags, in the same
6928
way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with
6929
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6930
(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6931
transfer to the options for this function. The bits are guaranteed to be
6932
adjacent, but do not have the same values. This bit of Boolean trickery assumes
6933
that the match-time bits are not more significant than the flag bits. If by
6934
accident this is not the case, a compile-time division by zero error will
6935
occur. */
6936
6937
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6938
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6939
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6940
#undef FF
6941
#undef OO
6942
6943
/* If the pattern was successfully studied with JIT support, we will run the
6944
JIT executable instead of the rest of this function. Most options must be set
6945
at compile time for the JIT code to be usable. */
6946
6947
#ifdef SUPPORT_JIT
6948
use_jit = (re->executable_jit != NULL &&
6949
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
6950
#endif
6951
6952
/* Initialize UTF/UCP parameters. */
6953
6954
#ifdef SUPPORT_UNICODE
6955
utf = (re->overall_options & PCRE2_UTF) != 0;
6956
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
6957
ucp = (re->overall_options & PCRE2_UCP) != 0;
6958
#endif /* SUPPORT_UNICODE */
6959
6960
/* Convert the partial matching flags into an integer. */
6961
6962
mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6963
((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6964
6965
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6966
time. */
6967
6968
if (mb->partial != 0 &&
6969
((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6970
return PCRE2_ERROR_BADOPTION;
6971
6972
/* It is an error to set an offset limit without setting the flag at compile
6973
time. */
6974
6975
if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6976
(re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6977
return PCRE2_ERROR_BADOFFSETLIMIT;
6978
6979
/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6980
free the memory that was obtained. Set the field to NULL for no match cases. */
6981
6982
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6983
{
6984
match_data->memctl.free((void *)match_data->subject,
6985
match_data->memctl.memory_data);
6986
match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6987
}
6988
match_data->subject = NULL;
6989
6990
/* Zero the error offset in case the first code unit is invalid UTF. */
6991
6992
match_data->startchar = 0;
6993
6994
6995
/* ============================= JIT matching ============================== */
6996
6997
/* Prepare for JIT matching. Check a UTF string for validity unless no check is
6998
requested or invalid UTF can be handled. We check only the portion of the
6999
subject that might be be inspected during matching - from the offset minus the
7000
maximum lookbehind to the given length. This saves time when a small part of a
7001
large subject is being matched by the use of a starting offset. Note that the
7002
maximum lookbehind is a number of characters, not code units. */
7003
7004
#ifdef SUPPORT_JIT
7005
if (use_jit)
7006
{
7007
#ifdef SUPPORT_UNICODE
7008
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
7009
{
7010
7011
/* For 8-bit and 16-bit UTF, check that the first code unit is a valid
7012
character start. */
7013
7014
#if PCRE2_CODE_UNIT_WIDTH != 32
7015
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
7016
{
7017
if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
7018
#if PCRE2_CODE_UNIT_WIDTH == 8
7019
return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
7020
#else
7021
return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
7022
#endif
7023
}
7024
#endif /* WIDTH != 32 */
7025
7026
/* Move back by the maximum lookbehind, just in case it happens at the very
7027
start of matching. */
7028
7029
#if PCRE2_CODE_UNIT_WIDTH != 32
7030
for (unsigned int i = re->max_lookbehind; i > 0 && start_match > subject; i--)
7031
{
7032
start_match--;
7033
while (start_match > subject &&
7034
#if PCRE2_CODE_UNIT_WIDTH == 8
7035
(*start_match & 0xc0) == 0x80)
7036
#else /* 16-bit */
7037
(*start_match & 0xfc00) == 0xdc00)
7038
#endif
7039
start_match--;
7040
}
7041
#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
7042
7043
/* In the 32-bit library, one code unit equals one character. However,
7044
we cannot just subtract the lookbehind and then compare pointers, because
7045
a very large lookbehind could create an invalid pointer. */
7046
7047
if (start_offset >= re->max_lookbehind)
7048
start_match -= re->max_lookbehind;
7049
else
7050
start_match = subject;
7051
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
7052
7053
/* Validate the relevant portion of the subject. Adjust the offset of an
7054
invalid code point to be an absolute offset in the whole string. */
7055
7056
match_data->rc = PRIV(valid_utf)(start_match,
7057
length - (start_match - subject), &(match_data->startchar));
7058
if (match_data->rc != 0)
7059
{
7060
match_data->startchar += start_match - subject;
7061
return match_data->rc;
7062
}
7063
jit_checked_utf = TRUE;
7064
}
7065
#endif /* SUPPORT_UNICODE */
7066
7067
/* If JIT returns BADOPTION, which means that the selected complete or
7068
partial matching mode was not compiled, fall through to the interpreter. */
7069
7070
rc = pcre2_jit_match(code, subject, length, start_offset, options,
7071
match_data, mcontext);
7072
if (rc != PCRE2_ERROR_JIT_BADOPTION)
7073
{
7074
match_data->subject_length = length;
7075
if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7076
{
7077
length = CU2BYTES(length + was_zero_terminated);
7078
match_data->subject = match_data->memctl.malloc(length,
7079
match_data->memctl.memory_data);
7080
if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
7081
memcpy((void *)match_data->subject, subject, length);
7082
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7083
}
7084
return rc;
7085
}
7086
}
7087
#endif /* SUPPORT_JIT */
7088
7089
/* ========================= End of JIT matching ========================== */
7090
7091
7092
/* Proceed with non-JIT matching. The default is to allow lookbehinds to the
7093
start of the subject. A UTF check when there is a non-zero offset may change
7094
this. */
7095
7096
mb->check_subject = subject;
7097
7098
/* If a UTF subject string was not checked for validity in the JIT code above,
7099
check it here, and handle support for invalid UTF strings. The check above
7100
happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
7101
If we get here in those circumstances, it means the subject string is valid,
7102
but for some reason JIT matching was not successful. There is no need to check
7103
the subject again.
7104
7105
We check only the portion of the subject that might be be inspected during
7106
matching - from the offset minus the maximum lookbehind to the given length.
7107
This saves time when a small part of a large subject is being matched by the
7108
use of a starting offset. Note that the maximum lookbehind is a number of
7109
characters, not code units.
7110
7111
Note also that support for invalid UTF forces a check, overriding the setting
7112
of PCRE2_NO_CHECK_UTF. */
7113
7114
#ifdef SUPPORT_UNICODE
7115
if (utf &&
7116
#ifdef SUPPORT_JIT
7117
!jit_checked_utf &&
7118
#endif
7119
((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
7120
{
7121
#if PCRE2_CODE_UNIT_WIDTH != 32
7122
BOOL skipped_bad_start = FALSE;
7123
#endif
7124
7125
/* For 8-bit and 16-bit UTF, check that the first code unit is a valid
7126
character start. If we are handling invalid UTF, just skip over such code
7127
units. Otherwise, give an appropriate error. */
7128
7129
#if PCRE2_CODE_UNIT_WIDTH != 32
7130
if (allow_invalid)
7131
{
7132
while (start_match < end_subject && NOT_FIRSTCU(*start_match))
7133
{
7134
start_match++;
7135
skipped_bad_start = TRUE;
7136
}
7137
}
7138
else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
7139
{
7140
if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
7141
#if PCRE2_CODE_UNIT_WIDTH == 8
7142
return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
7143
#else
7144
return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
7145
#endif
7146
}
7147
#endif /* WIDTH != 32 */
7148
7149
/* The mb->check_subject field points to the start of UTF checking;
7150
lookbehinds can go back no further than this. */
7151
7152
mb->check_subject = start_match;
7153
7154
/* Move back by the maximum lookbehind, just in case it happens at the very
7155
start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
7156
units above. */
7157
7158
#if PCRE2_CODE_UNIT_WIDTH != 32
7159
if (!skipped_bad_start)
7160
{
7161
unsigned int i;
7162
for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
7163
{
7164
mb->check_subject--;
7165
while (mb->check_subject > subject &&
7166
#if PCRE2_CODE_UNIT_WIDTH == 8
7167
(*mb->check_subject & 0xc0) == 0x80)
7168
#else /* 16-bit */
7169
(*mb->check_subject & 0xfc00) == 0xdc00)
7170
#endif
7171
mb->check_subject--;
7172
}
7173
}
7174
#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
7175
7176
/* In the 32-bit library, one code unit equals one character. However,
7177
we cannot just subtract the lookbehind and then compare pointers, because
7178
a very large lookbehind could create an invalid pointer. */
7179
7180
if (start_offset >= re->max_lookbehind)
7181
mb->check_subject -= re->max_lookbehind;
7182
else
7183
mb->check_subject = subject;
7184
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
7185
7186
/* Validate the relevant portion of the subject. There's a loop in case we
7187
encounter bad UTF in the characters preceding start_match which we are
7188
scanning because of a lookbehind. */
7189
7190
for (;;)
7191
{
7192
match_data->rc = PRIV(valid_utf)(mb->check_subject,
7193
length - (mb->check_subject - subject), &(match_data->startchar));
7194
7195
if (match_data->rc == 0) break; /* Valid UTF string */
7196
7197
/* Invalid UTF string. Adjust the offset to be an absolute offset in the
7198
whole string. If we are handling invalid UTF strings, set end_subject to
7199
stop before the bad code unit, and set the options to "not end of line".
7200
Otherwise return the error. */
7201
7202
match_data->startchar += mb->check_subject - subject;
7203
if (!allow_invalid || match_data->rc > 0) return match_data->rc;
7204
end_subject = subject + match_data->startchar;
7205
7206
/* If the end precedes start_match, it means there is invalid UTF in the
7207
extra code units we reversed over because of a lookbehind. Advance past the
7208
first bad code unit, and then skip invalid character starting code units in
7209
8-bit and 16-bit modes, and try again with the original end point. */
7210
7211
if (end_subject < start_match)
7212
{
7213
mb->check_subject = end_subject + 1;
7214
#if PCRE2_CODE_UNIT_WIDTH != 32
7215
while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
7216
mb->check_subject++;
7217
#endif
7218
end_subject = true_end_subject;
7219
}
7220
7221
/* Otherwise, set the not end of line option, and do the match. */
7222
7223
else
7224
{
7225
fragment_options = PCRE2_NOTEOL;
7226
break;
7227
}
7228
}
7229
}
7230
#endif /* SUPPORT_UNICODE */
7231
7232
/* A NULL match context means "use a default context", but we take the memory
7233
control functions from the pattern. */
7234
7235
if (mcontext == NULL)
7236
{
7237
mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
7238
mb->memctl = re->memctl;
7239
}
7240
else mb->memctl = mcontext->memctl;
7241
7242
anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
7243
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
7244
startline = (re->flags & PCRE2_STARTLINE) != 0;
7245
bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
7246
true_end_subject : subject + mcontext->offset_limit;
7247
7248
/* Initialize and set up the fixed fields in the callout block, with a pointer
7249
in the match block. */
7250
7251
mb->cb = &cb;
7252
cb.version = 2;
7253
cb.subject = subject;
7254
cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
7255
cb.callout_flags = 0;
7256
7257
/* Fill in the remaining fields in the match block, except for moptions, which
7258
gets set later. */
7259
7260
mb->callout = mcontext->callout;
7261
mb->callout_data = mcontext->callout_data;
7262
7263
mb->start_subject = subject;
7264
mb->start_offset = start_offset;
7265
mb->end_subject = end_subject;
7266
mb->true_end_subject = true_end_subject;
7267
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
7268
mb->allowemptypartial = (re->max_lookbehind > 0) ||
7269
(re->flags & PCRE2_MATCH_EMPTY) != 0;
7270
mb->poptions = re->overall_options; /* Pattern options */
7271
mb->ignore_skip_arg = 0;
7272
mb->mark = mb->nomatch_mark = NULL; /* In case never set */
7273
7274
/* The name table is needed for finding all the numbers associated with a
7275
given name, for condition testing. The code follows the name table. */
7276
7277
mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code));
7278
mb->name_count = re->name_count;
7279
mb->name_entry_size = re->name_entry_size;
7280
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);
7281
7282
/* Process the \R and newline settings. */
7283
7284
mb->bsr_convention = re->bsr_convention;
7285
mb->nltype = NLTYPE_FIXED;
7286
switch(re->newline_convention)
7287
{
7288
case PCRE2_NEWLINE_CR:
7289
mb->nllen = 1;
7290
mb->nl[0] = CHAR_CR;
7291
break;
7292
7293
case PCRE2_NEWLINE_LF:
7294
mb->nllen = 1;
7295
mb->nl[0] = CHAR_NL;
7296
break;
7297
7298
case PCRE2_NEWLINE_NUL:
7299
mb->nllen = 1;
7300
mb->nl[0] = CHAR_NUL;
7301
break;
7302
7303
case PCRE2_NEWLINE_CRLF:
7304
mb->nllen = 2;
7305
mb->nl[0] = CHAR_CR;
7306
mb->nl[1] = CHAR_NL;
7307
break;
7308
7309
case PCRE2_NEWLINE_ANY:
7310
mb->nltype = NLTYPE_ANY;
7311
break;
7312
7313
case PCRE2_NEWLINE_ANYCRLF:
7314
mb->nltype = NLTYPE_ANYCRLF;
7315
break;
7316
7317
default:
7318
PCRE2_DEBUG_UNREACHABLE();
7319
return PCRE2_ERROR_INTERNAL;
7320
}
7321
7322
/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
7323
vector at the end, whose size depends on the number of capturing parentheses in
7324
the pattern. It is not used at all if there are no capturing parentheses.
7325
7326
frame_size is the total size of each frame
7327
match_data->heapframes is the pointer to the frames vector
7328
match_data->heapframes_size is the allocated size of the vector
7329
7330
We must pad the frame_size for alignment to ensure subsequent frames are as
7331
aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
7332
array, that does not guarantee it is suitably aligned for pointers, as some
7333
architectures have pointers that are larger than a size_t. */
7334
7335
frame_size = (offsetof(heapframe, ovector) +
7336
re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &
7337
~(HEAPFRAME_ALIGNMENT - 1);
7338
7339
/* Limits set in the pattern override the match context only if they are
7340
smaller. */
7341
7342
mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
7343
mcontext->heap_limit : re->limit_heap);
7344
7345
mb->match_limit = (mcontext->match_limit < re->limit_match)?
7346
mcontext->match_limit : re->limit_match;
7347
7348
mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
7349
mcontext->depth_limit : re->limit_depth;
7350
7351
/* If a pattern has very many capturing parentheses, the frame size may be very
7352
large. Set the initial frame vector size to ensure that there are at least 10
7353
available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
7354
greater than the heap limit, get as large a vector as possible. */
7355
7356
heapframes_size = frame_size * 10;
7357
if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
7358
if (heapframes_size / 1024 > mb->heap_limit)
7359
{
7360
PCRE2_SIZE max_size = 1024 * mb->heap_limit;
7361
if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT;
7362
heapframes_size = max_size;
7363
}
7364
7365
/* If an existing frame vector in the match_data block is large enough, we can
7366
use it. Otherwise, free any pre-existing vector and get a new one. */
7367
7368
if (match_data->heapframes_size < heapframes_size)
7369
{
7370
match_data->memctl.free(match_data->heapframes,
7371
match_data->memctl.memory_data);
7372
match_data->heapframes = match_data->memctl.malloc(heapframes_size,
7373
match_data->memctl.memory_data);
7374
if (match_data->heapframes == NULL)
7375
{
7376
match_data->heapframes_size = 0;
7377
return PCRE2_ERROR_NOMEMORY;
7378
}
7379
match_data->heapframes_size = heapframes_size;
7380
}
7381
7382
/* Write to the ovector within the first frame to mark every capture unset and
7383
to avoid uninitialized memory read errors when it is copied to a new frame. */
7384
7385
memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,
7386
frame_size - offsetof(heapframe, ovector));
7387
7388
/* Pointers to the individual character tables */
7389
7390
mb->lcc = re->tables + lcc_offset;
7391
mb->fcc = re->tables + fcc_offset;
7392
mb->ctypes = re->tables + ctypes_offset;
7393
7394
/* Set up the first code unit to match, if available. If there's no first code
7395
unit there may be a bitmap of possible first characters. */
7396
7397
if ((re->flags & PCRE2_FIRSTSET) != 0)
7398
{
7399
has_first_cu = TRUE;
7400
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
7401
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
7402
{
7403
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
7404
#ifdef SUPPORT_UNICODE
7405
#if PCRE2_CODE_UNIT_WIDTH == 8
7406
if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
7407
#else
7408
if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
7409
#endif
7410
#endif /* SUPPORT_UNICODE */
7411
}
7412
}
7413
else
7414
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
7415
start_bits = re->start_bitmap;
7416
7417
/* There may also be a "last known required character" set. */
7418
7419
if ((re->flags & PCRE2_LASTSET) != 0)
7420
{
7421
has_req_cu = TRUE;
7422
req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
7423
if ((re->flags & PCRE2_LASTCASELESS) != 0)
7424
{
7425
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
7426
#ifdef SUPPORT_UNICODE
7427
#if PCRE2_CODE_UNIT_WIDTH == 8
7428
if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
7429
#else
7430
if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
7431
#endif
7432
#endif /* SUPPORT_UNICODE */
7433
}
7434
}
7435
7436
7437
/* ==========================================================================*/
7438
7439
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
7440
the loop runs just once. */
7441
7442
#ifdef SUPPORT_UNICODE
7443
FRAGMENT_RESTART:
7444
#endif
7445
7446
start_partial = match_partial = NULL;
7447
mb->hitend = FALSE;
7448
7449
#if PCRE2_CODE_UNIT_WIDTH == 8
7450
memchr_found_first_cu = NULL;
7451
memchr_found_first_cu2 = NULL;
7452
#endif
7453
7454
for(;;)
7455
{
7456
PCRE2_SPTR new_start_match;
7457
7458
/* ----------------- Start of match optimizations ---------------- */
7459
7460
/* There are some optimizations that avoid running the match if a known
7461
starting point is not found, or if a known later code unit is not present.
7462
However, there is an option (settable at compile time) that disables these,
7463
for testing and for ensuring that all callouts do actually occur. */
7464
7465
if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
7466
{
7467
/* If firstline is TRUE, the start of the match is constrained to the first
7468
line of a multiline string. That is, the match must be before or at the
7469
first newline following the start of matching. Temporarily adjust
7470
end_subject so that we stop the scans for a first code unit at a newline.
7471
If the match fails at the newline, later code breaks the loop. */
7472
7473
if (firstline)
7474
{
7475
PCRE2_SPTR t = start_match;
7476
#ifdef SUPPORT_UNICODE
7477
if (utf)
7478
{
7479
while (t < end_subject && !IS_NEWLINE(t))
7480
{
7481
t++;
7482
ACROSSCHAR(t < end_subject, t, t++);
7483
}
7484
}
7485
else
7486
#endif
7487
while (t < end_subject && !IS_NEWLINE(t)) t++;
7488
end_subject = t;
7489
}
7490
7491
/* Anchored: check the first code unit if one is recorded. This may seem
7492
pointless but it can help in detecting a no match case without scanning for
7493
the required code unit. */
7494
7495
if (anchored)
7496
{
7497
if (has_first_cu || start_bits != NULL)
7498
{
7499
BOOL ok = start_match < end_subject;
7500
if (ok)
7501
{
7502
PCRE2_UCHAR c = UCHAR21TEST(start_match);
7503
ok = has_first_cu && (c == first_cu || c == first_cu2);
7504
if (!ok && start_bits != NULL)
7505
{
7506
#if PCRE2_CODE_UNIT_WIDTH != 8
7507
if (c > 255) c = 255;
7508
#endif
7509
ok = (start_bits[c/8] & (1u << (c&7))) != 0;
7510
}
7511
}
7512
if (!ok)
7513
{
7514
rc = MATCH_NOMATCH;
7515
break;
7516
}
7517
}
7518
}
7519
7520
/* Not anchored. Advance to a unique first code unit if there is one. */
7521
7522
else
7523
{
7524
if (has_first_cu)
7525
{
7526
if (first_cu != first_cu2) /* Caseless */
7527
{
7528
/* In 16-bit and 32_bit modes we have to do our own search, so can
7529
look for both cases at once. */
7530
7531
#if PCRE2_CODE_UNIT_WIDTH != 8
7532
PCRE2_UCHAR smc;
7533
while (start_match < end_subject &&
7534
(smc = UCHAR21TEST(start_match)) != first_cu &&
7535
smc != first_cu2)
7536
start_match++;
7537
#else
7538
/* In 8-bit mode, the use of memchr() gives a big speed up, even
7539
though we have to call it twice in order to find the earliest
7540
occurrence of the code unit in either of its cases. Caching is used
7541
to remember the positions of previously found code units. This can
7542
make a huge difference when the strings are very long and only one
7543
case is actually present. */
7544
7545
PCRE2_SPTR pp1 = NULL;
7546
PCRE2_SPTR pp2 = NULL;
7547
PCRE2_SIZE searchlength = end_subject - start_match;
7548
7549
/* If we haven't got a previously found position for first_cu, or if
7550
the current starting position is later, we need to do a search. If
7551
the code unit is not found, set it to the end. */
7552
7553
if (memchr_found_first_cu == NULL ||
7554
start_match > memchr_found_first_cu)
7555
{
7556
pp1 = memchr(start_match, first_cu, searchlength);
7557
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
7558
}
7559
7560
/* If the start is before a previously found position, use the
7561
previous position, or NULL if a previous search failed. */
7562
7563
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
7564
memchr_found_first_cu;
7565
7566
/* Do the same thing for the other case. */
7567
7568
if (memchr_found_first_cu2 == NULL ||
7569
start_match > memchr_found_first_cu2)
7570
{
7571
pp2 = memchr(start_match, first_cu2, searchlength);
7572
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
7573
}
7574
7575
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
7576
memchr_found_first_cu2;
7577
7578
/* Set the start to the end of the subject if neither case was found.
7579
Otherwise, use the earlier found point. */
7580
7581
if (pp1 == NULL)
7582
start_match = (pp2 == NULL)? end_subject : pp2;
7583
else
7584
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
7585
7586
#endif /* 8-bit handling */
7587
}
7588
7589
/* The caseful case is much simpler. */
7590
7591
else
7592
{
7593
#if PCRE2_CODE_UNIT_WIDTH != 8
7594
while (start_match < end_subject && UCHAR21TEST(start_match) !=
7595
first_cu)
7596
start_match++;
7597
#else
7598
start_match = memchr(start_match, first_cu, end_subject - start_match);
7599
if (start_match == NULL) start_match = end_subject;
7600
#endif
7601
}
7602
7603
/* If we can't find the required first code unit, having reached the
7604
true end of the subject, break the bumpalong loop, to force a match
7605
failure, except when doing partial matching, when we let the next cycle
7606
run at the end of the subject. To see why, consider the pattern
7607
/(?<=abc)def/, which partially matches "abc", even though the string
7608
does not contain the starting character "d". If we have not reached the
7609
true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
7610
temporarily modified) we also let the cycle run, because the matching
7611
string is legitimately allowed to start with the first code unit of a
7612
newline. */
7613
7614
if (mb->partial == 0 && start_match >= mb->end_subject)
7615
{
7616
rc = MATCH_NOMATCH;
7617
break;
7618
}
7619
}
7620
7621
/* If there's no first code unit, advance to just after a linebreak for a
7622
multiline match if required. */
7623
7624
else if (startline)
7625
{
7626
if (start_match > mb->start_subject + start_offset)
7627
{
7628
#ifdef SUPPORT_UNICODE
7629
if (utf)
7630
{
7631
while (start_match < end_subject && !WAS_NEWLINE(start_match))
7632
{
7633
start_match++;
7634
ACROSSCHAR(start_match < end_subject, start_match, start_match++);
7635
}
7636
}
7637
else
7638
#endif
7639
while (start_match < end_subject && !WAS_NEWLINE(start_match))
7640
start_match++;
7641
7642
/* If we have just passed a CR and the newline option is ANY or
7643
ANYCRLF, and we are now at a LF, advance the match position by one
7644
more code unit. */
7645
7646
if (start_match[-1] == CHAR_CR &&
7647
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
7648
start_match < end_subject &&
7649
UCHAR21TEST(start_match) == CHAR_NL)
7650
start_match++;
7651
}
7652
}
7653
7654
/* If there's no first code unit or a requirement for a multiline line
7655
start, advance to a non-unique first code unit if any have been
7656
identified. The bitmap contains only 256 bits. When code units are 16 or
7657
32 bits wide, all code units greater than 254 set the 255 bit. */
7658
7659
else if (start_bits != NULL)
7660
{
7661
while (start_match < end_subject)
7662
{
7663
uint32_t c = UCHAR21TEST(start_match);
7664
#if PCRE2_CODE_UNIT_WIDTH != 8
7665
if (c > 255) c = 255;
7666
#endif
7667
if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
7668
start_match++;
7669
}
7670
7671
/* See comment above in first_cu checking about the next few lines. */
7672
7673
if (mb->partial == 0 && start_match >= mb->end_subject)
7674
{
7675
rc = MATCH_NOMATCH;
7676
break;
7677
}
7678
}
7679
} /* End first code unit handling */
7680
7681
/* Restore fudged end_subject */
7682
7683
end_subject = mb->end_subject;
7684
7685
/* The following two optimizations must be disabled for partial matching. */
7686
7687
if (mb->partial == 0)
7688
{
7689
PCRE2_SPTR p;
7690
7691
/* The minimum matching length is a lower bound; no string of that length
7692
may actually match the pattern. Although the value is, strictly, in
7693
characters, we treat it as code units to avoid spending too much time in
7694
this optimization. */
7695
7696
if (end_subject - start_match < re->minlength)
7697
{
7698
rc = MATCH_NOMATCH;
7699
break;
7700
}
7701
7702
/* If req_cu is set, we know that that code unit must appear in the
7703
subject for the (non-partial) match to succeed. If the first code unit is
7704
set, req_cu must be later in the subject; otherwise the test starts at
7705
the match point. This optimization can save a huge amount of backtracking
7706
in patterns with nested unlimited repeats that aren't going to match.
7707
Writing separate code for caseful/caseless versions makes it go faster,
7708
as does using an autoincrement and backing off on a match. As in the case
7709
of the first code unit, using memchr() in the 8-bit library gives a big
7710
speed up. Unlike the first_cu check above, we do not need to call
7711
memchr() twice in the caseless case because we only need to check for the
7712
presence of the character in either case, not find the first occurrence.
7713
7714
The search can be skipped if the code unit was found later than the
7715
current starting point in a previous iteration of the bumpalong loop.
7716
7717
HOWEVER: when the subject string is very, very long, searching to its end
7718
can take a long time, and give bad performance on quite ordinary
7719
anchored patterns. This showed up when somebody was matching something
7720
like /^\d+C/ on a 32-megabyte string... so we don't do this when the
7721
string is sufficiently long, but it's worth searching a lot more for
7722
unanchored patterns. */
7723
7724
p = start_match + (has_first_cu? 1:0);
7725
if (has_req_cu && p > req_cu_ptr)
7726
{
7727
PCRE2_SIZE check_length = end_subject - start_match;
7728
7729
if (check_length < REQ_CU_MAX ||
7730
(!anchored && check_length < REQ_CU_MAX * 1000))
7731
{
7732
if (req_cu != req_cu2) /* Caseless */
7733
{
7734
#if PCRE2_CODE_UNIT_WIDTH != 8
7735
while (p < end_subject)
7736
{
7737
uint32_t pp = UCHAR21INCTEST(p);
7738
if (pp == req_cu || pp == req_cu2) { p--; break; }
7739
}
7740
#else /* 8-bit code units */
7741
PCRE2_SPTR pp = p;
7742
p = memchr(pp, req_cu, end_subject - pp);
7743
if (p == NULL)
7744
{
7745
p = memchr(pp, req_cu2, end_subject - pp);
7746
if (p == NULL) p = end_subject;
7747
}
7748
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
7749
}
7750
7751
/* The caseful case */
7752
7753
else
7754
{
7755
#if PCRE2_CODE_UNIT_WIDTH != 8
7756
while (p < end_subject)
7757
{
7758
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7759
}
7760
7761
#else /* 8-bit code units */
7762
p = memchr(p, req_cu, end_subject - p);
7763
if (p == NULL) p = end_subject;
7764
#endif
7765
}
7766
7767
/* If we can't find the required code unit, break the bumpalong loop,
7768
forcing a match failure. */
7769
7770
if (p >= end_subject)
7771
{
7772
rc = MATCH_NOMATCH;
7773
break;
7774
}
7775
7776
/* If we have found the required code unit, save the point where we
7777
found it, so that we don't search again next time round the bumpalong
7778
loop if the start hasn't yet passed this code unit. */
7779
7780
req_cu_ptr = p;
7781
}
7782
}
7783
}
7784
}
7785
7786
/* ------------ End of start of match optimizations ------------ */
7787
7788
/* Give no match if we have passed the bumpalong limit. */
7789
7790
if (start_match > bumpalong_limit)
7791
{
7792
rc = MATCH_NOMATCH;
7793
break;
7794
}
7795
7796
/* OK, we can now run the match. If "hitend" is set afterwards, remember the
7797
first starting point for which a partial match was found. */
7798
7799
cb.start_match = (PCRE2_SIZE)(start_match - subject);
7800
cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7801
7802
mb->start_used_ptr = start_match;
7803
mb->last_used_ptr = start_match;
7804
#ifdef SUPPORT_UNICODE
7805
mb->moptions = options | fragment_options;
7806
#else
7807
mb->moptions = options;
7808
#endif
7809
mb->match_call_count = 0;
7810
mb->end_offset_top = 0;
7811
mb->skip_arg_count = 0;
7812
7813
#ifdef DEBUG_SHOW_OPS
7814
fprintf(stderr, "++ Calling match()\n");
7815
#endif
7816
7817
rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
7818
match_data, mb);
7819
7820
#ifdef DEBUG_SHOW_OPS
7821
fprintf(stderr, "++ match() returned %d\n\n", rc);
7822
#endif
7823
7824
if (mb->hitend && start_partial == NULL)
7825
{
7826
start_partial = mb->start_used_ptr;
7827
match_partial = start_match;
7828
}
7829
7830
switch(rc)
7831
{
7832
/* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7833
the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7834
entirely. The only way we can do that is to re-do the match at the same
7835
point, with a flag to force SKIP with an argument to be ignored. Just
7836
treating this case as NOMATCH does not work because it does not check other
7837
alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7838
7839
case MATCH_SKIP_ARG:
7840
new_start_match = start_match;
7841
mb->ignore_skip_arg = mb->skip_arg_count;
7842
break;
7843
7844
/* SKIP passes back the next starting point explicitly, but if it is no
7845
greater than the match we have just done, treat it as NOMATCH. */
7846
7847
case MATCH_SKIP:
7848
if (mb->verb_skip_ptr > start_match)
7849
{
7850
new_start_match = mb->verb_skip_ptr;
7851
break;
7852
}
7853
/* Fall through */
7854
7855
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
7856
exactly like PRUNE. Unset ignore SKIP-with-argument. */
7857
7858
case MATCH_NOMATCH:
7859
case MATCH_PRUNE:
7860
case MATCH_THEN:
7861
mb->ignore_skip_arg = 0;
7862
new_start_match = start_match + 1;
7863
#ifdef SUPPORT_UNICODE
7864
if (utf)
7865
ACROSSCHAR(new_start_match < end_subject, new_start_match,
7866
new_start_match++);
7867
#endif
7868
break;
7869
7870
/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7871
7872
case MATCH_COMMIT:
7873
rc = MATCH_NOMATCH;
7874
goto ENDLOOP;
7875
7876
/* Any other return is either a match, or some kind of error. */
7877
7878
default:
7879
goto ENDLOOP;
7880
}
7881
7882
/* Control reaches here for the various types of "no match at this point"
7883
result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7884
7885
rc = MATCH_NOMATCH;
7886
7887
/* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7888
newline in the subject (though it may continue over the newline). Therefore,
7889
if we have just failed to match, starting at a newline, do not continue. */
7890
7891
if (firstline && IS_NEWLINE(start_match)) break;
7892
7893
/* Advance to new matching position */
7894
7895
start_match = new_start_match;
7896
7897
/* Break the loop if the pattern is anchored or if we have passed the end of
7898
the subject. */
7899
7900
if (anchored || start_match > end_subject) break;
7901
7902
/* If we have just passed a CR and we are now at a LF, and the pattern does
7903
not contain any explicit matches for \r or \n, and the newline option is CRLF
7904
or ANY or ANYCRLF, advance the match position by one more code unit. In
7905
normal matching start_match will aways be greater than the first position at
7906
this stage, but a failed *SKIP can cause a return at the same point, which is
7907
why the first test exists. */
7908
7909
if (start_match > subject + start_offset &&
7910
start_match[-1] == CHAR_CR &&
7911
start_match < end_subject &&
7912
*start_match == CHAR_NL &&
7913
(re->flags & PCRE2_HASCRORLF) == 0 &&
7914
(mb->nltype == NLTYPE_ANY ||
7915
mb->nltype == NLTYPE_ANYCRLF ||
7916
mb->nllen == 2))
7917
start_match++;
7918
7919
mb->mark = NULL; /* Reset for start of next match attempt */
7920
} /* End of for(;;) "bumpalong" loop */
7921
7922
/* ==========================================================================*/
7923
7924
/* When we reach here, one of the following stopping conditions is true:
7925
7926
(1) The match succeeded, either completely, or partially;
7927
7928
(2) The pattern is anchored or the match was failed after (*COMMIT);
7929
7930
(3) We are past the end of the subject or the bumpalong limit;
7931
7932
(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7933
this option requests that a match occur at or before the first newline in
7934
the subject.
7935
7936
(5) Some kind of error occurred.
7937
7938
*/
7939
7940
ENDLOOP:
7941
7942
/* If end_subject != true_end_subject, it means we are handling invalid UTF,
7943
and have just processed a non-terminal fragment. If this resulted in no match
7944
or a partial match we must carry on to the next fragment (a partial match is
7945
returned to the caller only at the very end of the subject). A loop is used to
7946
avoid trying to match against empty fragments; if the pattern can match an
7947
empty string it would have done so already. */
7948
7949
#ifdef SUPPORT_UNICODE
7950
if (utf && end_subject != true_end_subject &&
7951
(rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
7952
{
7953
for (;;)
7954
{
7955
/* Advance past the first bad code unit, and then skip invalid character
7956
starting code units in 8-bit and 16-bit modes. */
7957
7958
start_match = end_subject + 1;
7959
7960
#if PCRE2_CODE_UNIT_WIDTH != 32
7961
while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
7962
start_match++;
7963
#endif
7964
7965
/* If we have hit the end of the subject, there isn't another non-empty
7966
fragment, so give up. */
7967
7968
if (start_match >= true_end_subject)
7969
{
7970
rc = MATCH_NOMATCH; /* In case it was partial */
7971
match_partial = NULL;
7972
break;
7973
}
7974
7975
/* Check the rest of the subject */
7976
7977
mb->check_subject = start_match;
7978
rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
7979
&(match_data->startchar));
7980
7981
/* The rest of the subject is valid UTF. */
7982
7983
if (rc == 0)
7984
{
7985
mb->end_subject = end_subject = true_end_subject;
7986
fragment_options = PCRE2_NOTBOL;
7987
goto FRAGMENT_RESTART;
7988
}
7989
7990
/* A subsequent UTF error has been found; if the next fragment is
7991
non-empty, set up to process it. Otherwise, let the loop advance. */
7992
7993
else if (rc < 0)
7994
{
7995
mb->end_subject = end_subject = start_match + match_data->startchar;
7996
if (end_subject > start_match)
7997
{
7998
fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
7999
goto FRAGMENT_RESTART;
8000
}
8001
}
8002
}
8003
}
8004
#endif /* SUPPORT_UNICODE */
8005
8006
/* Fill in fields that are always returned in the match data. */
8007
8008
match_data->code = re;
8009
match_data->mark = mb->mark;
8010
match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
8011
8012
/* Handle a fully successful match. Set the return code to the number of
8013
captured strings, or 0 if there were too many to fit into the ovector, and then
8014
set the remaining returned values before returning. Make a copy of the subject
8015
string if requested. */
8016
8017
if (rc == MATCH_MATCH)
8018
{
8019
match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
8020
0 : (int)mb->end_offset_top/2 + 1;
8021
match_data->subject_length = length;
8022
match_data->startchar = start_match - subject;
8023
match_data->leftchar = mb->start_used_ptr - subject;
8024
match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
8025
mb->last_used_ptr : mb->end_match_ptr) - subject;
8026
if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
8027
{
8028
length = CU2BYTES(length + was_zero_terminated);
8029
match_data->subject = match_data->memctl.malloc(length,
8030
match_data->memctl.memory_data);
8031
if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
8032
memcpy((void *)match_data->subject, subject, length);
8033
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
8034
}
8035
else match_data->subject = subject;
8036
8037
return match_data->rc;
8038
}
8039
8040
/* Control gets here if there has been a partial match, an error, or if the
8041
overall match attempt has failed at all permitted starting positions. Any mark
8042
data is in the nomatch_mark field. */
8043
8044
match_data->mark = mb->nomatch_mark;
8045
8046
/* For anything other than nomatch or partial match, just return the code. */
8047
8048
if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
8049
8050
/* Handle a partial match. If a "soft" partial match was requested, searching
8051
for a complete match will have continued, and the value of rc at this point
8052
will be MATCH_NOMATCH. For a "hard" partial match, it will already be
8053
PCRE2_ERROR_PARTIAL. */
8054
8055
else if (match_partial != NULL)
8056
{
8057
match_data->subject = subject;
8058
match_data->subject_length = length;
8059
match_data->ovector[0] = match_partial - subject;
8060
match_data->ovector[1] = end_subject - subject;
8061
match_data->startchar = match_partial - subject;
8062
match_data->leftchar = start_partial - subject;
8063
match_data->rightchar = end_subject - subject;
8064
match_data->rc = PCRE2_ERROR_PARTIAL;
8065
}
8066
8067
/* Else this is the classic nomatch case. */
8068
8069
else match_data->rc = PCRE2_ERROR_NOMATCH;
8070
8071
return match_data->rc;
8072
}
8073
8074
/* These #undefs are here to enable unity builds with CMake. */
8075
8076
#undef NLBLOCK /* Block containing newline information */
8077
#undef PSSTART /* Field containing processed string start */
8078
#undef PSEND /* Field containing processed string end */
8079
8080
/* End of pcre2_match.c */
8081
8082