Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_dfa_match.c
9898 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
* Redistributions of source code must retain the above copyright notice,
17
this list of conditions and the following disclaimer.
18
19
* Redistributions in binary form must reproduce the above copyright
20
notice, this list of conditions and the following disclaimer in the
21
documentation and/or other materials provided with the distribution.
22
23
* Neither the name of the University of Cambridge nor the names of its
24
contributors may be used to endorse or promote products derived from
25
this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains the external function pcre2_dfa_match(), which is an
43
alternative matching function that uses a sort of DFA algorithm (not a true
44
FSM). This is NOT Perl-compatible, but it has advantages in certain
45
applications. */
46
47
48
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49
the performance of his patterns greatly. I could not use it as it stood, as it
50
was not thread safe, and made assumptions about pattern sizes. Also, it caused
51
test 7 to loop, and test 9 to crash with a segfault.
52
53
The issue is the check for duplicate states, which is done by a simple linear
54
search up the state list. (Grep for "duplicate" below to find the code.) For
55
many patterns, there will never be many states active at one time, so a simple
56
linear search is fine. In patterns that have many active states, it might be a
57
bottleneck. The suggested code used an indexing scheme to remember which states
58
had previously been used for each character, and avoided the linear search when
59
it knew there was no chance of a duplicate. This was implemented when adding
60
states to the state lists.
61
62
I wrote some thread-safe, not-limited code to try something similar at the time
63
of checking for duplicates (instead of when adding states), using index vectors
64
on the stack. It did give a 13% improvement with one specially constructed
65
pattern for certain subject strings, but on other strings and on many of the
66
simpler patterns in the test suite it did worse. The major problem, I think,
67
was the extra time to initialize the index. This had to be done for each call
68
of internal_dfa_match(). (The supplied patch used a static vector, initialized
69
only once - I suspect this was the cause of the problems with the tests.)
70
71
Overall, I concluded that the gains in some cases did not outweigh the losses
72
in others, so I abandoned this code. */
73
74
75
#ifdef HAVE_CONFIG_H
76
#include "config.h"
77
#endif
78
79
#define NLBLOCK mb /* Block containing newline information */
80
#define PSSTART start_subject /* Field containing processed string start */
81
#define PSEND end_subject /* Field containing processed string end */
82
83
#include "pcre2_internal.h"
84
85
#define PUBLIC_DFA_MATCH_OPTIONS \
86
(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87
PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88
PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89
PCRE2_COPY_MATCHED_SUBJECT)
90
91
92
/*************************************************
93
* Code parameters and static tables *
94
*************************************************/
95
96
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97
into others, under special conditions. A gap of 20 between the blocks should be
98
enough. The resulting opcodes don't have to be less than 256 because they are
99
never stored, so we push them well clear of the normal opcodes. */
100
101
#define OP_PROP_EXTRA 300
102
#define OP_EXTUNI_EXTRA 320
103
#define OP_ANYNL_EXTRA 340
104
#define OP_HSPACE_EXTRA 360
105
#define OP_VSPACE_EXTRA 380
106
107
108
/* This table identifies those opcodes that are followed immediately by a
109
character that is to be tested in some way. This makes it possible to
110
centralize the loading of these characters. In the case of Type * etc, the
111
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112
small value. Non-zero values in the table are the offsets from the opcode where
113
the character is to be found. ***NOTE*** If the start of this table is
114
modified, the three tables that follow must also be modified. */
115
116
static const uint8_t coptable[] = {
117
0, /* End */
118
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120
0, 0, 0, /* Any, AllAny, Anybyte */
121
0, 0, /* \P, \p */
122
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123
0, /* \X */
124
0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125
1, /* Char */
126
1, /* Chari */
127
1, /* not */
128
1, /* noti */
129
/* Positive single-char repeats */
130
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131
1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132
1+IMM2_SIZE, /* exact */
133
1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135
1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136
1+IMM2_SIZE, /* exact I */
137
1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138
/* Negative single-char repeats - only for chars < 256 */
139
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140
1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141
1+IMM2_SIZE, /* NOT exact */
142
1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144
1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145
1+IMM2_SIZE, /* NOT exact I */
146
1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147
/* Positive type repeats */
148
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149
1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150
1+IMM2_SIZE, /* Type exact */
151
1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152
/* Character class & ref repeats */
153
0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154
0, 0, /* CRRANGE, CRMINRANGE */
155
0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156
0, /* CLASS */
157
0, /* NCLASS */
158
0, /* XCLASS - variable length */
159
0, /* ECLASS - variable length */
160
0, /* REF */
161
0, /* REFI */
162
0, /* DNREF */
163
0, /* DNREFI */
164
0, /* RECURSE */
165
0, /* CALLOUT */
166
0, /* CALLOUT_STR */
167
0, /* Alt */
168
0, /* Ket */
169
0, /* KetRmax */
170
0, /* KetRmin */
171
0, /* KetRpos */
172
0, 0, /* Reverse, Vreverse */
173
0, /* Assert */
174
0, /* Assert not */
175
0, /* Assert behind */
176
0, /* Assert behind not */
177
0, /* NA assert */
178
0, /* NA assert behind */
179
0, /* Assert scan substring */
180
0, /* ONCE */
181
0, /* SCRIPT_RUN */
182
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
183
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
184
0, 0, /* CREF, DNCREF */
185
0, 0, /* RREF, DNRREF */
186
0, 0, /* FALSE, TRUE */
187
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
188
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
189
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
190
0, 0, /* COMMIT, COMMIT_ARG */
191
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
192
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
193
0, 0, /* \B and \b in UCP mode */
194
};
195
196
/* This table identifies those opcodes that inspect a character. It is used to
197
remember the fact that a character could have been inspected when the end of
198
the subject is reached. ***NOTE*** If the start of this table is modified, the
199
two tables that follow must also be modified. */
200
201
static const uint8_t poptable[] = {
202
0, /* End */
203
0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
204
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
205
1, 1, 1, /* Any, AllAny, Anybyte */
206
1, 1, /* \P, \p */
207
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
208
1, /* \X */
209
0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
210
1, /* Char */
211
1, /* Chari */
212
1, /* not */
213
1, /* noti */
214
/* Positive single-char repeats */
215
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
216
1, 1, 1, /* upto, minupto, exact */
217
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
218
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
219
1, 1, 1, /* upto I, minupto I, exact I */
220
1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
221
/* Negative single-char repeats - only for chars < 256 */
222
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
223
1, 1, 1, /* NOT upto, minupto, exact */
224
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
225
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
226
1, 1, 1, /* NOT upto I, minupto I, exact I */
227
1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
228
/* Positive type repeats */
229
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
230
1, 1, 1, /* Type upto, minupto, exact */
231
1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
232
/* Character class & ref repeats */
233
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
234
1, 1, /* CRRANGE, CRMINRANGE */
235
1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
236
1, /* CLASS */
237
1, /* NCLASS */
238
1, /* XCLASS - variable length */
239
1, /* ECLASS - variable length */
240
0, /* REF */
241
0, /* REFI */
242
0, /* DNREF */
243
0, /* DNREFI */
244
0, /* RECURSE */
245
0, /* CALLOUT */
246
0, /* CALLOUT_STR */
247
0, /* Alt */
248
0, /* Ket */
249
0, /* KetRmax */
250
0, /* KetRmin */
251
0, /* KetRpos */
252
0, 0, /* Reverse, Vreverse */
253
0, /* Assert */
254
0, /* Assert not */
255
0, /* Assert behind */
256
0, /* Assert behind not */
257
0, /* NA assert */
258
0, /* NA assert behind */
259
0, /* Assert scan substring */
260
0, /* ONCE */
261
0, /* SCRIPT_RUN */
262
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
263
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
264
0, 0, /* CREF, DNCREF */
265
0, 0, /* RREF, DNRREF */
266
0, 0, /* FALSE, TRUE */
267
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
268
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
269
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
270
0, 0, /* COMMIT, COMMIT_ARG */
271
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
272
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
273
1, 1, /* \B and \b in UCP mode */
274
};
275
276
/* Compile-time check that these tables have the correct size. */
277
STATIC_ASSERT(sizeof(coptable) == OP_TABLE_LENGTH, coptable);
278
STATIC_ASSERT(sizeof(poptable) == OP_TABLE_LENGTH, poptable);
279
280
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
281
and \w */
282
283
static const uint8_t toptable1[] = {
284
0, 0, 0, 0, 0, 0,
285
ctype_digit, ctype_digit,
286
ctype_space, ctype_space,
287
ctype_word, ctype_word,
288
0, 0 /* OP_ANY, OP_ALLANY */
289
};
290
291
static const uint8_t toptable2[] = {
292
0, 0, 0, 0, 0, 0,
293
ctype_digit, 0,
294
ctype_space, 0,
295
ctype_word, 0,
296
1, 1 /* OP_ANY, OP_ALLANY */
297
};
298
299
300
/* Structure for holding data about a particular state, which is in effect the
301
current data for an active path through the match tree. It must consist
302
entirely of ints because the working vector we are passed, and which we put
303
these structures in, is a vector of ints. */
304
305
typedef struct stateblock {
306
int offset; /* Offset to opcode (-ve has meaning) */
307
int count; /* Count for repeats */
308
int data; /* Some use extra data */
309
} stateblock;
310
311
#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
312
313
314
/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
315
local working space and output vectors that were created on the stack. This has
316
caused issues for some patterns, especially in small-stack environments such as
317
Windows. A new scheme is now in use which sets up a vector on the stack, but if
318
this is too small, heap memory is used, up to the heap_limit. The main
319
parameters are all numbers of ints because the workspace is a vector of ints.
320
321
The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
322
defined in pcre2_internal.h so as to be available to pcre2test when it is
323
finding the minimum heap requirement for a match. */
324
325
#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
326
327
#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
328
#define RWS_RSIZE 1000 /* Work size for recursion */
329
#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
330
#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
331
332
/* This structure is at the start of each workspace block. */
333
334
typedef struct RWS_anchor {
335
struct RWS_anchor *next;
336
uint32_t size; /* Number of ints */
337
uint32_t free; /* Number of ints */
338
} RWS_anchor;
339
340
#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
341
342
343
344
/*************************************************
345
* Process a callout *
346
*************************************************/
347
348
/* This function is called to perform a callout.
349
350
Arguments:
351
code current code pointer
352
offsets points to current capture offsets
353
current_subject start of current subject match
354
ptr current position in subject
355
mb the match block
356
extracode extra code offset when called from condition
357
lengthptr where to return the callout length
358
359
Returns: the return from the callout
360
*/
361
362
static int
363
do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
364
PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
365
PCRE2_SIZE *lengthptr)
366
{
367
pcre2_callout_block *cb = mb->cb;
368
369
*lengthptr = (code[extracode] == OP_CALLOUT)?
370
(PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
371
(PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
372
373
if (mb->callout == NULL) return 0; /* No callout provided */
374
375
/* Fixed fields in the callout block are set once and for all at the start of
376
matching. */
377
378
cb->offset_vector = offsets;
379
cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
380
cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
381
cb->pattern_position = GET(code, 1 + extracode);
382
cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
383
384
if (code[extracode] == OP_CALLOUT)
385
{
386
cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
387
cb->callout_string_offset = 0;
388
cb->callout_string = NULL;
389
cb->callout_string_length = 0;
390
}
391
else
392
{
393
cb->callout_number = 0;
394
cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
395
cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
396
cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
397
}
398
399
return (mb->callout)(cb, mb->callout_data);
400
}
401
402
403
404
/*************************************************
405
* Expand local workspace memory *
406
*************************************************/
407
408
/* This function is called when internal_dfa_match() is about to be called
409
recursively and there is insufficient working space left in the current
410
workspace block. If there's an existing next block, use it; otherwise get a new
411
block unless the heap limit is reached.
412
413
Arguments:
414
rwsptr pointer to block pointer (updated)
415
ovecsize space needed for an ovector
416
mb the match block
417
418
Returns: 0 rwsptr has been updated
419
!0 an error code
420
*/
421
422
static int
423
more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
424
{
425
RWS_anchor *rws = *rwsptr;
426
RWS_anchor *new;
427
428
if (rws->next != NULL)
429
{
430
new = rws->next;
431
}
432
433
/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
434
mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
435
overflow. */
436
437
else
438
{
439
uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
440
uint32_t newsizeK = newsize/(1024/sizeof(int));
441
442
if (newsizeK + mb->heap_used > mb->heap_limit)
443
newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
444
newsize = newsizeK*(1024/sizeof(int));
445
446
if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
447
return PCRE2_ERROR_HEAPLIMIT;
448
new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
449
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
450
mb->heap_used += newsizeK;
451
new->next = NULL;
452
new->size = newsize;
453
rws->next = new;
454
}
455
456
new->free = new->size - RWS_ANCHOR_SIZE;
457
*rwsptr = new;
458
return 0;
459
}
460
461
462
463
/*************************************************
464
* Match a Regular Expression - DFA engine *
465
*************************************************/
466
467
/* This internal function applies a compiled pattern to a subject string,
468
starting at a given point, using a DFA engine. This function is called from the
469
external one, possibly multiple times if the pattern is not anchored. The
470
function calls itself recursively for some kinds of subpattern.
471
472
Arguments:
473
mb the match_data block with fixed information
474
this_start_code the opening bracket of this subexpression's code
475
current_subject where we currently are in the subject string
476
start_offset start offset in the subject string
477
offsets vector to contain the matching string offsets
478
offsetcount size of same
479
workspace vector of workspace
480
wscount size of same
481
rlevel function call recursion level
482
483
Returns: > 0 => number of match offset pairs placed in offsets
484
= 0 => offsets overflowed; longest matches are present
485
-1 => failed to match
486
< -1 => some kind of unexpected problem
487
488
The following macros are used for adding states to the two state vectors (one
489
for the current character, one for the following character). */
490
491
#define ADD_ACTIVE(x,y) \
492
if (active_count++ < wscount) \
493
{ \
494
next_active_state->offset = (x); \
495
next_active_state->count = (y); \
496
next_active_state++; \
497
} \
498
else return PCRE2_ERROR_DFA_WSSIZE
499
500
#define ADD_ACTIVE_DATA(x,y,z) \
501
if (active_count++ < wscount) \
502
{ \
503
next_active_state->offset = (x); \
504
next_active_state->count = (y); \
505
next_active_state->data = (z); \
506
next_active_state++; \
507
} \
508
else return PCRE2_ERROR_DFA_WSSIZE
509
510
#define ADD_NEW(x,y) \
511
if (new_count++ < wscount) \
512
{ \
513
next_new_state->offset = (x); \
514
next_new_state->count = (y); \
515
next_new_state++; \
516
} \
517
else return PCRE2_ERROR_DFA_WSSIZE
518
519
#define ADD_NEW_DATA(x,y,z) \
520
if (new_count++ < wscount) \
521
{ \
522
next_new_state->offset = (x); \
523
next_new_state->count = (y); \
524
next_new_state->data = (z); \
525
next_new_state++; \
526
} \
527
else return PCRE2_ERROR_DFA_WSSIZE
528
529
/* And now, here is the code */
530
531
static int
532
internal_dfa_match(
533
dfa_match_block *mb,
534
PCRE2_SPTR this_start_code,
535
PCRE2_SPTR current_subject,
536
PCRE2_SIZE start_offset,
537
PCRE2_SIZE *offsets,
538
uint32_t offsetcount,
539
int *workspace,
540
int wscount,
541
uint32_t rlevel,
542
int *RWS)
543
{
544
stateblock *active_states, *new_states, *temp_states;
545
stateblock *next_active_state, *next_new_state;
546
const uint8_t *ctypes, *lcc, *fcc;
547
PCRE2_SPTR ptr;
548
PCRE2_SPTR end_code;
549
dfa_recursion_info new_recursive;
550
int active_count, new_count, match_count;
551
552
/* Some fields in the mb block are frequently referenced, so we load them into
553
independent variables in the hope that this will perform better. */
554
555
PCRE2_SPTR start_subject = mb->start_subject;
556
PCRE2_SPTR end_subject = mb->end_subject;
557
PCRE2_SPTR start_code = mb->start_code;
558
559
#ifdef SUPPORT_UNICODE
560
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
561
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
562
#else
563
BOOL utf = FALSE;
564
#endif
565
566
BOOL reset_could_continue = FALSE;
567
568
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
569
if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
570
offsetcount &= (uint32_t)(-2); /* Round down */
571
572
wscount -= 2;
573
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
574
(2 * INTS_PER_STATEBLOCK);
575
576
ctypes = mb->tables + ctypes_offset;
577
lcc = mb->tables + lcc_offset;
578
fcc = mb->tables + fcc_offset;
579
580
match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
581
582
active_states = (stateblock *)(workspace + 2);
583
next_new_state = new_states = active_states + wscount;
584
new_count = 0;
585
586
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
587
the alternative states onto the list, and find out where the end is. This
588
makes is possible to use this function recursively, when we want to stop at a
589
matching internal ket rather than at the end.
590
591
If we are dealing with a backward assertion we have to find out the maximum
592
amount to move back, and set up each alternative appropriately. */
593
594
if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
595
{
596
size_t max_back = 0;
597
size_t gone_back;
598
599
end_code = this_start_code;
600
do
601
{
602
size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
603
if (back > max_back) max_back = back;
604
end_code += GET(end_code, 1);
605
}
606
while (*end_code == OP_ALT);
607
608
/* If we can't go back the amount required for the longest lookbehind
609
pattern, go back as far as we can; some alternatives may still be viable. */
610
611
#ifdef SUPPORT_UNICODE
612
/* In character mode we have to step back character by character */
613
614
if (utf)
615
{
616
for (gone_back = 0; gone_back < max_back; gone_back++)
617
{
618
if (current_subject <= start_subject) break;
619
current_subject--;
620
ACROSSCHAR(current_subject > start_subject, current_subject,
621
current_subject--);
622
}
623
}
624
else
625
#endif
626
627
/* In byte-mode we can do this quickly. */
628
629
{
630
size_t current_offset = (size_t)(current_subject - start_subject);
631
gone_back = (current_offset < max_back)? current_offset : max_back;
632
current_subject -= gone_back;
633
}
634
635
/* Save the earliest consulted character */
636
637
if (current_subject < mb->start_used_ptr)
638
mb->start_used_ptr = current_subject;
639
640
/* Now we can process the individual branches. There will be an OP_REVERSE at
641
the start of each branch, except when the length of the branch is zero. */
642
643
end_code = this_start_code;
644
do
645
{
646
uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
647
size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
648
if (back <= gone_back)
649
{
650
int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
651
ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
652
}
653
end_code += GET(end_code, 1);
654
}
655
while (*end_code == OP_ALT);
656
}
657
658
/* This is the code for a "normal" subpattern (not a backward assertion). The
659
start of a whole pattern is always one of these. If we are at the top level,
660
we may be asked to restart matching from the same point that we reached for a
661
previous partial match. We still have to scan through the top-level branches to
662
find the end state. */
663
664
else
665
{
666
end_code = this_start_code;
667
668
/* Restarting */
669
670
if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
671
{
672
do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
673
new_count = workspace[1];
674
if (!workspace[0])
675
memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
676
}
677
678
/* Not restarting */
679
680
else
681
{
682
int length = 1 + LINK_SIZE +
683
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
684
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
685
? IMM2_SIZE:0);
686
do
687
{
688
ADD_NEW((int)(end_code - start_code + length), 0);
689
end_code += GET(end_code, 1);
690
length = 1 + LINK_SIZE;
691
}
692
while (*end_code == OP_ALT);
693
}
694
}
695
696
workspace[0] = 0; /* Bit indicating which vector is current */
697
698
/* Loop for scanning the subject */
699
700
ptr = current_subject;
701
for (;;)
702
{
703
int i, j;
704
int clen, dlen;
705
uint32_t c, d;
706
BOOL partial_newline = FALSE;
707
BOOL could_continue = reset_could_continue;
708
reset_could_continue = FALSE;
709
710
if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
711
712
/* Make the new state list into the active state list and empty the
713
new state list. */
714
715
temp_states = active_states;
716
active_states = new_states;
717
new_states = temp_states;
718
active_count = new_count;
719
new_count = 0;
720
721
workspace[0] ^= 1; /* Remember for the restarting feature */
722
workspace[1] = active_count;
723
724
/* Set the pointers for adding new states */
725
726
next_active_state = active_states + active_count;
727
next_new_state = new_states;
728
729
/* Load the current character from the subject outside the loop, as many
730
different states may want to look at it, and we assume that at least one
731
will. */
732
733
if (ptr < end_subject)
734
{
735
clen = 1; /* Number of data items in the character */
736
#ifdef SUPPORT_UNICODE
737
GETCHARLENTEST(c, ptr, clen);
738
#else
739
c = *ptr;
740
#endif /* SUPPORT_UNICODE */
741
}
742
else
743
{
744
clen = 0; /* This indicates the end of the subject */
745
c = NOTACHAR; /* This value should never actually be used */
746
}
747
748
/* Scan up the active states and act on each one. The result of an action
749
may be to add more states to the currently active list (e.g. on hitting a
750
parenthesis) or it may be to put states on the new list, for considering
751
when we move the character pointer on. */
752
753
for (i = 0; i < active_count; i++)
754
{
755
stateblock *current_state = active_states + i;
756
BOOL caseless = FALSE;
757
PCRE2_SPTR code;
758
uint32_t codevalue;
759
int state_offset = current_state->offset;
760
int rrc;
761
int count;
762
763
/* A negative offset is a special case meaning "hold off going to this
764
(negated) state until the number of characters in the data field have
765
been skipped". If the could_continue flag was passed over from a previous
766
state, arrange for it to passed on. */
767
768
if (state_offset < 0)
769
{
770
if (current_state->data > 0)
771
{
772
ADD_NEW_DATA(state_offset, current_state->count,
773
current_state->data - 1);
774
if (could_continue) reset_could_continue = TRUE;
775
continue;
776
}
777
else
778
{
779
current_state->offset = state_offset = -state_offset;
780
}
781
}
782
783
/* Check for a duplicate state with the same count, and skip if found.
784
See the note at the head of this module about the possibility of improving
785
performance here. */
786
787
for (j = 0; j < i; j++)
788
{
789
if (active_states[j].offset == state_offset &&
790
active_states[j].count == current_state->count)
791
goto NEXT_ACTIVE_STATE;
792
}
793
794
/* The state offset is the offset to the opcode */
795
796
code = start_code + state_offset;
797
codevalue = *code;
798
799
/* If this opcode inspects a character, but we are at the end of the
800
subject, remember the fact for use when testing for a partial match. */
801
802
if (clen == 0 && poptable[codevalue] != 0)
803
could_continue = TRUE;
804
805
/* If this opcode is followed by an inline character, load it. It is
806
tempting to test for the presence of a subject character here, but that
807
is wrong, because sometimes zero repetitions of the subject are
808
permitted.
809
810
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
811
argument that is not a data character - but is always one byte long because
812
the values are small. We have to take special action to deal with \P, \p,
813
\H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
814
these ones to new opcodes. */
815
816
if (coptable[codevalue] > 0)
817
{
818
dlen = 1;
819
#ifdef SUPPORT_UNICODE
820
if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
821
#endif /* SUPPORT_UNICODE */
822
d = code[coptable[codevalue]];
823
if (codevalue >= OP_TYPESTAR)
824
{
825
switch(d)
826
{
827
case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
828
case OP_NOTPROP:
829
case OP_PROP: codevalue += OP_PROP_EXTRA; break;
830
case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
831
case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
832
case OP_NOT_HSPACE:
833
case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
834
case OP_NOT_VSPACE:
835
case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
836
default: break;
837
}
838
}
839
}
840
else
841
{
842
dlen = 0; /* Not strictly necessary, but compilers moan */
843
d = NOTACHAR; /* if these variables are not set. */
844
}
845
846
847
/* Now process the individual opcodes */
848
849
switch (codevalue)
850
{
851
/* ========================================================================== */
852
/* Reached a closing bracket. If not at the end of the pattern, carry
853
on with the next opcode. For repeating opcodes, also add the repeat
854
state. Note that KETRPOS will always be encountered at the end of the
855
subpattern, because the possessive subpattern repeats are always handled
856
using recursive calls. Thus, it never adds any new states.
857
858
At the end of the (sub)pattern, unless we have an empty string and
859
PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
860
start of the subject, save the match data, shifting up all previous
861
matches so we always have the longest first. */
862
863
case OP_KET:
864
case OP_KETRMIN:
865
case OP_KETRMAX:
866
case OP_KETRPOS:
867
if (code != end_code)
868
{
869
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
870
if (codevalue != OP_KET)
871
{
872
ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
873
}
874
}
875
else
876
{
877
if (ptr > current_subject ||
878
((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
879
((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
880
current_subject > start_subject + mb->start_offset)))
881
{
882
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
883
else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
884
match_count = 0;
885
count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
886
if (count > 0) (void)memmove(offsets + 2, offsets,
887
(size_t)count * sizeof(PCRE2_SIZE));
888
if (offsetcount >= 2)
889
{
890
offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
891
offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
892
}
893
if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
894
}
895
}
896
break;
897
898
/* ========================================================================== */
899
/* These opcodes add to the current list of states without looking
900
at the current character. */
901
902
/*-----------------------------------------------------------------*/
903
case OP_ALT:
904
do { code += GET(code, 1); } while (*code == OP_ALT);
905
ADD_ACTIVE((int)(code - start_code), 0);
906
break;
907
908
/*-----------------------------------------------------------------*/
909
case OP_BRA:
910
case OP_SBRA:
911
do
912
{
913
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
914
code += GET(code, 1);
915
}
916
while (*code == OP_ALT);
917
break;
918
919
/*-----------------------------------------------------------------*/
920
case OP_CBRA:
921
case OP_SCBRA:
922
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
923
code += GET(code, 1);
924
while (*code == OP_ALT)
925
{
926
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
927
code += GET(code, 1);
928
}
929
break;
930
931
/*-----------------------------------------------------------------*/
932
case OP_BRAZERO:
933
case OP_BRAMINZERO:
934
ADD_ACTIVE(state_offset + 1, 0);
935
code += 1 + GET(code, 2);
936
while (*code == OP_ALT) code += GET(code, 1);
937
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
938
break;
939
940
/*-----------------------------------------------------------------*/
941
case OP_SKIPZERO:
942
code += 1 + GET(code, 2);
943
while (*code == OP_ALT) code += GET(code, 1);
944
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
945
break;
946
947
/*-----------------------------------------------------------------*/
948
case OP_CIRC:
949
if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
950
{ ADD_ACTIVE(state_offset + 1, 0); }
951
break;
952
953
/*-----------------------------------------------------------------*/
954
case OP_CIRCM:
955
if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
956
((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
957
&& WAS_NEWLINE(ptr)))
958
{ ADD_ACTIVE(state_offset + 1, 0); }
959
break;
960
961
/*-----------------------------------------------------------------*/
962
case OP_EOD:
963
if (ptr >= end_subject)
964
{
965
if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
966
return PCRE2_ERROR_PARTIAL;
967
else { ADD_ACTIVE(state_offset + 1, 0); }
968
}
969
break;
970
971
/*-----------------------------------------------------------------*/
972
case OP_SOD:
973
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
974
break;
975
976
/*-----------------------------------------------------------------*/
977
case OP_SOM:
978
if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
979
break;
980
981
982
/* ========================================================================== */
983
/* These opcodes inspect the next subject character, and sometimes
984
the previous one as well, but do not have an argument. The variable
985
clen contains the length of the current character and is zero if we are
986
at the end of the subject. */
987
988
/*-----------------------------------------------------------------*/
989
case OP_ANY:
990
if (clen > 0 && !IS_NEWLINE(ptr))
991
{
992
if (ptr + 1 >= mb->end_subject &&
993
(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
994
NLBLOCK->nltype == NLTYPE_FIXED &&
995
NLBLOCK->nllen == 2 &&
996
c == NLBLOCK->nl[0])
997
{
998
could_continue = partial_newline = TRUE;
999
}
1000
else
1001
{
1002
ADD_NEW(state_offset + 1, 0);
1003
}
1004
}
1005
break;
1006
1007
/*-----------------------------------------------------------------*/
1008
case OP_ALLANY:
1009
if (clen > 0)
1010
{ ADD_NEW(state_offset + 1, 0); }
1011
break;
1012
1013
/*-----------------------------------------------------------------*/
1014
case OP_EODN:
1015
if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1016
{
1017
if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018
return PCRE2_ERROR_PARTIAL;
1019
ADD_ACTIVE(state_offset + 1, 0);
1020
}
1021
break;
1022
1023
/*-----------------------------------------------------------------*/
1024
case OP_DOLL:
1025
if ((mb->moptions & PCRE2_NOTEOL) == 0)
1026
{
1027
if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1028
could_continue = TRUE;
1029
else if (clen == 0 ||
1030
((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1031
(ptr == end_subject - mb->nllen)
1032
))
1033
{ ADD_ACTIVE(state_offset + 1, 0); }
1034
else if (ptr + 1 >= mb->end_subject &&
1035
(mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1036
NLBLOCK->nltype == NLTYPE_FIXED &&
1037
NLBLOCK->nllen == 2 &&
1038
c == NLBLOCK->nl[0])
1039
{
1040
if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1041
{
1042
reset_could_continue = TRUE;
1043
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1044
}
1045
else could_continue = partial_newline = TRUE;
1046
}
1047
}
1048
break;
1049
1050
/*-----------------------------------------------------------------*/
1051
case OP_DOLLM:
1052
if ((mb->moptions & PCRE2_NOTEOL) == 0)
1053
{
1054
if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1055
could_continue = TRUE;
1056
else if (clen == 0 ||
1057
((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1058
{ ADD_ACTIVE(state_offset + 1, 0); }
1059
else if (ptr + 1 >= mb->end_subject &&
1060
(mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1061
NLBLOCK->nltype == NLTYPE_FIXED &&
1062
NLBLOCK->nllen == 2 &&
1063
c == NLBLOCK->nl[0])
1064
{
1065
if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1066
{
1067
reset_could_continue = TRUE;
1068
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1069
}
1070
else could_continue = partial_newline = TRUE;
1071
}
1072
}
1073
else if (IS_NEWLINE(ptr))
1074
{ ADD_ACTIVE(state_offset + 1, 0); }
1075
break;
1076
1077
/*-----------------------------------------------------------------*/
1078
1079
case OP_DIGIT:
1080
case OP_WHITESPACE:
1081
case OP_WORDCHAR:
1082
if (clen > 0 && c < 256 &&
1083
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1084
{ ADD_NEW(state_offset + 1, 0); }
1085
break;
1086
1087
/*-----------------------------------------------------------------*/
1088
case OP_NOT_DIGIT:
1089
case OP_NOT_WHITESPACE:
1090
case OP_NOT_WORDCHAR:
1091
if (clen > 0 && (c >= 256 ||
1092
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1093
{ ADD_NEW(state_offset + 1, 0); }
1094
break;
1095
1096
/*-----------------------------------------------------------------*/
1097
case OP_WORD_BOUNDARY:
1098
case OP_NOT_WORD_BOUNDARY:
1099
case OP_NOT_UCP_WORD_BOUNDARY:
1100
case OP_UCP_WORD_BOUNDARY:
1101
{
1102
int left_word, right_word;
1103
1104
if (ptr > start_subject)
1105
{
1106
PCRE2_SPTR temp = ptr - 1;
1107
if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1108
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1109
if (utf) { BACKCHAR(temp); }
1110
#endif
1111
GETCHARTEST(d, temp);
1112
#ifdef SUPPORT_UNICODE
1113
if (codevalue == OP_UCP_WORD_BOUNDARY ||
1114
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1115
{
1116
int chartype = UCD_CHARTYPE(d);
1117
int category = PRIV(ucp_gentype)[chartype];
1118
left_word = (category == ucp_L || category == ucp_N ||
1119
chartype == ucp_Mn || chartype == ucp_Pc);
1120
}
1121
else
1122
#endif
1123
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1124
}
1125
else left_word = FALSE;
1126
1127
if (clen > 0)
1128
{
1129
if (ptr >= mb->last_used_ptr)
1130
{
1131
PCRE2_SPTR temp = ptr + 1;
1132
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1133
if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1134
#endif
1135
mb->last_used_ptr = temp;
1136
}
1137
#ifdef SUPPORT_UNICODE
1138
if (codevalue == OP_UCP_WORD_BOUNDARY ||
1139
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1140
{
1141
int chartype = UCD_CHARTYPE(c);
1142
int category = PRIV(ucp_gentype)[chartype];
1143
right_word = (category == ucp_L || category == ucp_N ||
1144
chartype == ucp_Mn || chartype == ucp_Pc);
1145
}
1146
else
1147
#endif
1148
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1149
}
1150
else right_word = FALSE;
1151
1152
if ((left_word == right_word) ==
1153
(codevalue == OP_NOT_WORD_BOUNDARY ||
1154
codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1155
{ ADD_ACTIVE(state_offset + 1, 0); }
1156
}
1157
break;
1158
1159
1160
/*-----------------------------------------------------------------*/
1161
/* Check the next character by Unicode property. We will get here only
1162
if the support is in the binary; otherwise a compile-time error occurs.
1163
*/
1164
1165
#ifdef SUPPORT_UNICODE
1166
case OP_PROP:
1167
case OP_NOTPROP:
1168
if (clen > 0)
1169
{
1170
BOOL OK;
1171
int chartype;
1172
const uint32_t *cp;
1173
const ucd_record * prop = GET_UCD(c);
1174
switch(code[1])
1175
{
1176
case PT_LAMP:
1177
chartype = prop->chartype;
1178
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1179
chartype == ucp_Lt;
1180
break;
1181
1182
case PT_GC:
1183
OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1184
break;
1185
1186
case PT_PC:
1187
OK = prop->chartype == code[2];
1188
break;
1189
1190
case PT_SC:
1191
OK = prop->script == code[2];
1192
break;
1193
1194
case PT_SCX:
1195
OK = (prop->script == code[2] ||
1196
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1197
break;
1198
1199
/* These are specials for combination cases. */
1200
1201
case PT_ALNUM:
1202
chartype = prop->chartype;
1203
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1204
PRIV(ucp_gentype)[chartype] == ucp_N;
1205
break;
1206
1207
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
1208
which means that Perl space and POSIX space are now identical. PCRE
1209
was changed at release 8.34. */
1210
1211
case PT_SPACE: /* Perl space */
1212
case PT_PXSPACE: /* POSIX space */
1213
switch(c)
1214
{
1215
HSPACE_CASES:
1216
VSPACE_CASES:
1217
OK = TRUE;
1218
break;
1219
1220
default:
1221
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1222
break;
1223
}
1224
break;
1225
1226
case PT_WORD:
1227
chartype = prop->chartype;
1228
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1229
PRIV(ucp_gentype)[chartype] == ucp_N ||
1230
chartype == ucp_Mn || chartype == ucp_Pc;
1231
break;
1232
1233
case PT_CLIST:
1234
#if PCRE2_CODE_UNIT_WIDTH == 32
1235
if (c > MAX_UTF_CODE_POINT)
1236
{
1237
OK = FALSE;
1238
break;
1239
}
1240
#endif
1241
cp = PRIV(ucd_caseless_sets) + code[2];
1242
for (;;)
1243
{
1244
if (c < *cp) { OK = FALSE; break; }
1245
if (c == *cp++) { OK = TRUE; break; }
1246
}
1247
break;
1248
1249
case PT_UCNC:
1250
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1251
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1252
c >= 0xe000;
1253
break;
1254
1255
case PT_BIDICL:
1256
OK = UCD_BIDICLASS(c) == code[2];
1257
break;
1258
1259
case PT_BOOL:
1260
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1261
UCD_BPROPS_PROP(prop), code[2]) != 0;
1262
break;
1263
1264
/* Should never occur, but keep compilers from grumbling. */
1265
1266
default:
1267
OK = codevalue != OP_PROP;
1268
break;
1269
}
1270
1271
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1272
}
1273
break;
1274
#endif
1275
1276
1277
1278
/* ========================================================================== */
1279
/* These opcodes likewise inspect the subject character, but have an
1280
argument that is not a data character. It is one of these opcodes:
1281
OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1282
OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1283
1284
case OP_TYPEPLUS:
1285
case OP_TYPEMINPLUS:
1286
case OP_TYPEPOSPLUS:
1287
count = current_state->count; /* Already matched */
1288
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1289
if (clen > 0)
1290
{
1291
if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1292
(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1293
NLBLOCK->nltype == NLTYPE_FIXED &&
1294
NLBLOCK->nllen == 2 &&
1295
c == NLBLOCK->nl[0])
1296
{
1297
could_continue = partial_newline = TRUE;
1298
}
1299
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1300
(c < 256 &&
1301
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
1302
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1303
{
1304
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1305
{
1306
active_count--; /* Remove non-match possibility */
1307
next_active_state--;
1308
}
1309
count++;
1310
ADD_NEW(state_offset, count);
1311
}
1312
}
1313
break;
1314
1315
/*-----------------------------------------------------------------*/
1316
case OP_TYPEQUERY:
1317
case OP_TYPEMINQUERY:
1318
case OP_TYPEPOSQUERY:
1319
ADD_ACTIVE(state_offset + 2, 0);
1320
if (clen > 0)
1321
{
1322
if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1323
(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1324
NLBLOCK->nltype == NLTYPE_FIXED &&
1325
NLBLOCK->nllen == 2 &&
1326
c == NLBLOCK->nl[0])
1327
{
1328
could_continue = partial_newline = TRUE;
1329
}
1330
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1331
(c < 256 &&
1332
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
1333
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1334
{
1335
if (codevalue == OP_TYPEPOSQUERY)
1336
{
1337
active_count--; /* Remove non-match possibility */
1338
next_active_state--;
1339
}
1340
ADD_NEW(state_offset + 2, 0);
1341
}
1342
}
1343
break;
1344
1345
/*-----------------------------------------------------------------*/
1346
case OP_TYPESTAR:
1347
case OP_TYPEMINSTAR:
1348
case OP_TYPEPOSSTAR:
1349
ADD_ACTIVE(state_offset + 2, 0);
1350
if (clen > 0)
1351
{
1352
if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1353
(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1354
NLBLOCK->nltype == NLTYPE_FIXED &&
1355
NLBLOCK->nllen == 2 &&
1356
c == NLBLOCK->nl[0])
1357
{
1358
could_continue = partial_newline = TRUE;
1359
}
1360
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1361
(c < 256 &&
1362
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
1363
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1364
{
1365
if (codevalue == OP_TYPEPOSSTAR)
1366
{
1367
active_count--; /* Remove non-match possibility */
1368
next_active_state--;
1369
}
1370
ADD_NEW(state_offset, 0);
1371
}
1372
}
1373
break;
1374
1375
/*-----------------------------------------------------------------*/
1376
case OP_TYPEEXACT:
1377
count = current_state->count; /* Number already matched */
1378
if (clen > 0)
1379
{
1380
if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1381
(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1382
NLBLOCK->nltype == NLTYPE_FIXED &&
1383
NLBLOCK->nllen == 2 &&
1384
c == NLBLOCK->nl[0])
1385
{
1386
could_continue = partial_newline = TRUE;
1387
}
1388
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1389
(c < 256 &&
1390
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
1391
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1392
{
1393
if (++count >= (int)GET2(code, 1))
1394
{ ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1395
else
1396
{ ADD_NEW(state_offset, count); }
1397
}
1398
}
1399
break;
1400
1401
/*-----------------------------------------------------------------*/
1402
case OP_TYPEUPTO:
1403
case OP_TYPEMINUPTO:
1404
case OP_TYPEPOSUPTO:
1405
ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1406
count = current_state->count; /* Number already matched */
1407
if (clen > 0)
1408
{
1409
if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1410
(mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1411
NLBLOCK->nltype == NLTYPE_FIXED &&
1412
NLBLOCK->nllen == 2 &&
1413
c == NLBLOCK->nl[0])
1414
{
1415
could_continue = partial_newline = TRUE;
1416
}
1417
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1418
(c < 256 &&
1419
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
1420
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1421
{
1422
if (codevalue == OP_TYPEPOSUPTO)
1423
{
1424
active_count--; /* Remove non-match possibility */
1425
next_active_state--;
1426
}
1427
if (++count >= (int)GET2(code, 1))
1428
{ ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1429
else
1430
{ ADD_NEW(state_offset, count); }
1431
}
1432
}
1433
break;
1434
1435
/* ========================================================================== */
1436
/* These are virtual opcodes that are used when something like
1437
OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1438
argument. It keeps the code above fast for the other cases. The argument
1439
is in the d variable. */
1440
1441
#ifdef SUPPORT_UNICODE
1442
case OP_PROP_EXTRA + OP_TYPEPLUS:
1443
case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1444
case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1445
count = current_state->count; /* Already matched */
1446
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1447
if (clen > 0)
1448
{
1449
BOOL OK;
1450
int chartype;
1451
const uint32_t *cp;
1452
const ucd_record * prop = GET_UCD(c);
1453
switch(code[2])
1454
{
1455
case PT_LAMP:
1456
chartype = prop->chartype;
1457
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1458
break;
1459
1460
case PT_GC:
1461
OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1462
break;
1463
1464
case PT_PC:
1465
OK = prop->chartype == code[3];
1466
break;
1467
1468
case PT_SC:
1469
OK = prop->script == code[3];
1470
break;
1471
1472
case PT_SCX:
1473
OK = (prop->script == code[3] ||
1474
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1475
break;
1476
1477
/* These are specials for combination cases. */
1478
1479
case PT_ALNUM:
1480
chartype = prop->chartype;
1481
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1482
PRIV(ucp_gentype)[chartype] == ucp_N;
1483
break;
1484
1485
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
1486
which means that Perl space and POSIX space are now identical. PCRE
1487
was changed at release 8.34. */
1488
1489
case PT_SPACE: /* Perl space */
1490
case PT_PXSPACE: /* POSIX space */
1491
switch(c)
1492
{
1493
HSPACE_CASES:
1494
VSPACE_CASES:
1495
OK = TRUE;
1496
break;
1497
1498
default:
1499
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1500
break;
1501
}
1502
break;
1503
1504
case PT_WORD:
1505
chartype = prop->chartype;
1506
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1507
PRIV(ucp_gentype)[chartype] == ucp_N ||
1508
chartype == ucp_Mn || chartype == ucp_Pc;
1509
break;
1510
1511
case PT_CLIST:
1512
#if PCRE2_CODE_UNIT_WIDTH == 32
1513
if (c > MAX_UTF_CODE_POINT)
1514
{
1515
OK = FALSE;
1516
break;
1517
}
1518
#endif
1519
cp = PRIV(ucd_caseless_sets) + code[3];
1520
for (;;)
1521
{
1522
if (c < *cp) { OK = FALSE; break; }
1523
if (c == *cp++) { OK = TRUE; break; }
1524
}
1525
break;
1526
1527
case PT_UCNC:
1528
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1529
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1530
c >= 0xe000;
1531
break;
1532
1533
case PT_BIDICL:
1534
OK = UCD_BIDICLASS(c) == code[3];
1535
break;
1536
1537
case PT_BOOL:
1538
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1539
UCD_BPROPS_PROP(prop), code[3]) != 0;
1540
break;
1541
1542
/* Should never occur, but keep compilers from grumbling. */
1543
1544
default:
1545
OK = codevalue != OP_PROP;
1546
break;
1547
}
1548
1549
if (OK == (d == OP_PROP))
1550
{
1551
if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1552
{
1553
active_count--; /* Remove non-match possibility */
1554
next_active_state--;
1555
}
1556
count++;
1557
ADD_NEW(state_offset, count);
1558
}
1559
}
1560
break;
1561
1562
/*-----------------------------------------------------------------*/
1563
case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1564
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1565
case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1566
count = current_state->count; /* Already matched */
1567
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1568
if (clen > 0)
1569
{
1570
int ncount = 0;
1571
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1572
{
1573
active_count--; /* Remove non-match possibility */
1574
next_active_state--;
1575
}
1576
(void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1577
&ncount);
1578
count++;
1579
ADD_NEW_DATA(-state_offset, count, ncount);
1580
}
1581
break;
1582
#endif
1583
1584
/*-----------------------------------------------------------------*/
1585
case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1586
case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1587
case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1588
count = current_state->count; /* Already matched */
1589
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1590
if (clen > 0)
1591
{
1592
int ncount = 0;
1593
switch (c)
1594
{
1595
case CHAR_VT:
1596
case CHAR_FF:
1597
case CHAR_NEL:
1598
#ifndef EBCDIC
1599
case 0x2028:
1600
case 0x2029:
1601
#endif /* Not EBCDIC */
1602
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1603
goto ANYNL01;
1604
1605
case CHAR_CR:
1606
if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1607
/* Fall through */
1608
1609
ANYNL01:
1610
case CHAR_LF:
1611
if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1612
{
1613
active_count--; /* Remove non-match possibility */
1614
next_active_state--;
1615
}
1616
count++;
1617
ADD_NEW_DATA(-state_offset, count, ncount);
1618
break;
1619
1620
default:
1621
break;
1622
}
1623
}
1624
break;
1625
1626
/*-----------------------------------------------------------------*/
1627
case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1628
case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1629
case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1630
count = current_state->count; /* Already matched */
1631
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1632
if (clen > 0)
1633
{
1634
BOOL OK;
1635
switch (c)
1636
{
1637
VSPACE_CASES:
1638
OK = TRUE;
1639
break;
1640
1641
default:
1642
OK = FALSE;
1643
break;
1644
}
1645
1646
if (OK == (d == OP_VSPACE))
1647
{
1648
if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1649
{
1650
active_count--; /* Remove non-match possibility */
1651
next_active_state--;
1652
}
1653
count++;
1654
ADD_NEW_DATA(-state_offset, count, 0);
1655
}
1656
}
1657
break;
1658
1659
/*-----------------------------------------------------------------*/
1660
case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1661
case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1662
case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1663
count = current_state->count; /* Already matched */
1664
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1665
if (clen > 0)
1666
{
1667
BOOL OK;
1668
switch (c)
1669
{
1670
HSPACE_CASES:
1671
OK = TRUE;
1672
break;
1673
1674
default:
1675
OK = FALSE;
1676
break;
1677
}
1678
1679
if (OK == (d == OP_HSPACE))
1680
{
1681
if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1682
{
1683
active_count--; /* Remove non-match possibility */
1684
next_active_state--;
1685
}
1686
count++;
1687
ADD_NEW_DATA(-state_offset, count, 0);
1688
}
1689
}
1690
break;
1691
1692
/*-----------------------------------------------------------------*/
1693
#ifdef SUPPORT_UNICODE
1694
case OP_PROP_EXTRA + OP_TYPEQUERY:
1695
case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1696
case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1697
count = 4;
1698
goto QS1;
1699
1700
case OP_PROP_EXTRA + OP_TYPESTAR:
1701
case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1702
case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1703
count = 0;
1704
1705
QS1:
1706
1707
ADD_ACTIVE(state_offset + 4, 0);
1708
if (clen > 0)
1709
{
1710
BOOL OK;
1711
int chartype;
1712
const uint32_t *cp;
1713
const ucd_record * prop = GET_UCD(c);
1714
switch(code[2])
1715
{
1716
case PT_LAMP:
1717
chartype = prop->chartype;
1718
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1719
break;
1720
1721
case PT_GC:
1722
OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1723
break;
1724
1725
case PT_PC:
1726
OK = prop->chartype == code[3];
1727
break;
1728
1729
case PT_SC:
1730
OK = prop->script == code[3];
1731
break;
1732
1733
case PT_SCX:
1734
OK = (prop->script == code[3] ||
1735
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1736
break;
1737
1738
/* These are specials for combination cases. */
1739
1740
case PT_ALNUM:
1741
chartype = prop->chartype;
1742
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1743
PRIV(ucp_gentype)[chartype] == ucp_N;
1744
break;
1745
1746
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
1747
which means that Perl space and POSIX space are now identical. PCRE
1748
was changed at release 8.34. */
1749
1750
case PT_SPACE: /* Perl space */
1751
case PT_PXSPACE: /* POSIX space */
1752
switch(c)
1753
{
1754
HSPACE_CASES:
1755
VSPACE_CASES:
1756
OK = TRUE;
1757
break;
1758
1759
default:
1760
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1761
break;
1762
}
1763
break;
1764
1765
case PT_WORD:
1766
chartype = prop->chartype;
1767
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1768
PRIV(ucp_gentype)[chartype] == ucp_N ||
1769
chartype == ucp_Mn || chartype == ucp_Pc;
1770
break;
1771
1772
case PT_CLIST:
1773
#if PCRE2_CODE_UNIT_WIDTH == 32
1774
if (c > MAX_UTF_CODE_POINT)
1775
{
1776
OK = FALSE;
1777
break;
1778
}
1779
#endif
1780
cp = PRIV(ucd_caseless_sets) + code[3];
1781
for (;;)
1782
{
1783
if (c < *cp) { OK = FALSE; break; }
1784
if (c == *cp++) { OK = TRUE; break; }
1785
}
1786
break;
1787
1788
case PT_UCNC:
1789
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1790
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1791
c >= 0xe000;
1792
break;
1793
1794
case PT_BIDICL:
1795
OK = UCD_BIDICLASS(c) == code[3];
1796
break;
1797
1798
case PT_BOOL:
1799
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1800
UCD_BPROPS_PROP(prop), code[3]) != 0;
1801
break;
1802
1803
/* Should never occur, but keep compilers from grumbling. */
1804
1805
default:
1806
OK = codevalue != OP_PROP;
1807
break;
1808
}
1809
1810
if (OK == (d == OP_PROP))
1811
{
1812
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1813
codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1814
{
1815
active_count--; /* Remove non-match possibility */
1816
next_active_state--;
1817
}
1818
ADD_NEW(state_offset + count, 0);
1819
}
1820
}
1821
break;
1822
1823
/*-----------------------------------------------------------------*/
1824
case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1825
case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1826
case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1827
count = 2;
1828
goto QS2;
1829
1830
case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1831
case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1832
case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1833
count = 0;
1834
1835
QS2:
1836
1837
ADD_ACTIVE(state_offset + 2, 0);
1838
if (clen > 0)
1839
{
1840
int ncount = 0;
1841
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1842
codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1843
{
1844
active_count--; /* Remove non-match possibility */
1845
next_active_state--;
1846
}
1847
(void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1848
&ncount);
1849
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1850
}
1851
break;
1852
#endif
1853
1854
/*-----------------------------------------------------------------*/
1855
case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1856
case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1857
case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1858
count = 2;
1859
goto QS3;
1860
1861
case OP_ANYNL_EXTRA + OP_TYPESTAR:
1862
case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1863
case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1864
count = 0;
1865
1866
QS3:
1867
ADD_ACTIVE(state_offset + 2, 0);
1868
if (clen > 0)
1869
{
1870
int ncount = 0;
1871
switch (c)
1872
{
1873
case CHAR_VT:
1874
case CHAR_FF:
1875
case CHAR_NEL:
1876
#ifndef EBCDIC
1877
case 0x2028:
1878
case 0x2029:
1879
#endif /* Not EBCDIC */
1880
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1881
goto ANYNL02;
1882
1883
case CHAR_CR:
1884
if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1885
/* Fall through */
1886
1887
ANYNL02:
1888
case CHAR_LF:
1889
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1890
codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1891
{
1892
active_count--; /* Remove non-match possibility */
1893
next_active_state--;
1894
}
1895
ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1896
break;
1897
1898
default:
1899
break;
1900
}
1901
}
1902
break;
1903
1904
/*-----------------------------------------------------------------*/
1905
case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1906
case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1907
case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1908
count = 2;
1909
goto QS4;
1910
1911
case OP_VSPACE_EXTRA + OP_TYPESTAR:
1912
case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1913
case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1914
count = 0;
1915
1916
QS4:
1917
ADD_ACTIVE(state_offset + 2, 0);
1918
if (clen > 0)
1919
{
1920
BOOL OK;
1921
switch (c)
1922
{
1923
VSPACE_CASES:
1924
OK = TRUE;
1925
break;
1926
1927
default:
1928
OK = FALSE;
1929
break;
1930
}
1931
if (OK == (d == OP_VSPACE))
1932
{
1933
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1934
codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1935
{
1936
active_count--; /* Remove non-match possibility */
1937
next_active_state--;
1938
}
1939
ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1940
}
1941
}
1942
break;
1943
1944
/*-----------------------------------------------------------------*/
1945
case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1946
case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1947
case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1948
count = 2;
1949
goto QS5;
1950
1951
case OP_HSPACE_EXTRA + OP_TYPESTAR:
1952
case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1953
case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1954
count = 0;
1955
1956
QS5:
1957
ADD_ACTIVE(state_offset + 2, 0);
1958
if (clen > 0)
1959
{
1960
BOOL OK;
1961
switch (c)
1962
{
1963
HSPACE_CASES:
1964
OK = TRUE;
1965
break;
1966
1967
default:
1968
OK = FALSE;
1969
break;
1970
}
1971
1972
if (OK == (d == OP_HSPACE))
1973
{
1974
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1975
codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1976
{
1977
active_count--; /* Remove non-match possibility */
1978
next_active_state--;
1979
}
1980
ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1981
}
1982
}
1983
break;
1984
1985
/*-----------------------------------------------------------------*/
1986
#ifdef SUPPORT_UNICODE
1987
case OP_PROP_EXTRA + OP_TYPEEXACT:
1988
case OP_PROP_EXTRA + OP_TYPEUPTO:
1989
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1990
case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1991
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1992
{ ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1993
count = current_state->count; /* Number already matched */
1994
if (clen > 0)
1995
{
1996
BOOL OK;
1997
int chartype;
1998
const uint32_t *cp;
1999
const ucd_record * prop = GET_UCD(c);
2000
switch(code[1 + IMM2_SIZE + 1])
2001
{
2002
case PT_LAMP:
2003
chartype = prop->chartype;
2004
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2005
break;
2006
2007
case PT_GC:
2008
OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2009
break;
2010
2011
case PT_PC:
2012
OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2013
break;
2014
2015
case PT_SC:
2016
OK = prop->script == code[1 + IMM2_SIZE + 2];
2017
break;
2018
2019
case PT_SCX:
2020
OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2021
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2022
code[1 + IMM2_SIZE + 2]) != 0);
2023
break;
2024
2025
/* These are specials for combination cases. */
2026
2027
case PT_ALNUM:
2028
chartype = prop->chartype;
2029
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2030
PRIV(ucp_gentype)[chartype] == ucp_N;
2031
break;
2032
2033
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
2034
which means that Perl space and POSIX space are now identical. PCRE
2035
was changed at release 8.34. */
2036
2037
case PT_SPACE: /* Perl space */
2038
case PT_PXSPACE: /* POSIX space */
2039
switch(c)
2040
{
2041
HSPACE_CASES:
2042
VSPACE_CASES:
2043
OK = TRUE;
2044
break;
2045
2046
default:
2047
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2048
break;
2049
}
2050
break;
2051
2052
case PT_WORD:
2053
chartype = prop->chartype;
2054
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2055
PRIV(ucp_gentype)[chartype] == ucp_N ||
2056
chartype == ucp_Mn || chartype == ucp_Pc;
2057
break;
2058
2059
case PT_CLIST:
2060
#if PCRE2_CODE_UNIT_WIDTH == 32
2061
if (c > MAX_UTF_CODE_POINT)
2062
{
2063
OK = FALSE;
2064
break;
2065
}
2066
#endif
2067
cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2068
for (;;)
2069
{
2070
if (c < *cp) { OK = FALSE; break; }
2071
if (c == *cp++) { OK = TRUE; break; }
2072
}
2073
break;
2074
2075
case PT_UCNC:
2076
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2077
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2078
c >= 0xe000;
2079
break;
2080
2081
case PT_BIDICL:
2082
OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2083
break;
2084
2085
case PT_BOOL:
2086
OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2087
UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2088
break;
2089
2090
/* Should never occur, but keep compilers from grumbling. */
2091
2092
default:
2093
OK = codevalue != OP_PROP;
2094
break;
2095
}
2096
2097
if (OK == (d == OP_PROP))
2098
{
2099
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2100
{
2101
active_count--; /* Remove non-match possibility */
2102
next_active_state--;
2103
}
2104
if (++count >= (int)GET2(code, 1))
2105
{ ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2106
else
2107
{ ADD_NEW(state_offset, count); }
2108
}
2109
}
2110
break;
2111
2112
/*-----------------------------------------------------------------*/
2113
case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2114
case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2115
case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2116
case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2117
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2118
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2119
count = current_state->count; /* Number already matched */
2120
if (clen > 0)
2121
{
2122
PCRE2_SPTR nptr;
2123
int ncount = 0;
2124
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2125
{
2126
active_count--; /* Remove non-match possibility */
2127
next_active_state--;
2128
}
2129
nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2130
&ncount);
2131
if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2132
reset_could_continue = TRUE;
2133
if (++count >= (int)GET2(code, 1))
2134
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2135
else
2136
{ ADD_NEW_DATA(-state_offset, count, ncount); }
2137
}
2138
break;
2139
#endif
2140
2141
/*-----------------------------------------------------------------*/
2142
case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2143
case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2144
case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2145
case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2146
if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2147
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2148
count = current_state->count; /* Number already matched */
2149
if (clen > 0)
2150
{
2151
int ncount = 0;
2152
switch (c)
2153
{
2154
case CHAR_VT:
2155
case CHAR_FF:
2156
case CHAR_NEL:
2157
#ifndef EBCDIC
2158
case 0x2028:
2159
case 0x2029:
2160
#endif /* Not EBCDIC */
2161
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2162
goto ANYNL03;
2163
2164
case CHAR_CR:
2165
if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2166
/* Fall through */
2167
2168
ANYNL03:
2169
case CHAR_LF:
2170
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2171
{
2172
active_count--; /* Remove non-match possibility */
2173
next_active_state--;
2174
}
2175
if (++count >= (int)GET2(code, 1))
2176
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2177
else
2178
{ ADD_NEW_DATA(-state_offset, count, ncount); }
2179
break;
2180
2181
default:
2182
break;
2183
}
2184
}
2185
break;
2186
2187
/*-----------------------------------------------------------------*/
2188
case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2189
case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2190
case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2191
case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2192
if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2193
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2194
count = current_state->count; /* Number already matched */
2195
if (clen > 0)
2196
{
2197
BOOL OK;
2198
switch (c)
2199
{
2200
VSPACE_CASES:
2201
OK = TRUE;
2202
break;
2203
2204
default:
2205
OK = FALSE;
2206
}
2207
2208
if (OK == (d == OP_VSPACE))
2209
{
2210
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2211
{
2212
active_count--; /* Remove non-match possibility */
2213
next_active_state--;
2214
}
2215
if (++count >= (int)GET2(code, 1))
2216
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2217
else
2218
{ ADD_NEW_DATA(-state_offset, count, 0); }
2219
}
2220
}
2221
break;
2222
2223
/*-----------------------------------------------------------------*/
2224
case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2225
case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2226
case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2227
case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2228
if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2229
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2230
count = current_state->count; /* Number already matched */
2231
if (clen > 0)
2232
{
2233
BOOL OK;
2234
switch (c)
2235
{
2236
HSPACE_CASES:
2237
OK = TRUE;
2238
break;
2239
2240
default:
2241
OK = FALSE;
2242
break;
2243
}
2244
2245
if (OK == (d == OP_HSPACE))
2246
{
2247
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2248
{
2249
active_count--; /* Remove non-match possibility */
2250
next_active_state--;
2251
}
2252
if (++count >= (int)GET2(code, 1))
2253
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2254
else
2255
{ ADD_NEW_DATA(-state_offset, count, 0); }
2256
}
2257
}
2258
break;
2259
2260
/* ========================================================================== */
2261
/* These opcodes are followed by a character that is usually compared
2262
to the current subject character; it is loaded into d. We still get
2263
here even if there is no subject character, because in some cases zero
2264
repetitions are permitted. */
2265
2266
/*-----------------------------------------------------------------*/
2267
case OP_CHAR:
2268
if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2269
break;
2270
2271
/*-----------------------------------------------------------------*/
2272
case OP_CHARI:
2273
if (clen == 0) break;
2274
2275
#ifdef SUPPORT_UNICODE
2276
if (utf_or_ucp)
2277
{
2278
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2279
{
2280
unsigned int othercase;
2281
if (c < 128)
2282
othercase = fcc[c];
2283
else
2284
othercase = UCD_OTHERCASE(c);
2285
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2286
}
2287
}
2288
else
2289
#endif /* SUPPORT_UNICODE */
2290
/* Not UTF or UCP mode */
2291
{
2292
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2293
{ ADD_NEW(state_offset + 2, 0); }
2294
}
2295
break;
2296
2297
2298
#ifdef SUPPORT_UNICODE
2299
/*-----------------------------------------------------------------*/
2300
/* This is a tricky one because it can match more than one character.
2301
Find out how many characters to skip, and then set up a negative state
2302
to wait for them to pass before continuing. */
2303
2304
case OP_EXTUNI:
2305
if (clen > 0)
2306
{
2307
int ncount = 0;
2308
PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2309
end_subject, utf, &ncount);
2310
if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2311
reset_could_continue = TRUE;
2312
ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2313
}
2314
break;
2315
#endif
2316
2317
/*-----------------------------------------------------------------*/
2318
/* This is a tricky like EXTUNI because it too can match more than one
2319
character (when CR is followed by LF). In this case, set up a negative
2320
state to wait for one character to pass before continuing. */
2321
2322
case OP_ANYNL:
2323
if (clen > 0) switch(c)
2324
{
2325
case CHAR_VT:
2326
case CHAR_FF:
2327
case CHAR_NEL:
2328
#ifndef EBCDIC
2329
case 0x2028:
2330
case 0x2029:
2331
#endif /* Not EBCDIC */
2332
if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2333
/* Fall through */
2334
2335
case CHAR_LF:
2336
ADD_NEW(state_offset + 1, 0);
2337
break;
2338
2339
case CHAR_CR:
2340
if (ptr + 1 >= end_subject)
2341
{
2342
ADD_NEW(state_offset + 1, 0);
2343
if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2344
reset_could_continue = TRUE;
2345
}
2346
else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2347
{
2348
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2349
}
2350
else
2351
{
2352
ADD_NEW(state_offset + 1, 0);
2353
}
2354
break;
2355
}
2356
break;
2357
2358
/*-----------------------------------------------------------------*/
2359
case OP_NOT_VSPACE:
2360
if (clen > 0) switch(c)
2361
{
2362
VSPACE_CASES:
2363
break;
2364
2365
default:
2366
ADD_NEW(state_offset + 1, 0);
2367
break;
2368
}
2369
break;
2370
2371
/*-----------------------------------------------------------------*/
2372
case OP_VSPACE:
2373
if (clen > 0) switch(c)
2374
{
2375
VSPACE_CASES:
2376
ADD_NEW(state_offset + 1, 0);
2377
break;
2378
2379
default:
2380
break;
2381
}
2382
break;
2383
2384
/*-----------------------------------------------------------------*/
2385
case OP_NOT_HSPACE:
2386
if (clen > 0) switch(c)
2387
{
2388
HSPACE_CASES:
2389
break;
2390
2391
default:
2392
ADD_NEW(state_offset + 1, 0);
2393
break;
2394
}
2395
break;
2396
2397
/*-----------------------------------------------------------------*/
2398
case OP_HSPACE:
2399
if (clen > 0) switch(c)
2400
{
2401
HSPACE_CASES:
2402
ADD_NEW(state_offset + 1, 0);
2403
break;
2404
2405
default:
2406
break;
2407
}
2408
break;
2409
2410
/*-----------------------------------------------------------------*/
2411
/* Match a negated single character casefully. */
2412
2413
case OP_NOT:
2414
if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2415
break;
2416
2417
/*-----------------------------------------------------------------*/
2418
/* Match a negated single character caselessly. */
2419
2420
case OP_NOTI:
2421
if (clen > 0)
2422
{
2423
uint32_t otherd;
2424
#ifdef SUPPORT_UNICODE
2425
if (utf_or_ucp && d >= 128)
2426
otherd = UCD_OTHERCASE(d);
2427
else
2428
#endif /* SUPPORT_UNICODE */
2429
otherd = TABLE_GET(d, fcc, d);
2430
if (c != d && c != otherd)
2431
{ ADD_NEW(state_offset + dlen + 1, 0); }
2432
}
2433
break;
2434
2435
/*-----------------------------------------------------------------*/
2436
case OP_PLUSI:
2437
case OP_MINPLUSI:
2438
case OP_POSPLUSI:
2439
case OP_NOTPLUSI:
2440
case OP_NOTMINPLUSI:
2441
case OP_NOTPOSPLUSI:
2442
caseless = TRUE;
2443
codevalue -= OP_STARI - OP_STAR;
2444
2445
/* Fall through */
2446
case OP_PLUS:
2447
case OP_MINPLUS:
2448
case OP_POSPLUS:
2449
case OP_NOTPLUS:
2450
case OP_NOTMINPLUS:
2451
case OP_NOTPOSPLUS:
2452
count = current_state->count; /* Already matched */
2453
if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2454
if (clen > 0)
2455
{
2456
uint32_t otherd = NOTACHAR;
2457
if (caseless)
2458
{
2459
#ifdef SUPPORT_UNICODE
2460
if (utf_or_ucp && d >= 128)
2461
otherd = UCD_OTHERCASE(d);
2462
else
2463
#endif /* SUPPORT_UNICODE */
2464
otherd = TABLE_GET(d, fcc, d);
2465
}
2466
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2467
{
2468
if (count > 0 &&
2469
(codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2470
{
2471
active_count--; /* Remove non-match possibility */
2472
next_active_state--;
2473
}
2474
count++;
2475
ADD_NEW(state_offset, count);
2476
}
2477
}
2478
break;
2479
2480
/*-----------------------------------------------------------------*/
2481
case OP_QUERYI:
2482
case OP_MINQUERYI:
2483
case OP_POSQUERYI:
2484
case OP_NOTQUERYI:
2485
case OP_NOTMINQUERYI:
2486
case OP_NOTPOSQUERYI:
2487
caseless = TRUE;
2488
codevalue -= OP_STARI - OP_STAR;
2489
/* Fall through */
2490
case OP_QUERY:
2491
case OP_MINQUERY:
2492
case OP_POSQUERY:
2493
case OP_NOTQUERY:
2494
case OP_NOTMINQUERY:
2495
case OP_NOTPOSQUERY:
2496
ADD_ACTIVE(state_offset + dlen + 1, 0);
2497
if (clen > 0)
2498
{
2499
uint32_t otherd = NOTACHAR;
2500
if (caseless)
2501
{
2502
#ifdef SUPPORT_UNICODE
2503
if (utf_or_ucp && d >= 128)
2504
otherd = UCD_OTHERCASE(d);
2505
else
2506
#endif /* SUPPORT_UNICODE */
2507
otherd = TABLE_GET(d, fcc, d);
2508
}
2509
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2510
{
2511
if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2512
{
2513
active_count--; /* Remove non-match possibility */
2514
next_active_state--;
2515
}
2516
ADD_NEW(state_offset + dlen + 1, 0);
2517
}
2518
}
2519
break;
2520
2521
/*-----------------------------------------------------------------*/
2522
case OP_STARI:
2523
case OP_MINSTARI:
2524
case OP_POSSTARI:
2525
case OP_NOTSTARI:
2526
case OP_NOTMINSTARI:
2527
case OP_NOTPOSSTARI:
2528
caseless = TRUE;
2529
codevalue -= OP_STARI - OP_STAR;
2530
/* Fall through */
2531
case OP_STAR:
2532
case OP_MINSTAR:
2533
case OP_POSSTAR:
2534
case OP_NOTSTAR:
2535
case OP_NOTMINSTAR:
2536
case OP_NOTPOSSTAR:
2537
ADD_ACTIVE(state_offset + dlen + 1, 0);
2538
if (clen > 0)
2539
{
2540
uint32_t otherd = NOTACHAR;
2541
if (caseless)
2542
{
2543
#ifdef SUPPORT_UNICODE
2544
if (utf_or_ucp && d >= 128)
2545
otherd = UCD_OTHERCASE(d);
2546
else
2547
#endif /* SUPPORT_UNICODE */
2548
otherd = TABLE_GET(d, fcc, d);
2549
}
2550
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2551
{
2552
if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2553
{
2554
active_count--; /* Remove non-match possibility */
2555
next_active_state--;
2556
}
2557
ADD_NEW(state_offset, 0);
2558
}
2559
}
2560
break;
2561
2562
/*-----------------------------------------------------------------*/
2563
case OP_EXACTI:
2564
case OP_NOTEXACTI:
2565
caseless = TRUE;
2566
codevalue -= OP_STARI - OP_STAR;
2567
/* Fall through */
2568
case OP_EXACT:
2569
case OP_NOTEXACT:
2570
count = current_state->count; /* Number already matched */
2571
if (clen > 0)
2572
{
2573
uint32_t otherd = NOTACHAR;
2574
if (caseless)
2575
{
2576
#ifdef SUPPORT_UNICODE
2577
if (utf_or_ucp && d >= 128)
2578
otherd = UCD_OTHERCASE(d);
2579
else
2580
#endif /* SUPPORT_UNICODE */
2581
otherd = TABLE_GET(d, fcc, d);
2582
}
2583
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2584
{
2585
if (++count >= (int)GET2(code, 1))
2586
{ ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2587
else
2588
{ ADD_NEW(state_offset, count); }
2589
}
2590
}
2591
break;
2592
2593
/*-----------------------------------------------------------------*/
2594
case OP_UPTOI:
2595
case OP_MINUPTOI:
2596
case OP_POSUPTOI:
2597
case OP_NOTUPTOI:
2598
case OP_NOTMINUPTOI:
2599
case OP_NOTPOSUPTOI:
2600
caseless = TRUE;
2601
codevalue -= OP_STARI - OP_STAR;
2602
/* Fall through */
2603
case OP_UPTO:
2604
case OP_MINUPTO:
2605
case OP_POSUPTO:
2606
case OP_NOTUPTO:
2607
case OP_NOTMINUPTO:
2608
case OP_NOTPOSUPTO:
2609
ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2610
count = current_state->count; /* Number already matched */
2611
if (clen > 0)
2612
{
2613
uint32_t otherd = NOTACHAR;
2614
if (caseless)
2615
{
2616
#ifdef SUPPORT_UNICODE
2617
if (utf_or_ucp && d >= 128)
2618
otherd = UCD_OTHERCASE(d);
2619
else
2620
#endif /* SUPPORT_UNICODE */
2621
otherd = TABLE_GET(d, fcc, d);
2622
}
2623
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2624
{
2625
if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2626
{
2627
active_count--; /* Remove non-match possibility */
2628
next_active_state--;
2629
}
2630
if (++count >= (int)GET2(code, 1))
2631
{ ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2632
else
2633
{ ADD_NEW(state_offset, count); }
2634
}
2635
}
2636
break;
2637
2638
2639
/* ========================================================================== */
2640
/* These are the class-handling opcodes */
2641
2642
case OP_CLASS:
2643
case OP_NCLASS:
2644
#ifdef SUPPORT_WIDE_CHARS
2645
case OP_XCLASS:
2646
case OP_ECLASS:
2647
#endif
2648
{
2649
BOOL isinclass = FALSE;
2650
int next_state_offset;
2651
PCRE2_SPTR ecode;
2652
2653
#ifdef SUPPORT_WIDE_CHARS
2654
/* An extended class may have a table or a list of single characters,
2655
ranges, or both, and it may be positive or negative. There's a
2656
function that sorts all this out. */
2657
2658
if (codevalue == OP_XCLASS)
2659
{
2660
ecode = code + GET(code, 1);
2661
if (clen > 0)
2662
isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE,
2663
(const uint8_t*)mb->start_code, utf);
2664
}
2665
2666
/* A nested set-based class has internal opcodes for performing
2667
set operations. */
2668
2669
else if (codevalue == OP_ECLASS)
2670
{
2671
ecode = code + GET(code, 1);
2672
if (clen > 0)
2673
isinclass = PRIV(eclass)(c, code + 1 + LINK_SIZE, ecode,
2674
(const uint8_t*)mb->start_code, utf);
2675
}
2676
2677
else
2678
#endif /* SUPPORT_WIDE_CHARS */
2679
2680
/* For a simple class, there is always just a 32-byte table, and we
2681
can set isinclass from it. */
2682
2683
{
2684
ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2685
if (clen > 0)
2686
{
2687
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2688
((((const uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2689
}
2690
}
2691
2692
/* At this point, isinclass is set for all kinds of class, and ecode
2693
points to the byte after the end of the class. If there is a
2694
quantifier, this is where it will be. */
2695
2696
next_state_offset = (int)(ecode - start_code);
2697
2698
switch (*ecode)
2699
{
2700
case OP_CRSTAR:
2701
case OP_CRMINSTAR:
2702
case OP_CRPOSSTAR:
2703
ADD_ACTIVE(next_state_offset + 1, 0);
2704
if (isinclass)
2705
{
2706
if (*ecode == OP_CRPOSSTAR)
2707
{
2708
active_count--; /* Remove non-match possibility */
2709
next_active_state--;
2710
}
2711
ADD_NEW(state_offset, 0);
2712
}
2713
break;
2714
2715
case OP_CRPLUS:
2716
case OP_CRMINPLUS:
2717
case OP_CRPOSPLUS:
2718
count = current_state->count; /* Already matched */
2719
if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2720
if (isinclass)
2721
{
2722
if (count > 0 && *ecode == OP_CRPOSPLUS)
2723
{
2724
active_count--; /* Remove non-match possibility */
2725
next_active_state--;
2726
}
2727
count++;
2728
ADD_NEW(state_offset, count);
2729
}
2730
break;
2731
2732
case OP_CRQUERY:
2733
case OP_CRMINQUERY:
2734
case OP_CRPOSQUERY:
2735
ADD_ACTIVE(next_state_offset + 1, 0);
2736
if (isinclass)
2737
{
2738
if (*ecode == OP_CRPOSQUERY)
2739
{
2740
active_count--; /* Remove non-match possibility */
2741
next_active_state--;
2742
}
2743
ADD_NEW(next_state_offset + 1, 0);
2744
}
2745
break;
2746
2747
case OP_CRRANGE:
2748
case OP_CRMINRANGE:
2749
case OP_CRPOSRANGE:
2750
count = current_state->count; /* Already matched */
2751
if (count >= (int)GET2(ecode, 1))
2752
{ ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2753
if (isinclass)
2754
{
2755
int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2756
2757
if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2758
{
2759
active_count--; /* Remove non-match possibility */
2760
next_active_state--;
2761
}
2762
2763
if (++count >= max && max != 0) /* Max 0 => no limit */
2764
{ ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2765
else
2766
{ ADD_NEW(state_offset, count); }
2767
}
2768
break;
2769
2770
default:
2771
if (isinclass) { ADD_NEW(next_state_offset, 0); }
2772
break;
2773
}
2774
}
2775
break;
2776
2777
/* ========================================================================== */
2778
/* These are the opcodes for fancy brackets of various kinds. We have
2779
to use recursion in order to handle them. The "always failing" assertion
2780
(?!) is optimised to OP_FAIL when compiling, so we have to support that,
2781
though the other "backtracking verbs" are not supported. */
2782
2783
case OP_FAIL:
2784
break;
2785
2786
case OP_ASSERT:
2787
case OP_ASSERT_NOT:
2788
case OP_ASSERTBACK:
2789
case OP_ASSERTBACK_NOT:
2790
{
2791
int rc;
2792
int *local_workspace;
2793
PCRE2_SIZE *local_offsets;
2794
PCRE2_SPTR endasscode = code + GET(code, 1);
2795
RWS_anchor *rws = (RWS_anchor *)RWS;
2796
2797
if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2798
{
2799
rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2800
if (rc != 0) return rc;
2801
RWS = (int *)rws;
2802
}
2803
2804
local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2805
local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2806
rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2807
2808
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2809
2810
rc = internal_dfa_match(
2811
mb, /* static match data */
2812
code, /* this subexpression's code */
2813
ptr, /* where we currently are */
2814
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
2815
local_offsets, /* offset vector */
2816
RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2817
local_workspace, /* workspace vector */
2818
RWS_RSIZE, /* size of same */
2819
rlevel, /* function recursion level */
2820
RWS); /* recursion workspace */
2821
2822
rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2823
2824
if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2825
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2826
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2827
}
2828
break;
2829
2830
/*-----------------------------------------------------------------*/
2831
case OP_COND:
2832
case OP_SCOND:
2833
{
2834
int codelink = (int)GET(code, 1);
2835
PCRE2_UCHAR condcode;
2836
2837
/* Because of the way auto-callout works during compile, a callout item
2838
is inserted between OP_COND and an assertion condition. This does not
2839
happen for the other conditions. */
2840
2841
if (code[LINK_SIZE + 1] == OP_CALLOUT
2842
|| code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2843
{
2844
PCRE2_SIZE callout_length;
2845
rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2846
1 + LINK_SIZE, &callout_length);
2847
if (rrc < 0) return rrc; /* Abandon */
2848
if (rrc > 0) break; /* Fail this thread */
2849
code += callout_length; /* Skip callout data */
2850
}
2851
2852
condcode = code[LINK_SIZE+1];
2853
2854
/* Back reference conditions and duplicate named recursion conditions
2855
are not supported */
2856
2857
if (condcode == OP_CREF || condcode == OP_DNCREF ||
2858
condcode == OP_DNRREF)
2859
return PCRE2_ERROR_DFA_UCOND;
2860
2861
/* The DEFINE condition is always false, and the assertion (?!) is
2862
converted to OP_FAIL. */
2863
2864
if (condcode == OP_FALSE || condcode == OP_FAIL)
2865
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2866
2867
/* There is also an always-true condition */
2868
2869
else if (condcode == OP_TRUE)
2870
{ ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2871
2872
/* The only supported version of OP_RREF is for the value RREF_ANY,
2873
which means "test if in any recursion". We can't test for specifically
2874
recursed groups. */
2875
2876
else if (condcode == OP_RREF)
2877
{
2878
unsigned int value = GET2(code, LINK_SIZE + 2);
2879
if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2880
if (mb->recursive != NULL)
2881
{ ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2882
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2883
}
2884
2885
/* Otherwise, the condition is an assertion */
2886
2887
else
2888
{
2889
int rc;
2890
int *local_workspace;
2891
PCRE2_SIZE *local_offsets;
2892
PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2893
PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2894
RWS_anchor *rws = (RWS_anchor *)RWS;
2895
2896
if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2897
{
2898
rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2899
if (rc != 0) return rc;
2900
RWS = (int *)rws;
2901
}
2902
2903
local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2904
local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2905
rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2906
2907
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2908
2909
rc = internal_dfa_match(
2910
mb, /* fixed match data */
2911
asscode, /* this subexpression's code */
2912
ptr, /* where we currently are */
2913
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
2914
local_offsets, /* offset vector */
2915
RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2916
local_workspace, /* workspace vector */
2917
RWS_RSIZE, /* size of same */
2918
rlevel, /* function recursion level */
2919
RWS); /* recursion workspace */
2920
2921
rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2922
2923
if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2924
if ((rc >= 0) ==
2925
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2926
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2927
else
2928
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2929
}
2930
}
2931
break;
2932
2933
/*-----------------------------------------------------------------*/
2934
case OP_RECURSE:
2935
{
2936
int rc;
2937
int *local_workspace;
2938
PCRE2_SIZE *local_offsets;
2939
RWS_anchor *rws = (RWS_anchor *)RWS;
2940
PCRE2_SPTR callpat = start_code + GET(code, 1);
2941
uint32_t recno = (callpat == mb->start_code)? 0 :
2942
GET2(callpat, 1 + LINK_SIZE);
2943
2944
if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2945
{
2946
rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2947
if (rc != 0) return rc;
2948
RWS = (int *)rws;
2949
}
2950
2951
local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2952
local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2953
rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2954
2955
/* Check for repeating a recursion without advancing the subject
2956
pointer or last used character. This should catch convoluted mutual
2957
recursions. (Some simple cases are caught at compile time.) */
2958
2959
for (dfa_recursion_info *ri = mb->recursive;
2960
ri != NULL;
2961
ri = ri->prevrec)
2962
{
2963
if (recno == ri->group_num && ptr == ri->subject_position &&
2964
mb->last_used_ptr == ri->last_used_ptr)
2965
return PCRE2_ERROR_RECURSELOOP;
2966
}
2967
2968
/* Remember this recursion and where we started it so as to
2969
catch infinite loops. */
2970
2971
new_recursive.group_num = recno;
2972
new_recursive.subject_position = ptr;
2973
new_recursive.last_used_ptr = mb->last_used_ptr;
2974
new_recursive.prevrec = mb->recursive;
2975
mb->recursive = &new_recursive;
2976
2977
rc = internal_dfa_match(
2978
mb, /* fixed match data */
2979
callpat, /* this subexpression's code */
2980
ptr, /* where we currently are */
2981
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
2982
local_offsets, /* offset vector */
2983
RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2984
local_workspace, /* workspace vector */
2985
RWS_RSIZE, /* size of same */
2986
rlevel, /* function recursion level */
2987
RWS); /* recursion workspace */
2988
2989
rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2990
mb->recursive = new_recursive.prevrec; /* Done this recursion */
2991
2992
/* Ran out of internal offsets */
2993
2994
if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2995
2996
/* For each successful matched substring, set up the next state with a
2997
count of characters to skip before trying it. Note that the count is in
2998
characters, not bytes. */
2999
3000
if (rc > 0)
3001
{
3002
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3003
{
3004
PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3005
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3006
if (utf)
3007
{
3008
PCRE2_SPTR p = start_subject + local_offsets[rc];
3009
PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3010
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3011
}
3012
#endif
3013
if (charcount > 0)
3014
{
3015
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3016
(int)(charcount - 1));
3017
}
3018
else
3019
{
3020
ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3021
}
3022
}
3023
}
3024
else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3025
}
3026
break;
3027
3028
/*-----------------------------------------------------------------*/
3029
case OP_BRAPOS:
3030
case OP_SBRAPOS:
3031
case OP_CBRAPOS:
3032
case OP_SCBRAPOS:
3033
case OP_BRAPOSZERO:
3034
{
3035
int rc;
3036
int *local_workspace;
3037
PCRE2_SIZE *local_offsets;
3038
PCRE2_SIZE charcount, matched_count;
3039
PCRE2_SPTR local_ptr = ptr;
3040
RWS_anchor *rws = (RWS_anchor *)RWS;
3041
BOOL allow_zero;
3042
3043
if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3044
{
3045
rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3046
if (rc != 0) return rc;
3047
RWS = (int *)rws;
3048
}
3049
3050
local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3051
local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3052
rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3053
3054
if (codevalue == OP_BRAPOSZERO)
3055
{
3056
allow_zero = TRUE;
3057
++code; /* The following opcode will be one of the above BRAs */
3058
}
3059
else allow_zero = FALSE;
3060
3061
/* Loop to match the subpattern as many times as possible as if it were
3062
a complete pattern. */
3063
3064
for (matched_count = 0;; matched_count++)
3065
{
3066
rc = internal_dfa_match(
3067
mb, /* fixed match data */
3068
code, /* this subexpression's code */
3069
local_ptr, /* where we currently are */
3070
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
3071
local_offsets, /* offset vector */
3072
RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3073
local_workspace, /* workspace vector */
3074
RWS_RSIZE, /* size of same */
3075
rlevel, /* function recursion level */
3076
RWS); /* recursion workspace */
3077
3078
/* Failed to match */
3079
3080
if (rc < 0)
3081
{
3082
if (rc != PCRE2_ERROR_NOMATCH) return rc;
3083
break;
3084
}
3085
3086
/* Matched: break the loop if zero characters matched. */
3087
3088
charcount = local_offsets[1] - local_offsets[0];
3089
if (charcount == 0) break;
3090
local_ptr += charcount; /* Advance temporary position ptr */
3091
}
3092
3093
rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3094
3095
/* At this point we have matched the subpattern matched_count
3096
times, and local_ptr is pointing to the character after the end of the
3097
last match. */
3098
3099
if (matched_count > 0 || allow_zero)
3100
{
3101
PCRE2_SPTR end_subpattern = code;
3102
int next_state_offset;
3103
3104
do { end_subpattern += GET(end_subpattern, 1); }
3105
while (*end_subpattern == OP_ALT);
3106
next_state_offset =
3107
(int)(end_subpattern - start_code + LINK_SIZE + 1);
3108
3109
/* Optimization: if there are no more active states, and there
3110
are no new states yet set up, then skip over the subject string
3111
right here, to save looping. Otherwise, set up the new state to swing
3112
into action when the end of the matched substring is reached. */
3113
3114
if (i + 1 >= active_count && new_count == 0)
3115
{
3116
ptr = local_ptr;
3117
clen = 0;
3118
ADD_NEW(next_state_offset, 0);
3119
}
3120
else
3121
{
3122
PCRE2_SPTR p = ptr;
3123
PCRE2_SPTR pp = local_ptr;
3124
charcount = (PCRE2_SIZE)(pp - p);
3125
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3126
if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3127
#endif
3128
ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3129
}
3130
}
3131
}
3132
break;
3133
3134
/*-----------------------------------------------------------------*/
3135
case OP_ONCE:
3136
{
3137
int rc;
3138
int *local_workspace;
3139
PCRE2_SIZE *local_offsets;
3140
RWS_anchor *rws = (RWS_anchor *)RWS;
3141
3142
if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3143
{
3144
rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3145
if (rc != 0) return rc;
3146
RWS = (int *)rws;
3147
}
3148
3149
local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3150
local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3151
rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3152
3153
rc = internal_dfa_match(
3154
mb, /* fixed match data */
3155
code, /* this subexpression's code */
3156
ptr, /* where we currently are */
3157
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
3158
local_offsets, /* offset vector */
3159
RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3160
local_workspace, /* workspace vector */
3161
RWS_RSIZE, /* size of same */
3162
rlevel, /* function recursion level */
3163
RWS); /* recursion workspace */
3164
3165
rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3166
3167
if (rc >= 0)
3168
{
3169
PCRE2_SPTR end_subpattern = code;
3170
PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3171
int next_state_offset, repeat_state_offset;
3172
3173
do { end_subpattern += GET(end_subpattern, 1); }
3174
while (*end_subpattern == OP_ALT);
3175
next_state_offset =
3176
(int)(end_subpattern - start_code + LINK_SIZE + 1);
3177
3178
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
3179
arrange for the repeat state also to be added to the relevant list.
3180
Calculate the offset, or set -1 for no repeat. */
3181
3182
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3183
*end_subpattern == OP_KETRMIN)?
3184
(int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3185
3186
/* If we have matched an empty string, add the next state at the
3187
current character pointer. This is important so that the duplicate
3188
checking kicks in, which is what breaks infinite loops that match an
3189
empty string. */
3190
3191
if (charcount == 0)
3192
{
3193
ADD_ACTIVE(next_state_offset, 0);
3194
}
3195
3196
/* Optimization: if there are no more active states, and there
3197
are no new states yet set up, then skip over the subject string
3198
right here, to save looping. Otherwise, set up the new state to swing
3199
into action when the end of the matched substring is reached. */
3200
3201
else if (i + 1 >= active_count && new_count == 0)
3202
{
3203
ptr += charcount;
3204
clen = 0;
3205
ADD_NEW(next_state_offset, 0);
3206
3207
/* If we are adding a repeat state at the new character position,
3208
we must fudge things so that it is the only current state.
3209
Otherwise, it might be a duplicate of one we processed before, and
3210
that would cause it to be skipped. */
3211
3212
if (repeat_state_offset >= 0)
3213
{
3214
next_active_state = active_states;
3215
active_count = 0;
3216
i = -1;
3217
ADD_ACTIVE(repeat_state_offset, 0);
3218
}
3219
}
3220
else
3221
{
3222
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3223
if (utf)
3224
{
3225
PCRE2_SPTR p = start_subject + local_offsets[0];
3226
PCRE2_SPTR pp = start_subject + local_offsets[1];
3227
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3228
}
3229
#endif
3230
ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3231
if (repeat_state_offset >= 0)
3232
{ ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3233
}
3234
}
3235
else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3236
}
3237
break;
3238
3239
3240
/* ========================================================================== */
3241
/* Handle callouts */
3242
3243
case OP_CALLOUT:
3244
case OP_CALLOUT_STR:
3245
{
3246
PCRE2_SIZE callout_length;
3247
rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3248
&callout_length);
3249
if (rrc < 0) return rrc; /* Abandon */
3250
if (rrc == 0)
3251
{ ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3252
}
3253
break;
3254
3255
3256
/* ========================================================================== */
3257
default: /* Unsupported opcode */
3258
return PCRE2_ERROR_DFA_UITEM;
3259
}
3260
3261
NEXT_ACTIVE_STATE: continue;
3262
3263
} /* End of loop scanning active states */
3264
3265
/* We have finished the processing at the current subject character. If no
3266
new states have been set for the next character, we have found all the
3267
matches that we are going to find. If partial matching has been requested,
3268
check for appropriate conditions.
3269
3270
The "could_continue" variable is true if a state could have continued but
3271
for the fact that the end of the subject was reached. */
3272
3273
if (new_count <= 0)
3274
{
3275
if (could_continue && /* Some could go on, and */
3276
( /* either... */
3277
(mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3278
|| /* or... */
3279
((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3280
match_count < 0) /* no matches */
3281
) && /* And... */
3282
(
3283
partial_newline || /* Either partial NL */
3284
( /* or ... */
3285
ptr >= end_subject && /* End of subject and */
3286
( /* either */
3287
ptr > mb->start_used_ptr || /* Inspected non-empty string */
3288
mb->allowemptypartial /* or pattern has lookbehind */
3289
) /* or could match empty */
3290
)
3291
))
3292
match_count = PCRE2_ERROR_PARTIAL;
3293
break; /* Exit from loop along the subject string */
3294
}
3295
3296
/* One or more states are active for the next character. */
3297
3298
ptr += clen; /* Advance to next subject character */
3299
} /* Loop to move along the subject string */
3300
3301
/* Control gets here from "break" a few lines above. If we have a match and
3302
PCRE2_ENDANCHORED is set, the match fails. */
3303
3304
if (match_count >= 0 &&
3305
((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3306
ptr < end_subject)
3307
match_count = PCRE2_ERROR_NOMATCH;
3308
3309
return match_count;
3310
}
3311
3312
3313
3314
/*************************************************
3315
* Match a pattern using the DFA algorithm *
3316
*************************************************/
3317
3318
/* This function matches a compiled pattern to a subject string, using the
3319
alternate matching algorithm that finds all matches at once.
3320
3321
Arguments:
3322
code points to the compiled pattern
3323
subject subject string
3324
length length of subject string
3325
startoffset where to start matching in the subject
3326
options option bits
3327
match_data points to a match data structure
3328
gcontext points to a match context
3329
workspace pointer to workspace
3330
wscount size of workspace
3331
3332
Returns: > 0 => number of match offset pairs placed in offsets
3333
= 0 => offsets overflowed; longest matches are present
3334
-1 => failed to match
3335
< -1 => some kind of unexpected problem
3336
*/
3337
3338
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3339
pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3340
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3341
pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3342
{
3343
int rc;
3344
int was_zero_terminated = 0;
3345
3346
const pcre2_real_code *re = (const pcre2_real_code *)code;
3347
3348
PCRE2_SPTR start_match;
3349
PCRE2_SPTR end_subject;
3350
PCRE2_SPTR bumpalong_limit;
3351
PCRE2_SPTR req_cu_ptr;
3352
3353
BOOL utf, anchored, startline, firstline;
3354
BOOL has_first_cu = FALSE;
3355
BOOL has_req_cu = FALSE;
3356
3357
#if PCRE2_CODE_UNIT_WIDTH == 8
3358
PCRE2_SPTR memchr_found_first_cu = NULL;
3359
PCRE2_SPTR memchr_found_first_cu2 = NULL;
3360
#endif
3361
3362
PCRE2_UCHAR first_cu = 0;
3363
PCRE2_UCHAR first_cu2 = 0;
3364
PCRE2_UCHAR req_cu = 0;
3365
PCRE2_UCHAR req_cu2 = 0;
3366
3367
const uint8_t *start_bits = NULL;
3368
3369
/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3370
is used below, and it expects NLBLOCK to be defined as a pointer. */
3371
3372
pcre2_callout_block cb;
3373
dfa_match_block actual_match_block;
3374
dfa_match_block *mb = &actual_match_block;
3375
3376
/* Set up a starting block of memory for use during recursive calls to
3377
internal_dfa_match(). By putting this on the stack, it minimizes resource use
3378
in the case when it is not needed. If this is too small, more memory is
3379
obtained from the heap. At the start of each block is an anchor structure.*/
3380
3381
int base_recursion_workspace[RWS_BASE_SIZE];
3382
RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3383
rws->next = NULL;
3384
rws->size = RWS_BASE_SIZE;
3385
rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3386
3387
/* Recognize NULL, length 0 as an empty string. */
3388
3389
if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3390
3391
/* Plausibility checks */
3392
3393
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3394
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3395
return PCRE2_ERROR_NULL;
3396
3397
if (length == PCRE2_ZERO_TERMINATED)
3398
{
3399
length = PRIV(strlen)(subject);
3400
was_zero_terminated = 1;
3401
}
3402
3403
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3404
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3405
3406
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3407
time. */
3408
3409
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3410
((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3411
return PCRE2_ERROR_BADOPTION;
3412
3413
/* Invalid UTF support is not available for DFA matching. */
3414
3415
if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3416
return PCRE2_ERROR_DFA_UINVALID_UTF;
3417
3418
/* Check that the first field in the block is the magic number. If it is not,
3419
return with PCRE2_ERROR_BADMAGIC. */
3420
3421
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3422
3423
/* Check the code unit width. */
3424
3425
if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3426
return PCRE2_ERROR_BADMODE;
3427
3428
/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3429
options variable for this function. Users of PCRE2 who are not calling the
3430
function directly would like to have a way of setting these flags, in the same
3431
way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with
3432
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3433
(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3434
transferred to the options for this function. The bits are guaranteed to be
3435
adjacent, but do not have the same values. This bit of Boolean trickery assumes
3436
that the match-time bits are not more significant than the flag bits. If by
3437
accident this is not the case, a compile-time division by zero error will
3438
occur. */
3439
3440
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3441
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3442
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3443
#undef FF
3444
#undef OO
3445
3446
/* If restarting after a partial match, do some sanity checks on the contents
3447
of the workspace. */
3448
3449
if ((options & PCRE2_DFA_RESTART) != 0)
3450
{
3451
if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3452
workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3453
return PCRE2_ERROR_DFA_BADRESTART;
3454
}
3455
3456
/* Set some local values */
3457
3458
utf = (re->overall_options & PCRE2_UTF) != 0;
3459
start_match = subject + start_offset;
3460
end_subject = subject + length;
3461
req_cu_ptr = start_match - 1;
3462
anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3463
(re->overall_options & PCRE2_ANCHORED) != 0;
3464
3465
/* The "must be at the start of a line" flags are used in a loop when finding
3466
where to start. */
3467
3468
startline = (re->flags & PCRE2_STARTLINE) != 0;
3469
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3470
bumpalong_limit = end_subject;
3471
3472
/* Initialize and set up the fixed fields in the callout block, with a pointer
3473
in the match block. */
3474
3475
mb->cb = &cb;
3476
cb.version = 2;
3477
cb.subject = subject;
3478
cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3479
cb.callout_flags = 0;
3480
cb.capture_top = 1; /* No capture support */
3481
cb.capture_last = 0;
3482
cb.mark = NULL; /* No (*MARK) support */
3483
3484
/* Get data from the match context, if present, and fill in the remaining
3485
fields in the match block. It is an error to set an offset limit without
3486
setting the flag at compile time. */
3487
3488
if (mcontext == NULL)
3489
{
3490
mb->callout = NULL;
3491
mb->memctl = re->memctl;
3492
mb->match_limit = PRIV(default_match_context).match_limit;
3493
mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3494
mb->heap_limit = PRIV(default_match_context).heap_limit;
3495
}
3496
else
3497
{
3498
if (mcontext->offset_limit != PCRE2_UNSET)
3499
{
3500
if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3501
return PCRE2_ERROR_BADOFFSETLIMIT;
3502
bumpalong_limit = subject + mcontext->offset_limit;
3503
}
3504
mb->callout = mcontext->callout;
3505
mb->callout_data = mcontext->callout_data;
3506
mb->memctl = mcontext->memctl;
3507
mb->match_limit = mcontext->match_limit;
3508
mb->match_limit_depth = mcontext->depth_limit;
3509
mb->heap_limit = mcontext->heap_limit;
3510
}
3511
3512
if (mb->match_limit > re->limit_match)
3513
mb->match_limit = re->limit_match;
3514
3515
if (mb->match_limit_depth > re->limit_depth)
3516
mb->match_limit_depth = re->limit_depth;
3517
3518
if (mb->heap_limit > re->limit_heap)
3519
mb->heap_limit = re->limit_heap;
3520
3521
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);
3522
mb->tables = re->tables;
3523
mb->start_subject = subject;
3524
mb->end_subject = end_subject;
3525
mb->start_offset = start_offset;
3526
mb->allowemptypartial = (re->max_lookbehind > 0) ||
3527
(re->flags & PCRE2_MATCH_EMPTY) != 0;
3528
mb->moptions = options;
3529
mb->poptions = re->overall_options;
3530
mb->match_call_count = 0;
3531
mb->heap_used = 0;
3532
3533
/* Process the \R and newline settings. */
3534
3535
mb->bsr_convention = re->bsr_convention;
3536
mb->nltype = NLTYPE_FIXED;
3537
switch(re->newline_convention)
3538
{
3539
case PCRE2_NEWLINE_CR:
3540
mb->nllen = 1;
3541
mb->nl[0] = CHAR_CR;
3542
break;
3543
3544
case PCRE2_NEWLINE_LF:
3545
mb->nllen = 1;
3546
mb->nl[0] = CHAR_NL;
3547
break;
3548
3549
case PCRE2_NEWLINE_NUL:
3550
mb->nllen = 1;
3551
mb->nl[0] = CHAR_NUL;
3552
break;
3553
3554
case PCRE2_NEWLINE_CRLF:
3555
mb->nllen = 2;
3556
mb->nl[0] = CHAR_CR;
3557
mb->nl[1] = CHAR_NL;
3558
break;
3559
3560
case PCRE2_NEWLINE_ANY:
3561
mb->nltype = NLTYPE_ANY;
3562
break;
3563
3564
case PCRE2_NEWLINE_ANYCRLF:
3565
mb->nltype = NLTYPE_ANYCRLF;
3566
break;
3567
3568
default:
3569
PCRE2_DEBUG_UNREACHABLE();
3570
return PCRE2_ERROR_INTERNAL;
3571
}
3572
3573
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3574
we must also check that a starting offset does not point into the middle of a
3575
multiunit character. We check only the portion of the subject that is going to
3576
be inspected during matching - from the offset minus the maximum back reference
3577
to the given length. This saves time when a small part of a large subject is
3578
being matched by the use of a starting offset. Note that the maximum lookbehind
3579
is a number of characters, not code units. */
3580
3581
#ifdef SUPPORT_UNICODE
3582
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3583
{
3584
PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3585
3586
if (start_offset > 0)
3587
{
3588
#if PCRE2_CODE_UNIT_WIDTH != 32
3589
unsigned int i;
3590
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3591
return PCRE2_ERROR_BADUTFOFFSET;
3592
for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3593
{
3594
check_subject--;
3595
while (check_subject > subject &&
3596
#if PCRE2_CODE_UNIT_WIDTH == 8
3597
(*check_subject & 0xc0) == 0x80)
3598
#else /* 16-bit */
3599
(*check_subject & 0xfc00) == 0xdc00)
3600
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3601
check_subject--;
3602
}
3603
#else /* In the 32-bit library, one code unit equals one character. */
3604
check_subject -= re->max_lookbehind;
3605
if (check_subject < subject) check_subject = subject;
3606
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3607
}
3608
3609
/* Validate the relevant portion of the subject. After an error, adjust the
3610
offset to be an absolute offset in the whole string. */
3611
3612
match_data->rc = PRIV(valid_utf)(check_subject,
3613
length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3614
if (match_data->rc != 0)
3615
{
3616
match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3617
return match_data->rc;
3618
}
3619
}
3620
#endif /* SUPPORT_UNICODE */
3621
3622
/* Set up the first code unit to match, if available. If there's no first code
3623
unit there may be a bitmap of possible first characters. */
3624
3625
if ((re->flags & PCRE2_FIRSTSET) != 0)
3626
{
3627
has_first_cu = TRUE;
3628
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3629
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3630
{
3631
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3632
#ifdef SUPPORT_UNICODE
3633
#if PCRE2_CODE_UNIT_WIDTH == 8
3634
if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3635
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3636
#else
3637
if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3638
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3639
#endif
3640
#endif /* SUPPORT_UNICODE */
3641
}
3642
}
3643
else
3644
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3645
start_bits = re->start_bitmap;
3646
3647
/* There may be a "last known required code unit" set. */
3648
3649
if ((re->flags & PCRE2_LASTSET) != 0)
3650
{
3651
has_req_cu = TRUE;
3652
req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3653
if ((re->flags & PCRE2_LASTCASELESS) != 0)
3654
{
3655
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3656
#ifdef SUPPORT_UNICODE
3657
#if PCRE2_CODE_UNIT_WIDTH == 8
3658
if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3659
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3660
#else
3661
if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3662
req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3663
#endif
3664
#endif /* SUPPORT_UNICODE */
3665
}
3666
}
3667
3668
/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3669
free the memory that was obtained. */
3670
3671
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3672
{
3673
match_data->memctl.free((void *)match_data->subject,
3674
match_data->memctl.memory_data);
3675
match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3676
}
3677
3678
/* Fill in fields that are always returned in the match data. */
3679
3680
match_data->code = re;
3681
match_data->subject = NULL; /* Default for no match */
3682
match_data->mark = NULL;
3683
match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3684
3685
/* Call the main matching function, looping for a non-anchored regex after a
3686
failed match. If not restarting, perform certain optimizations at the start of
3687
a match. */
3688
3689
for (;;)
3690
{
3691
/* ----------------- Start of match optimizations ---------------- */
3692
3693
/* There are some optimizations that avoid running the match if a known
3694
starting point is not found, or if a known later code unit is not present.
3695
However, there is an option (settable at compile time) that disables
3696
these, for testing and for ensuring that all callouts do actually occur.
3697
The optimizations must also be avoided when restarting a DFA match. */
3698
3699
if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0 &&
3700
(options & PCRE2_DFA_RESTART) == 0)
3701
{
3702
/* If firstline is TRUE, the start of the match is constrained to the first
3703
line of a multiline string. That is, the match must be before or at the
3704
first newline following the start of matching. Temporarily adjust
3705
end_subject so that we stop the optimization scans for a first code unit
3706
immediately after the first character of a newline (the first code unit can
3707
legitimately be a newline). If the match fails at the newline, later code
3708
breaks this loop. */
3709
3710
if (firstline)
3711
{
3712
PCRE2_SPTR t = start_match;
3713
#ifdef SUPPORT_UNICODE
3714
if (utf)
3715
{
3716
while (t < end_subject && !IS_NEWLINE(t))
3717
{
3718
t++;
3719
ACROSSCHAR(t < end_subject, t, t++);
3720
}
3721
}
3722
else
3723
#endif
3724
while (t < end_subject && !IS_NEWLINE(t)) t++;
3725
end_subject = t;
3726
}
3727
3728
/* Anchored: check the first code unit if one is recorded. This may seem
3729
pointless but it can help in detecting a no match case without scanning for
3730
the required code unit. */
3731
3732
if (anchored)
3733
{
3734
if (has_first_cu || start_bits != NULL)
3735
{
3736
BOOL ok = start_match < end_subject;
3737
if (ok)
3738
{
3739
PCRE2_UCHAR c = UCHAR21TEST(start_match);
3740
ok = has_first_cu && (c == first_cu || c == first_cu2);
3741
if (!ok && start_bits != NULL)
3742
{
3743
#if PCRE2_CODE_UNIT_WIDTH != 8
3744
if (c > 255) c = 255;
3745
#endif
3746
ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3747
}
3748
}
3749
if (!ok) break;
3750
}
3751
}
3752
3753
/* Not anchored. Advance to a unique first code unit if there is one. */
3754
3755
else
3756
{
3757
if (has_first_cu)
3758
{
3759
if (first_cu != first_cu2) /* Caseless */
3760
{
3761
/* In 16-bit and 32_bit modes we have to do our own search, so can
3762
look for both cases at once. */
3763
3764
#if PCRE2_CODE_UNIT_WIDTH != 8
3765
PCRE2_UCHAR smc;
3766
while (start_match < end_subject &&
3767
(smc = UCHAR21TEST(start_match)) != first_cu &&
3768
smc != first_cu2)
3769
start_match++;
3770
#else
3771
/* In 8-bit mode, the use of memchr() gives a big speed up, even
3772
though we have to call it twice in order to find the earliest
3773
occurrence of the code unit in either of its cases. Caching is used
3774
to remember the positions of previously found code units. This can
3775
make a huge difference when the strings are very long and only one
3776
case is actually present. */
3777
3778
PCRE2_SPTR pp1 = NULL;
3779
PCRE2_SPTR pp2 = NULL;
3780
PCRE2_SIZE searchlength = end_subject - start_match;
3781
3782
/* If we haven't got a previously found position for first_cu, or if
3783
the current starting position is later, we need to do a search. If
3784
the code unit is not found, set it to the end. */
3785
3786
if (memchr_found_first_cu == NULL ||
3787
start_match > memchr_found_first_cu)
3788
{
3789
pp1 = memchr(start_match, first_cu, searchlength);
3790
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3791
}
3792
3793
/* If the start is before a previously found position, use the
3794
previous position, or NULL if a previous search failed. */
3795
3796
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3797
memchr_found_first_cu;
3798
3799
/* Do the same thing for the other case. */
3800
3801
if (memchr_found_first_cu2 == NULL ||
3802
start_match > memchr_found_first_cu2)
3803
{
3804
pp2 = memchr(start_match, first_cu2, searchlength);
3805
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3806
}
3807
3808
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3809
memchr_found_first_cu2;
3810
3811
/* Set the start to the end of the subject if neither case was found.
3812
Otherwise, use the earlier found point. */
3813
3814
if (pp1 == NULL)
3815
start_match = (pp2 == NULL)? end_subject : pp2;
3816
else
3817
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3818
3819
#endif /* 8-bit handling */
3820
}
3821
3822
/* The caseful case is much simpler. */
3823
3824
else
3825
{
3826
#if PCRE2_CODE_UNIT_WIDTH != 8
3827
while (start_match < end_subject && UCHAR21TEST(start_match) !=
3828
first_cu)
3829
start_match++;
3830
#else /* 8-bit code units */
3831
start_match = memchr(start_match, first_cu, end_subject - start_match);
3832
if (start_match == NULL) start_match = end_subject;
3833
#endif
3834
}
3835
3836
/* If we can't find the required code unit, having reached the true end
3837
of the subject, break the bumpalong loop, to force a match failure,
3838
except when doing partial matching, when we let the next cycle run at
3839
the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3840
which partially matches "abc", even though the string does not contain
3841
the starting character "d". If we have not reached the true end of the
3842
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3843
we also let the cycle run, because the matching string is legitimately
3844
allowed to start with the first code unit of a newline. */
3845
3846
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3847
start_match >= mb->end_subject)
3848
break;
3849
}
3850
3851
/* If there's no first code unit, advance to just after a linebreak for a
3852
multiline match if required. */
3853
3854
else if (startline)
3855
{
3856
if (start_match > mb->start_subject + start_offset)
3857
{
3858
#ifdef SUPPORT_UNICODE
3859
if (utf)
3860
{
3861
while (start_match < end_subject && !WAS_NEWLINE(start_match))
3862
{
3863
start_match++;
3864
ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3865
}
3866
}
3867
else
3868
#endif
3869
while (start_match < end_subject && !WAS_NEWLINE(start_match))
3870
start_match++;
3871
3872
/* If we have just passed a CR and the newline option is ANY or
3873
ANYCRLF, and we are now at a LF, advance the match position by one
3874
more code unit. */
3875
3876
if (start_match[-1] == CHAR_CR &&
3877
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3878
start_match < end_subject &&
3879
UCHAR21TEST(start_match) == CHAR_NL)
3880
start_match++;
3881
}
3882
}
3883
3884
/* If there's no first code unit or a requirement for a multiline line
3885
start, advance to a non-unique first code unit if any have been
3886
identified. The bitmap contains only 256 bits. When code units are 16 or
3887
32 bits wide, all code units greater than 254 set the 255 bit. */
3888
3889
else if (start_bits != NULL)
3890
{
3891
while (start_match < end_subject)
3892
{
3893
uint32_t c = UCHAR21TEST(start_match);
3894
#if PCRE2_CODE_UNIT_WIDTH != 8
3895
if (c > 255) c = 255;
3896
#endif
3897
if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3898
start_match++;
3899
}
3900
3901
/* See comment above in first_cu checking about the next line. */
3902
3903
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3904
start_match >= mb->end_subject)
3905
break;
3906
}
3907
} /* End of first code unit handling */
3908
3909
/* Restore fudged end_subject */
3910
3911
end_subject = mb->end_subject;
3912
3913
/* The following two optimizations are disabled for partial matching. */
3914
3915
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3916
{
3917
PCRE2_SPTR p;
3918
3919
/* The minimum matching length is a lower bound; no actual string of that
3920
length may actually match the pattern. Although the value is, strictly,
3921
in characters, we treat it as code units to avoid spending too much time
3922
in this optimization. */
3923
3924
if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3925
3926
/* If req_cu is set, we know that that code unit must appear in the
3927
subject for the match to succeed. If the first code unit is set, req_cu
3928
must be later in the subject; otherwise the test starts at the match
3929
point. This optimization can save a huge amount of backtracking in
3930
patterns with nested unlimited repeats that aren't going to match.
3931
Writing separate code for cased/caseless versions makes it go faster, as
3932
does using an autoincrement and backing off on a match. As in the case of
3933
the first code unit, using memchr() in the 8-bit library gives a big
3934
speed up. Unlike the first_cu check above, we do not need to call
3935
memchr() twice in the caseless case because we only need to check for the
3936
presence of the character in either case, not find the first occurrence.
3937
3938
The search can be skipped if the code unit was found later than the
3939
current starting point in a previous iteration of the bumpalong loop.
3940
3941
HOWEVER: when the subject string is very, very long, searching to its end
3942
can take a long time, and give bad performance on quite ordinary
3943
patterns. This showed up when somebody was matching something like
3944
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3945
sufficiently long, but it's worth searching a lot more for unanchored
3946
patterns. */
3947
3948
p = start_match + (has_first_cu? 1:0);
3949
if (has_req_cu && p > req_cu_ptr)
3950
{
3951
PCRE2_SIZE check_length = end_subject - start_match;
3952
3953
if (check_length < REQ_CU_MAX ||
3954
(!anchored && check_length < REQ_CU_MAX * 1000))
3955
{
3956
if (req_cu != req_cu2) /* Caseless */
3957
{
3958
#if PCRE2_CODE_UNIT_WIDTH != 8
3959
while (p < end_subject)
3960
{
3961
uint32_t pp = UCHAR21INCTEST(p);
3962
if (pp == req_cu || pp == req_cu2) { p--; break; }
3963
}
3964
#else /* 8-bit code units */
3965
PCRE2_SPTR pp = p;
3966
p = memchr(pp, req_cu, end_subject - pp);
3967
if (p == NULL)
3968
{
3969
p = memchr(pp, req_cu2, end_subject - pp);
3970
if (p == NULL) p = end_subject;
3971
}
3972
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3973
}
3974
3975
/* The caseful case */
3976
3977
else
3978
{
3979
#if PCRE2_CODE_UNIT_WIDTH != 8
3980
while (p < end_subject)
3981
{
3982
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3983
}
3984
3985
#else /* 8-bit code units */
3986
p = memchr(p, req_cu, end_subject - p);
3987
if (p == NULL) p = end_subject;
3988
#endif
3989
}
3990
3991
/* If we can't find the required code unit, break the matching loop,
3992
forcing a match failure. */
3993
3994
if (p >= end_subject) break;
3995
3996
/* If we have found the required code unit, save the point where we
3997
found it, so that we don't search again next time round the loop if
3998
the start hasn't passed this code unit yet. */
3999
4000
req_cu_ptr = p;
4001
}
4002
}
4003
}
4004
}
4005
4006
/* ------------ End of start of match optimizations ------------ */
4007
4008
/* Give no match if we have passed the bumpalong limit. */
4009
4010
if (start_match > bumpalong_limit) break;
4011
4012
/* OK, now we can do the business */
4013
4014
mb->start_used_ptr = start_match;
4015
mb->last_used_ptr = start_match;
4016
mb->recursive = NULL;
4017
4018
rc = internal_dfa_match(
4019
mb, /* fixed match data */
4020
mb->start_code, /* this subexpression's code */
4021
start_match, /* where we currently are */
4022
start_offset, /* start offset in subject */
4023
match_data->ovector, /* offset vector */
4024
(uint32_t)match_data->oveccount * 2, /* actual size of same */
4025
workspace, /* workspace vector */
4026
(int)wscount, /* size of same */
4027
0, /* function recurse level */
4028
base_recursion_workspace); /* initial workspace for recursion */
4029
4030
/* Anything other than "no match" means we are done, always; otherwise, carry
4031
on only if not anchored. */
4032
4033
if (rc != PCRE2_ERROR_NOMATCH || anchored)
4034
{
4035
if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4036
{
4037
match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4038
match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4039
}
4040
match_data->subject_length = length;
4041
match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4042
match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4043
match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4044
match_data->rc = rc;
4045
4046
if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4047
{
4048
length = CU2BYTES(length + was_zero_terminated);
4049
match_data->subject = match_data->memctl.malloc(length,
4050
match_data->memctl.memory_data);
4051
if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4052
memcpy((void *)match_data->subject, subject, length);
4053
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4054
}
4055
else
4056
{
4057
if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4058
}
4059
goto EXIT;
4060
}
4061
4062
/* Advance to the next subject character unless we are at the end of a line
4063
and firstline is set. */
4064
4065
if (firstline && IS_NEWLINE(start_match)) break;
4066
start_match++;
4067
#ifdef SUPPORT_UNICODE
4068
if (utf)
4069
{
4070
ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4071
}
4072
#endif
4073
if (start_match > end_subject) break;
4074
4075
/* If we have just passed a CR and we are now at a LF, and the pattern does
4076
not contain any explicit matches for \r or \n, and the newline option is CRLF
4077
or ANY or ANYCRLF, advance the match position by one more character. */
4078
4079
if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4080
start_match < end_subject &&
4081
UCHAR21TEST(start_match) == CHAR_NL &&
4082
(re->flags & PCRE2_HASCRORLF) == 0 &&
4083
(mb->nltype == NLTYPE_ANY ||
4084
mb->nltype == NLTYPE_ANYCRLF ||
4085
mb->nllen == 2))
4086
start_match++;
4087
4088
} /* "Bumpalong" loop */
4089
4090
NOMATCH_EXIT:
4091
rc = PCRE2_ERROR_NOMATCH;
4092
4093
EXIT:
4094
while (rws->next != NULL)
4095
{
4096
RWS_anchor *next = rws->next;
4097
rws->next = next->next;
4098
mb->memctl.free(next, mb->memctl.memory_data);
4099
}
4100
4101
return rc;
4102
}
4103
4104
/* These #undefs are here to enable unity builds with CMake. */
4105
4106
#undef NLBLOCK /* Block containing newline information */
4107
#undef PSSTART /* Field containing processed string start */
4108
#undef PSEND /* Field containing processed string end */
4109
4110
/* End of pcre2_dfa_match.c */
4111
4112