CoCalc -- pcre2_dfa

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_dfa_match.c
⁹⁸⁹⁸ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41

42
/* This module contains the external function pcre2_dfa_match(), which is an
43
alternative matching function that uses a sort of DFA algorithm (not a true
44
FSM). This is NOT Perl-compatible, but it has advantages in certain
45
applications. */
46

47

48
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49
the performance of his patterns greatly. I could not use it as it stood, as it
50
was not thread safe, and made assumptions about pattern sizes. Also, it caused
51
test 7 to loop, and test 9 to crash with a segfault.
52

53
The issue is the check for duplicate states, which is done by a simple linear
54
search up the state list. (Grep for "duplicate" below to find the code.) For
55
many patterns, there will never be many states active at one time, so a simple
56
linear search is fine. In patterns that have many active states, it might be a
57
bottleneck. The suggested code used an indexing scheme to remember which states
58
had previously been used for each character, and avoided the linear search when
59
it knew there was no chance of a duplicate. This was implemented when adding
60
states to the state lists.
61

62
I wrote some thread-safe, not-limited code to try something similar at the time
63
of checking for duplicates (instead of when adding states), using index vectors
64
on the stack. It did give a 13% improvement with one specially constructed
65
pattern for certain subject strings, but on other strings and on many of the
66
simpler patterns in the test suite it did worse. The major problem, I think,
67
was the extra time to initialize the index. This had to be done for each call
68
of internal_dfa_match(). (The supplied patch used a static vector, initialized
69
only once - I suspect this was the cause of the problems with the tests.)
70

71
Overall, I concluded that the gains in some cases did not outweigh the losses
72
in others, so I abandoned this code. */
73

74

75
#ifdef HAVE_CONFIG_H
76
#include "config.h"
77
#endif
78

79
#define NLBLOCK mb             /* Block containing newline information */
80
#define PSSTART start_subject  /* Field containing processed string start */
81
#define PSEND   end_subject    /* Field containing processed string end */
82

83
#include "pcre2_internal.h"
84

85
#define PUBLIC_DFA_MATCH_OPTIONS \
86
  (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87
   PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88
   PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89
   PCRE2_COPY_MATCHED_SUBJECT)
90

91

92
/*************************************************
93
*      Code parameters and static tables         *
94
*************************************************/
95

96
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97
into others, under special conditions. A gap of 20 between the blocks should be
98
enough. The resulting opcodes don't have to be less than 256 because they are
99
never stored, so we push them well clear of the normal opcodes. */
100

101
#define OP_PROP_EXTRA       300
102
#define OP_EXTUNI_EXTRA     320
103
#define OP_ANYNL_EXTRA      340
104
#define OP_HSPACE_EXTRA     360
105
#define OP_VSPACE_EXTRA     380
106

107

108
/* This table identifies those opcodes that are followed immediately by a
109
character that is to be tested in some way. This makes it possible to
110
centralize the loading of these characters. In the case of Type * etc, the
111
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112
small value. Non-zero values in the table are the offsets from the opcode where
113
the character is to be found. ***NOTE*** If the start of this table is
114
modified, the three tables that follow must also be modified. */
115

116
static const uint8_t coptable[] = {
117
  0,                             /* End                                    */
118
  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119
  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120
  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121
  0, 0,                          /* \P, \p                                 */
122
  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123
  0,                             /* \X                                     */
124
  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125
  1,                             /* Char                                   */
126
  1,                             /* Chari                                  */
127
  1,                             /* not                                    */
128
  1,                             /* noti                                   */
129
  /* Positive single-char repeats                                          */
130
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132
  1+IMM2_SIZE,                   /* exact                                  */
133
  1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134
  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136
  1+IMM2_SIZE,                   /* exact I                                */
137
  1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138
  /* Negative single-char repeats - only for chars < 256                   */
139
  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141
  1+IMM2_SIZE,                   /* NOT exact                              */
142
  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143
  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145
  1+IMM2_SIZE,                   /* NOT exact I                            */
146
  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147
  /* Positive type repeats                                                 */
148
  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149
  1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150
  1+IMM2_SIZE,                   /* Type exact                             */
151
  1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152
  /* Character class & ref repeats                                         */
153
  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154
  0, 0,                          /* CRRANGE, CRMINRANGE                    */
155
  0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156
  0,                             /* CLASS                                  */
157
  0,                             /* NCLASS                                 */
158
  0,                             /* XCLASS - variable length               */
159
  0,                             /* ECLASS - variable length               */
160
  0,                             /* REF                                    */
161
  0,                             /* REFI                                   */
162
  0,                             /* DNREF                                  */
163
  0,                             /* DNREFI                                 */
164
  0,                             /* RECURSE                                */
165
  0,                             /* CALLOUT                                */
166
  0,                             /* CALLOUT_STR                            */
167
  0,                             /* Alt                                    */
168
  0,                             /* Ket                                    */
169
  0,                             /* KetRmax                                */
170
  0,                             /* KetRmin                                */
171
  0,                             /* KetRpos                                */
172
  0, 0,                          /* Reverse, Vreverse                      */
173
  0,                             /* Assert                                 */
174
  0,                             /* Assert not                             */
175
  0,                             /* Assert behind                          */
176
  0,                             /* Assert behind not                      */
177
  0,                             /* NA assert                              */
178
  0,                             /* NA assert behind                       */
179
  0,                             /* Assert scan substring                  */
180
  0,                             /* ONCE                                   */
181
  0,                             /* SCRIPT_RUN                             */
182
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
183
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
184
  0, 0,                          /* CREF, DNCREF                           */
185
  0, 0,                          /* RREF, DNRREF                           */
186
  0, 0,                          /* FALSE, TRUE                            */
187
  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
188
  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
189
  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
190
  0, 0,                          /* COMMIT, COMMIT_ARG                     */
191
  0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
192
  0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
193
  0, 0,                          /* \B and \b in UCP mode                  */
194
};
195

196
/* This table identifies those opcodes that inspect a character. It is used to
197
remember the fact that a character could have been inspected when the end of
198
the subject is reached. ***NOTE*** If the start of this table is modified, the
199
two tables that follow must also be modified. */
200

201
static const uint8_t poptable[] = {
202
  0,                             /* End                                    */
203
  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
204
  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
205
  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
206
  1, 1,                          /* \P, \p                                 */
207
  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
208
  1,                             /* \X                                     */
209
  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
210
  1,                             /* Char                                   */
211
  1,                             /* Chari                                  */
212
  1,                             /* not                                    */
213
  1,                             /* noti                                   */
214
  /* Positive single-char repeats                                          */
215
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
216
  1, 1, 1,                       /* upto, minupto, exact                   */
217
  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
218
  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
219
  1, 1, 1,                       /* upto I, minupto I, exact I             */
220
  1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
221
  /* Negative single-char repeats - only for chars < 256                   */
222
  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
223
  1, 1, 1,                       /* NOT upto, minupto, exact               */
224
  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
225
  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
226
  1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
227
  1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
228
  /* Positive type repeats                                                 */
229
  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
230
  1, 1, 1,                       /* Type upto, minupto, exact              */
231
  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
232
  /* Character class & ref repeats                                         */
233
  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
234
  1, 1,                          /* CRRANGE, CRMINRANGE                    */
235
  1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
236
  1,                             /* CLASS                                  */
237
  1,                             /* NCLASS                                 */
238
  1,                             /* XCLASS - variable length               */
239
  1,                             /* ECLASS - variable length               */
240
  0,                             /* REF                                    */
241
  0,                             /* REFI                                   */
242
  0,                             /* DNREF                                  */
243
  0,                             /* DNREFI                                 */
244
  0,                             /* RECURSE                                */
245
  0,                             /* CALLOUT                                */
246
  0,                             /* CALLOUT_STR                            */
247
  0,                             /* Alt                                    */
248
  0,                             /* Ket                                    */
249
  0,                             /* KetRmax                                */
250
  0,                             /* KetRmin                                */
251
  0,                             /* KetRpos                                */
252
  0, 0,                          /* Reverse, Vreverse                      */
253
  0,                             /* Assert                                 */
254
  0,                             /* Assert not                             */
255
  0,                             /* Assert behind                          */
256
  0,                             /* Assert behind not                      */
257
  0,                             /* NA assert                              */
258
  0,                             /* NA assert behind                       */
259
  0,                             /* Assert scan substring                  */
260
  0,                             /* ONCE                                   */
261
  0,                             /* SCRIPT_RUN                             */
262
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
263
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
264
  0, 0,                          /* CREF, DNCREF                           */
265
  0, 0,                          /* RREF, DNRREF                           */
266
  0, 0,                          /* FALSE, TRUE                            */
267
  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
268
  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
269
  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
270
  0, 0,                          /* COMMIT, COMMIT_ARG                     */
271
  0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
272
  0, 0, 0,                       /* CLOSE, SKIPZERO, DEFINE                */
273
  1, 1,                          /* \B and \b in UCP mode                  */
274
};
275

276
/* Compile-time check that these tables have the correct size. */
277
STATIC_ASSERT(sizeof(coptable) == OP_TABLE_LENGTH, coptable);
278
STATIC_ASSERT(sizeof(poptable) == OP_TABLE_LENGTH, poptable);
279

280
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
281
and \w */
282

283
static const uint8_t toptable1[] = {
284
  0, 0, 0, 0, 0, 0,
285
  ctype_digit, ctype_digit,
286
  ctype_space, ctype_space,
287
  ctype_word,  ctype_word,
288
  0, 0                            /* OP_ANY, OP_ALLANY */
289
};
290

291
static const uint8_t toptable2[] = {
292
  0, 0, 0, 0, 0, 0,
293
  ctype_digit, 0,
294
  ctype_space, 0,
295
  ctype_word,  0,
296
  1, 1                            /* OP_ANY, OP_ALLANY */
297
};
298

299

300
/* Structure for holding data about a particular state, which is in effect the
301
current data for an active path through the match tree. It must consist
302
entirely of ints because the working vector we are passed, and which we put
303
these structures in, is a vector of ints. */
304

305
typedef struct stateblock {
306
  int offset;                     /* Offset to opcode (-ve has meaning) */
307
  int count;                      /* Count for repeats */
308
  int data;                       /* Some use extra data */
309
} stateblock;
310

311
#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
312

313

314
/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
315
local working space and output vectors that were created on the stack. This has
316
caused issues for some patterns, especially in small-stack environments such as
317
Windows. A new scheme is now in use which sets up a vector on the stack, but if
318
this is too small, heap memory is used, up to the heap_limit. The main
319
parameters are all numbers of ints because the workspace is a vector of ints.
320

321
The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
322
defined in pcre2_internal.h so as to be available to pcre2test when it is
323
finding the minimum heap requirement for a match. */
324

325
#define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
326

327
#define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
328
#define RWS_RSIZE       1000                    /* Work size for recursion */
329
#define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
330
#define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
331

332
/* This structure is at the start of each workspace block. */
333

334
typedef struct RWS_anchor {
335
  struct RWS_anchor *next;
336
  uint32_t size;  /* Number of ints */
337
  uint32_t free;  /* Number of ints */
338
} RWS_anchor;
339

340
#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
341

342

343

344
/*************************************************
345
*               Process a callout                *
346
*************************************************/
347

348
/* This function is called to perform a callout.
349

350
Arguments:
351
  code              current code pointer
352
  offsets           points to current capture offsets
353
  current_subject   start of current subject match
354
  ptr               current position in subject
355
  mb                the match block
356
  extracode         extra code offset when called from condition
357
  lengthptr         where to return the callout length
358

359
Returns:            the return from the callout
360
*/
361

362
static int
363
do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
364
  PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
365
  PCRE2_SIZE *lengthptr)
366
{
367
pcre2_callout_block *cb = mb->cb;
368

369
*lengthptr = (code[extracode] == OP_CALLOUT)?
370
  (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
371
  (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
372

373
if (mb->callout == NULL) return 0;    /* No callout provided */
374

375
/* Fixed fields in the callout block are set once and for all at the start of
376
matching. */
377

378
cb->offset_vector    = offsets;
379
cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
380
cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
381
cb->pattern_position = GET(code, 1 + extracode);
382
cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
383

384
if (code[extracode] == OP_CALLOUT)
385
  {
386
  cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
387
  cb->callout_string_offset = 0;
388
  cb->callout_string = NULL;
389
  cb->callout_string_length = 0;
390
  }
391
else
392
  {
393
  cb->callout_number = 0;
394
  cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
395
  cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
396
  cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
397
  }
398

399
return (mb->callout)(cb, mb->callout_data);
400
}
401

402

403

404
/*************************************************
405
*         Expand local workspace memory          *
406
*************************************************/
407

408
/* This function is called when internal_dfa_match() is about to be called
409
recursively and there is insufficient working space left in the current
410
workspace block. If there's an existing next block, use it; otherwise get a new
411
block unless the heap limit is reached.
412

413
Arguments:
414
  rwsptr     pointer to block pointer (updated)
415
  ovecsize   space needed for an ovector
416
  mb         the match block
417

418
Returns:     0 rwsptr has been updated
419
            !0 an error code
420
*/
421

422
static int
423
more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
424
{
425
RWS_anchor *rws = *rwsptr;
426
RWS_anchor *new;
427

428
if (rws->next != NULL)
429
  {
430
  new = rws->next;
431
  }
432

433
/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
434
mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
435
overflow. */
436

437
else
438
  {
439
  uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
440
  uint32_t newsizeK = newsize/(1024/sizeof(int));
441

442
  if (newsizeK + mb->heap_used > mb->heap_limit)
443
    newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
444
  newsize = newsizeK*(1024/sizeof(int));
445

446
  if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
447
    return PCRE2_ERROR_HEAPLIMIT;
448
  new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
449
  if (new == NULL) return PCRE2_ERROR_NOMEMORY;
450
  mb->heap_used += newsizeK;
451
  new->next = NULL;
452
  new->size = newsize;
453
  rws->next = new;
454
  }
455

456
new->free = new->size - RWS_ANCHOR_SIZE;
457
*rwsptr = new;
458
return 0;
459
}
460

461

462

463
/*************************************************
464
*     Match a Regular Expression - DFA engine    *
465
*************************************************/
466

467
/* This internal function applies a compiled pattern to a subject string,
468
starting at a given point, using a DFA engine. This function is called from the
469
external one, possibly multiple times if the pattern is not anchored. The
470
function calls itself recursively for some kinds of subpattern.
471

472
Arguments:
473
  mb                the match_data block with fixed information
474
  this_start_code   the opening bracket of this subexpression's code
475
  current_subject   where we currently are in the subject string
476
  start_offset      start offset in the subject string
477
  offsets           vector to contain the matching string offsets
478
  offsetcount       size of same
479
  workspace         vector of workspace
480
  wscount           size of same
481
  rlevel            function call recursion level
482

483
Returns:            > 0 => number of match offset pairs placed in offsets
484
                    = 0 => offsets overflowed; longest matches are present
485
                     -1 => failed to match
486
                   < -1 => some kind of unexpected problem
487

488
The following macros are used for adding states to the two state vectors (one
489
for the current character, one for the following character). */
490

491
#define ADD_ACTIVE(x,y) \
492
  if (active_count++ < wscount) \
493
    { \
494
    next_active_state->offset = (x); \
495
    next_active_state->count  = (y); \
496
    next_active_state++; \
497
    } \
498
  else return PCRE2_ERROR_DFA_WSSIZE
499

500
#define ADD_ACTIVE_DATA(x,y,z) \
501
  if (active_count++ < wscount) \
502
    { \
503
    next_active_state->offset = (x); \
504
    next_active_state->count  = (y); \
505
    next_active_state->data   = (z); \
506
    next_active_state++; \
507
    } \
508
  else return PCRE2_ERROR_DFA_WSSIZE
509

510
#define ADD_NEW(x,y) \
511
  if (new_count++ < wscount) \
512
    { \
513
    next_new_state->offset = (x); \
514
    next_new_state->count  = (y); \
515
    next_new_state++; \
516
    } \
517
  else return PCRE2_ERROR_DFA_WSSIZE
518

519
#define ADD_NEW_DATA(x,y,z) \
520
  if (new_count++ < wscount) \
521
    { \
522
    next_new_state->offset = (x); \
523
    next_new_state->count  = (y); \
524
    next_new_state->data   = (z); \
525
    next_new_state++; \
526
    } \
527
  else return PCRE2_ERROR_DFA_WSSIZE
528

529
/* And now, here is the code */
530

531
static int
532
internal_dfa_match(
533
  dfa_match_block *mb,
534
  PCRE2_SPTR this_start_code,
535
  PCRE2_SPTR current_subject,
536
  PCRE2_SIZE start_offset,
537
  PCRE2_SIZE *offsets,
538
  uint32_t offsetcount,
539
  int *workspace,
540
  int wscount,
541
  uint32_t rlevel,
542
  int *RWS)
543
{
544
stateblock *active_states, *new_states, *temp_states;
545
stateblock *next_active_state, *next_new_state;
546
const uint8_t *ctypes, *lcc, *fcc;
547
PCRE2_SPTR ptr;
548
PCRE2_SPTR end_code;
549
dfa_recursion_info new_recursive;
550
int active_count, new_count, match_count;
551

552
/* Some fields in the mb block are frequently referenced, so we load them into
553
independent variables in the hope that this will perform better. */
554

555
PCRE2_SPTR start_subject = mb->start_subject;
556
PCRE2_SPTR end_subject = mb->end_subject;
557
PCRE2_SPTR start_code = mb->start_code;
558

559
#ifdef SUPPORT_UNICODE
560
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
561
BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
562
#else
563
BOOL utf = FALSE;
564
#endif
565

566
BOOL reset_could_continue = FALSE;
567

568
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
569
if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
570
offsetcount &= (uint32_t)(-2);  /* Round down */
571

572
wscount -= 2;
573
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
574
          (2 * INTS_PER_STATEBLOCK);
575

576
ctypes = mb->tables + ctypes_offset;
577
lcc = mb->tables + lcc_offset;
578
fcc = mb->tables + fcc_offset;
579

580
match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
581

582
active_states = (stateblock *)(workspace + 2);
583
next_new_state = new_states = active_states + wscount;
584
new_count = 0;
585

586
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
587
the alternative states onto the list, and find out where the end is. This
588
makes is possible to use this function recursively, when we want to stop at a
589
matching internal ket rather than at the end.
590

591
If we are dealing with a backward assertion we have to find out the maximum
592
amount to move back, and set up each alternative appropriately. */
593

594
if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
595
  {
596
  size_t max_back = 0;
597
  size_t gone_back;
598

599
  end_code = this_start_code;
600
  do
601
    {
602
    size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
603
    if (back > max_back) max_back = back;
604
    end_code += GET(end_code, 1);
605
    }
606
  while (*end_code == OP_ALT);
607

608
  /* If we can't go back the amount required for the longest lookbehind
609
  pattern, go back as far as we can; some alternatives may still be viable. */
610

611
#ifdef SUPPORT_UNICODE
612
  /* In character mode we have to step back character by character */
613

614
  if (utf)
615
    {
616
    for (gone_back = 0; gone_back < max_back; gone_back++)
617
      {
618
      if (current_subject <= start_subject) break;
619
      current_subject--;
620
      ACROSSCHAR(current_subject > start_subject, current_subject,
621
        current_subject--);
622
      }
623
    }
624
  else
625
#endif
626

627
  /* In byte-mode we can do this quickly. */
628

629
    {
630
    size_t current_offset = (size_t)(current_subject - start_subject);
631
    gone_back = (current_offset < max_back)? current_offset : max_back;
632
    current_subject -= gone_back;
633
    }
634

635
  /* Save the earliest consulted character */
636

637
  if (current_subject < mb->start_used_ptr)
638
    mb->start_used_ptr = current_subject;
639

640
  /* Now we can process the individual branches. There will be an OP_REVERSE at
641
  the start of each branch, except when the length of the branch is zero. */
642

643
  end_code = this_start_code;
644
  do
645
    {
646
    uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
647
    size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
648
    if (back <= gone_back)
649
      {
650
      int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
651
      ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
652
      }
653
    end_code += GET(end_code, 1);
654
    }
655
  while (*end_code == OP_ALT);
656
 }
657

658
/* This is the code for a "normal" subpattern (not a backward assertion). The
659
start of a whole pattern is always one of these. If we are at the top level,
660
we may be asked to restart matching from the same point that we reached for a
661
previous partial match. We still have to scan through the top-level branches to
662
find the end state. */
663

664
else
665
  {
666
  end_code = this_start_code;
667

668
  /* Restarting */
669

670
  if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
671
    {
672
    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
673
    new_count = workspace[1];
674
    if (!workspace[0])
675
      memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
676
    }
677

678
  /* Not restarting */
679

680
  else
681
    {
682
    int length = 1 + LINK_SIZE +
683
      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
684
        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
685
        ? IMM2_SIZE:0);
686
    do
687
      {
688
      ADD_NEW((int)(end_code - start_code + length), 0);
689
      end_code += GET(end_code, 1);
690
      length = 1 + LINK_SIZE;
691
      }
692
    while (*end_code == OP_ALT);
693
    }
694
  }
695

696
workspace[0] = 0;    /* Bit indicating which vector is current */
697

698
/* Loop for scanning the subject */
699

700
ptr = current_subject;
701
for (;;)
702
  {
703
  int i, j;
704
  int clen, dlen;
705
  uint32_t c, d;
706
  BOOL partial_newline = FALSE;
707
  BOOL could_continue = reset_could_continue;
708
  reset_could_continue = FALSE;
709

710
  if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
711

712
  /* Make the new state list into the active state list and empty the
713
  new state list. */
714

715
  temp_states = active_states;
716
  active_states = new_states;
717
  new_states = temp_states;
718
  active_count = new_count;
719
  new_count = 0;
720

721
  workspace[0] ^= 1;              /* Remember for the restarting feature */
722
  workspace[1] = active_count;
723

724
  /* Set the pointers for adding new states */
725

726
  next_active_state = active_states + active_count;
727
  next_new_state = new_states;
728

729
  /* Load the current character from the subject outside the loop, as many
730
  different states may want to look at it, and we assume that at least one
731
  will. */
732

733
  if (ptr < end_subject)
734
    {
735
    clen = 1;        /* Number of data items in the character */
736
#ifdef SUPPORT_UNICODE
737
    GETCHARLENTEST(c, ptr, clen);
738
#else
739
    c = *ptr;
740
#endif  /* SUPPORT_UNICODE */
741
    }
742
  else
743
    {
744
    clen = 0;        /* This indicates the end of the subject */
745
    c = NOTACHAR;    /* This value should never actually be used */
746
    }
747

748
  /* Scan up the active states and act on each one. The result of an action
749
  may be to add more states to the currently active list (e.g. on hitting a
750
  parenthesis) or it may be to put states on the new list, for considering
751
  when we move the character pointer on. */
752

753
  for (i = 0; i < active_count; i++)
754
    {
755
    stateblock *current_state = active_states + i;
756
    BOOL caseless = FALSE;
757
    PCRE2_SPTR code;
758
    uint32_t codevalue;
759
    int state_offset = current_state->offset;
760
    int rrc;
761
    int count;
762

763
    /* A negative offset is a special case meaning "hold off going to this
764
    (negated) state until the number of characters in the data field have
765
    been skipped". If the could_continue flag was passed over from a previous
766
    state, arrange for it to passed on. */
767

768
    if (state_offset < 0)
769
      {
770
      if (current_state->data > 0)
771
        {
772
        ADD_NEW_DATA(state_offset, current_state->count,
773
          current_state->data - 1);
774
        if (could_continue) reset_could_continue = TRUE;
775
        continue;
776
        }
777
      else
778
        {
779
        current_state->offset = state_offset = -state_offset;
780
        }
781
      }
782

783
    /* Check for a duplicate state with the same count, and skip if found.
784
    See the note at the head of this module about the possibility of improving
785
    performance here. */
786

787
    for (j = 0; j < i; j++)
788
      {
789
      if (active_states[j].offset == state_offset &&
790
          active_states[j].count == current_state->count)
791
        goto NEXT_ACTIVE_STATE;
792
      }
793

794
    /* The state offset is the offset to the opcode */
795

796
    code = start_code + state_offset;
797
    codevalue = *code;
798

799
    /* If this opcode inspects a character, but we are at the end of the
800
    subject, remember the fact for use when testing for a partial match. */
801

802
    if (clen == 0 && poptable[codevalue] != 0)
803
      could_continue = TRUE;
804

805
    /* If this opcode is followed by an inline character, load it. It is
806
    tempting to test for the presence of a subject character here, but that
807
    is wrong, because sometimes zero repetitions of the subject are
808
    permitted.
809

810
    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
811
    argument that is not a data character - but is always one byte long because
812
    the values are small. We have to take special action to deal with  \P, \p,
813
    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
814
    these ones to new opcodes. */
815

816
    if (coptable[codevalue] > 0)
817
      {
818
      dlen = 1;
819
#ifdef SUPPORT_UNICODE
820
      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
821
#endif  /* SUPPORT_UNICODE */
822
      d = code[coptable[codevalue]];
823
      if (codevalue >= OP_TYPESTAR)
824
        {
825
        switch(d)
826
          {
827
          case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
828
          case OP_NOTPROP:
829
          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
830
          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
831
          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
832
          case OP_NOT_HSPACE:
833
          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
834
          case OP_NOT_VSPACE:
835
          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
836
          default: break;
837
          }
838
        }
839
      }
840
    else
841
      {
842
      dlen = 0;         /* Not strictly necessary, but compilers moan */
843
      d = NOTACHAR;     /* if these variables are not set. */
844
      }
845

846

847
    /* Now process the individual opcodes */
848

849
    switch (codevalue)
850
      {
851
/* ========================================================================== */
852
      /* Reached a closing bracket. If not at the end of the pattern, carry
853
      on with the next opcode. For repeating opcodes, also add the repeat
854
      state. Note that KETRPOS will always be encountered at the end of the
855
      subpattern, because the possessive subpattern repeats are always handled
856
      using recursive calls. Thus, it never adds any new states.
857

858
      At the end of the (sub)pattern, unless we have an empty string and
859
      PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
860
      start of the subject, save the match data, shifting up all previous
861
      matches so we always have the longest first. */
862

863
      case OP_KET:
864
      case OP_KETRMIN:
865
      case OP_KETRMAX:
866
      case OP_KETRPOS:
867
      if (code != end_code)
868
        {
869
        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
870
        if (codevalue != OP_KET)
871
          {
872
          ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
873
          }
874
        }
875
      else
876
        {
877
        if (ptr > current_subject ||
878
            ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
879
              ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
880
                current_subject > start_subject + mb->start_offset)))
881
          {
882
          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
883
            else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
884
              match_count = 0;
885
          count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
886
          if (count > 0) (void)memmove(offsets + 2, offsets,
887
            (size_t)count * sizeof(PCRE2_SIZE));
888
          if (offsetcount >= 2)
889
            {
890
            offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
891
            offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
892
            }
893
          if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
894
          }
895
        }
896
      break;
897

898
/* ========================================================================== */
899
      /* These opcodes add to the current list of states without looking
900
      at the current character. */
901

902
      /*-----------------------------------------------------------------*/
903
      case OP_ALT:
904
      do { code += GET(code, 1); } while (*code == OP_ALT);
905
      ADD_ACTIVE((int)(code - start_code), 0);
906
      break;
907

908
      /*-----------------------------------------------------------------*/
909
      case OP_BRA:
910
      case OP_SBRA:
911
      do
912
        {
913
        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
914
        code += GET(code, 1);
915
        }
916
      while (*code == OP_ALT);
917
      break;
918

919
      /*-----------------------------------------------------------------*/
920
      case OP_CBRA:
921
      case OP_SCBRA:
922
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
923
      code += GET(code, 1);
924
      while (*code == OP_ALT)
925
        {
926
        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
927
        code += GET(code, 1);
928
        }
929
      break;
930

931
      /*-----------------------------------------------------------------*/
932
      case OP_BRAZERO:
933
      case OP_BRAMINZERO:
934
      ADD_ACTIVE(state_offset + 1, 0);
935
      code += 1 + GET(code, 2);
936
      while (*code == OP_ALT) code += GET(code, 1);
937
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
938
      break;
939

940
      /*-----------------------------------------------------------------*/
941
      case OP_SKIPZERO:
942
      code += 1 + GET(code, 2);
943
      while (*code == OP_ALT) code += GET(code, 1);
944
      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
945
      break;
946

947
      /*-----------------------------------------------------------------*/
948
      case OP_CIRC:
949
      if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
950
        { ADD_ACTIVE(state_offset + 1, 0); }
951
      break;
952

953
      /*-----------------------------------------------------------------*/
954
      case OP_CIRCM:
955
      if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
956
          ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
957
            && WAS_NEWLINE(ptr)))
958
        { ADD_ACTIVE(state_offset + 1, 0); }
959
      break;
960

961
      /*-----------------------------------------------------------------*/
962
      case OP_EOD:
963
      if (ptr >= end_subject)
964
        {
965
        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
966
          return PCRE2_ERROR_PARTIAL;
967
        else { ADD_ACTIVE(state_offset + 1, 0); }
968
        }
969
      break;
970

971
      /*-----------------------------------------------------------------*/
972
      case OP_SOD:
973
      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
974
      break;
975

976
      /*-----------------------------------------------------------------*/
977
      case OP_SOM:
978
      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
979
      break;
980

981

982
/* ========================================================================== */
983
      /* These opcodes inspect the next subject character, and sometimes
984
      the previous one as well, but do not have an argument. The variable
985
      clen contains the length of the current character and is zero if we are
986
      at the end of the subject. */
987

988
      /*-----------------------------------------------------------------*/
989
      case OP_ANY:
990
      if (clen > 0 && !IS_NEWLINE(ptr))
991
        {
992
        if (ptr + 1 >= mb->end_subject &&
993
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
994
            NLBLOCK->nltype == NLTYPE_FIXED &&
995
            NLBLOCK->nllen == 2 &&
996
            c == NLBLOCK->nl[0])
997
          {
998
          could_continue = partial_newline = TRUE;
999
          }
1000
        else
1001
          {
1002
          ADD_NEW(state_offset + 1, 0);
1003
          }
1004
        }
1005
      break;
1006

1007
      /*-----------------------------------------------------------------*/
1008
      case OP_ALLANY:
1009
      if (clen > 0)
1010
        { ADD_NEW(state_offset + 1, 0); }
1011
      break;
1012

1013
      /*-----------------------------------------------------------------*/
1014
      case OP_EODN:
1015
      if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1016
        {
1017
        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1018
          return PCRE2_ERROR_PARTIAL;
1019
        ADD_ACTIVE(state_offset + 1, 0);
1020
        }
1021
      break;
1022

1023
      /*-----------------------------------------------------------------*/
1024
      case OP_DOLL:
1025
      if ((mb->moptions & PCRE2_NOTEOL) == 0)
1026
        {
1027
        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1028
          could_continue = TRUE;
1029
        else if (clen == 0 ||
1030
            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1031
               (ptr == end_subject - mb->nllen)
1032
            ))
1033
          { ADD_ACTIVE(state_offset + 1, 0); }
1034
        else if (ptr + 1 >= mb->end_subject &&
1035
                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1036
                 NLBLOCK->nltype == NLTYPE_FIXED &&
1037
                 NLBLOCK->nllen == 2 &&
1038
                 c == NLBLOCK->nl[0])
1039
          {
1040
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1041
            {
1042
            reset_could_continue = TRUE;
1043
            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1044
            }
1045
          else could_continue = partial_newline = TRUE;
1046
          }
1047
        }
1048
      break;
1049

1050
      /*-----------------------------------------------------------------*/
1051
      case OP_DOLLM:
1052
      if ((mb->moptions & PCRE2_NOTEOL) == 0)
1053
        {
1054
        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1055
          could_continue = TRUE;
1056
        else if (clen == 0 ||
1057
            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1058
          { ADD_ACTIVE(state_offset + 1, 0); }
1059
        else if (ptr + 1 >= mb->end_subject &&
1060
                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1061
                 NLBLOCK->nltype == NLTYPE_FIXED &&
1062
                 NLBLOCK->nllen == 2 &&
1063
                 c == NLBLOCK->nl[0])
1064
          {
1065
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1066
            {
1067
            reset_could_continue = TRUE;
1068
            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1069
            }
1070
          else could_continue = partial_newline = TRUE;
1071
          }
1072
        }
1073
      else if (IS_NEWLINE(ptr))
1074
        { ADD_ACTIVE(state_offset + 1, 0); }
1075
      break;
1076

1077
      /*-----------------------------------------------------------------*/
1078

1079
      case OP_DIGIT:
1080
      case OP_WHITESPACE:
1081
      case OP_WORDCHAR:
1082
      if (clen > 0 && c < 256 &&
1083
            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1084
        { ADD_NEW(state_offset + 1, 0); }
1085
      break;
1086

1087
      /*-----------------------------------------------------------------*/
1088
      case OP_NOT_DIGIT:
1089
      case OP_NOT_WHITESPACE:
1090
      case OP_NOT_WORDCHAR:
1091
      if (clen > 0 && (c >= 256 ||
1092
            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1093
        { ADD_NEW(state_offset + 1, 0); }
1094
      break;
1095

1096
      /*-----------------------------------------------------------------*/
1097
      case OP_WORD_BOUNDARY:
1098
      case OP_NOT_WORD_BOUNDARY:
1099
      case OP_NOT_UCP_WORD_BOUNDARY:
1100
      case OP_UCP_WORD_BOUNDARY:
1101
        {
1102
        int left_word, right_word;
1103

1104
        if (ptr > start_subject)
1105
          {
1106
          PCRE2_SPTR temp = ptr - 1;
1107
          if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1108
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1109
          if (utf) { BACKCHAR(temp); }
1110
#endif
1111
          GETCHARTEST(d, temp);
1112
#ifdef SUPPORT_UNICODE
1113
          if (codevalue == OP_UCP_WORD_BOUNDARY ||
1114
              codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1115
            {
1116
            int chartype = UCD_CHARTYPE(d);
1117
            int category = PRIV(ucp_gentype)[chartype];
1118
            left_word = (category == ucp_L || category == ucp_N ||
1119
              chartype == ucp_Mn || chartype == ucp_Pc);
1120
            }
1121
          else
1122
#endif
1123
          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1124
          }
1125
        else left_word = FALSE;
1126

1127
        if (clen > 0)
1128
          {
1129
          if (ptr >= mb->last_used_ptr)
1130
            {
1131
            PCRE2_SPTR temp = ptr + 1;
1132
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1133
            if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1134
#endif
1135
            mb->last_used_ptr = temp;
1136
            }
1137
#ifdef SUPPORT_UNICODE
1138
          if (codevalue == OP_UCP_WORD_BOUNDARY ||
1139
              codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1140
            {
1141
            int chartype = UCD_CHARTYPE(c);
1142
            int category = PRIV(ucp_gentype)[chartype];
1143
            right_word = (category == ucp_L || category == ucp_N ||
1144
              chartype == ucp_Mn || chartype == ucp_Pc);
1145
            }
1146
          else
1147
#endif
1148
          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1149
          }
1150
        else right_word = FALSE;
1151

1152
        if ((left_word == right_word) ==
1153
            (codevalue == OP_NOT_WORD_BOUNDARY ||
1154
             codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1155
          { ADD_ACTIVE(state_offset + 1, 0); }
1156
        }
1157
      break;
1158

1159

1160
      /*-----------------------------------------------------------------*/
1161
      /* Check the next character by Unicode property. We will get here only
1162
      if the support is in the binary; otherwise a compile-time error occurs.
1163
      */
1164

1165
#ifdef SUPPORT_UNICODE
1166
      case OP_PROP:
1167
      case OP_NOTPROP:
1168
      if (clen > 0)
1169
        {
1170
        BOOL OK;
1171
        int chartype;
1172
        const uint32_t *cp;
1173
        const ucd_record * prop = GET_UCD(c);
1174
        switch(code[1])
1175
          {
1176
          case PT_LAMP:
1177
          chartype = prop->chartype;
1178
          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1179
               chartype == ucp_Lt;
1180
          break;
1181

1182
          case PT_GC:
1183
          OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1184
          break;
1185

1186
          case PT_PC:
1187
          OK = prop->chartype == code[2];
1188
          break;
1189

1190
          case PT_SC:
1191
          OK = prop->script == code[2];
1192
          break;
1193

1194
          case PT_SCX:
1195
          OK = (prop->script == code[2] ||
1196
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1197
          break;
1198

1199
          /* These are specials for combination cases. */
1200

1201
          case PT_ALNUM:
1202
          chartype = prop->chartype;
1203
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1204
               PRIV(ucp_gentype)[chartype] == ucp_N;
1205
          break;
1206

1207
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1208
          which means that Perl space and POSIX space are now identical. PCRE
1209
          was changed at release 8.34. */
1210

1211
          case PT_SPACE:    /* Perl space */
1212
          case PT_PXSPACE:  /* POSIX space */
1213
          switch(c)
1214
            {
1215
            HSPACE_CASES:
1216
            VSPACE_CASES:
1217
            OK = TRUE;
1218
            break;
1219

1220
            default:
1221
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1222
            break;
1223
            }
1224
          break;
1225

1226
          case PT_WORD:
1227
          chartype = prop->chartype;
1228
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1229
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1230
               chartype == ucp_Mn || chartype == ucp_Pc;
1231
          break;
1232

1233
          case PT_CLIST:
1234
#if PCRE2_CODE_UNIT_WIDTH == 32
1235
          if (c > MAX_UTF_CODE_POINT)
1236
            {
1237
            OK = FALSE;
1238
            break;
1239
            }
1240
#endif
1241
          cp = PRIV(ucd_caseless_sets) + code[2];
1242
          for (;;)
1243
            {
1244
            if (c < *cp) { OK = FALSE; break; }
1245
            if (c == *cp++) { OK = TRUE; break; }
1246
            }
1247
          break;
1248

1249
          case PT_UCNC:
1250
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1251
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1252
               c >= 0xe000;
1253
          break;
1254

1255
          case PT_BIDICL:
1256
          OK = UCD_BIDICLASS(c) == code[2];
1257
          break;
1258

1259
          case PT_BOOL:
1260
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1261
            UCD_BPROPS_PROP(prop), code[2]) != 0;
1262
          break;
1263

1264
          /* Should never occur, but keep compilers from grumbling. */
1265

1266
          default:
1267
          OK = codevalue != OP_PROP;
1268
          break;
1269
          }
1270

1271
        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1272
        }
1273
      break;
1274
#endif
1275

1276

1277

1278
/* ========================================================================== */
1279
      /* These opcodes likewise inspect the subject character, but have an
1280
      argument that is not a data character. It is one of these opcodes:
1281
      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1282
      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1283

1284
      case OP_TYPEPLUS:
1285
      case OP_TYPEMINPLUS:
1286
      case OP_TYPEPOSPLUS:
1287
      count = current_state->count;  /* Already matched */
1288
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1289
      if (clen > 0)
1290
        {
1291
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1292
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1293
            NLBLOCK->nltype == NLTYPE_FIXED &&
1294
            NLBLOCK->nllen == 2 &&
1295
            c == NLBLOCK->nl[0])
1296
          {
1297
          could_continue = partial_newline = TRUE;
1298
          }
1299
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1300
            (c < 256 &&
1301
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1302
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1303
          {
1304
          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1305
            {
1306
            active_count--;            /* Remove non-match possibility */
1307
            next_active_state--;
1308
            }
1309
          count++;
1310
          ADD_NEW(state_offset, count);
1311
          }
1312
        }
1313
      break;
1314

1315
      /*-----------------------------------------------------------------*/
1316
      case OP_TYPEQUERY:
1317
      case OP_TYPEMINQUERY:
1318
      case OP_TYPEPOSQUERY:
1319
      ADD_ACTIVE(state_offset + 2, 0);
1320
      if (clen > 0)
1321
        {
1322
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1323
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1324
            NLBLOCK->nltype == NLTYPE_FIXED &&
1325
            NLBLOCK->nllen == 2 &&
1326
            c == NLBLOCK->nl[0])
1327
          {
1328
          could_continue = partial_newline = TRUE;
1329
          }
1330
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1331
            (c < 256 &&
1332
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1333
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1334
          {
1335
          if (codevalue == OP_TYPEPOSQUERY)
1336
            {
1337
            active_count--;            /* Remove non-match possibility */
1338
            next_active_state--;
1339
            }
1340
          ADD_NEW(state_offset + 2, 0);
1341
          }
1342
        }
1343
      break;
1344

1345
      /*-----------------------------------------------------------------*/
1346
      case OP_TYPESTAR:
1347
      case OP_TYPEMINSTAR:
1348
      case OP_TYPEPOSSTAR:
1349
      ADD_ACTIVE(state_offset + 2, 0);
1350
      if (clen > 0)
1351
        {
1352
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1353
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1354
            NLBLOCK->nltype == NLTYPE_FIXED &&
1355
            NLBLOCK->nllen == 2 &&
1356
            c == NLBLOCK->nl[0])
1357
          {
1358
          could_continue = partial_newline = TRUE;
1359
          }
1360
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1361
            (c < 256 &&
1362
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1363
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1364
          {
1365
          if (codevalue == OP_TYPEPOSSTAR)
1366
            {
1367
            active_count--;            /* Remove non-match possibility */
1368
            next_active_state--;
1369
            }
1370
          ADD_NEW(state_offset, 0);
1371
          }
1372
        }
1373
      break;
1374

1375
      /*-----------------------------------------------------------------*/
1376
      case OP_TYPEEXACT:
1377
      count = current_state->count;  /* Number already matched */
1378
      if (clen > 0)
1379
        {
1380
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1381
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1382
            NLBLOCK->nltype == NLTYPE_FIXED &&
1383
            NLBLOCK->nllen == 2 &&
1384
            c == NLBLOCK->nl[0])
1385
          {
1386
          could_continue = partial_newline = TRUE;
1387
          }
1388
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1389
            (c < 256 &&
1390
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1391
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1392
          {
1393
          if (++count >= (int)GET2(code, 1))
1394
            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1395
          else
1396
            { ADD_NEW(state_offset, count); }
1397
          }
1398
        }
1399
      break;
1400

1401
      /*-----------------------------------------------------------------*/
1402
      case OP_TYPEUPTO:
1403
      case OP_TYPEMINUPTO:
1404
      case OP_TYPEPOSUPTO:
1405
      ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1406
      count = current_state->count;  /* Number already matched */
1407
      if (clen > 0)
1408
        {
1409
        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1410
            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1411
            NLBLOCK->nltype == NLTYPE_FIXED &&
1412
            NLBLOCK->nllen == 2 &&
1413
            c == NLBLOCK->nl[0])
1414
          {
1415
          could_continue = partial_newline = TRUE;
1416
          }
1417
        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1418
            (c < 256 &&
1419
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1420
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1421
          {
1422
          if (codevalue == OP_TYPEPOSUPTO)
1423
            {
1424
            active_count--;           /* Remove non-match possibility */
1425
            next_active_state--;
1426
            }
1427
          if (++count >= (int)GET2(code, 1))
1428
            { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1429
          else
1430
            { ADD_NEW(state_offset, count); }
1431
          }
1432
        }
1433
      break;
1434

1435
/* ========================================================================== */
1436
      /* These are virtual opcodes that are used when something like
1437
      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1438
      argument. It keeps the code above fast for the other cases. The argument
1439
      is in the d variable. */
1440

1441
#ifdef SUPPORT_UNICODE
1442
      case OP_PROP_EXTRA + OP_TYPEPLUS:
1443
      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1444
      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1445
      count = current_state->count;           /* Already matched */
1446
      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1447
      if (clen > 0)
1448
        {
1449
        BOOL OK;
1450
        int chartype;
1451
        const uint32_t *cp;
1452
        const ucd_record * prop = GET_UCD(c);
1453
        switch(code[2])
1454
          {
1455
          case PT_LAMP:
1456
          chartype = prop->chartype;
1457
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1458
          break;
1459

1460
          case PT_GC:
1461
          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1462
          break;
1463

1464
          case PT_PC:
1465
          OK = prop->chartype == code[3];
1466
          break;
1467

1468
          case PT_SC:
1469
          OK = prop->script == code[3];
1470
          break;
1471

1472
          case PT_SCX:
1473
          OK = (prop->script == code[3] ||
1474
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1475
          break;
1476

1477
          /* These are specials for combination cases. */
1478

1479
          case PT_ALNUM:
1480
          chartype = prop->chartype;
1481
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1482
               PRIV(ucp_gentype)[chartype] == ucp_N;
1483
          break;
1484

1485
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1486
          which means that Perl space and POSIX space are now identical. PCRE
1487
          was changed at release 8.34. */
1488

1489
          case PT_SPACE:    /* Perl space */
1490
          case PT_PXSPACE:  /* POSIX space */
1491
          switch(c)
1492
            {
1493
            HSPACE_CASES:
1494
            VSPACE_CASES:
1495
            OK = TRUE;
1496
            break;
1497

1498
            default:
1499
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1500
            break;
1501
            }
1502
          break;
1503

1504
          case PT_WORD:
1505
          chartype = prop->chartype;
1506
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1507
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1508
               chartype == ucp_Mn || chartype == ucp_Pc;
1509
          break;
1510

1511
          case PT_CLIST:
1512
#if PCRE2_CODE_UNIT_WIDTH == 32
1513
          if (c > MAX_UTF_CODE_POINT)
1514
            {
1515
            OK = FALSE;
1516
            break;
1517
            }
1518
#endif
1519
          cp = PRIV(ucd_caseless_sets) + code[3];
1520
          for (;;)
1521
            {
1522
            if (c < *cp) { OK = FALSE; break; }
1523
            if (c == *cp++) { OK = TRUE; break; }
1524
            }
1525
          break;
1526

1527
          case PT_UCNC:
1528
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1529
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1530
               c >= 0xe000;
1531
          break;
1532

1533
          case PT_BIDICL:
1534
          OK = UCD_BIDICLASS(c) == code[3];
1535
          break;
1536

1537
          case PT_BOOL:
1538
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1539
            UCD_BPROPS_PROP(prop), code[3]) != 0;
1540
          break;
1541

1542
          /* Should never occur, but keep compilers from grumbling. */
1543

1544
          default:
1545
          OK = codevalue != OP_PROP;
1546
          break;
1547
          }
1548

1549
        if (OK == (d == OP_PROP))
1550
          {
1551
          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1552
            {
1553
            active_count--;           /* Remove non-match possibility */
1554
            next_active_state--;
1555
            }
1556
          count++;
1557
          ADD_NEW(state_offset, count);
1558
          }
1559
        }
1560
      break;
1561

1562
      /*-----------------------------------------------------------------*/
1563
      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1564
      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1565
      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1566
      count = current_state->count;  /* Already matched */
1567
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1568
      if (clen > 0)
1569
        {
1570
        int ncount = 0;
1571
        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1572
          {
1573
          active_count--;           /* Remove non-match possibility */
1574
          next_active_state--;
1575
          }
1576
        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1577
          &ncount);
1578
        count++;
1579
        ADD_NEW_DATA(-state_offset, count, ncount);
1580
        }
1581
      break;
1582
#endif
1583

1584
      /*-----------------------------------------------------------------*/
1585
      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1586
      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1587
      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1588
      count = current_state->count;  /* Already matched */
1589
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1590
      if (clen > 0)
1591
        {
1592
        int ncount = 0;
1593
        switch (c)
1594
          {
1595
          case CHAR_VT:
1596
          case CHAR_FF:
1597
          case CHAR_NEL:
1598
#ifndef EBCDIC
1599
          case 0x2028:
1600
          case 0x2029:
1601
#endif  /* Not EBCDIC */
1602
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1603
          goto ANYNL01;
1604

1605
          case CHAR_CR:
1606
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1607
          /* Fall through */
1608

1609
          ANYNL01:
1610
          case CHAR_LF:
1611
          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1612
            {
1613
            active_count--;           /* Remove non-match possibility */
1614
            next_active_state--;
1615
            }
1616
          count++;
1617
          ADD_NEW_DATA(-state_offset, count, ncount);
1618
          break;
1619

1620
          default:
1621
          break;
1622
          }
1623
        }
1624
      break;
1625

1626
      /*-----------------------------------------------------------------*/
1627
      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1628
      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1629
      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1630
      count = current_state->count;  /* Already matched */
1631
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1632
      if (clen > 0)
1633
        {
1634
        BOOL OK;
1635
        switch (c)
1636
          {
1637
          VSPACE_CASES:
1638
          OK = TRUE;
1639
          break;
1640

1641
          default:
1642
          OK = FALSE;
1643
          break;
1644
          }
1645

1646
        if (OK == (d == OP_VSPACE))
1647
          {
1648
          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1649
            {
1650
            active_count--;           /* Remove non-match possibility */
1651
            next_active_state--;
1652
            }
1653
          count++;
1654
          ADD_NEW_DATA(-state_offset, count, 0);
1655
          }
1656
        }
1657
      break;
1658

1659
      /*-----------------------------------------------------------------*/
1660
      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1661
      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1662
      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1663
      count = current_state->count;  /* Already matched */
1664
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1665
      if (clen > 0)
1666
        {
1667
        BOOL OK;
1668
        switch (c)
1669
          {
1670
          HSPACE_CASES:
1671
          OK = TRUE;
1672
          break;
1673

1674
          default:
1675
          OK = FALSE;
1676
          break;
1677
          }
1678

1679
        if (OK == (d == OP_HSPACE))
1680
          {
1681
          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1682
            {
1683
            active_count--;           /* Remove non-match possibility */
1684
            next_active_state--;
1685
            }
1686
          count++;
1687
          ADD_NEW_DATA(-state_offset, count, 0);
1688
          }
1689
        }
1690
      break;
1691

1692
      /*-----------------------------------------------------------------*/
1693
#ifdef SUPPORT_UNICODE
1694
      case OP_PROP_EXTRA + OP_TYPEQUERY:
1695
      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1696
      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1697
      count = 4;
1698
      goto QS1;
1699

1700
      case OP_PROP_EXTRA + OP_TYPESTAR:
1701
      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1702
      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1703
      count = 0;
1704

1705
      QS1:
1706

1707
      ADD_ACTIVE(state_offset + 4, 0);
1708
      if (clen > 0)
1709
        {
1710
        BOOL OK;
1711
        int chartype;
1712
        const uint32_t *cp;
1713
        const ucd_record * prop = GET_UCD(c);
1714
        switch(code[2])
1715
          {
1716
          case PT_LAMP:
1717
          chartype = prop->chartype;
1718
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1719
          break;
1720

1721
          case PT_GC:
1722
          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1723
          break;
1724

1725
          case PT_PC:
1726
          OK = prop->chartype == code[3];
1727
          break;
1728

1729
          case PT_SC:
1730
          OK = prop->script == code[3];
1731
          break;
1732

1733
          case PT_SCX:
1734
          OK = (prop->script == code[3] ||
1735
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1736
          break;
1737

1738
          /* These are specials for combination cases. */
1739

1740
          case PT_ALNUM:
1741
          chartype = prop->chartype;
1742
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1743
               PRIV(ucp_gentype)[chartype] == ucp_N;
1744
          break;
1745

1746
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1747
          which means that Perl space and POSIX space are now identical. PCRE
1748
          was changed at release 8.34. */
1749

1750
          case PT_SPACE:    /* Perl space */
1751
          case PT_PXSPACE:  /* POSIX space */
1752
          switch(c)
1753
            {
1754
            HSPACE_CASES:
1755
            VSPACE_CASES:
1756
            OK = TRUE;
1757
            break;
1758

1759
            default:
1760
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1761
            break;
1762
            }
1763
          break;
1764

1765
          case PT_WORD:
1766
          chartype = prop->chartype;
1767
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1768
               PRIV(ucp_gentype)[chartype] == ucp_N ||
1769
               chartype == ucp_Mn || chartype == ucp_Pc;
1770
          break;
1771

1772
          case PT_CLIST:
1773
#if PCRE2_CODE_UNIT_WIDTH == 32
1774
          if (c > MAX_UTF_CODE_POINT)
1775
            {
1776
            OK = FALSE;
1777
            break;
1778
            }
1779
#endif
1780
          cp = PRIV(ucd_caseless_sets) + code[3];
1781
          for (;;)
1782
            {
1783
            if (c < *cp) { OK = FALSE; break; }
1784
            if (c == *cp++) { OK = TRUE; break; }
1785
            }
1786
          break;
1787

1788
          case PT_UCNC:
1789
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1790
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1791
               c >= 0xe000;
1792
          break;
1793

1794
          case PT_BIDICL:
1795
          OK = UCD_BIDICLASS(c) == code[3];
1796
          break;
1797

1798
          case PT_BOOL:
1799
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1800
            UCD_BPROPS_PROP(prop), code[3]) != 0;
1801
          break;
1802

1803
          /* Should never occur, but keep compilers from grumbling. */
1804

1805
          default:
1806
          OK = codevalue != OP_PROP;
1807
          break;
1808
          }
1809

1810
        if (OK == (d == OP_PROP))
1811
          {
1812
          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1813
              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1814
            {
1815
            active_count--;           /* Remove non-match possibility */
1816
            next_active_state--;
1817
            }
1818
          ADD_NEW(state_offset + count, 0);
1819
          }
1820
        }
1821
      break;
1822

1823
      /*-----------------------------------------------------------------*/
1824
      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1825
      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1826
      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1827
      count = 2;
1828
      goto QS2;
1829

1830
      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1831
      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1832
      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1833
      count = 0;
1834

1835
      QS2:
1836

1837
      ADD_ACTIVE(state_offset + 2, 0);
1838
      if (clen > 0)
1839
        {
1840
        int ncount = 0;
1841
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1842
            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1843
          {
1844
          active_count--;           /* Remove non-match possibility */
1845
          next_active_state--;
1846
          }
1847
        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1848
          &ncount);
1849
        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1850
        }
1851
      break;
1852
#endif
1853

1854
      /*-----------------------------------------------------------------*/
1855
      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1856
      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1857
      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1858
      count = 2;
1859
      goto QS3;
1860

1861
      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1862
      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1863
      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1864
      count = 0;
1865

1866
      QS3:
1867
      ADD_ACTIVE(state_offset + 2, 0);
1868
      if (clen > 0)
1869
        {
1870
        int ncount = 0;
1871
        switch (c)
1872
          {
1873
          case CHAR_VT:
1874
          case CHAR_FF:
1875
          case CHAR_NEL:
1876
#ifndef EBCDIC
1877
          case 0x2028:
1878
          case 0x2029:
1879
#endif  /* Not EBCDIC */
1880
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1881
          goto ANYNL02;
1882

1883
          case CHAR_CR:
1884
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1885
          /* Fall through */
1886

1887
          ANYNL02:
1888
          case CHAR_LF:
1889
          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1890
              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1891
            {
1892
            active_count--;           /* Remove non-match possibility */
1893
            next_active_state--;
1894
            }
1895
          ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1896
          break;
1897

1898
          default:
1899
          break;
1900
          }
1901
        }
1902
      break;
1903

1904
      /*-----------------------------------------------------------------*/
1905
      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1906
      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1907
      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1908
      count = 2;
1909
      goto QS4;
1910

1911
      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1912
      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1913
      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1914
      count = 0;
1915

1916
      QS4:
1917
      ADD_ACTIVE(state_offset + 2, 0);
1918
      if (clen > 0)
1919
        {
1920
        BOOL OK;
1921
        switch (c)
1922
          {
1923
          VSPACE_CASES:
1924
          OK = TRUE;
1925
          break;
1926

1927
          default:
1928
          OK = FALSE;
1929
          break;
1930
          }
1931
        if (OK == (d == OP_VSPACE))
1932
          {
1933
          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1934
              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1935
            {
1936
            active_count--;           /* Remove non-match possibility */
1937
            next_active_state--;
1938
            }
1939
          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1940
          }
1941
        }
1942
      break;
1943

1944
      /*-----------------------------------------------------------------*/
1945
      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1946
      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1947
      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1948
      count = 2;
1949
      goto QS5;
1950

1951
      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1952
      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1953
      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1954
      count = 0;
1955

1956
      QS5:
1957
      ADD_ACTIVE(state_offset + 2, 0);
1958
      if (clen > 0)
1959
        {
1960
        BOOL OK;
1961
        switch (c)
1962
          {
1963
          HSPACE_CASES:
1964
          OK = TRUE;
1965
          break;
1966

1967
          default:
1968
          OK = FALSE;
1969
          break;
1970
          }
1971

1972
        if (OK == (d == OP_HSPACE))
1973
          {
1974
          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1975
              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1976
            {
1977
            active_count--;           /* Remove non-match possibility */
1978
            next_active_state--;
1979
            }
1980
          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1981
          }
1982
        }
1983
      break;
1984

1985
      /*-----------------------------------------------------------------*/
1986
#ifdef SUPPORT_UNICODE
1987
      case OP_PROP_EXTRA + OP_TYPEEXACT:
1988
      case OP_PROP_EXTRA + OP_TYPEUPTO:
1989
      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1990
      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1991
      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1992
        { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1993
      count = current_state->count;  /* Number already matched */
1994
      if (clen > 0)
1995
        {
1996
        BOOL OK;
1997
        int chartype;
1998
        const uint32_t *cp;
1999
        const ucd_record * prop = GET_UCD(c);
2000
        switch(code[1 + IMM2_SIZE + 1])
2001
          {
2002
          case PT_LAMP:
2003
          chartype = prop->chartype;
2004
          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2005
          break;
2006

2007
          case PT_GC:
2008
          OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2009
          break;
2010

2011
          case PT_PC:
2012
          OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2013
          break;
2014

2015
          case PT_SC:
2016
          OK = prop->script == code[1 + IMM2_SIZE + 2];
2017
          break;
2018

2019
          case PT_SCX:
2020
          OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2021
                MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2022
                  code[1 + IMM2_SIZE + 2]) != 0);
2023
          break;
2024

2025
          /* These are specials for combination cases. */
2026

2027
          case PT_ALNUM:
2028
          chartype = prop->chartype;
2029
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2030
               PRIV(ucp_gentype)[chartype] == ucp_N;
2031
          break;
2032

2033
          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2034
          which means that Perl space and POSIX space are now identical. PCRE
2035
          was changed at release 8.34. */
2036

2037
          case PT_SPACE:    /* Perl space */
2038
          case PT_PXSPACE:  /* POSIX space */
2039
          switch(c)
2040
            {
2041
            HSPACE_CASES:
2042
            VSPACE_CASES:
2043
            OK = TRUE;
2044
            break;
2045

2046
            default:
2047
            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2048
            break;
2049
            }
2050
          break;
2051

2052
          case PT_WORD:
2053
          chartype = prop->chartype;
2054
          OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2055
               PRIV(ucp_gentype)[chartype] == ucp_N ||
2056
               chartype == ucp_Mn || chartype == ucp_Pc;
2057
          break;
2058

2059
          case PT_CLIST:
2060
#if PCRE2_CODE_UNIT_WIDTH == 32
2061
          if (c > MAX_UTF_CODE_POINT)
2062
            {
2063
            OK = FALSE;
2064
            break;
2065
            }
2066
#endif
2067
          cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2068
          for (;;)
2069
            {
2070
            if (c < *cp) { OK = FALSE; break; }
2071
            if (c == *cp++) { OK = TRUE; break; }
2072
            }
2073
          break;
2074

2075
          case PT_UCNC:
2076
          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2077
               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2078
               c >= 0xe000;
2079
          break;
2080

2081
          case PT_BIDICL:
2082
          OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2083
          break;
2084

2085
          case PT_BOOL:
2086
          OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2087
            UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2088
          break;
2089

2090
          /* Should never occur, but keep compilers from grumbling. */
2091

2092
          default:
2093
          OK = codevalue != OP_PROP;
2094
          break;
2095
          }
2096

2097
        if (OK == (d == OP_PROP))
2098
          {
2099
          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2100
            {
2101
            active_count--;           /* Remove non-match possibility */
2102
            next_active_state--;
2103
            }
2104
          if (++count >= (int)GET2(code, 1))
2105
            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2106
          else
2107
            { ADD_NEW(state_offset, count); }
2108
          }
2109
        }
2110
      break;
2111

2112
      /*-----------------------------------------------------------------*/
2113
      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2114
      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2115
      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2116
      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2117
      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2118
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2119
      count = current_state->count;  /* Number already matched */
2120
      if (clen > 0)
2121
        {
2122
        PCRE2_SPTR nptr;
2123
        int ncount = 0;
2124
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2125
          {
2126
          active_count--;           /* Remove non-match possibility */
2127
          next_active_state--;
2128
          }
2129
        nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2130
          &ncount);
2131
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2132
            reset_could_continue = TRUE;
2133
        if (++count >= (int)GET2(code, 1))
2134
          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2135
        else
2136
          { ADD_NEW_DATA(-state_offset, count, ncount); }
2137
        }
2138
      break;
2139
#endif
2140

2141
      /*-----------------------------------------------------------------*/
2142
      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2143
      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2144
      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2145
      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2146
      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2147
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2148
      count = current_state->count;  /* Number already matched */
2149
      if (clen > 0)
2150
        {
2151
        int ncount = 0;
2152
        switch (c)
2153
          {
2154
          case CHAR_VT:
2155
          case CHAR_FF:
2156
          case CHAR_NEL:
2157
#ifndef EBCDIC
2158
          case 0x2028:
2159
          case 0x2029:
2160
#endif  /* Not EBCDIC */
2161
          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2162
          goto ANYNL03;
2163

2164
          case CHAR_CR:
2165
          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2166
          /* Fall through */
2167

2168
          ANYNL03:
2169
          case CHAR_LF:
2170
          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2171
            {
2172
            active_count--;           /* Remove non-match possibility */
2173
            next_active_state--;
2174
            }
2175
          if (++count >= (int)GET2(code, 1))
2176
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2177
          else
2178
            { ADD_NEW_DATA(-state_offset, count, ncount); }
2179
          break;
2180

2181
          default:
2182
          break;
2183
          }
2184
        }
2185
      break;
2186

2187
      /*-----------------------------------------------------------------*/
2188
      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2189
      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2190
      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2191
      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2192
      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2193
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2194
      count = current_state->count;  /* Number already matched */
2195
      if (clen > 0)
2196
        {
2197
        BOOL OK;
2198
        switch (c)
2199
          {
2200
          VSPACE_CASES:
2201
          OK = TRUE;
2202
          break;
2203

2204
          default:
2205
          OK = FALSE;
2206
          }
2207

2208
        if (OK == (d == OP_VSPACE))
2209
          {
2210
          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2211
            {
2212
            active_count--;           /* Remove non-match possibility */
2213
            next_active_state--;
2214
            }
2215
          if (++count >= (int)GET2(code, 1))
2216
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2217
          else
2218
            { ADD_NEW_DATA(-state_offset, count, 0); }
2219
          }
2220
        }
2221
      break;
2222

2223
      /*-----------------------------------------------------------------*/
2224
      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2225
      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2226
      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2227
      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2228
      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2229
        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2230
      count = current_state->count;  /* Number already matched */
2231
      if (clen > 0)
2232
        {
2233
        BOOL OK;
2234
        switch (c)
2235
          {
2236
          HSPACE_CASES:
2237
          OK = TRUE;
2238
          break;
2239

2240
          default:
2241
          OK = FALSE;
2242
          break;
2243
          }
2244

2245
        if (OK == (d == OP_HSPACE))
2246
          {
2247
          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2248
            {
2249
            active_count--;           /* Remove non-match possibility */
2250
            next_active_state--;
2251
            }
2252
          if (++count >= (int)GET2(code, 1))
2253
            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2254
          else
2255
            { ADD_NEW_DATA(-state_offset, count, 0); }
2256
          }
2257
        }
2258
      break;
2259

2260
/* ========================================================================== */
2261
      /* These opcodes are followed by a character that is usually compared
2262
      to the current subject character; it is loaded into d. We still get
2263
      here even if there is no subject character, because in some cases zero
2264
      repetitions are permitted. */
2265

2266
      /*-----------------------------------------------------------------*/
2267
      case OP_CHAR:
2268
      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2269
      break;
2270

2271
      /*-----------------------------------------------------------------*/
2272
      case OP_CHARI:
2273
      if (clen == 0) break;
2274

2275
#ifdef SUPPORT_UNICODE
2276
      if (utf_or_ucp)
2277
        {
2278
        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2279
          {
2280
          unsigned int othercase;
2281
          if (c < 128)
2282
            othercase = fcc[c];
2283
          else
2284
            othercase = UCD_OTHERCASE(c);
2285
          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2286
          }
2287
        }
2288
      else
2289
#endif  /* SUPPORT_UNICODE */
2290
      /* Not UTF or UCP mode */
2291
        {
2292
        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2293
          { ADD_NEW(state_offset + 2, 0); }
2294
        }
2295
      break;
2296

2297

2298
#ifdef SUPPORT_UNICODE
2299
      /*-----------------------------------------------------------------*/
2300
      /* This is a tricky one because it can match more than one character.
2301
      Find out how many characters to skip, and then set up a negative state
2302
      to wait for them to pass before continuing. */
2303

2304
      case OP_EXTUNI:
2305
      if (clen > 0)
2306
        {
2307
        int ncount = 0;
2308
        PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2309
          end_subject, utf, &ncount);
2310
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2311
            reset_could_continue = TRUE;
2312
        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2313
        }
2314
      break;
2315
#endif
2316

2317
      /*-----------------------------------------------------------------*/
2318
      /* This is a tricky like EXTUNI because it too can match more than one
2319
      character (when CR is followed by LF). In this case, set up a negative
2320
      state to wait for one character to pass before continuing. */
2321

2322
      case OP_ANYNL:
2323
      if (clen > 0) switch(c)
2324
        {
2325
        case CHAR_VT:
2326
        case CHAR_FF:
2327
        case CHAR_NEL:
2328
#ifndef EBCDIC
2329
        case 0x2028:
2330
        case 0x2029:
2331
#endif  /* Not EBCDIC */
2332
        if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2333
        /* Fall through */
2334

2335
        case CHAR_LF:
2336
        ADD_NEW(state_offset + 1, 0);
2337
        break;
2338

2339
        case CHAR_CR:
2340
        if (ptr + 1 >= end_subject)
2341
          {
2342
          ADD_NEW(state_offset + 1, 0);
2343
          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2344
            reset_could_continue = TRUE;
2345
          }
2346
        else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2347
          {
2348
          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2349
          }
2350
        else
2351
          {
2352
          ADD_NEW(state_offset + 1, 0);
2353
          }
2354
        break;
2355
        }
2356
      break;
2357

2358
      /*-----------------------------------------------------------------*/
2359
      case OP_NOT_VSPACE:
2360
      if (clen > 0) switch(c)
2361
        {
2362
        VSPACE_CASES:
2363
        break;
2364

2365
        default:
2366
        ADD_NEW(state_offset + 1, 0);
2367
        break;
2368
        }
2369
      break;
2370

2371
      /*-----------------------------------------------------------------*/
2372
      case OP_VSPACE:
2373
      if (clen > 0) switch(c)
2374
        {
2375
        VSPACE_CASES:
2376
        ADD_NEW(state_offset + 1, 0);
2377
        break;
2378

2379
        default:
2380
        break;
2381
        }
2382
      break;
2383

2384
      /*-----------------------------------------------------------------*/
2385
      case OP_NOT_HSPACE:
2386
      if (clen > 0) switch(c)
2387
        {
2388
        HSPACE_CASES:
2389
        break;
2390

2391
        default:
2392
        ADD_NEW(state_offset + 1, 0);
2393
        break;
2394
        }
2395
      break;
2396

2397
      /*-----------------------------------------------------------------*/
2398
      case OP_HSPACE:
2399
      if (clen > 0) switch(c)
2400
        {
2401
        HSPACE_CASES:
2402
        ADD_NEW(state_offset + 1, 0);
2403
        break;
2404

2405
        default:
2406
        break;
2407
        }
2408
      break;
2409

2410
      /*-----------------------------------------------------------------*/
2411
      /* Match a negated single character casefully. */
2412

2413
      case OP_NOT:
2414
      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2415
      break;
2416

2417
      /*-----------------------------------------------------------------*/
2418
      /* Match a negated single character caselessly. */
2419

2420
      case OP_NOTI:
2421
      if (clen > 0)
2422
        {
2423
        uint32_t otherd;
2424
#ifdef SUPPORT_UNICODE
2425
        if (utf_or_ucp && d >= 128)
2426
          otherd = UCD_OTHERCASE(d);
2427
        else
2428
#endif  /* SUPPORT_UNICODE */
2429
        otherd = TABLE_GET(d, fcc, d);
2430
        if (c != d && c != otherd)
2431
          { ADD_NEW(state_offset + dlen + 1, 0); }
2432
        }
2433
      break;
2434

2435
      /*-----------------------------------------------------------------*/
2436
      case OP_PLUSI:
2437
      case OP_MINPLUSI:
2438
      case OP_POSPLUSI:
2439
      case OP_NOTPLUSI:
2440
      case OP_NOTMINPLUSI:
2441
      case OP_NOTPOSPLUSI:
2442
      caseless = TRUE;
2443
      codevalue -= OP_STARI - OP_STAR;
2444

2445
      /* Fall through */
2446
      case OP_PLUS:
2447
      case OP_MINPLUS:
2448
      case OP_POSPLUS:
2449
      case OP_NOTPLUS:
2450
      case OP_NOTMINPLUS:
2451
      case OP_NOTPOSPLUS:
2452
      count = current_state->count;  /* Already matched */
2453
      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2454
      if (clen > 0)
2455
        {
2456
        uint32_t otherd = NOTACHAR;
2457
        if (caseless)
2458
          {
2459
#ifdef SUPPORT_UNICODE
2460
          if (utf_or_ucp && d >= 128)
2461
            otherd = UCD_OTHERCASE(d);
2462
          else
2463
#endif  /* SUPPORT_UNICODE */
2464
          otherd = TABLE_GET(d, fcc, d);
2465
          }
2466
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2467
          {
2468
          if (count > 0 &&
2469
              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2470
            {
2471
            active_count--;             /* Remove non-match possibility */
2472
            next_active_state--;
2473
            }
2474
          count++;
2475
          ADD_NEW(state_offset, count);
2476
          }
2477
        }
2478
      break;
2479

2480
      /*-----------------------------------------------------------------*/
2481
      case OP_QUERYI:
2482
      case OP_MINQUERYI:
2483
      case OP_POSQUERYI:
2484
      case OP_NOTQUERYI:
2485
      case OP_NOTMINQUERYI:
2486
      case OP_NOTPOSQUERYI:
2487
      caseless = TRUE;
2488
      codevalue -= OP_STARI - OP_STAR;
2489
      /* Fall through */
2490
      case OP_QUERY:
2491
      case OP_MINQUERY:
2492
      case OP_POSQUERY:
2493
      case OP_NOTQUERY:
2494
      case OP_NOTMINQUERY:
2495
      case OP_NOTPOSQUERY:
2496
      ADD_ACTIVE(state_offset + dlen + 1, 0);
2497
      if (clen > 0)
2498
        {
2499
        uint32_t otherd = NOTACHAR;
2500
        if (caseless)
2501
          {
2502
#ifdef SUPPORT_UNICODE
2503
          if (utf_or_ucp && d >= 128)
2504
            otherd = UCD_OTHERCASE(d);
2505
          else
2506
#endif  /* SUPPORT_UNICODE */
2507
          otherd = TABLE_GET(d, fcc, d);
2508
          }
2509
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2510
          {
2511
          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2512
            {
2513
            active_count--;            /* Remove non-match possibility */
2514
            next_active_state--;
2515
            }
2516
          ADD_NEW(state_offset + dlen + 1, 0);
2517
          }
2518
        }
2519
      break;
2520

2521
      /*-----------------------------------------------------------------*/
2522
      case OP_STARI:
2523
      case OP_MINSTARI:
2524
      case OP_POSSTARI:
2525
      case OP_NOTSTARI:
2526
      case OP_NOTMINSTARI:
2527
      case OP_NOTPOSSTARI:
2528
      caseless = TRUE;
2529
      codevalue -= OP_STARI - OP_STAR;
2530
      /* Fall through */
2531
      case OP_STAR:
2532
      case OP_MINSTAR:
2533
      case OP_POSSTAR:
2534
      case OP_NOTSTAR:
2535
      case OP_NOTMINSTAR:
2536
      case OP_NOTPOSSTAR:
2537
      ADD_ACTIVE(state_offset + dlen + 1, 0);
2538
      if (clen > 0)
2539
        {
2540
        uint32_t otherd = NOTACHAR;
2541
        if (caseless)
2542
          {
2543
#ifdef SUPPORT_UNICODE
2544
          if (utf_or_ucp && d >= 128)
2545
            otherd = UCD_OTHERCASE(d);
2546
          else
2547
#endif  /* SUPPORT_UNICODE */
2548
          otherd = TABLE_GET(d, fcc, d);
2549
          }
2550
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2551
          {
2552
          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2553
            {
2554
            active_count--;            /* Remove non-match possibility */
2555
            next_active_state--;
2556
            }
2557
          ADD_NEW(state_offset, 0);
2558
          }
2559
        }
2560
      break;
2561

2562
      /*-----------------------------------------------------------------*/
2563
      case OP_EXACTI:
2564
      case OP_NOTEXACTI:
2565
      caseless = TRUE;
2566
      codevalue -= OP_STARI - OP_STAR;
2567
      /* Fall through */
2568
      case OP_EXACT:
2569
      case OP_NOTEXACT:
2570
      count = current_state->count;  /* Number already matched */
2571
      if (clen > 0)
2572
        {
2573
        uint32_t otherd = NOTACHAR;
2574
        if (caseless)
2575
          {
2576
#ifdef SUPPORT_UNICODE
2577
          if (utf_or_ucp && d >= 128)
2578
            otherd = UCD_OTHERCASE(d);
2579
          else
2580
#endif  /* SUPPORT_UNICODE */
2581
          otherd = TABLE_GET(d, fcc, d);
2582
          }
2583
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2584
          {
2585
          if (++count >= (int)GET2(code, 1))
2586
            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2587
          else
2588
            { ADD_NEW(state_offset, count); }
2589
          }
2590
        }
2591
      break;
2592

2593
      /*-----------------------------------------------------------------*/
2594
      case OP_UPTOI:
2595
      case OP_MINUPTOI:
2596
      case OP_POSUPTOI:
2597
      case OP_NOTUPTOI:
2598
      case OP_NOTMINUPTOI:
2599
      case OP_NOTPOSUPTOI:
2600
      caseless = TRUE;
2601
      codevalue -= OP_STARI - OP_STAR;
2602
      /* Fall through */
2603
      case OP_UPTO:
2604
      case OP_MINUPTO:
2605
      case OP_POSUPTO:
2606
      case OP_NOTUPTO:
2607
      case OP_NOTMINUPTO:
2608
      case OP_NOTPOSUPTO:
2609
      ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2610
      count = current_state->count;  /* Number already matched */
2611
      if (clen > 0)
2612
        {
2613
        uint32_t otherd = NOTACHAR;
2614
        if (caseless)
2615
          {
2616
#ifdef SUPPORT_UNICODE
2617
          if (utf_or_ucp && d >= 128)
2618
            otherd = UCD_OTHERCASE(d);
2619
          else
2620
#endif  /* SUPPORT_UNICODE */
2621
          otherd = TABLE_GET(d, fcc, d);
2622
          }
2623
        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2624
          {
2625
          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2626
            {
2627
            active_count--;             /* Remove non-match possibility */
2628
            next_active_state--;
2629
            }
2630
          if (++count >= (int)GET2(code, 1))
2631
            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2632
          else
2633
            { ADD_NEW(state_offset, count); }
2634
          }
2635
        }
2636
      break;
2637

2638

2639
/* ========================================================================== */
2640
      /* These are the class-handling opcodes */
2641

2642
      case OP_CLASS:
2643
      case OP_NCLASS:
2644
#ifdef SUPPORT_WIDE_CHARS
2645
      case OP_XCLASS:
2646
      case OP_ECLASS:
2647
#endif
2648
        {
2649
        BOOL isinclass = FALSE;
2650
        int next_state_offset;
2651
        PCRE2_SPTR ecode;
2652

2653
#ifdef SUPPORT_WIDE_CHARS
2654
        /* An extended class may have a table or a list of single characters,
2655
        ranges, or both, and it may be positive or negative. There's a
2656
        function that sorts all this out. */
2657

2658
        if (codevalue == OP_XCLASS)
2659
         {
2660
         ecode = code + GET(code, 1);
2661
         if (clen > 0)
2662
           isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE,
2663
             (const uint8_t*)mb->start_code, utf);
2664
         }
2665

2666
        /* A nested set-based class has internal opcodes for performing
2667
        set operations. */
2668

2669
        else if (codevalue == OP_ECLASS)
2670
         {
2671
         ecode = code + GET(code, 1);
2672
         if (clen > 0)
2673
           isinclass = PRIV(eclass)(c, code + 1 + LINK_SIZE, ecode,
2674
             (const uint8_t*)mb->start_code, utf);
2675
         }
2676

2677
        else
2678
#endif /* SUPPORT_WIDE_CHARS */
2679

2680
        /* For a simple class, there is always just a 32-byte table, and we
2681
        can set isinclass from it. */
2682

2683
          {
2684
          ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2685
          if (clen > 0)
2686
            {
2687
            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2688
              ((((const uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2689
            }
2690
          }
2691

2692
        /* At this point, isinclass is set for all kinds of class, and ecode
2693
        points to the byte after the end of the class. If there is a
2694
        quantifier, this is where it will be. */
2695

2696
        next_state_offset = (int)(ecode - start_code);
2697

2698
        switch (*ecode)
2699
          {
2700
          case OP_CRSTAR:
2701
          case OP_CRMINSTAR:
2702
          case OP_CRPOSSTAR:
2703
          ADD_ACTIVE(next_state_offset + 1, 0);
2704
          if (isinclass)
2705
            {
2706
            if (*ecode == OP_CRPOSSTAR)
2707
              {
2708
              active_count--;           /* Remove non-match possibility */
2709
              next_active_state--;
2710
              }
2711
            ADD_NEW(state_offset, 0);
2712
            }
2713
          break;
2714

2715
          case OP_CRPLUS:
2716
          case OP_CRMINPLUS:
2717
          case OP_CRPOSPLUS:
2718
          count = current_state->count;  /* Already matched */
2719
          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2720
          if (isinclass)
2721
            {
2722
            if (count > 0 && *ecode == OP_CRPOSPLUS)
2723
              {
2724
              active_count--;           /* Remove non-match possibility */
2725
              next_active_state--;
2726
              }
2727
            count++;
2728
            ADD_NEW(state_offset, count);
2729
            }
2730
          break;
2731

2732
          case OP_CRQUERY:
2733
          case OP_CRMINQUERY:
2734
          case OP_CRPOSQUERY:
2735
          ADD_ACTIVE(next_state_offset + 1, 0);
2736
          if (isinclass)
2737
            {
2738
            if (*ecode == OP_CRPOSQUERY)
2739
              {
2740
              active_count--;           /* Remove non-match possibility */
2741
              next_active_state--;
2742
              }
2743
            ADD_NEW(next_state_offset + 1, 0);
2744
            }
2745
          break;
2746

2747
          case OP_CRRANGE:
2748
          case OP_CRMINRANGE:
2749
          case OP_CRPOSRANGE:
2750
          count = current_state->count;  /* Already matched */
2751
          if (count >= (int)GET2(ecode, 1))
2752
            { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2753
          if (isinclass)
2754
            {
2755
            int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2756

2757
            if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2758
              {
2759
              active_count--;           /* Remove non-match possibility */
2760
              next_active_state--;
2761
              }
2762

2763
            if (++count >= max && max != 0)   /* Max 0 => no limit */
2764
              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2765
            else
2766
              { ADD_NEW(state_offset, count); }
2767
            }
2768
          break;
2769

2770
          default:
2771
          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2772
          break;
2773
          }
2774
        }
2775
      break;
2776

2777
/* ========================================================================== */
2778
      /* These are the opcodes for fancy brackets of various kinds. We have
2779
      to use recursion in order to handle them. The "always failing" assertion
2780
      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2781
      though the other "backtracking verbs" are not supported. */
2782

2783
      case OP_FAIL:
2784
      break;
2785

2786
      case OP_ASSERT:
2787
      case OP_ASSERT_NOT:
2788
      case OP_ASSERTBACK:
2789
      case OP_ASSERTBACK_NOT:
2790
        {
2791
        int rc;
2792
        int *local_workspace;
2793
        PCRE2_SIZE *local_offsets;
2794
        PCRE2_SPTR endasscode = code + GET(code, 1);
2795
        RWS_anchor *rws = (RWS_anchor *)RWS;
2796

2797
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2798
          {
2799
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2800
          if (rc != 0) return rc;
2801
          RWS = (int *)rws;
2802
          }
2803

2804
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2805
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2806
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2807

2808
        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2809

2810
        rc = internal_dfa_match(
2811
          mb,                                   /* static match data */
2812
          code,                                 /* this subexpression's code */
2813
          ptr,                                  /* where we currently are */
2814
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2815
          local_offsets,                        /* offset vector */
2816
          RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2817
          local_workspace,                      /* workspace vector */
2818
          RWS_RSIZE,                            /* size of same */
2819
          rlevel,                               /* function recursion level */
2820
          RWS);                                 /* recursion workspace */
2821

2822
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2823

2824
        if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2825
        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2826
            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2827
        }
2828
      break;
2829

2830
      /*-----------------------------------------------------------------*/
2831
      case OP_COND:
2832
      case OP_SCOND:
2833
        {
2834
        int codelink = (int)GET(code, 1);
2835
        PCRE2_UCHAR condcode;
2836

2837
        /* Because of the way auto-callout works during compile, a callout item
2838
        is inserted between OP_COND and an assertion condition. This does not
2839
        happen for the other conditions. */
2840

2841
        if (code[LINK_SIZE + 1] == OP_CALLOUT
2842
            || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2843
          {
2844
          PCRE2_SIZE callout_length;
2845
          rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2846
            1 + LINK_SIZE, &callout_length);
2847
          if (rrc < 0) return rrc;                 /* Abandon */
2848
          if (rrc > 0) break;                      /* Fail this thread */
2849
          code += callout_length;                  /* Skip callout data */
2850
          }
2851

2852
        condcode = code[LINK_SIZE+1];
2853

2854
        /* Back reference conditions and duplicate named recursion conditions
2855
        are not supported */
2856

2857
        if (condcode == OP_CREF || condcode == OP_DNCREF ||
2858
            condcode == OP_DNRREF)
2859
          return PCRE2_ERROR_DFA_UCOND;
2860

2861
        /* The DEFINE condition is always false, and the assertion (?!) is
2862
        converted to OP_FAIL. */
2863

2864
        if (condcode == OP_FALSE || condcode == OP_FAIL)
2865
          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2866

2867
        /* There is also an always-true condition */
2868

2869
        else if (condcode == OP_TRUE)
2870
          { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2871

2872
        /* The only supported version of OP_RREF is for the value RREF_ANY,
2873
        which means "test if in any recursion". We can't test for specifically
2874
        recursed groups. */
2875

2876
        else if (condcode == OP_RREF)
2877
          {
2878
          unsigned int value = GET2(code, LINK_SIZE + 2);
2879
          if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2880
          if (mb->recursive != NULL)
2881
            { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2882
          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2883
          }
2884

2885
        /* Otherwise, the condition is an assertion */
2886

2887
        else
2888
          {
2889
          int rc;
2890
          int *local_workspace;
2891
          PCRE2_SIZE *local_offsets;
2892
          PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2893
          PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2894
          RWS_anchor *rws = (RWS_anchor *)RWS;
2895

2896
          if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2897
            {
2898
            rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2899
            if (rc != 0) return rc;
2900
            RWS = (int *)rws;
2901
            }
2902

2903
          local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2904
          local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2905
          rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2906

2907
          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2908

2909
          rc = internal_dfa_match(
2910
            mb,                                   /* fixed match data */
2911
            asscode,                              /* this subexpression's code */
2912
            ptr,                                  /* where we currently are */
2913
            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2914
            local_offsets,                        /* offset vector */
2915
            RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2916
            local_workspace,                      /* workspace vector */
2917
            RWS_RSIZE,                            /* size of same */
2918
            rlevel,                               /* function recursion level */
2919
            RWS);                                 /* recursion workspace */
2920

2921
          rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2922

2923
          if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2924
          if ((rc >= 0) ==
2925
                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2926
            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2927
          else
2928
            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2929
          }
2930
        }
2931
      break;
2932

2933
      /*-----------------------------------------------------------------*/
2934
      case OP_RECURSE:
2935
        {
2936
        int rc;
2937
        int *local_workspace;
2938
        PCRE2_SIZE *local_offsets;
2939
        RWS_anchor *rws = (RWS_anchor *)RWS;
2940
        PCRE2_SPTR callpat = start_code + GET(code, 1);
2941
        uint32_t recno = (callpat == mb->start_code)? 0 :
2942
          GET2(callpat, 1 + LINK_SIZE);
2943

2944
        if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2945
          {
2946
          rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2947
          if (rc != 0) return rc;
2948
          RWS = (int *)rws;
2949
          }
2950

2951
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2952
        local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2953
        rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2954

2955
        /* Check for repeating a recursion without advancing the subject
2956
        pointer or last used character. This should catch convoluted mutual
2957
        recursions. (Some simple cases are caught at compile time.) */
2958

2959
        for (dfa_recursion_info *ri = mb->recursive;
2960
             ri != NULL;
2961
             ri = ri->prevrec)
2962
          {
2963
          if (recno == ri->group_num && ptr == ri->subject_position &&
2964
              mb->last_used_ptr == ri->last_used_ptr)
2965
            return PCRE2_ERROR_RECURSELOOP;
2966
          }
2967

2968
        /* Remember this recursion and where we started it so as to
2969
        catch infinite loops. */
2970

2971
        new_recursive.group_num = recno;
2972
        new_recursive.subject_position = ptr;
2973
        new_recursive.last_used_ptr = mb->last_used_ptr;
2974
        new_recursive.prevrec = mb->recursive;
2975
        mb->recursive = &new_recursive;
2976

2977
        rc = internal_dfa_match(
2978
          mb,                                   /* fixed match data */
2979
          callpat,                              /* this subexpression's code */
2980
          ptr,                                  /* where we currently are */
2981
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2982
          local_offsets,                        /* offset vector */
2983
          RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2984
          local_workspace,                      /* workspace vector */
2985
          RWS_RSIZE,                            /* size of same */
2986
          rlevel,                               /* function recursion level */
2987
          RWS);                                 /* recursion workspace */
2988

2989
        rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2990
        mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2991

2992
        /* Ran out of internal offsets */
2993

2994
        if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2995

2996
        /* For each successful matched substring, set up the next state with a
2997
        count of characters to skip before trying it. Note that the count is in
2998
        characters, not bytes. */
2999

3000
        if (rc > 0)
3001
          {
3002
          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3003
            {
3004
            PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3005
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3006
            if (utf)
3007
              {
3008
              PCRE2_SPTR p = start_subject + local_offsets[rc];
3009
              PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3010
              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3011
              }
3012
#endif
3013
            if (charcount > 0)
3014
              {
3015
              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3016
                (int)(charcount - 1));
3017
              }
3018
            else
3019
              {
3020
              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3021
              }
3022
            }
3023
          }
3024
        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3025
        }
3026
      break;
3027

3028
      /*-----------------------------------------------------------------*/
3029
      case OP_BRAPOS:
3030
      case OP_SBRAPOS:
3031
      case OP_CBRAPOS:
3032
      case OP_SCBRAPOS:
3033
      case OP_BRAPOSZERO:
3034
        {
3035
        int rc;
3036
        int *local_workspace;
3037
        PCRE2_SIZE *local_offsets;
3038
        PCRE2_SIZE charcount, matched_count;
3039
        PCRE2_SPTR local_ptr = ptr;
3040
        RWS_anchor *rws = (RWS_anchor *)RWS;
3041
        BOOL allow_zero;
3042

3043
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3044
          {
3045
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3046
          if (rc != 0) return rc;
3047
          RWS = (int *)rws;
3048
          }
3049

3050
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3051
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3052
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3053

3054
        if (codevalue == OP_BRAPOSZERO)
3055
          {
3056
          allow_zero = TRUE;
3057
          ++code;  /* The following opcode will be one of the above BRAs */
3058
          }
3059
        else allow_zero = FALSE;
3060

3061
        /* Loop to match the subpattern as many times as possible as if it were
3062
        a complete pattern. */
3063

3064
        for (matched_count = 0;; matched_count++)
3065
          {
3066
          rc = internal_dfa_match(
3067
            mb,                                   /* fixed match data */
3068
            code,                                 /* this subexpression's code */
3069
            local_ptr,                            /* where we currently are */
3070
            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3071
            local_offsets,                        /* offset vector */
3072
            RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3073
            local_workspace,                      /* workspace vector */
3074
            RWS_RSIZE,                            /* size of same */
3075
            rlevel,                               /* function recursion level */
3076
            RWS);                                 /* recursion workspace */
3077

3078
          /* Failed to match */
3079

3080
          if (rc < 0)
3081
            {
3082
            if (rc != PCRE2_ERROR_NOMATCH) return rc;
3083
            break;
3084
            }
3085

3086
          /* Matched: break the loop if zero characters matched. */
3087

3088
          charcount = local_offsets[1] - local_offsets[0];
3089
          if (charcount == 0) break;
3090
          local_ptr += charcount;    /* Advance temporary position ptr */
3091
          }
3092

3093
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3094

3095
        /* At this point we have matched the subpattern matched_count
3096
        times, and local_ptr is pointing to the character after the end of the
3097
        last match. */
3098

3099
        if (matched_count > 0 || allow_zero)
3100
          {
3101
          PCRE2_SPTR end_subpattern = code;
3102
          int next_state_offset;
3103

3104
          do { end_subpattern += GET(end_subpattern, 1); }
3105
            while (*end_subpattern == OP_ALT);
3106
          next_state_offset =
3107
            (int)(end_subpattern - start_code + LINK_SIZE + 1);
3108

3109
          /* Optimization: if there are no more active states, and there
3110
          are no new states yet set up, then skip over the subject string
3111
          right here, to save looping. Otherwise, set up the new state to swing
3112
          into action when the end of the matched substring is reached. */
3113

3114
          if (i + 1 >= active_count && new_count == 0)
3115
            {
3116
            ptr = local_ptr;
3117
            clen = 0;
3118
            ADD_NEW(next_state_offset, 0);
3119
            }
3120
          else
3121
            {
3122
            PCRE2_SPTR p = ptr;
3123
            PCRE2_SPTR pp = local_ptr;
3124
            charcount = (PCRE2_SIZE)(pp - p);
3125
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3126
            if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3127
#endif
3128
            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3129
            }
3130
          }
3131
        }
3132
      break;
3133

3134
      /*-----------------------------------------------------------------*/
3135
      case OP_ONCE:
3136
        {
3137
        int rc;
3138
        int *local_workspace;
3139
        PCRE2_SIZE *local_offsets;
3140
        RWS_anchor *rws = (RWS_anchor *)RWS;
3141

3142
        if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3143
          {
3144
          rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3145
          if (rc != 0) return rc;
3146
          RWS = (int *)rws;
3147
          }
3148

3149
        local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3150
        local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3151
        rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3152

3153
        rc = internal_dfa_match(
3154
          mb,                                   /* fixed match data */
3155
          code,                                 /* this subexpression's code */
3156
          ptr,                                  /* where we currently are */
3157
          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3158
          local_offsets,                        /* offset vector */
3159
          RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3160
          local_workspace,                      /* workspace vector */
3161
          RWS_RSIZE,                            /* size of same */
3162
          rlevel,                               /* function recursion level */
3163
          RWS);                                 /* recursion workspace */
3164

3165
        rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3166

3167
        if (rc >= 0)
3168
          {
3169
          PCRE2_SPTR end_subpattern = code;
3170
          PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3171
          int next_state_offset, repeat_state_offset;
3172

3173
          do { end_subpattern += GET(end_subpattern, 1); }
3174
            while (*end_subpattern == OP_ALT);
3175
          next_state_offset =
3176
            (int)(end_subpattern - start_code + LINK_SIZE + 1);
3177

3178
          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3179
          arrange for the repeat state also to be added to the relevant list.
3180
          Calculate the offset, or set -1 for no repeat. */
3181

3182
          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3183
                                 *end_subpattern == OP_KETRMIN)?
3184
            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3185

3186
          /* If we have matched an empty string, add the next state at the
3187
          current character pointer. This is important so that the duplicate
3188
          checking kicks in, which is what breaks infinite loops that match an
3189
          empty string. */
3190

3191
          if (charcount == 0)
3192
            {
3193
            ADD_ACTIVE(next_state_offset, 0);
3194
            }
3195

3196
          /* Optimization: if there are no more active states, and there
3197
          are no new states yet set up, then skip over the subject string
3198
          right here, to save looping. Otherwise, set up the new state to swing
3199
          into action when the end of the matched substring is reached. */
3200

3201
          else if (i + 1 >= active_count && new_count == 0)
3202
            {
3203
            ptr += charcount;
3204
            clen = 0;
3205
            ADD_NEW(next_state_offset, 0);
3206

3207
            /* If we are adding a repeat state at the new character position,
3208
            we must fudge things so that it is the only current state.
3209
            Otherwise, it might be a duplicate of one we processed before, and
3210
            that would cause it to be skipped. */
3211

3212
            if (repeat_state_offset >= 0)
3213
              {
3214
              next_active_state = active_states;
3215
              active_count = 0;
3216
              i = -1;
3217
              ADD_ACTIVE(repeat_state_offset, 0);
3218
              }
3219
            }
3220
          else
3221
            {
3222
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3223
            if (utf)
3224
              {
3225
              PCRE2_SPTR p = start_subject + local_offsets[0];
3226
              PCRE2_SPTR pp = start_subject + local_offsets[1];
3227
              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3228
              }
3229
#endif
3230
            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3231
            if (repeat_state_offset >= 0)
3232
              { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3233
            }
3234
          }
3235
        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3236
        }
3237
      break;
3238

3239

3240
/* ========================================================================== */
3241
      /* Handle callouts */
3242

3243
      case OP_CALLOUT:
3244
      case OP_CALLOUT_STR:
3245
        {
3246
        PCRE2_SIZE callout_length;
3247
        rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3248
          &callout_length);
3249
        if (rrc < 0) return rrc;   /* Abandon */
3250
        if (rrc == 0)
3251
          { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3252
        }
3253
      break;
3254

3255

3256
/* ========================================================================== */
3257
      default:        /* Unsupported opcode */
3258
      return PCRE2_ERROR_DFA_UITEM;
3259
      }
3260

3261
    NEXT_ACTIVE_STATE: continue;
3262

3263
    }      /* End of loop scanning active states */
3264

3265
  /* We have finished the processing at the current subject character. If no
3266
  new states have been set for the next character, we have found all the
3267
  matches that we are going to find. If partial matching has been requested,
3268
  check for appropriate conditions.
3269

3270
  The "could_continue" variable is true if a state could have continued but
3271
  for the fact that the end of the subject was reached. */
3272

3273
  if (new_count <= 0)
3274
    {
3275
    if (could_continue &&                            /* Some could go on, and */
3276
        (                                            /* either... */
3277
        (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3278
        ||                                           /* or... */
3279
        ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3280
         match_count < 0)                             /* no matches */
3281
        ) &&                                         /* And... */
3282
        (
3283
        partial_newline ||                   /* Either partial NL */
3284
          (                                  /* or ... */
3285
          ptr >= end_subject &&              /* End of subject and */
3286
            (                                  /* either */
3287
            ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3288
            mb->allowemptypartial              /* or pattern has lookbehind */
3289
            )                                  /* or could match empty */
3290
          )
3291
        ))
3292
      match_count = PCRE2_ERROR_PARTIAL;
3293
    break;  /* Exit from loop along the subject string */
3294
    }
3295

3296
  /* One or more states are active for the next character. */
3297

3298
  ptr += clen;    /* Advance to next subject character */
3299
  }               /* Loop to move along the subject string */
3300

3301
/* Control gets here from "break" a few lines above. If we have a match and
3302
PCRE2_ENDANCHORED is set, the match fails. */
3303

3304
if (match_count >= 0 &&
3305
    ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3306
    ptr < end_subject)
3307
  match_count = PCRE2_ERROR_NOMATCH;
3308

3309
return match_count;
3310
}
3311

3312

3313

3314
/*************************************************
3315
*     Match a pattern using the DFA algorithm    *
3316
*************************************************/
3317

3318
/* This function matches a compiled pattern to a subject string, using the
3319
alternate matching algorithm that finds all matches at once.
3320

3321
Arguments:
3322
  code          points to the compiled pattern
3323
  subject       subject string
3324
  length        length of subject string
3325
  startoffset   where to start matching in the subject
3326
  options       option bits
3327
  match_data    points to a match data structure
3328
  gcontext      points to a match context
3329
  workspace     pointer to workspace
3330
  wscount       size of workspace
3331

3332
Returns:        > 0 => number of match offset pairs placed in offsets
3333
                = 0 => offsets overflowed; longest matches are present
3334
                 -1 => failed to match
3335
               < -1 => some kind of unexpected problem
3336
*/
3337

3338
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3339
pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3340
  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3341
  pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3342
{
3343
int rc;
3344
int was_zero_terminated = 0;
3345

3346
const pcre2_real_code *re = (const pcre2_real_code *)code;
3347

3348
PCRE2_SPTR start_match;
3349
PCRE2_SPTR end_subject;
3350
PCRE2_SPTR bumpalong_limit;
3351
PCRE2_SPTR req_cu_ptr;
3352

3353
BOOL utf, anchored, startline, firstline;
3354
BOOL has_first_cu = FALSE;
3355
BOOL has_req_cu = FALSE;
3356

3357
#if PCRE2_CODE_UNIT_WIDTH == 8
3358
PCRE2_SPTR memchr_found_first_cu = NULL;
3359
PCRE2_SPTR memchr_found_first_cu2 = NULL;
3360
#endif
3361

3362
PCRE2_UCHAR first_cu = 0;
3363
PCRE2_UCHAR first_cu2 = 0;
3364
PCRE2_UCHAR req_cu = 0;
3365
PCRE2_UCHAR req_cu2 = 0;
3366

3367
const uint8_t *start_bits = NULL;
3368

3369
/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3370
is used below, and it expects NLBLOCK to be defined as a pointer. */
3371

3372
pcre2_callout_block cb;
3373
dfa_match_block actual_match_block;
3374
dfa_match_block *mb = &actual_match_block;
3375

3376
/* Set up a starting block of memory for use during recursive calls to
3377
internal_dfa_match(). By putting this on the stack, it minimizes resource use
3378
in the case when it is not needed. If this is too small, more memory is
3379
obtained from the heap. At the start of each block is an anchor structure.*/
3380

3381
int base_recursion_workspace[RWS_BASE_SIZE];
3382
RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3383
rws->next = NULL;
3384
rws->size = RWS_BASE_SIZE;
3385
rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3386

3387
/* Recognize NULL, length 0 as an empty string. */
3388

3389
if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3390

3391
/* Plausibility checks */
3392

3393
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3394
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3395
  return PCRE2_ERROR_NULL;
3396

3397
if (length == PCRE2_ZERO_TERMINATED)
3398
  {
3399
  length = PRIV(strlen)(subject);
3400
  was_zero_terminated = 1;
3401
  }
3402

3403
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3404
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3405

3406
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3407
time. */
3408

3409
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3410
   ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3411
  return PCRE2_ERROR_BADOPTION;
3412

3413
/* Invalid UTF support is not available for DFA matching. */
3414

3415
if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3416
  return PCRE2_ERROR_DFA_UINVALID_UTF;
3417

3418
/* Check that the first field in the block is the magic number. If it is not,
3419
return with PCRE2_ERROR_BADMAGIC. */
3420

3421
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3422

3423
/* Check the code unit width. */
3424

3425
if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3426
  return PCRE2_ERROR_BADMODE;
3427

3428
/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3429
options variable for this function. Users of PCRE2 who are not calling the
3430
function directly would like to have a way of setting these flags, in the same
3431
way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with
3432
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3433
(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3434
transferred to the options for this function. The bits are guaranteed to be
3435
adjacent, but do not have the same values. This bit of Boolean trickery assumes
3436
that the match-time bits are not more significant than the flag bits. If by
3437
accident this is not the case, a compile-time division by zero error will
3438
occur. */
3439

3440
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3441
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3442
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3443
#undef FF
3444
#undef OO
3445

3446
/* If restarting after a partial match, do some sanity checks on the contents
3447
of the workspace. */
3448

3449
if ((options & PCRE2_DFA_RESTART) != 0)
3450
  {
3451
  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3452
    workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3453
      return PCRE2_ERROR_DFA_BADRESTART;
3454
  }
3455

3456
/* Set some local values */
3457

3458
utf = (re->overall_options & PCRE2_UTF) != 0;
3459
start_match = subject + start_offset;
3460
end_subject = subject + length;
3461
req_cu_ptr = start_match - 1;
3462
anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3463
  (re->overall_options & PCRE2_ANCHORED) != 0;
3464

3465
/* The "must be at the start of a line" flags are used in a loop when finding
3466
where to start. */
3467

3468
startline = (re->flags & PCRE2_STARTLINE) != 0;
3469
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3470
bumpalong_limit = end_subject;
3471

3472
/* Initialize and set up the fixed fields in the callout block, with a pointer
3473
in the match block. */
3474

3475
mb->cb = &cb;
3476
cb.version = 2;
3477
cb.subject = subject;
3478
cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3479
cb.callout_flags = 0;
3480
cb.capture_top      = 1;      /* No capture support */
3481
cb.capture_last     = 0;
3482
cb.mark             = NULL;   /* No (*MARK) support */
3483

3484
/* Get data from the match context, if present, and fill in the remaining
3485
fields in the match block. It is an error to set an offset limit without
3486
setting the flag at compile time. */
3487

3488
if (mcontext == NULL)
3489
  {
3490
  mb->callout = NULL;
3491
  mb->memctl = re->memctl;
3492
  mb->match_limit = PRIV(default_match_context).match_limit;
3493
  mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3494
  mb->heap_limit = PRIV(default_match_context).heap_limit;
3495
  }
3496
else
3497
  {
3498
  if (mcontext->offset_limit != PCRE2_UNSET)
3499
    {
3500
    if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3501
      return PCRE2_ERROR_BADOFFSETLIMIT;
3502
    bumpalong_limit = subject + mcontext->offset_limit;
3503
    }
3504
  mb->callout = mcontext->callout;
3505
  mb->callout_data = mcontext->callout_data;
3506
  mb->memctl = mcontext->memctl;
3507
  mb->match_limit = mcontext->match_limit;
3508
  mb->match_limit_depth = mcontext->depth_limit;
3509
  mb->heap_limit = mcontext->heap_limit;
3510
  }
3511

3512
if (mb->match_limit > re->limit_match)
3513
  mb->match_limit = re->limit_match;
3514

3515
if (mb->match_limit_depth > re->limit_depth)
3516
  mb->match_limit_depth = re->limit_depth;
3517

3518
if (mb->heap_limit > re->limit_heap)
3519
  mb->heap_limit = re->limit_heap;
3520

3521
mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);
3522
mb->tables = re->tables;
3523
mb->start_subject = subject;
3524
mb->end_subject = end_subject;
3525
mb->start_offset = start_offset;
3526
mb->allowemptypartial = (re->max_lookbehind > 0) ||
3527
  (re->flags & PCRE2_MATCH_EMPTY) != 0;
3528
mb->moptions = options;
3529
mb->poptions = re->overall_options;
3530
mb->match_call_count = 0;
3531
mb->heap_used = 0;
3532

3533
/* Process the \R and newline settings. */
3534

3535
mb->bsr_convention = re->bsr_convention;
3536
mb->nltype = NLTYPE_FIXED;
3537
switch(re->newline_convention)
3538
  {
3539
  case PCRE2_NEWLINE_CR:
3540
  mb->nllen = 1;
3541
  mb->nl[0] = CHAR_CR;
3542
  break;
3543

3544
  case PCRE2_NEWLINE_LF:
3545
  mb->nllen = 1;
3546
  mb->nl[0] = CHAR_NL;
3547
  break;
3548

3549
  case PCRE2_NEWLINE_NUL:
3550
  mb->nllen = 1;
3551
  mb->nl[0] = CHAR_NUL;
3552
  break;
3553

3554
  case PCRE2_NEWLINE_CRLF:
3555
  mb->nllen = 2;
3556
  mb->nl[0] = CHAR_CR;
3557
  mb->nl[1] = CHAR_NL;
3558
  break;
3559

3560
  case PCRE2_NEWLINE_ANY:
3561
  mb->nltype = NLTYPE_ANY;
3562
  break;
3563

3564
  case PCRE2_NEWLINE_ANYCRLF:
3565
  mb->nltype = NLTYPE_ANYCRLF;
3566
  break;
3567

3568
  default:
3569
  PCRE2_DEBUG_UNREACHABLE();
3570
  return PCRE2_ERROR_INTERNAL;
3571
  }
3572

3573
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3574
we must also check that a starting offset does not point into the middle of a
3575
multiunit character. We check only the portion of the subject that is going to
3576
be inspected during matching - from the offset minus the maximum back reference
3577
to the given length. This saves time when a small part of a large subject is
3578
being matched by the use of a starting offset. Note that the maximum lookbehind
3579
is a number of characters, not code units. */
3580

3581
#ifdef SUPPORT_UNICODE
3582
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3583
  {
3584
  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3585

3586
  if (start_offset > 0)
3587
    {
3588
#if PCRE2_CODE_UNIT_WIDTH != 32
3589
    unsigned int i;
3590
    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3591
      return PCRE2_ERROR_BADUTFOFFSET;
3592
    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3593
      {
3594
      check_subject--;
3595
      while (check_subject > subject &&
3596
#if PCRE2_CODE_UNIT_WIDTH == 8
3597
      (*check_subject & 0xc0) == 0x80)
3598
#else  /* 16-bit */
3599
      (*check_subject & 0xfc00) == 0xdc00)
3600
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3601
        check_subject--;
3602
      }
3603
#else   /* In the 32-bit library, one code unit equals one character. */
3604
    check_subject -= re->max_lookbehind;
3605
    if (check_subject < subject) check_subject = subject;
3606
#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3607
    }
3608

3609
  /* Validate the relevant portion of the subject. After an error, adjust the
3610
  offset to be an absolute offset in the whole string. */
3611

3612
  match_data->rc = PRIV(valid_utf)(check_subject,
3613
    length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3614
  if (match_data->rc != 0)
3615
    {
3616
    match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3617
    return match_data->rc;
3618
    }
3619
  }
3620
#endif  /* SUPPORT_UNICODE */
3621

3622
/* Set up the first code unit to match, if available. If there's no first code
3623
unit there may be a bitmap of possible first characters. */
3624

3625
if ((re->flags & PCRE2_FIRSTSET) != 0)
3626
  {
3627
  has_first_cu = TRUE;
3628
  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3629
  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3630
    {
3631
    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3632
#ifdef SUPPORT_UNICODE
3633
#if PCRE2_CODE_UNIT_WIDTH == 8
3634
    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3635
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3636
#else
3637
    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3638
      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3639
#endif
3640
#endif  /* SUPPORT_UNICODE */
3641
    }
3642
  }
3643
else
3644
  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3645
    start_bits = re->start_bitmap;
3646

3647
/* There may be a "last known required code unit" set. */
3648

3649
if ((re->flags & PCRE2_LASTSET) != 0)
3650
  {
3651
  has_req_cu = TRUE;
3652
  req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3653
  if ((re->flags & PCRE2_LASTCASELESS) != 0)
3654
    {
3655
    req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3656
#ifdef SUPPORT_UNICODE
3657
#if PCRE2_CODE_UNIT_WIDTH == 8
3658
    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3659
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3660
#else
3661
    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3662
      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3663
#endif
3664
#endif  /* SUPPORT_UNICODE */
3665
    }
3666
  }
3667

3668
/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3669
free the memory that was obtained. */
3670

3671
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3672
  {
3673
  match_data->memctl.free((void *)match_data->subject,
3674
    match_data->memctl.memory_data);
3675
  match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3676
  }
3677

3678
/* Fill in fields that are always returned in the match data. */
3679

3680
match_data->code = re;
3681
match_data->subject = NULL;  /* Default for no match */
3682
match_data->mark = NULL;
3683
match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3684

3685
/* Call the main matching function, looping for a non-anchored regex after a
3686
failed match. If not restarting, perform certain optimizations at the start of
3687
a match. */
3688

3689
for (;;)
3690
  {
3691
  /* ----------------- Start of match optimizations ---------------- */
3692

3693
  /* There are some optimizations that avoid running the match if a known
3694
  starting point is not found, or if a known later code unit is not present.
3695
  However, there is an option (settable at compile time) that disables
3696
  these, for testing and for ensuring that all callouts do actually occur.
3697
  The optimizations must also be avoided when restarting a DFA match. */
3698

3699
  if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0 &&
3700
      (options & PCRE2_DFA_RESTART) == 0)
3701
    {
3702
    /* If firstline is TRUE, the start of the match is constrained to the first
3703
    line of a multiline string. That is, the match must be before or at the
3704
    first newline following the start of matching. Temporarily adjust
3705
    end_subject so that we stop the optimization scans for a first code unit
3706
    immediately after the first character of a newline (the first code unit can
3707
    legitimately be a newline). If the match fails at the newline, later code
3708
    breaks this loop. */
3709

3710
    if (firstline)
3711
      {
3712
      PCRE2_SPTR t = start_match;
3713
#ifdef SUPPORT_UNICODE
3714
      if (utf)
3715
        {
3716
        while (t < end_subject && !IS_NEWLINE(t))
3717
          {
3718
          t++;
3719
          ACROSSCHAR(t < end_subject, t, t++);
3720
          }
3721
        }
3722
      else
3723
#endif
3724
      while (t < end_subject && !IS_NEWLINE(t)) t++;
3725
      end_subject = t;
3726
      }
3727

3728
    /* Anchored: check the first code unit if one is recorded. This may seem
3729
    pointless but it can help in detecting a no match case without scanning for
3730
    the required code unit. */
3731

3732
    if (anchored)
3733
      {
3734
      if (has_first_cu || start_bits != NULL)
3735
        {
3736
        BOOL ok = start_match < end_subject;
3737
        if (ok)
3738
          {
3739
          PCRE2_UCHAR c = UCHAR21TEST(start_match);
3740
          ok = has_first_cu && (c == first_cu || c == first_cu2);
3741
          if (!ok && start_bits != NULL)
3742
            {
3743
#if PCRE2_CODE_UNIT_WIDTH != 8
3744
            if (c > 255) c = 255;
3745
#endif
3746
            ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3747
            }
3748
          }
3749
        if (!ok) break;
3750
        }
3751
      }
3752

3753
    /* Not anchored. Advance to a unique first code unit if there is one. */
3754

3755
    else
3756
      {
3757
      if (has_first_cu)
3758
        {
3759
        if (first_cu != first_cu2)  /* Caseless */
3760
          {
3761
          /* In 16-bit and 32_bit modes we have to do our own search, so can
3762
          look for both cases at once. */
3763

3764
#if PCRE2_CODE_UNIT_WIDTH != 8
3765
          PCRE2_UCHAR smc;
3766
          while (start_match < end_subject &&
3767
                (smc = UCHAR21TEST(start_match)) != first_cu &&
3768
                 smc != first_cu2)
3769
            start_match++;
3770
#else
3771
          /* In 8-bit mode, the use of memchr() gives a big speed up, even
3772
          though we have to call it twice in order to find the earliest
3773
          occurrence of the code unit in either of its cases. Caching is used
3774
          to remember the positions of previously found code units. This can
3775
          make a huge difference when the strings are very long and only one
3776
          case is actually present. */
3777

3778
          PCRE2_SPTR pp1 = NULL;
3779
          PCRE2_SPTR pp2 = NULL;
3780
          PCRE2_SIZE searchlength = end_subject - start_match;
3781

3782
          /* If we haven't got a previously found position for first_cu, or if
3783
          the current starting position is later, we need to do a search. If
3784
          the code unit is not found, set it to the end. */
3785

3786
          if (memchr_found_first_cu == NULL ||
3787
              start_match > memchr_found_first_cu)
3788
            {
3789
            pp1 = memchr(start_match, first_cu, searchlength);
3790
            memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3791
            }
3792

3793
          /* If the start is before a previously found position, use the
3794
          previous position, or NULL if a previous search failed. */
3795

3796
          else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3797
            memchr_found_first_cu;
3798

3799
          /* Do the same thing for the other case. */
3800

3801
          if (memchr_found_first_cu2 == NULL ||
3802
              start_match > memchr_found_first_cu2)
3803
            {
3804
            pp2 = memchr(start_match, first_cu2, searchlength);
3805
            memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3806
            }
3807

3808
          else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3809
            memchr_found_first_cu2;
3810

3811
          /* Set the start to the end of the subject if neither case was found.
3812
          Otherwise, use the earlier found point. */
3813

3814
          if (pp1 == NULL)
3815
            start_match = (pp2 == NULL)? end_subject : pp2;
3816
          else
3817
            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3818

3819
#endif  /* 8-bit handling */
3820
          }
3821

3822
        /* The caseful case is much simpler. */
3823

3824
        else
3825
          {
3826
#if PCRE2_CODE_UNIT_WIDTH != 8
3827
          while (start_match < end_subject && UCHAR21TEST(start_match) !=
3828
                 first_cu)
3829
            start_match++;
3830
#else  /* 8-bit code units */
3831
          start_match = memchr(start_match, first_cu, end_subject - start_match);
3832
          if (start_match == NULL) start_match = end_subject;
3833
#endif
3834
          }
3835

3836
        /* If we can't find the required code unit, having reached the true end
3837
        of the subject, break the bumpalong loop, to force a match failure,
3838
        except when doing partial matching, when we let the next cycle run at
3839
        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3840
        which partially matches "abc", even though the string does not contain
3841
        the starting character "d". If we have not reached the true end of the
3842
        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3843
        we also let the cycle run, because the matching string is legitimately
3844
        allowed to start with the first code unit of a newline. */
3845

3846
        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3847
            start_match >= mb->end_subject)
3848
          break;
3849
        }
3850

3851
      /* If there's no first code unit, advance to just after a linebreak for a
3852
      multiline match if required. */
3853

3854
      else if (startline)
3855
        {
3856
        if (start_match > mb->start_subject + start_offset)
3857
          {
3858
#ifdef SUPPORT_UNICODE
3859
          if (utf)
3860
            {
3861
            while (start_match < end_subject && !WAS_NEWLINE(start_match))
3862
              {
3863
              start_match++;
3864
              ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3865
              }
3866
            }
3867
          else
3868
#endif
3869
          while (start_match < end_subject && !WAS_NEWLINE(start_match))
3870
            start_match++;
3871

3872
          /* If we have just passed a CR and the newline option is ANY or
3873
          ANYCRLF, and we are now at a LF, advance the match position by one
3874
          more code unit. */
3875

3876
          if (start_match[-1] == CHAR_CR &&
3877
               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3878
               start_match < end_subject &&
3879
               UCHAR21TEST(start_match) == CHAR_NL)
3880
            start_match++;
3881
          }
3882
        }
3883

3884
      /* If there's no first code unit or a requirement for a multiline line
3885
      start, advance to a non-unique first code unit if any have been
3886
      identified. The bitmap contains only 256 bits. When code units are 16 or
3887
      32 bits wide, all code units greater than 254 set the 255 bit. */
3888

3889
      else if (start_bits != NULL)
3890
        {
3891
        while (start_match < end_subject)
3892
          {
3893
          uint32_t c = UCHAR21TEST(start_match);
3894
#if PCRE2_CODE_UNIT_WIDTH != 8
3895
          if (c > 255) c = 255;
3896
#endif
3897
          if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3898
          start_match++;
3899
          }
3900

3901
        /* See comment above in first_cu checking about the next line. */
3902

3903
        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3904
            start_match >= mb->end_subject)
3905
          break;
3906
        }
3907
      }  /* End of first code unit handling */
3908

3909
    /* Restore fudged end_subject */
3910

3911
    end_subject = mb->end_subject;
3912

3913
    /* The following two optimizations are disabled for partial matching. */
3914

3915
    if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3916
      {
3917
      PCRE2_SPTR p;
3918

3919
      /* The minimum matching length is a lower bound; no actual string of that
3920
      length may actually match the pattern. Although the value is, strictly,
3921
      in characters, we treat it as code units to avoid spending too much time
3922
      in this optimization. */
3923

3924
      if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3925

3926
      /* If req_cu is set, we know that that code unit must appear in the
3927
      subject for the match to succeed. If the first code unit is set, req_cu
3928
      must be later in the subject; otherwise the test starts at the match
3929
      point. This optimization can save a huge amount of backtracking in
3930
      patterns with nested unlimited repeats that aren't going to match.
3931
      Writing separate code for cased/caseless versions makes it go faster, as
3932
      does using an autoincrement and backing off on a match. As in the case of
3933
      the first code unit, using memchr() in the 8-bit library gives a big
3934
      speed up. Unlike the first_cu check above, we do not need to call
3935
      memchr() twice in the caseless case because we only need to check for the
3936
      presence of the character in either case, not find the first occurrence.
3937

3938
      The search can be skipped if the code unit was found later than the
3939
      current starting point in a previous iteration of the bumpalong loop.
3940

3941
      HOWEVER: when the subject string is very, very long, searching to its end
3942
      can take a long time, and give bad performance on quite ordinary
3943
      patterns. This showed up when somebody was matching something like
3944
      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3945
      sufficiently long, but it's worth searching a lot more for unanchored
3946
      patterns. */
3947

3948
      p = start_match + (has_first_cu? 1:0);
3949
      if (has_req_cu && p > req_cu_ptr)
3950
        {
3951
        PCRE2_SIZE check_length = end_subject - start_match;
3952

3953
        if (check_length < REQ_CU_MAX ||
3954
              (!anchored && check_length < REQ_CU_MAX * 1000))
3955
          {
3956
          if (req_cu != req_cu2)  /* Caseless */
3957
            {
3958
#if PCRE2_CODE_UNIT_WIDTH != 8
3959
            while (p < end_subject)
3960
              {
3961
              uint32_t pp = UCHAR21INCTEST(p);
3962
              if (pp == req_cu || pp == req_cu2) { p--; break; }
3963
              }
3964
#else  /* 8-bit code units */
3965
            PCRE2_SPTR pp = p;
3966
            p = memchr(pp, req_cu, end_subject - pp);
3967
            if (p == NULL)
3968
              {
3969
              p = memchr(pp, req_cu2, end_subject - pp);
3970
              if (p == NULL) p = end_subject;
3971
              }
3972
#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3973
            }
3974

3975
          /* The caseful case */
3976

3977
          else
3978
            {
3979
#if PCRE2_CODE_UNIT_WIDTH != 8
3980
            while (p < end_subject)
3981
              {
3982
              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3983
              }
3984

3985
#else  /* 8-bit code units */
3986
            p = memchr(p, req_cu, end_subject - p);
3987
            if (p == NULL) p = end_subject;
3988
#endif
3989
            }
3990

3991
          /* If we can't find the required code unit, break the matching loop,
3992
          forcing a match failure. */
3993

3994
          if (p >= end_subject) break;
3995

3996
          /* If we have found the required code unit, save the point where we
3997
          found it, so that we don't search again next time round the loop if
3998
          the start hasn't passed this code unit yet. */
3999

4000
          req_cu_ptr = p;
4001
          }
4002
        }
4003
      }
4004
    }
4005

4006
  /* ------------ End of start of match optimizations ------------ */
4007

4008
  /* Give no match if we have passed the bumpalong limit. */
4009

4010
  if (start_match > bumpalong_limit) break;
4011

4012
  /* OK, now we can do the business */
4013

4014
  mb->start_used_ptr = start_match;
4015
  mb->last_used_ptr = start_match;
4016
  mb->recursive = NULL;
4017

4018
  rc = internal_dfa_match(
4019
    mb,                           /* fixed match data */
4020
    mb->start_code,               /* this subexpression's code */
4021
    start_match,                  /* where we currently are */
4022
    start_offset,                 /* start offset in subject */
4023
    match_data->ovector,          /* offset vector */
4024
    (uint32_t)match_data->oveccount * 2,  /* actual size of same */
4025
    workspace,                    /* workspace vector */
4026
    (int)wscount,                 /* size of same */
4027
    0,                            /* function recurse level */
4028
    base_recursion_workspace);    /* initial workspace for recursion */
4029

4030
  /* Anything other than "no match" means we are done, always; otherwise, carry
4031
  on only if not anchored. */
4032

4033
  if (rc != PCRE2_ERROR_NOMATCH || anchored)
4034
    {
4035
    if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4036
      {
4037
      match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4038
      match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4039
      }
4040
    match_data->subject_length = length;
4041
    match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4042
    match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4043
    match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4044
    match_data->rc = rc;
4045

4046
    if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4047
      {
4048
      length = CU2BYTES(length + was_zero_terminated);
4049
      match_data->subject = match_data->memctl.malloc(length,
4050
        match_data->memctl.memory_data);
4051
      if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4052
      memcpy((void *)match_data->subject, subject, length);
4053
      match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4054
      }
4055
    else
4056
      {
4057
      if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4058
      }
4059
    goto EXIT;
4060
    }
4061

4062
  /* Advance to the next subject character unless we are at the end of a line
4063
  and firstline is set. */
4064

4065
  if (firstline && IS_NEWLINE(start_match)) break;
4066
  start_match++;
4067
#ifdef SUPPORT_UNICODE
4068
  if (utf)
4069
    {
4070
    ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4071
    }
4072
#endif
4073
  if (start_match > end_subject) break;
4074

4075
  /* If we have just passed a CR and we are now at a LF, and the pattern does
4076
  not contain any explicit matches for \r or \n, and the newline option is CRLF
4077
  or ANY or ANYCRLF, advance the match position by one more character. */
4078

4079
  if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4080
      start_match < end_subject &&
4081
      UCHAR21TEST(start_match) == CHAR_NL &&
4082
      (re->flags & PCRE2_HASCRORLF) == 0 &&
4083
        (mb->nltype == NLTYPE_ANY ||
4084
         mb->nltype == NLTYPE_ANYCRLF ||
4085
         mb->nllen == 2))
4086
    start_match++;
4087

4088
  }   /* "Bumpalong" loop */
4089

4090
NOMATCH_EXIT:
4091
rc = PCRE2_ERROR_NOMATCH;
4092

4093
EXIT:
4094
while (rws->next != NULL)
4095
  {
4096
  RWS_anchor *next = rws->next;
4097
  rws->next = next->next;
4098
  mb->memctl.free(next, mb->memctl.memory_data);
4099
  }
4100

4101
return rc;
4102
}
4103

4104
/* These #undefs are here to enable unity builds with CMake. */
4105

4106
#undef NLBLOCK /* Block containing newline information */
4107
#undef PSSTART /* Field containing processed string start */
4108
#undef PSEND   /* Field containing processed string end */
4109

4110
/* End of pcre2_dfa_match.c */
4111

4112
Product

Resources

Company