CoCalc -- pcre2_intmodedep.h

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_intmodedep.h
⁹⁸⁹⁸ views
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4

5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7

8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11

12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15

16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18

19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22

23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26

27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40

41

42
/* This module contains mode-dependent macro and structure definitions. The
43
file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
44
These mode-dependent items are kept in a separate file so that they can also be
45
#included multiple times for different code unit widths by pcre2test in order
46
to have access to the hidden structures at all supported widths.
47

48
Some of the mode-dependent macros are required at different widths for
49
different parts of the pcre2test code (in particular, the included
50
pcre2_printint.c file). We undefine them here so that they can be re-defined for
51
multiple inclusions. Not all of these are used in pcre2test, but it's easier
52
just to undefine them all. */
53

54
#undef ACROSSCHAR
55
#undef BACKCHAR
56
#undef BYTES2CU
57
#undef CHMAX_255
58
#undef CU2BYTES
59
#undef FORWARDCHAR
60
#undef FORWARDCHARTEST
61
#undef GET
62
#undef GET2
63
#undef GETCHAR
64
#undef GETCHARINC
65
#undef GETCHARINCTEST
66
#undef GETCHARLEN
67
#undef GETCHARLENTEST
68
#undef GETCHARTEST
69
#undef GET_EXTRALEN
70
#undef HAS_EXTRALEN
71
#undef IMM2_SIZE
72
#undef MAX_255
73
#undef MAX_MARK
74
#undef MAX_PATTERN_SIZE
75
#undef MAX_UTF_SINGLE_CU
76
#undef NOT_FIRSTCU
77
#undef PUT
78
#undef PUT2
79
#undef PUT2INC
80
#undef PUTCHAR
81
#undef PUTINC
82
#undef TABLE_GET
83

84

85

86
/* -------------------------- MACROS ----------------------------- */
87

88
/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
89
(always stored in big-endian order in 8-bit mode) by default. These are used,
90
for example, to link from the start of a subpattern to its alternatives and its
91
end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
92
to around 64K, which is big enough for almost everybody. However, I received a
93
request for an even bigger limit. For this reason, and also to make the code
94
easier to maintain, the storing and loading of offsets from the compiled code
95
unit string is now handled by the macros that are defined here.
96

97
The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
98
values of 3 or 4 are also supported. */
99

100
/* ------------------- 8-bit support  ------------------ */
101

102
#if PCRE2_CODE_UNIT_WIDTH == 8
103

104
#if LINK_SIZE == 2
105
#define PUT(a,n,d)   \
106
  (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
107
  (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
108
#define GET(a,n) \
109
  (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
110
#define MAX_PATTERN_SIZE (1 << 16)
111

112
#elif LINK_SIZE == 3
113
#define PUT(a,n,d)       \
114
  (a[n] = (PCRE2_UCHAR)((d) >> 16)),    \
115
  (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
116
  (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
117
#define GET(a,n) \
118
  (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
119
#define MAX_PATTERN_SIZE (1 << 24)
120

121
#elif LINK_SIZE == 4
122
#define PUT(a,n,d)        \
123
  (a[n] = (PCRE2_UCHAR)((d) >> 24)),     \
124
  (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
125
  (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)),  \
126
  (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
127
#define GET(a,n) \
128
  (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
129
#define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
130

131
#else
132
#error LINK_SIZE must be 2, 3, or 4
133
#endif
134

135

136
/* ------------------- 16-bit support  ------------------ */
137

138
#elif PCRE2_CODE_UNIT_WIDTH == 16
139

140
#if LINK_SIZE == 2
141
#undef LINK_SIZE
142
#define LINK_SIZE 1
143
#define PUT(a,n,d)   \
144
  (a[n] = (PCRE2_UCHAR)(d))
145
#define GET(a,n) \
146
  (a[n])
147
#define MAX_PATTERN_SIZE (1 << 16)
148

149
#elif LINK_SIZE == 3 || LINK_SIZE == 4
150
#undef LINK_SIZE
151
#define LINK_SIZE 2
152
#define PUT(a,n,d)   \
153
  (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
154
  (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
155
#define GET(a,n) \
156
  (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
157
#define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
158

159
#else
160
#error LINK_SIZE must be 2, 3, or 4
161
#endif
162

163

164
/* ------------------- 32-bit support  ------------------ */
165

166
#elif PCRE2_CODE_UNIT_WIDTH == 32
167
#undef LINK_SIZE
168
#define LINK_SIZE 1
169
#define PUT(a,n,d)   \
170
  (a[n] = (d))
171
#define GET(a,n) \
172
  (a[n])
173
#define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
174

175
#else
176
#error Unsupported compiling mode
177
#endif
178

179

180
/* --------------- Other mode-specific macros ----------------- */
181

182
/* PCRE uses some other (at least) 16-bit quantities that do not change when
183
the size of offsets changes. There are used for repeat counts and for other
184
things such as capturing parenthesis numbers in back references.
185

186
Define the number of code units required to hold a 16-bit count/offset, and
187
macros to load and store such a value. For reasons that I do not understand,
188
the expression in the 8-bit GET2 macro is treated by gcc as a signed
189
expression, even when a is declared as unsigned. It seems that any kind of
190
arithmetic results in a signed value. Hence the cast. */
191

192
#if PCRE2_CODE_UNIT_WIDTH == 8
193
#define IMM2_SIZE 2
194
#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
195
#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
196

197
#else  /* Code units are 16 or 32 bits */
198
#define IMM2_SIZE 1
199
#define GET2(a,n) a[n]
200
#define PUT2(a,n,d) a[n] = d
201
#endif
202

203
/* Other macros that are different for 8-bit mode. The MAX_255 macro checks
204
whether its argument, which is assumed to be one code unit, is less than 256.
205
The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
206
name must fit in one code unit; currently it is set to 255 or 65535. The
207
TABLE_GET macro is used to access elements of tables containing exactly 256
208
items. Its argument is a code unit. When code points can be greater than 255, a
209
check is needed before accessing these tables. */
210

211
#if PCRE2_CODE_UNIT_WIDTH == 8
212
#define MAX_255(c) TRUE
213
#define MAX_MARK ((1u << 8) - 1)
214
#define TABLE_GET(c, table, default) ((table)[c])
215
#ifdef SUPPORT_UNICODE
216
#define SUPPORT_WIDE_CHARS
217
#define CHMAX_255(c) ((c) <= 255u)
218
#else
219
#define CHMAX_255(c) TRUE
220
#endif  /* SUPPORT_UNICODE */
221

222
#else  /* Code units are 16 or 32 bits */
223
#define CHMAX_255(c) ((c) <= 255u)
224
#define MAX_255(c) ((c) <= 255u)
225
#define MAX_MARK ((1u << 16) - 1)
226
#define SUPPORT_WIDE_CHARS
227
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
228
#endif
229

230

231
/* ----------------- Character-handling macros ----------------- */
232

233
/* There is a proposed future special "UTF-21" mode, in which only the lowest
234
21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
235
high-order bits available to the application for other uses. In preparation for
236
the future implementation of this mode, there are macros that load a data item
237
and, if in this special mode, mask it to 21 bits. These macros all have names
238
starting with UCHAR21. In all other modes, including the normal 32-bit
239
library, the macros all have the same simple definitions. When the new mode is
240
implemented, it is expected that these definitions will be varied appropriately
241
using #ifdef when compiling the library that supports the special mode. */
242

243
#define UCHAR21(eptr)        (*(eptr))
244
#define UCHAR21TEST(eptr)    (*(eptr))
245
#define UCHAR21INC(eptr)     (*(eptr)++)
246
#define UCHAR21INCTEST(eptr) (*(eptr)++)
247

248
/* When UTF encoding is being used, a character is no longer just a single
249
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
250
handling generate simple sequences when used in the basic mode, and more
251
complicated ones for UTF characters. GETCHARLENTEST and other macros are not
252
used when UTF is not supported. To make sure they can never even appear when
253
UTF support is omitted, we don't even define them. */
254

255
#ifndef SUPPORT_UNICODE
256

257
/* #define MAX_UTF_SINGLE_CU */
258
/* #define HAS_EXTRALEN(c) */
259
/* #define GET_EXTRALEN(c) */
260
/* #define NOT_FIRSTCU(c) */
261
#define GETCHAR(c, eptr) c = *eptr;
262
#define GETCHARTEST(c, eptr) c = *eptr;
263
#define GETCHARINC(c, eptr) c = *eptr++;
264
#define GETCHARINCTEST(c, eptr) c = *eptr++;
265
#define GETCHARLEN(c, eptr, len) c = *eptr;
266
#define PUTCHAR(c, p) (*p = c, 1)
267
/* #define GETCHARLENTEST(c, eptr, len) */
268
/* #define BACKCHAR(eptr) */
269
/* #define FORWARDCHAR(eptr) */
270
/* #define FORWARCCHARTEST(eptr,end) */
271
/* #define ACROSSCHAR(condition, eptr, action) */
272

273
#else   /* SUPPORT_UNICODE */
274

275
/* ------------------- 8-bit support  ------------------ */
276

277
#if PCRE2_CODE_UNIT_WIDTH == 8
278
#define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
279

280
/* The largest UTF code point that can be encoded as a single code unit. */
281

282
#define MAX_UTF_SINGLE_CU 127
283

284
/* Tests whether the code point needs extra characters to decode. */
285

286
#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
287

288
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
289
Otherwise it has an undefined behaviour. */
290

291
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
292

293
/* Returns TRUE, if the given value is not the first code unit of a UTF
294
sequence. */
295

296
#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
297

298
/* Get the next UTF-8 character, not advancing the pointer. This is called when
299
we know we are in UTF-8 mode. */
300

301
#define GETCHAR(c, eptr) \
302
  c = *eptr; \
303
  if (c >= 0xc0u) GETUTF8(c, eptr);
304

305
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
306
pointer. */
307

308
#define GETCHARTEST(c, eptr) \
309
  c = *eptr; \
310
  if (utf && c >= 0xc0u) GETUTF8(c, eptr);
311

312
/* Get the next UTF-8 character, advancing the pointer. This is called when we
313
know we are in UTF-8 mode. */
314

315
#define GETCHARINC(c, eptr) \
316
  c = *eptr++; \
317
  if (c >= 0xc0u) GETUTF8INC(c, eptr);
318

319
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
320
This is called when we don't know if we are in UTF-8 mode. */
321

322
#define GETCHARINCTEST(c, eptr) \
323
  c = *eptr++; \
324
  if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
325

326
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
327
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
328

329
#define GETCHARLEN(c, eptr, len) \
330
  c = *eptr; \
331
  if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
332

333
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
334
pointer, incrementing length if there are extra bytes. This is called when we
335
do not know if we are in UTF-8 mode. */
336

337
#define GETCHARLENTEST(c, eptr, len) \
338
  c = *eptr; \
339
  if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
340

341
/* If the pointer is not at the start of a character, move it back until
342
it is. This is called only in UTF-8 mode - we don't put a test within the macro
343
because almost all calls are already within a block of UTF-8 only code. */
344

345
#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
346

347
/* Same as above, just in the other direction. */
348
#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
349
#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
350

351
/* Same as above, but it allows a fully customizable form. */
352
#define ACROSSCHAR(condition, eptr, action) \
353
  while((condition) && ((*eptr) & 0xc0u) == 0x80u) action
354

355
/* Deposit a character into memory, returning the number of code units. */
356

357
#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
358
  PRIV(ord2utf)(c,p) : (*p = c, 1))
359

360

361
/* ------------------- 16-bit support  ------------------ */
362

363
#elif PCRE2_CODE_UNIT_WIDTH == 16
364
#define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
365

366
/* The largest UTF code point that can be encoded as a single code unit. */
367

368
#define MAX_UTF_SINGLE_CU 65535
369

370
/* Tests whether the code point needs extra characters to decode. */
371

372
#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
373

374
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
375
Otherwise it has an undefined behaviour. */
376

377
#define GET_EXTRALEN(c) 1
378

379
/* Returns TRUE, if the given value is not the first code unit of a UTF
380
sequence. */
381

382
#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
383

384
/* Base macro to pick up the low surrogate of a UTF-16 character, not
385
advancing the pointer. */
386

387
#define GETUTF16(c, eptr) \
388
   { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
389

390
/* Get the next UTF-16 character, not advancing the pointer. This is called when
391
we know we are in UTF-16 mode. */
392

393
#define GETCHAR(c, eptr) \
394
  c = *eptr; \
395
  if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
396

397
/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
398
pointer. */
399

400
#define GETCHARTEST(c, eptr) \
401
  c = *eptr; \
402
  if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
403

404
/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
405
the pointer. */
406

407
#define GETUTF16INC(c, eptr) \
408
   { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
409

410
/* Get the next UTF-16 character, advancing the pointer. This is called when we
411
know we are in UTF-16 mode. */
412

413
#define GETCHARINC(c, eptr) \
414
  c = *eptr++; \
415
  if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
416

417
/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
418
This is called when we don't know if we are in UTF-16 mode. */
419

420
#define GETCHARINCTEST(c, eptr) \
421
  c = *eptr++; \
422
  if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
423

424
/* Base macro to pick up the low surrogate of a UTF-16 character, not
425
advancing the pointer, incrementing the length. */
426

427
#define GETUTF16LEN(c, eptr, len) \
428
   { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
429

430
/* Get the next UTF-16 character, not advancing the pointer, incrementing
431
length if there is a low surrogate. This is called when we know we are in
432
UTF-16 mode. */
433

434
#define GETCHARLEN(c, eptr, len) \
435
  c = *eptr; \
436
  if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
437

438
/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the
439
pointer, incrementing length if there is a low surrogate. This is called when
440
we do not know if we are in UTF-16 mode. */
441

442
#define GETCHARLENTEST(c, eptr, len) \
443
  c = *eptr; \
444
  if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
445

446
/* If the pointer is not at the start of a character, move it back until
447
it is. This is called only in UTF-16 mode - we don't put a test within the
448
macro because almost all calls are already within a block of UTF-16 only
449
code. */
450

451
#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
452

453
/* Same as above, just in the other direction. */
454
#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
455
#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
456

457
/* Same as above, but it allows a fully customizable form. */
458
#define ACROSSCHAR(condition, eptr, action) \
459
  if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action
460

461
/* Deposit a character into memory, returning the number of code units. */
462

463
#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
464
  PRIV(ord2utf)(c,p) : (*p = c, 1))
465

466

467
/* ------------------- 32-bit support  ------------------ */
468

469
#else
470

471
/* These are trivial for the 32-bit library, since all UTF-32 characters fit
472
into one PCRE2_UCHAR unit. */
473

474
#define MAX_UTF_SINGLE_CU (0x10ffffu)
475
#define HAS_EXTRALEN(c) (0)
476
#define GET_EXTRALEN(c) (0)
477
#define NOT_FIRSTCU(c) (0)
478

479
/* Get the next UTF-32 character, not advancing the pointer. This is called when
480
we know we are in UTF-32 mode. */
481

482
#define GETCHAR(c, eptr) \
483
  c = *(eptr);
484

485
/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
486
pointer. */
487

488
#define GETCHARTEST(c, eptr) \
489
  c = *(eptr);
490

491
/* Get the next UTF-32 character, advancing the pointer. This is called when we
492
know we are in UTF-32 mode. */
493

494
#define GETCHARINC(c, eptr) \
495
  c = *((eptr)++);
496

497
/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
498
This is called when we don't know if we are in UTF-32 mode. */
499

500
#define GETCHARINCTEST(c, eptr) \
501
  c = *((eptr)++);
502

503
/* Get the next UTF-32 character, not advancing the pointer, not incrementing
504
length (since all UTF-32 is of length 1). This is called when we know we are in
505
UTF-32 mode. */
506

507
#define GETCHARLEN(c, eptr, len) \
508
  GETCHAR(c, eptr)
509

510
/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
511
pointer, not incrementing the length (since all UTF-32 is of length 1).
512
This is called when we do not know if we are in UTF-32 mode. */
513

514
#define GETCHARLENTEST(c, eptr, len) \
515
  GETCHARTEST(c, eptr)
516

517
/* If the pointer is not at the start of a character, move it back until
518
it is. This is called only in UTF-32 mode - we don't put a test within the
519
macro because almost all calls are already within a block of UTF-32 only
520
code.
521

522
These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */
523

524
#define BACKCHAR(eptr) do { } while (0)
525

526
/* Same as above, just in the other direction. */
527

528
#define FORWARDCHAR(eptr) do { } while (0)
529
#define FORWARDCHARTEST(eptr,end) do { } while (0)
530

531
/* Same as above, but it allows a fully customizable form. */
532

533
#define ACROSSCHAR(condition, eptr, action) do { } while (0)
534

535
/* Deposit a character into memory, returning the number of code units. */
536

537
#define PUTCHAR(c, p) (*p = c, 1)
538

539
#endif  /* UTF-32 character handling */
540
#endif  /* SUPPORT_UNICODE */
541

542

543
/* Mode-dependent macros that have the same definition in all modes. */
544

545
#define CU2BYTES(x)     ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
546
#define BYTES2CU(x)     ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
547
#define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
548
#define PUT2INC(a,n,d)  PUT2(a,n,d), a += IMM2_SIZE
549

550

551
/* ----------------------- HIDDEN STRUCTURES ----------------------------- */
552

553
/* NOTE: All these structures *must* start with a pcre2_memctl structure. The
554
code that uses them is simpler because it assumes this. */
555

556
/* The real general context structure. At present it holds only data for custom
557
memory control. */
558

559
/* WARNING: if this is ever changed, code in pcre2_substitute.c will have to be
560
changed because it builds a general context "by hand" in order to avoid the
561
malloc() call in pcre2_general_context)_create(). There is also code in
562
pcre2_match.c that makes the same assumption. */
563

564
typedef struct pcre2_real_general_context {
565
  pcre2_memctl memctl;
566
} pcre2_real_general_context;
567

568
/* The real compile context structure */
569

570
typedef struct pcre2_real_compile_context {
571
  pcre2_memctl memctl;
572
  int (*stack_guard)(uint32_t, void *);
573
  void *stack_guard_data;
574
  const uint8_t *tables;
575
  PCRE2_SIZE max_pattern_length;
576
  PCRE2_SIZE max_pattern_compiled_length;
577
  uint16_t bsr_convention;
578
  uint16_t newline_convention;
579
  uint32_t parens_nest_limit;
580
  uint32_t extra_options;
581
  uint32_t max_varlookbehind;
582
  uint32_t optimization_flags;
583
} pcre2_real_compile_context;
584

585
/* The real match context structure. */
586

587
typedef struct pcre2_real_match_context {
588
  pcre2_memctl memctl;
589
#ifdef SUPPORT_JIT
590
  pcre2_jit_callback jit_callback;
591
  void *jit_callback_data;
592
#endif
593
  int        (*callout)(pcre2_callout_block *, void *);
594
  void        *callout_data;
595
  int        (*substitute_callout)(pcre2_substitute_callout_block *, void *);
596
  void        *substitute_callout_data;
597
  PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
598
                                        PCRE2_SIZE, int, void *);
599
  void        *substitute_case_callout_data;
600
  PCRE2_SIZE offset_limit;
601
  uint32_t heap_limit;
602
  uint32_t match_limit;
603
  uint32_t depth_limit;
604
} pcre2_real_match_context;
605

606
/* The real convert context structure. */
607

608
typedef struct pcre2_real_convert_context {
609
  pcre2_memctl memctl;
610
  uint32_t glob_separator;
611
  uint32_t glob_escape;
612
} pcre2_real_convert_context;
613

614
/* The real compiled code structure. The type for the blocksize field is
615
defined specially because it is required in pcre2_serialize_decode() when
616
copying the size from possibly unaligned memory into a variable of the same
617
type. Use a macro rather than a typedef to avoid compiler warnings when this
618
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
619
largest lookbehind that is supported. (OP_REVERSE and OP_VREVERSE in a pattern
620
have 16-bit arguments in 8-bit and 16-bit modes, so we need no more than a
621
16-bit field here.) */
622

623
#undef  CODE_BLOCKSIZE_TYPE
624
#define CODE_BLOCKSIZE_TYPE PCRE2_SIZE
625

626
#undef  LOOKBEHIND_MAX
627
#define LOOKBEHIND_MAX UINT16_MAX
628

629
typedef struct pcre2_real_code {
630
  pcre2_memctl memctl;            /* Memory control fields */
631
  const uint8_t *tables;          /* The character tables */
632
  void    *executable_jit;        /* Pointer to JIT code */
633
  uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
634
  CODE_BLOCKSIZE_TYPE blocksize;  /* Total (bytes) that was malloc-ed */
635
  CODE_BLOCKSIZE_TYPE code_start; /* Byte code start offset */
636
  uint32_t magic_number;          /* Paranoid and endianness check */
637
  uint32_t compile_options;       /* Options passed to pcre2_compile() */
638
  uint32_t overall_options;       /* Options after processing the pattern */
639
  uint32_t extra_options;         /* Taken from compile_context */
640
  uint32_t flags;                 /* Various state flags */
641
  uint32_t limit_heap;            /* Limit set in the pattern */
642
  uint32_t limit_match;           /* Limit set in the pattern */
643
  uint32_t limit_depth;           /* Limit set in the pattern */
644
  uint32_t first_codeunit;        /* Starting code unit */
645
  uint32_t last_codeunit;         /* This codeunit must be seen */
646
  uint16_t bsr_convention;        /* What \R matches */
647
  uint16_t newline_convention;    /* What is a newline? */
648
  uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
649
  uint16_t minlength;             /* Minimum length of match */
650
  uint16_t top_bracket;           /* Highest numbered group */
651
  uint16_t top_backref;           /* Highest numbered back reference */
652
  uint16_t name_entry_size;       /* Size (code units) of table entries */
653
  uint16_t name_count;            /* Number of name entries in the table */
654
  uint32_t optimization_flags;    /* Optimizations enabled at compile time */
655
} pcre2_real_code;
656

657
/* The real match data structure. Define ovector as large as it can ever
658
actually be so that array bound checkers don't grumble. Memory for this
659
structure is obtained by calling pcre2_match_data_create(), which sets the size
660
as the offset of ovector plus a pair of elements for each capturable string, so
661
the size varies from call to call. As the maximum number of capturing
662
subpatterns is 65535 we must allow for 65536 strings to include the overall
663
match. (See also the heapframe structure below.) */
664

665
struct heapframe;  /* Forward reference */
666

667
typedef struct pcre2_real_match_data {
668
  pcre2_memctl     memctl;           /* Memory control fields */
669
  const pcre2_real_code *code;       /* The pattern used for the match */
670
  PCRE2_SPTR       subject;          /* The subject that was matched */
671
  PCRE2_SPTR       mark;             /* Pointer to last mark */
672
  struct heapframe *heapframes;      /* Backtracking frames heap memory */
673
  PCRE2_SIZE       heapframes_size;  /* Malloc-ed size */
674
  PCRE2_SIZE       subject_length;   /* Subject length */
675
  PCRE2_SIZE       leftchar;         /* Offset to leftmost code unit */
676
  PCRE2_SIZE       rightchar;        /* Offset to rightmost code unit */
677
  PCRE2_SIZE       startchar;        /* Offset to starting code unit */
678
  uint8_t          matchedby;        /* Type of match (normal, JIT, DFA) */
679
  uint8_t          flags;            /* Various flags */
680
  uint16_t         oveccount;        /* Number of pairs */
681
  int              rc;               /* The return code from the match */
682
  PCRE2_SIZE       ovector[131072];  /* Must be last in the structure */
683
} pcre2_real_match_data;
684

685

686
/* ----------------------- PRIVATE STRUCTURES ----------------------------- */
687

688
/* These structures are not needed for pcre2test. */
689

690
#ifndef PCRE2_PCRE2TEST
691

692
/* Structures for checking for mutual function recursion when scanning compiled
693
or parsed code. */
694

695
typedef struct recurse_check {
696
  struct recurse_check *prev;
697
  PCRE2_SPTR group;
698
} recurse_check;
699

700
typedef struct parsed_recurse_check {
701
  struct parsed_recurse_check *prev;
702
  uint32_t *groupptr;
703
} parsed_recurse_check;
704

705
/* Structure for building a cache when filling in pattern recursion offsets. */
706

707
typedef struct recurse_cache {
708
  PCRE2_SPTR group;
709
  int groupnumber;
710
} recurse_cache;
711

712
/* Structure for maintaining a chain of pointers to the currently incomplete
713
branches, for testing for left recursion while compiling. */
714

715
typedef struct branch_chain {
716
  struct branch_chain *outer;
717
  PCRE2_UCHAR *current_branch;
718
} branch_chain;
719

720
/* Structure for building a list of named groups during the first pass of
721
compiling. */
722

723
typedef struct named_group {
724
  PCRE2_SPTR   name;          /* Points to the name in the pattern */
725
  uint32_t     number;        /* Group number */
726
  uint16_t     length;        /* Length of the name */
727
  uint16_t     isdup;         /* TRUE if a duplicate */
728
} named_group;
729

730
/* Structure for caching sorted ranges. This improves the performance
731
of translating META code to byte code. */
732

733
typedef struct class_ranges {
734
  struct class_ranges *next;       /* Next class ranges */
735
  size_t char_lists_size;          /* Total size of encoded char lists */
736
  size_t char_lists_start;         /* Start offset of encoded char lists */
737
  uint16_t range_list_size;        /* Size of ranges array */
738
  uint16_t char_lists_types;       /* The XCL_LIST header of char lists */
739
  /* Followed by the list of ranges (start/end pairs) */
740
} class_ranges;
741

742
typedef union class_bits_storage {
743
  uint8_t classbits[32];
744
  uint32_t classwords[8];
745
} class_bits_storage;
746

747
/* Structure for passing "static" information around between the functions
748
doing the compiling, so that they are thread-safe. */
749

750
typedef struct compile_block {
751
  pcre2_real_compile_context *cx;  /* Points to the compile context */
752
  const uint8_t *lcc;              /* Points to lower casing table */
753
  const uint8_t *fcc;              /* Points to case-flipping table */
754
  const uint8_t *cbits;            /* Points to character type table */
755
  const uint8_t *ctypes;           /* Points to table of type maps */
756
  PCRE2_UCHAR *start_workspace;    /* The start of working space */
757
  PCRE2_UCHAR *start_code;         /* The start of the compiled code */
758
  PCRE2_SPTR start_pattern;        /* The start of the pattern */
759
  PCRE2_SPTR end_pattern;          /* The end of the pattern */
760
  PCRE2_UCHAR *name_table;         /* The name/number table */
761
  PCRE2_SIZE workspace_size;       /* Size of workspace */
762
  PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */
763
  PCRE2_SIZE erroroffset;          /* Offset of error in pattern */
764
  class_bits_storage classbits;    /* Temporary store for classbits */
765
  uint16_t names_found;            /* Number of entries so far */
766
  uint16_t name_entry_size;        /* Size of each entry */
767
  uint16_t parens_depth;           /* Depth of nested parentheses */
768
  uint16_t assert_depth;           /* Depth of nested assertions */
769
  named_group *named_groups;       /* Points to vector in pre-compile */
770
  uint32_t named_group_list_size;  /* Number of entries in the list */
771
  uint32_t external_options;       /* External (initial) options */
772
  uint32_t external_flags;         /* External flag bits to be set */
773
  uint32_t bracount;               /* Count of capturing parentheses */
774
  uint32_t lastcapture;            /* Last capture encountered */
775
  uint32_t *parsed_pattern;        /* Parsed pattern buffer */
776
  uint32_t *parsed_pattern_end;    /* Parsed pattern should not get here */
777
  uint32_t *groupinfo;             /* Group info vector */
778
  uint32_t top_backref;            /* Maximum back reference */
779
  uint32_t backref_map;            /* Bitmap of low back refs */
780
  uint32_t nltype;                 /* Newline type */
781
  uint32_t nllen;                  /* Newline string length */
782
  PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
783
  uint8_t class_op_used[ECLASS_NEST_LIMIT]; /* Operation used for
784
                                               extended classes */
785
  uint32_t req_varyopt;            /* "After variable item" flag for reqbyte */
786
  uint32_t max_varlookbehind;      /* Limit for variable lookbehinds */
787
  int  max_lookbehind;             /* Maximum lookbehind encountered (characters) */
788
  BOOL had_accept;                 /* (*ACCEPT) encountered */
789
  BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
790
  BOOL had_recurse;                /* Had a pattern recursion or subroutine call */
791
  BOOL dupnames;                   /* Duplicate names exist */
792
#ifdef SUPPORT_WIDE_CHARS
793
  class_ranges *cranges;           /* First class range. */
794
  class_ranges *next_cranges;      /* Next class range. */
795
  size_t char_lists_size;          /* Current size of character lists */
796
#endif
797
} compile_block;
798

799
/* Structure for keeping the properties of the in-memory stack used
800
by the JIT matcher. */
801

802
typedef struct pcre2_real_jit_stack {
803
  pcre2_memctl memctl;
804
  void* stack;
805
} pcre2_real_jit_stack;
806

807
/* Structure for items in a linked list that represents an explicit recursive
808
call within the pattern when running pcre2_dfa_match(). */
809

810
typedef struct dfa_recursion_info {
811
  struct dfa_recursion_info *prevrec;
812
  PCRE2_SPTR subject_position;
813
  PCRE2_SPTR last_used_ptr;
814
  uint32_t group_num;
815
} dfa_recursion_info;
816

817
/* Structure for "stack" frames that are used for remembering backtracking
818
positions during matching. As these are used in a vector, with the ovector item
819
being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
820
only way to check this at compile time is to force an error by generating an
821
array with a negative size. By putting this in a typedef (which is never used),
822
we don't generate any code when all is well. */
823

824
typedef struct heapframe {
825

826
  /* The first set of fields are variables that have to be preserved over calls
827
  to RRMATCH(), but which do not need to be copied to new frames. */
828

829
  PCRE2_SPTR ecode;          /* The current position in the pattern */
830
  PCRE2_SPTR temp_sptr[2];   /* Used for short-term PCRE2_SPTR values */
831
  PCRE2_SIZE length;         /* Used for character, string, or code lengths */
832
  PCRE2_SIZE back_frame;     /* Amount to subtract on RRETURN */
833
  PCRE2_SIZE temp_size;      /* Used for short-term PCRE2_SIZE values */
834
  uint32_t rdepth;           /* Function "recursion" depth within pcre2_match() */
835
  uint32_t group_frame_type; /* Type information for group frames */
836
  uint32_t temp_32[4];       /* Used for short-term 32-bit or BOOL values */
837
  uint8_t return_id;         /* Where to go on in internal "return" */
838
  uint8_t op;                /* Processing opcode */
839

840
  /* At this point, the structure is 16-bit aligned. On most architectures
841
  the alignment requirement for a pointer will ensure that the eptr field below
842
  is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer
843
  that is 16-bit aligned. We must therefore ensure that what comes between here
844
  and eptr is an odd multiple of 16 bits so as to get back into 32-bit
845
  alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs
846
  fudges in the other cases. In the 32-bit case the padding comes first so that
847
  the occu field itself is 32-bit aligned. Without the padding, this structure
848
  is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */
849

850
#if PCRE2_CODE_UNIT_WIDTH == 8
851
  PCRE2_UCHAR occu[6];       /* Used for other case code units */
852
#elif PCRE2_CODE_UNIT_WIDTH == 16
853
  PCRE2_UCHAR occu[2];       /* Used for other case code units */
854
  uint8_t unused[2];         /* Ensure 32-bit alignment (see above) */
855
#else
856
  uint8_t unused[2];         /* Ensure 32-bit alignment (see above) */
857
  PCRE2_UCHAR occu[1];       /* Used for other case code units */
858
#endif
859

860
  /* The rest have to be copied from the previous frame whenever a new frame
861
  becomes current. The final field is specified as a large vector so that
862
  runtime array bound checks don't catch references to it. However, for any
863
  specific call to pcre2_match() the memory allocated for each frame structure
864
  allows for exactly the right size ovector for the number of capturing
865
  parentheses. (See also the comment for pcre2_real_match_data above.) */
866

867
  PCRE2_SPTR eptr;              /* MUST BE FIRST */
868
  PCRE2_SPTR start_match;       /* Can be adjusted by \K */
869
  PCRE2_SPTR mark;              /* Most recent mark on the success path */
870
  PCRE2_SPTR recurse_last_used; /* Last character used at time of pattern recursion */
871
  uint32_t current_recurse;     /* Group number of current (deepest) pattern recursion */
872
  uint32_t capture_last;        /* Most recent capture */
873
  PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
874
  PCRE2_SIZE offset_top;        /* Offset after highest capture */
875
  PCRE2_SIZE ovector[131072];   /* Must be last in the structure */
876
} heapframe;
877

878
/* Assert that the size of the heapframe structure is a multiple of PCRE2_SIZE.
879
See various comments above. */
880

881
STATIC_ASSERT((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0, heapframe_size);
882

883
/* Structure for computing the alignment of heapframe. */
884

885
typedef struct heapframe_align {
886
  char unalign;    /* Completely unalign the current offset */
887
  heapframe frame; /* Offset is its alignment */
888
} heapframe_align;
889

890
/* This define is the minimum alignment required for a heapframe, in bytes. */
891

892
#define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame)
893

894
/* Structure for passing "static" information around between the functions
895
doing traditional NFA matching (pcre2_match() and friends). */
896

897
typedef struct match_block {
898
  pcre2_memctl memctl;            /* For general use */
899
  uint32_t heap_limit;            /* As it says */
900
  uint32_t match_limit;           /* As it says */
901
  uint32_t match_limit_depth;     /* As it says */
902
  uint32_t match_call_count;      /* Number of times a new frame is created */
903
  BOOL hitend;                    /* Hit the end of the subject at some point */
904
  BOOL hasthen;                   /* Pattern contains (*THEN) */
905
  BOOL allowemptypartial;         /* Allow empty hard partial */
906
  const uint8_t *lcc;             /* Points to lower casing table */
907
  const uint8_t *fcc;             /* Points to case-flipping table */
908
  const uint8_t *ctypes;          /* Points to table of type maps */
909
  PCRE2_SIZE start_offset;        /* The start offset value */
910
  PCRE2_SIZE end_offset_top;      /* Highwater mark at end of match */
911
  uint16_t partial;               /* PARTIAL options */
912
  uint16_t bsr_convention;        /* \R interpretation */
913
  uint16_t name_count;            /* Number of names in name table */
914
  uint16_t name_entry_size;       /* Size of entry in names table */
915
  PCRE2_SPTR name_table;          /* Table of group names */
916
  PCRE2_SPTR start_code;          /* For use in pattern recursion */
917
  PCRE2_SPTR start_subject;       /* Start of the subject string */
918
  PCRE2_SPTR check_subject;       /* Where UTF-checked from */
919
  PCRE2_SPTR end_subject;         /* Usable end of the subject string */
920
  PCRE2_SPTR true_end_subject;    /* Actual end of the subject string */
921
  PCRE2_SPTR end_match_ptr;       /* Subject position at end match */
922
  PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
923
  PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
924
  PCRE2_SPTR mark;                /* Mark pointer to pass back on success */
925
  PCRE2_SPTR nomatch_mark;        /* Mark pointer to pass back on failure */
926
  PCRE2_SPTR verb_ecode_ptr;      /* For passing back info */
927
  PCRE2_SPTR verb_skip_ptr;       /* For passing back a (*SKIP) name */
928
  uint32_t verb_current_recurse;  /* Current recursion group when (*VERB) happens */
929
  uint32_t moptions;              /* Match options */
930
  uint32_t poptions;              /* Pattern options */
931
  uint32_t skip_arg_count;        /* For counting SKIP_ARGs */
932
  uint32_t ignore_skip_arg;       /* For re-run when SKIP arg name not found */
933
  uint32_t nltype;                /* Newline type */
934
  uint32_t nllen;                 /* Newline string length */
935
  PCRE2_UCHAR nl[4];              /* Newline string when fixed */
936
  pcre2_callout_block *cb;        /* Points to a callout block */
937
  void  *callout_data;            /* To pass back to callouts */
938
  int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
939
} match_block;
940

941
/* A similar structure is used for the same purpose by the DFA matching
942
functions. */
943

944
typedef struct dfa_match_block {
945
  pcre2_memctl memctl;            /* For general use */
946
  PCRE2_SPTR start_code;          /* Start of the compiled pattern */
947
  PCRE2_SPTR start_subject ;      /* Start of the subject string */
948
  PCRE2_SPTR end_subject;         /* End of subject string */
949
  PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
950
  PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
951
  const uint8_t *tables;          /* Character tables */
952
  PCRE2_SIZE start_offset;        /* The start offset value */
953
  uint32_t heap_limit;            /* As it says */
954
  PCRE2_SIZE heap_used;           /* As it says */
955
  uint32_t match_limit;           /* As it says */
956
  uint32_t match_limit_depth;     /* As it says */
957
  uint32_t match_call_count;      /* Number of calls of internal function */
958
  uint32_t moptions;              /* Match options */
959
  uint32_t poptions;              /* Pattern options */
960
  uint32_t nltype;                /* Newline type */
961
  uint32_t nllen;                 /* Newline string length */
962
  BOOL allowemptypartial;         /* Allow empty hard partial */
963
  PCRE2_UCHAR nl[4];              /* Newline string when fixed */
964
  uint16_t bsr_convention;        /* \R interpretation */
965
  pcre2_callout_block *cb;        /* Points to a callout block */
966
  void *callout_data;             /* To pass back to callouts */
967
  int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
968
  dfa_recursion_info *recursive;  /* Linked list of pattern recursion data */
969
} dfa_match_block;
970

971
#endif  /* PCRE2_PCRE2TEST */
972

973
/* End of pcre2_intmodedep.h */
974

975
Product

Resources

Company