Path: blob/master/thirdparty/pcre2/src/pcre2_intmodedep.h
21928 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041/* This module contains mode-dependent macro and structure definitions. The42file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.43These mode-dependent items are kept in a separate file so that they can also be44#included multiple times for different code unit widths by pcre2test in order45to have access to the hidden structures at all supported widths.4647Some of the mode-dependent macros are required at different widths for48different parts of the pcre2test code (in particular, the included49pcre2_printint_inc.h file). We undefine them here so that they can be re-defined50for multiple inclusions. Not all of these are used in pcre2test, but it's easier51just to undefine them all.5253You can also include pcre2_intmodedep.h with PCRE2_CODE_UNIT_WIDTH defined to54zero in order to simply clear the previous macros. */5556#ifndef PCRE2_CODE_UNIT_WIDTH57#error PCRE2_CODE_UNIT_WIDTH must be defined58#endif5960#undef ACROSSCHAR61#undef BACKCHAR62#undef BYTES2CU63#undef CHMAX_25564#undef CU2BYTES65#undef FORWARDCHAR66#undef FORWARDCHARTEST67#undef GET68#undef GET269#undef GETCHAR70#undef GETCHARINC71#undef GETCHARINCTEST72#undef GETCHARLEN73#undef GETCHARLENTEST74#undef GETCHARTEST75#undef GET_EXTRALEN76#undef HAS_EXTRALEN77#undef IMM2_SIZE78#undef MAX_25579#undef MAX_MARK80#undef MAX_PATTERN_SIZE81#undef MAX_UTF_SINGLE_CU82#undef NOT_FIRSTCU83#undef PUT84#undef PUT285#undef PUT2INC86#undef PUTCHAR87#undef PUTINC88#undef TABLE_GET8990/*************************************************91* MACROS *92*************************************************/9394/* Macros may be undefined and re-defined if the same file handles multiple95bit-widths. */9697#if PCRE2_CODE_UNIT_WIDTH != 09899/* PCRE keeps offsets in its compiled code as at least 16-bit quantities100(always stored in big-endian order in 8-bit mode) by default. These are used,101for example, to link from the start of a subpattern to its alternatives and its102end. The use of 16 bits per offset limits the size of an 8-bit compiled regex103to around 64K, which is big enough for almost everybody. However, I received a104request for an even bigger limit. For this reason, and also to make the code105easier to maintain, the storing and loading of offsets from the compiled code106unit string is now handled by the macros that are defined here.107108The macros are controlled by the value of LINK_SIZE. This defaults to 2, but109values of 3 or 4 are also supported. */110111#ifndef CONFIGURED_LINK_SIZE112#if LINK_SIZE == 2113#define CONFIGURED_LINK_SIZE 2114#elif LINK_SIZE == 3115#define CONFIGURED_LINK_SIZE 3116#elif LINK_SIZE == 4117#define CONFIGURED_LINK_SIZE 4118#else119#error LINK_SIZE must be 2, 3, or 4120#endif121#endif /* CONFIGURED_LINK_SIZE */122123/* ------------------- 8-bit support ------------------ */124125#if PCRE2_CODE_UNIT_WIDTH == 8126127#if CONFIGURED_LINK_SIZE == 2128#define PUT(a,n,d) \129(a[n] = (PCRE2_UCHAR)((d) >> 8)), \130(a[(n)+1] = (PCRE2_UCHAR)((d) & 255))131#define GET(a,n) \132(unsigned int)(((a)[n] << 8) | (a)[(n)+1])133#define MAX_PATTERN_SIZE (1 << 16)134135#elif CONFIGURED_LINK_SIZE == 3136#define PUT(a,n,d) \137(a[n] = (PCRE2_UCHAR)((d) >> 16)), \138(a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \139(a[(n)+2] = (PCRE2_UCHAR)((d) & 255))140#define GET(a,n) \141(unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])142#define MAX_PATTERN_SIZE (1 << 24)143144#elif CONFIGURED_LINK_SIZE == 4145#define PUT(a,n,d) \146(a[n] = (PCRE2_UCHAR)((d) >> 24)), \147(a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \148(a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \149(a[(n)+3] = (PCRE2_UCHAR)((d) & 255))150#define GET(a,n) \151(unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])152#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */153154#endif155156157/* ------------------- 16-bit support ------------------ */158159#elif PCRE2_CODE_UNIT_WIDTH == 16160161#if CONFIGURED_LINK_SIZE == 2162#undef LINK_SIZE163#define LINK_SIZE 1164#define PUT(a,n,d) \165(a[n] = (PCRE2_UCHAR)(d))166#define GET(a,n) \167(a[n])168#define MAX_PATTERN_SIZE (1 << 16)169170#elif CONFIGURED_LINK_SIZE == 3 || CONFIGURED_LINK_SIZE == 4171#undef LINK_SIZE172#define LINK_SIZE 2173#define PUT(a,n,d) \174(a[n] = (PCRE2_UCHAR)((d) >> 16)), \175(a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))176#define GET(a,n) \177(unsigned int)(((a)[n] << 16) | (a)[(n)+1])178#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */179180#endif181182183/* ------------------- 32-bit support ------------------ */184185#elif PCRE2_CODE_UNIT_WIDTH == 32186#undef LINK_SIZE187#define LINK_SIZE 1188#define PUT(a,n,d) \189(a[n] = (d))190#define GET(a,n) \191(a[n])192#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */193194#else195#error Unsupported compiling mode196#endif197198199/* --------------- Other mode-specific macros ----------------- */200201/* PCRE uses some other (at least) 16-bit quantities that do not change when202the size of offsets changes. There are used for repeat counts and for other203things such as capturing parenthesis numbers in back references.204205Define the number of code units required to hold a 16-bit count/offset, and206macros to load and store such a value. For reasons that I do not understand,207the expression in the 8-bit GET2 macro is treated by gcc as a signed208expression, even when a is declared as unsigned. It seems that any kind of209arithmetic results in a signed value. Hence the cast. */210211#if PCRE2_CODE_UNIT_WIDTH == 8212#define IMM2_SIZE 2213#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])214#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255215216#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32217#define IMM2_SIZE 1218#define GET2(a,n) a[n]219#define PUT2(a,n,d) a[n] = d220#endif221222/* Other macros that are different for 8-bit mode. The MAX_255 macro checks223whether its argument, which is assumed to be one code unit, is less than 256.224The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK225name must fit in one code unit; currently it is set to 255 or 65535. The226TABLE_GET macro is used to access elements of tables containing exactly 256227items. Its argument is a code unit. When code points can be greater than 255, a228check is needed before accessing these tables. */229230#if PCRE2_CODE_UNIT_WIDTH == 8231#define MAX_255(c) TRUE232#define MAX_MARK ((1u << 8) - 1)233#define TABLE_GET(c, table, default) ((table)[c])234#ifdef SUPPORT_UNICODE235#define SUPPORT_WIDE_CHARS236#define CHMAX_255(c) ((c) <= 255u)237#else238#define CHMAX_255(c) TRUE239#endif /* SUPPORT_UNICODE */240241#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32242#define CHMAX_255(c) ((c) <= 255u)243#define MAX_255(c) ((c) <= 255u)244#define MAX_MARK ((1u << 16) - 1)245#define SUPPORT_WIDE_CHARS246#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))247#endif248249250/* ----------------- Character-handling macros ----------------- */251252/* There is a proposed future special "UTF-21" mode, in which only the lowest25321 bits of a 32-bit character are interpreted as UTF, with the remaining 11254high-order bits available to the application for other uses. In preparation for255the future implementation of this mode, there are macros that load a data item256and, if in this special mode, mask it to 21 bits. These macros all have names257starting with UCHAR21. In all other modes, including the normal 32-bit258library, the macros all have the same simple definitions. When the new mode is259implemented, it is expected that these definitions will be varied appropriately260using #ifdef when compiling the library that supports the special mode. */261262#define UCHAR21(eptr) (*(eptr))263#define UCHAR21TEST(eptr) (*(eptr))264#define UCHAR21INC(eptr) (*(eptr)++)265#define UCHAR21INCTEST(eptr) (*(eptr)++)266267/* When UTF encoding is being used, a character is no longer just a single268byte in 8-bit mode or a single short in 16-bit mode. The macros for character269handling generate simple sequences when used in the basic mode, and more270complicated ones for UTF characters. GETCHARLENTEST and other macros are not271used when UTF is not supported. To make sure they can never even appear when272UTF support is omitted, we don't even define them. */273274#ifndef SUPPORT_UNICODE275276/* #define MAX_UTF_SINGLE_CU */277/* #define HAS_EXTRALEN(c) */278/* #define GET_EXTRALEN(c) */279/* #define NOT_FIRSTCU(c) */280#define GETCHAR(c, eptr) c = *eptr;281#define GETCHARTEST(c, eptr) c = *eptr;282#define GETCHARINC(c, eptr) c = *eptr++;283#define GETCHARINCTEST(c, eptr) c = *eptr++;284#define GETCHARLEN(c, eptr, len) c = *eptr;285#define PUTCHAR(c, p) (*p = c, 1)286/* #define GETCHARLENTEST(c, eptr, len) */287/* #define BACKCHAR(eptr) */288/* #define FORWARDCHAR(eptr) */289/* #define FORWARCCHARTEST(eptr,end) */290/* #define ACROSSCHAR(condition, eptr, action) */291292#else /* SUPPORT_UNICODE */293294/* ------------------- 8-bit support ------------------ */295296#if PCRE2_CODE_UNIT_WIDTH == 8297#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */298299/* The largest UTF code point that can be encoded as a single code unit. */300301#define MAX_UTF_SINGLE_CU 127302303/* Tests whether the code point needs extra characters to decode. */304305#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)306307/* Returns with the additional number of characters if HAS_EXTRALEN(c) is TRUE.308Otherwise it has an undefined behaviour. */309310#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])311312/* Returns TRUE, if the given value is not the first code unit of a UTF313sequence. */314315#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)316317/* Get the next UTF-8 character, not advancing the pointer. This is called when318we know we are in UTF-8 mode. */319320#define GETCHAR(c, eptr) \321c = *eptr; \322if (c >= 0xc0u) GETUTF8(c, eptr);323324/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the325pointer. */326327#define GETCHARTEST(c, eptr) \328c = *eptr; \329if (utf && c >= 0xc0u) GETUTF8(c, eptr);330331/* Get the next UTF-8 character, advancing the pointer. This is called when we332know we are in UTF-8 mode. */333334#define GETCHARINC(c, eptr) \335c = *eptr++; \336if (c >= 0xc0u) GETUTF8INC(c, eptr);337338/* Get the next character, testing for UTF-8 mode, and advancing the pointer.339This is called when we don't know if we are in UTF-8 mode. */340341#define GETCHARINCTEST(c, eptr) \342c = *eptr++; \343if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);344345/* Get the next UTF-8 character, not advancing the pointer, incrementing length346if there are extra bytes. This is called when we know we are in UTF-8 mode. */347348#define GETCHARLEN(c, eptr, len) \349c = *eptr; \350if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);351352/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the353pointer, incrementing length if there are extra bytes. This is called when we354do not know if we are in UTF-8 mode. */355356#define GETCHARLENTEST(c, eptr, len) \357c = *eptr; \358if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);359360/* If the pointer is not at the start of a character, move it back until361it is. This is called only in UTF-8 mode - we don't put a test within the macro362because almost all calls are already within a block of UTF-8 only code. */363364#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--365366/* Same as above, just in the other direction. */367#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++368#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++369370/* Same as above, but it allows a fully customizable form. */371#define ACROSSCHAR(condition, eptr, action) \372while((condition) && ((*eptr) & 0xc0u) == 0x80u) action373374/* Deposit a character into memory, returning the number of code units. */375376#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \377PRIV(ord2utf)(c,p) : (*p = c, 1))378379380/* ------------------- 16-bit support ------------------ */381382#elif PCRE2_CODE_UNIT_WIDTH == 16383#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */384385/* The largest UTF code point that can be encoded as a single code unit. */386387#define MAX_UTF_SINGLE_CU 65535388389/* Tests whether the code point needs extra characters to decode. */390391#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)392393/* Returns with the additional number of characters if HAS_EXTRALEN(c) is TRUE.394Otherwise it has an undefined behaviour. */395396#define GET_EXTRALEN(c) 1397398/* Returns TRUE, if the given value is not the first code unit of a UTF399sequence. */400401#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)402403/* Base macro to pick up the low surrogate of a UTF-16 character, not404advancing the pointer. */405406#define GETUTF16(c, eptr) \407{ c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }408409/* Get the next UTF-16 character, not advancing the pointer. This is called when410we know we are in UTF-16 mode. */411412#define GETCHAR(c, eptr) \413c = *eptr; \414if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);415416/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the417pointer. */418419#define GETCHARTEST(c, eptr) \420c = *eptr; \421if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);422423/* Base macro to pick up the low surrogate of a UTF-16 character, advancing424the pointer. */425426#define GETUTF16INC(c, eptr) \427{ c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }428429/* Get the next UTF-16 character, advancing the pointer. This is called when we430know we are in UTF-16 mode. */431432#define GETCHARINC(c, eptr) \433c = *eptr++; \434if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);435436/* Get the next character, testing for UTF-16 mode, and advancing the pointer.437This is called when we don't know if we are in UTF-16 mode. */438439#define GETCHARINCTEST(c, eptr) \440c = *eptr++; \441if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);442443/* Base macro to pick up the low surrogate of a UTF-16 character, not444advancing the pointer, incrementing the length. */445446#define GETUTF16LEN(c, eptr, len) \447{ c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }448449/* Get the next UTF-16 character, not advancing the pointer, incrementing450length if there is a low surrogate. This is called when we know we are in451UTF-16 mode. */452453#define GETCHARLEN(c, eptr, len) \454c = *eptr; \455if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);456457/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the458pointer, incrementing length if there is a low surrogate. This is called when459we do not know if we are in UTF-16 mode. */460461#define GETCHARLENTEST(c, eptr, len) \462c = *eptr; \463if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);464465/* If the pointer is not at the start of a character, move it back until466it is. This is called only in UTF-16 mode - we don't put a test within the467macro because almost all calls are already within a block of UTF-16 only468code. */469470#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--471472/* Same as above, just in the other direction. */473#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++474#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++475476/* Same as above, but it allows a fully customizable form. */477#define ACROSSCHAR(condition, eptr, action) \478if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action479480/* Deposit a character into memory, returning the number of code units. */481482#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \483PRIV(ord2utf)(c,p) : (*p = c, 1))484485486/* ------------------- 32-bit support ------------------ */487488#elif PCRE2_CODE_UNIT_WIDTH == 32489490/* These are trivial for the 32-bit library, since all UTF-32 characters fit491into one PCRE2_UCHAR unit. */492493#define MAX_UTF_SINGLE_CU (0x10ffffu)494#define HAS_EXTRALEN(c) (0)495#define GET_EXTRALEN(c) (0)496#define NOT_FIRSTCU(c) (0)497498/* Get the next UTF-32 character, not advancing the pointer. This is called when499we know we are in UTF-32 mode. */500501#define GETCHAR(c, eptr) \502c = *(eptr);503504/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the505pointer. */506507#define GETCHARTEST(c, eptr) \508c = *(eptr);509510/* Get the next UTF-32 character, advancing the pointer. This is called when we511know we are in UTF-32 mode. */512513#define GETCHARINC(c, eptr) \514c = *((eptr)++);515516/* Get the next character, testing for UTF-32 mode, and advancing the pointer.517This is called when we don't know if we are in UTF-32 mode. */518519#define GETCHARINCTEST(c, eptr) \520c = *((eptr)++);521522/* Get the next UTF-32 character, not advancing the pointer, not incrementing523length (since all UTF-32 is of length 1). This is called when we know we are in524UTF-32 mode. */525526#define GETCHARLEN(c, eptr, len) \527GETCHAR(c, eptr)528529/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the530pointer, not incrementing the length (since all UTF-32 is of length 1).531This is called when we do not know if we are in UTF-32 mode. */532533#define GETCHARLENTEST(c, eptr, len) \534GETCHARTEST(c, eptr)535536/* If the pointer is not at the start of a character, move it back until537it is. This is called only in UTF-32 mode - we don't put a test within the538macro because almost all calls are already within a block of UTF-32 only539code.540541These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */542543#define BACKCHAR(eptr) do { } while (0)544545/* Same as above, just in the other direction. */546547#define FORWARDCHAR(eptr) do { } while (0)548#define FORWARDCHARTEST(eptr,end) do { } while (0)549550/* Same as above, but it allows a fully customizable form. */551552#define ACROSSCHAR(condition, eptr, action) do { } while (0)553554/* Deposit a character into memory, returning the number of code units. */555556#define PUTCHAR(c, p) (*p = c, 1)557558#endif /* UTF-32 character handling */559#endif /* SUPPORT_UNICODE */560561562/* Mode-dependent macros that have the same definition in all modes. */563564#define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))565#define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))566#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE567#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE568569#endif /* PCRE2_CODE_UNIT_WIDTH != 0 */570571572573/*************************************************574* STRUCTURES *575*************************************************/576577/* We need a more complex include guard than usual, because the file can be578included once for each bit-width to define the various structures. */579580#if PCRE2_CODE_UNIT_WIDTH == 8 && !defined PCRE2_INTMODEDEP_IDEMPOTENT_GUARD_8581#define PCRE2_INTMODEDEP_IDEMPOTENT_GUARD_8582#define PCRE2_INTMODEDEP_CAN_DEFINE583#endif584#if PCRE2_CODE_UNIT_WIDTH == 16 && !defined PCRE2_INTMODEDEP_IDEMPOTENT_GUARD_16585#define PCRE2_INTMODEDEP_IDEMPOTENT_GUARD_16586#define PCRE2_INTMODEDEP_CAN_DEFINE587#endif588#if PCRE2_CODE_UNIT_WIDTH == 32 && !defined PCRE2_INTMODEDEP_IDEMPOTENT_GUARD_32589#define PCRE2_INTMODEDEP_IDEMPOTENT_GUARD_32590#define PCRE2_INTMODEDEP_CAN_DEFINE591#endif592593#ifdef PCRE2_INTMODEDEP_CAN_DEFINE594#undef PCRE2_INTMODEDEP_CAN_DEFINE595596/* ----------------------- HIDDEN STRUCTURES ----------------------------- */597598/* NOTE: All these structures *must* start with a pcre2_memctl structure. The599code that uses them is simpler because it assumes this. */600601/* The real general context structure. At present it holds only data for custom602memory control. */603604/* WARNING: if this is ever changed, code in pcre2_substitute.c will have to be605changed because it builds a general context "by hand" in order to avoid the606malloc() call in pcre2_general_context)_create(). There is also code in607pcre2_match.c that makes the same assumption. */608609typedef struct pcre2_real_general_context {610pcre2_memctl memctl;611} pcre2_real_general_context;612613/* The real compile context structure */614615typedef struct pcre2_real_compile_context {616pcre2_memctl memctl;617int (*stack_guard)(uint32_t, void *);618void *stack_guard_data;619const uint8_t *tables;620PCRE2_SIZE max_pattern_length;621PCRE2_SIZE max_pattern_compiled_length;622uint16_t bsr_convention;623uint16_t newline_convention;624uint32_t parens_nest_limit;625uint32_t extra_options;626uint32_t max_varlookbehind;627uint32_t optimization_flags;628} pcre2_real_compile_context;629630/* The real match context structure. */631632typedef struct pcre2_real_match_context {633pcre2_memctl memctl;634#ifdef SUPPORT_JIT635pcre2_jit_callback jit_callback;636void *jit_callback_data;637#endif638int (*callout)(pcre2_callout_block *, void *);639void *callout_data;640int (*substitute_callout)(pcre2_substitute_callout_block *, void *);641void *substitute_callout_data;642PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,643PCRE2_SIZE, int, void *);644void *substitute_case_callout_data;645PCRE2_SIZE offset_limit;646uint32_t heap_limit;647uint32_t match_limit;648uint32_t depth_limit;649} pcre2_real_match_context;650651/* The real convert context structure. */652653typedef struct pcre2_real_convert_context {654pcre2_memctl memctl;655uint32_t glob_separator;656uint32_t glob_escape;657} pcre2_real_convert_context;658659/* The real compiled code structure. The type for the blocksize field is660defined specially because it is required in pcre2_serialize_decode() when661copying the size from possibly unaligned memory into a variable of the same662type. Use a macro rather than a typedef to avoid compiler warnings when this663file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the664largest lookbehind that is supported. (OP_REVERSE and OP_VREVERSE in a pattern665have 16-bit arguments in 8-bit and 16-bit modes, so we need no more than a66616-bit field here.) */667668#undef CODE_BLOCKSIZE_TYPE669#define CODE_BLOCKSIZE_TYPE PCRE2_SIZE670671#undef LOOKBEHIND_MAX672#define LOOKBEHIND_MAX ((int)UINT16_MAX)673674typedef struct pcre2_real_code {675pcre2_memctl memctl; /* Memory control fields */676const uint8_t *tables; /* The character tables */677void *executable_jit; /* Pointer to JIT code */678uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */679CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */680CODE_BLOCKSIZE_TYPE code_start; /* Byte code start offset */681uint32_t magic_number; /* Paranoid and endianness check */682uint32_t compile_options; /* Options passed to pcre2_compile() */683uint32_t overall_options; /* Options after processing the pattern */684uint32_t extra_options; /* Taken from compile_context */685uint32_t flags; /* Various state flags */686uint32_t limit_heap; /* Limit set in the pattern */687uint32_t limit_match; /* Limit set in the pattern */688uint32_t limit_depth; /* Limit set in the pattern */689uint32_t first_codeunit; /* Starting code unit */690uint32_t last_codeunit; /* This codeunit must be seen */691uint16_t bsr_convention; /* What \R matches */692uint16_t newline_convention; /* What is a newline? */693uint16_t max_lookbehind; /* Longest lookbehind (characters) */694uint16_t minlength; /* Minimum length of match */695uint16_t top_bracket; /* Highest numbered group */696uint16_t top_backref; /* Highest numbered back reference */697uint16_t name_entry_size; /* Size (code units) of table entries */698uint16_t name_count; /* Number of name entries in the table */699uint32_t optimization_flags; /* Optimizations enabled at compile time */700} pcre2_real_code;701702/* The real match data structure. Define ovector as large as it can ever703actually be so that array bound checkers don't grumble. Memory for this704structure is obtained by calling pcre2_match_data_create(), which sets the size705as the offset of ovector plus a pair of elements for each capturable string, so706the size varies from call to call. As the maximum number of capturing707subpatterns is 65535 we must allow for 65536 strings to include the overall708match. (See also the heapframe structure below.) */709710struct heapframe; /* Forward reference */711712typedef struct pcre2_real_match_data {713pcre2_memctl memctl; /* Memory control fields */714const pcre2_real_code *code; /* The pattern used for the match */715PCRE2_SPTR subject; /* The subject that was matched */716PCRE2_SPTR mark; /* Pointer to last mark */717struct heapframe *heapframes; /* Backtracking frames heap memory */718PCRE2_SIZE heapframes_size; /* Malloc-ed size */719PCRE2_SIZE subject_length; /* Subject length */720PCRE2_SIZE start_offset; /* Offset to start of search */721PCRE2_SIZE leftchar; /* Offset to leftmost code unit */722PCRE2_SIZE rightchar; /* Offset to rightmost code unit */723PCRE2_SIZE startchar; /* Offset to starting code unit */724uint8_t matchedby; /* Type of match (normal, JIT, DFA) */725uint8_t flags; /* Various flags */726uint16_t oveccount; /* Number of pairs */727uint32_t options; /* Options passed in to the match call */728int rc; /* The return code from the match */729PCRE2_SIZE ovector[131072]; /* Must be last in the structure */730} pcre2_real_match_data;731732733/* ----------------------- PRIVATE STRUCTURES ----------------------------- */734735/* These structures are not needed for pcre2test. */736737#ifndef PCRE2_PCRE2TEST738739/* Structures for checking for mutual function recursion when scanning compiled740or parsed code. */741742typedef struct recurse_check {743struct recurse_check *prev;744PCRE2_SPTR group;745} recurse_check;746747typedef struct parsed_recurse_check {748struct parsed_recurse_check *prev;749uint32_t *groupptr;750} parsed_recurse_check;751752/* Structure for building a cache when filling in pattern recursion offsets. */753754typedef struct recurse_cache {755PCRE2_SPTR group;756int groupnumber;757} recurse_cache;758759/* Structure for maintaining a chain of pointers to the currently incomplete760branches, for testing for left recursion while compiling. */761762typedef struct branch_chain {763struct branch_chain *outer;764PCRE2_UCHAR *current_branch;765} branch_chain;766767/* Structure for building a list of named groups during the first pass of768compiling. When a duplicate name is stored in the list, its name is set to769the name of the first entry with the same name, and its length is set to 0. */770771typedef struct named_group {772PCRE2_SPTR name; /* Points to the name in the pattern */773uint32_t number; /* Group number */774uint16_t length; /* Length of the name */775uint16_t hash_dup; /* A concatenation of a 15 bit hash code and776a singe bit which represents duplication */777} named_group;778779/* Structure for storing compile time data. */780781typedef struct compile_data {782struct compile_data *next; /* Next compile data */783#ifdef PCRE2_DEBUG784uint8_t type; /* Debug only type of the data */785#endif786} compile_data;787788/* Structure for caching sorted ranges. This improves the performance789of translating META code to byte code. */790791typedef struct class_ranges {792compile_data header; /* Common header */793size_t char_lists_size; /* Total size of encoded char lists */794size_t char_lists_start; /* Start offset of encoded char lists */795uint16_t range_list_size; /* Size of ranges array */796uint16_t char_lists_types; /* The XCL_LIST header of char lists */797/* Followed by the list of ranges (start/end pairs) */798} class_ranges;799800/* Structure for sorted recurse arguments. */801802typedef struct recurse_arguments {803compile_data header; /* Common header */804size_t size; /* Total size */805size_t skip_size; /* Space consumed by arguments */806} recurse_arguments;807808typedef union class_bits_storage {809uint8_t classbits[32];810uint32_t classwords[8];811} class_bits_storage;812813/* Structure for passing "static" information around between the functions814doing the compiling, so that they are thread-safe. */815816typedef struct compile_block {817pcre2_real_compile_context *cx; /* Points to the compile context */818const uint8_t *lcc; /* Points to lower casing table */819const uint8_t *fcc; /* Points to case-flipping table */820const uint8_t *cbits; /* Points to character type table */821const uint8_t *ctypes; /* Points to table of type maps */822PCRE2_UCHAR *start_workspace; /* The start of working space */823PCRE2_UCHAR *start_code; /* The start of the compiled code */824PCRE2_SPTR start_pattern; /* The start of the pattern */825PCRE2_SPTR end_pattern; /* The end of the pattern */826PCRE2_UCHAR *name_table; /* The name/number table */827PCRE2_SIZE workspace_size; /* Size of workspace */828PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */829PCRE2_SIZE erroroffset; /* Offset of error in pattern */830class_bits_storage classbits; /* Temporary store for classbits */831uint16_t names_found; /* Number of entries so far */832uint16_t name_entry_size; /* Size of each entry */833uint16_t parens_depth; /* Depth of nested parentheses */834uint16_t assert_depth; /* Depth of nested assertions */835named_group *named_groups; /* Points to vector in pre-compile */836uint32_t named_group_list_size; /* Number of entries in the list */837uint32_t external_options; /* External (initial) options */838uint32_t external_flags; /* External flag bits to be set */839uint32_t bracount; /* Count of capturing parentheses */840uint32_t lastcapture; /* Last capture encountered */841uint32_t *parsed_pattern; /* Parsed pattern buffer */842uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */843uint32_t *groupinfo; /* Group info vector */844uint32_t top_backref; /* Maximum back reference */845uint32_t backref_map; /* Bitmap of low back refs */846uint32_t nltype; /* Newline type */847uint32_t nllen; /* Newline string length */848PCRE2_UCHAR nl[4]; /* Newline string when fixed length */849uint8_t class_op_used[ECLASS_NEST_LIMIT]; /* Operation used for850extended classes */851uint32_t req_varyopt; /* "After variable item" flag for reqbyte */852uint32_t max_varlookbehind; /* Limit for variable lookbehinds */853int max_lookbehind; /* Maximum lookbehind encountered (characters) */854BOOL had_accept; /* (*ACCEPT) encountered */855BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */856BOOL had_recurse; /* Had a pattern recursion or subroutine call */857BOOL dupnames; /* Duplicate names exist */858compile_data *first_data; /* First item in the compile data list */859compile_data *last_data; /* Last item in the compile data list */860#ifdef SUPPORT_WIDE_CHARS861size_t char_lists_size; /* Current size of character lists */862#endif863} compile_block;864865/* Structure for keeping the properties of the in-memory stack used866by the JIT matcher. */867868typedef struct pcre2_real_jit_stack {869pcre2_memctl memctl;870void* stack;871} pcre2_real_jit_stack;872873/* Structure for items in a linked list that represents an explicit recursive874call within the pattern when running pcre2_dfa_match(). */875876typedef struct dfa_recursion_info {877struct dfa_recursion_info *prevrec;878PCRE2_SPTR subject_position;879PCRE2_SPTR last_used_ptr;880uint32_t group_num;881} dfa_recursion_info;882883/* Structure for "stack" frames that are used for remembering backtracking884positions during matching. As these are used in a vector, with the ovector item885being extended, the size of the structure must be a multiple of PCRE2_SIZE. The886only way to check this at compile time is to force an error by generating an887array with a negative size. By putting this in a typedef (which is never used),888we don't generate any code when all is well. */889890typedef struct heapframe {891892/* The first set of fields are variables that have to be preserved over calls893to RRMATCH(), but which do not need to be copied to new frames. */894895PCRE2_SPTR ecode; /* The current position in the pattern */896PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE2_SPTR values */897PCRE2_SIZE length; /* Used for character, string, or code lengths */898PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */899PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */900uint32_t rdepth; /* Function "recursion" depth within pcre2_match() */901uint32_t group_frame_type; /* Type information for group frames */902uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */903uint8_t return_id; /* Where to go on in internal "return" */904uint8_t op; /* Processing opcode */905906/* At this point, the structure is 16-bit aligned. On most architectures907the alignment requirement for a pointer will ensure that the eptr field below908is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer909that is 16-bit aligned. We must therefore ensure that what comes between here910and eptr is an odd multiple of 16 bits so as to get back into 32-bit911alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs912fudges in the other cases. In the 32-bit case the padding comes first so that913the occu field itself is 32-bit aligned. Without the padding, this structure914is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */915916#if PCRE2_CODE_UNIT_WIDTH == 8917PCRE2_UCHAR occu[6]; /* Used for other case code units */918#elif PCRE2_CODE_UNIT_WIDTH == 16919PCRE2_UCHAR occu[2]; /* Used for other case code units */920uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */921#else922uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */923PCRE2_UCHAR occu[1]; /* Used for other case code units */924#endif925926/* The rest have to be copied from the previous frame whenever a new frame927becomes current. The final field is specified as a large vector so that928runtime array bound checks don't catch references to it. However, for any929specific call to pcre2_match() the memory allocated for each frame structure930allows for exactly the right size ovector for the number of capturing931parentheses. (See also the comment for pcre2_real_match_data above.) */932933PCRE2_SPTR eptr; /* MUST BE FIRST */934PCRE2_SPTR start_match; /* Can be adjusted by \K */935PCRE2_SPTR mark; /* Most recent mark on the success path */936PCRE2_SPTR recurse_last_used; /* Last character used at time of pattern recursion */937uint32_t current_recurse; /* Group number of current (deepest) pattern recursion */938uint32_t capture_last; /* Most recent capture */939PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */940PCRE2_SIZE offset_top; /* Offset after highest capture */941PCRE2_SIZE ovector[131072]; /* Must be last in the structure */942} heapframe;943944/* Assert that the size of the heapframe structure is a multiple of PCRE2_SIZE.945See various comments above. */946947STATIC_ASSERT((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0, heapframe_size);948949/* Structure for computing the alignment of heapframe. */950951typedef struct heapframe_align {952char unalign; /* Completely unalign the current offset */953heapframe frame; /* Offset is its alignment */954} heapframe_align;955956/* This define is the minimum alignment required for a heapframe, in bytes. */957958#define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame)959960/* Structure for passing "static" information around between the functions961doing traditional NFA matching (pcre2_match() and friends). */962963typedef struct match_block {964pcre2_memctl memctl; /* For general use */965uint32_t heap_limit; /* As it says */966uint32_t match_limit; /* As it says */967uint32_t match_limit_depth; /* As it says */968uint32_t match_call_count; /* Number of times a new frame is created */969BOOL hitend; /* Hit the end of the subject at some point */970BOOL hasthen; /* Pattern contains (*THEN) */971BOOL hasbsk; /* Pattern contains \K */972BOOL allowemptypartial; /* Allow empty hard partial */973BOOL allowlookaroundbsk; /* Allow \K within lookarounds */974const uint8_t *lcc; /* Points to lower casing table */975const uint8_t *fcc; /* Points to case-flipping table */976const uint8_t *ctypes; /* Points to table of type maps */977PCRE2_SIZE start_offset; /* The start offset value */978PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */979uint16_t partial; /* PARTIAL options */980uint16_t bsr_convention; /* \R interpretation */981uint16_t name_count; /* Number of names in name table */982uint16_t name_entry_size; /* Size of entry in names table */983PCRE2_SPTR name_table; /* Table of group names */984PCRE2_SPTR start_code; /* For use in pattern recursion */985PCRE2_SPTR start_subject; /* Start of the subject string */986PCRE2_SPTR check_subject; /* Where UTF-checked from */987PCRE2_SPTR end_subject; /* Usable end of the subject string */988PCRE2_SPTR true_end_subject; /* Actual end of the subject string */989PCRE2_SPTR end_match_ptr; /* Subject position at end match */990PCRE2_SPTR start_used_ptr; /* Earliest consulted character */991PCRE2_SPTR last_used_ptr; /* Latest consulted character */992PCRE2_SPTR mark; /* Mark pointer to pass back on success */993PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */994PCRE2_SPTR verb_ecode_ptr; /* For passing back info */995PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */996uint32_t verb_current_recurse; /* Current recursion group when (*VERB) happens */997uint32_t moptions; /* Match options */998uint32_t poptions; /* Pattern options */999uint32_t skip_arg_count; /* For counting SKIP_ARGs */1000uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */1001uint32_t nltype; /* Newline type */1002uint32_t nllen; /* Newline string length */1003PCRE2_UCHAR nl[4]; /* Newline string when fixed */1004pcre2_callout_block *cb; /* Points to a callout block */1005void *callout_data; /* To pass back to callouts */1006int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */1007} match_block;10081009/* A similar structure is used for the same purpose by the DFA matching1010functions. */10111012typedef struct dfa_match_block {1013pcre2_memctl memctl; /* For general use */1014PCRE2_SPTR start_code; /* Start of the compiled pattern */1015PCRE2_SPTR start_subject ; /* Start of the subject string */1016PCRE2_SPTR end_subject; /* End of subject string */1017PCRE2_SPTR start_used_ptr; /* Earliest consulted character */1018PCRE2_SPTR last_used_ptr; /* Latest consulted character */1019const uint8_t *tables; /* Character tables */1020PCRE2_SIZE start_offset; /* The start offset value */1021uint32_t heap_limit; /* As it says */1022PCRE2_SIZE heap_used; /* As it says */1023uint32_t match_limit; /* As it says */1024uint32_t match_limit_depth; /* As it says */1025uint32_t match_call_count; /* Number of calls of internal function */1026uint32_t moptions; /* Match options */1027uint32_t poptions; /* Pattern options */1028uint32_t nltype; /* Newline type */1029uint32_t nllen; /* Newline string length */1030BOOL allowemptypartial; /* Allow empty hard partial */1031PCRE2_UCHAR nl[4]; /* Newline string when fixed */1032uint16_t bsr_convention; /* \R interpretation */1033pcre2_callout_block *cb; /* Points to a callout block */1034void *callout_data; /* To pass back to callouts */1035int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */1036dfa_recursion_info *recursive; /* Linked list of pattern recursion data */1037} dfa_match_block;10381039#endif /* PCRE2_PCRE2TEST */10401041#endif /* PCRE2_INTMODEDEP_CAN_DEFINE */10421043/* End of pcre2_intmodedep.h */104410451046