Path: blob/master/thirdparty/pcre2/src/pcre2_compile_class.c
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/3940#ifdef HAVE_CONFIG_H41#include "config.h"42#endif4344#include "pcre2_compile.h"4546typedef struct {47/* Option bits for eclass. */48uint32_t options;49uint32_t xoptions;50/* Rarely used members. */51int *errorcodeptr;52compile_block *cb;53/* Bitmap is needed. */54BOOL needs_bitmap;55} eclass_context;5657/* Checks the allowed tokens at the end of a class structure in debug mode.58When a new token is not processed by all loops, and the token is equals to59a) one of the cases here:60the compiler will complain about a duplicated case value.61b) none of the cases here:62the loop without the handler will stop with an assertion failure. */6364#ifdef PCRE2_DEBUG65#define CLASS_END_CASES(meta) \66default: \67PCRE2_ASSERT((meta) <= META_END); \68/* Fall through */ \69case META_CLASS: \70case META_CLASS_NOT: \71case META_CLASS_EMPTY: \72case META_CLASS_EMPTY_NOT: \73case META_CLASS_END: \74case META_ECLASS_AND: \75case META_ECLASS_OR: \76case META_ECLASS_SUB: \77case META_ECLASS_XOR: \78case META_ECLASS_NOT:79#else80#define CLASS_END_CASES(meta) \81default:82#endif8384#ifdef SUPPORT_WIDE_CHARS8586/* Heapsort algorithm. */8788static void do_heapify(uint32_t *buffer, size_t size, size_t i)89{90size_t max;91size_t left;92size_t right;93uint32_t tmp1, tmp2;9495while (TRUE)96{97max = i;98left = (i << 1) + 2;99right = left + 2;100101if (left < size && buffer[left] > buffer[max]) max = left;102if (right < size && buffer[right] > buffer[max]) max = right;103if (i == max) return;104105/* Swap items. */106tmp1 = buffer[i];107tmp2 = buffer[i + 1];108buffer[i] = buffer[max];109buffer[i + 1] = buffer[max + 1];110buffer[max] = tmp1;111buffer[max + 1] = tmp2;112i = max;113}114}115116#ifdef SUPPORT_UNICODE117118#define PARSE_CLASS_UTF 0x1119#define PARSE_CLASS_CASELESS_UTF 0x2120#define PARSE_CLASS_RESTRICTED_UTF 0x4121#define PARSE_CLASS_TURKISH_UTF 0x8122123/* Get the range of nocase characters which includes the124'c' character passed as argument, or directly follows 'c'. */125126static const uint32_t*127get_nocase_range(uint32_t c)128{129uint32_t left = 0;130uint32_t right = PRIV(ucd_nocase_ranges_size);131uint32_t middle;132133if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;134135while (TRUE)136{137/* Range end of the middle element. */138middle = ((left + right) >> 1) | 0x1;139140if (PRIV(ucd_nocase_ranges)[middle] <= c)141left = middle + 1;142else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)143right = middle - 1;144else145return PRIV(ucd_nocase_ranges) + (middle - 1);146}147}148149/* Get the list of othercase characters, which belongs to the passed range.150Create ranges from these characters, and append them to the buffer argument. */151152static size_t153utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,154uint32_t *buffer)155{156uint32_t new_start = start;157uint32_t new_end = end;158uint32_t c = start;159const uint32_t *list;160uint32_t tmp[3];161size_t result = 2;162const uint32_t *skip_range = get_nocase_range(c);163uint32_t skip_start = skip_range[0];164165#if PCRE2_CODE_UNIT_WIDTH == 8166PCRE2_ASSERT(options & PARSE_CLASS_UTF);167#endif168169#if PCRE2_CODE_UNIT_WIDTH == 32170if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;171#endif172173while (c <= end)174{175uint32_t co;176177if (c > skip_start)178{179c = skip_range[1];180skip_range += 2;181skip_start = skip_range[0];182continue;183}184185/* Compute caseless set. */186187if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==188PARSE_CLASS_TURKISH_UTF &&189UCD_ANY_I(c))190{191co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);192}193else if ((co = UCD_CASESET(c)) != 0 &&194(options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&195PRIV(ucd_caseless_sets)[co] < 128)196{197co = 0; /* Ignore the caseless set if it's restricted. */198}199200if (co != 0)201list = PRIV(ucd_caseless_sets) + co;202else203{204co = UCD_OTHERCASE(c);205list = tmp;206tmp[0] = c;207tmp[1] = NOTACHAR;208209if (co != c)210{211tmp[1] = co;212tmp[2] = NOTACHAR;213}214}215c++;216217/* Add characters. */218do219{220#if PCRE2_CODE_UNIT_WIDTH == 16221if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;222#endif223224if (*list < new_start)225{226if (*list + 1 == new_start)227{228new_start--;229continue;230}231}232else if (*list > new_end)233{234if (*list - 1 == new_end)235{236new_end++;237continue;238}239}240else continue;241242result += 2;243if (buffer != NULL)244{245buffer[0] = *list;246buffer[1] = *list;247buffer += 2;248}249}250while (*(++list) != NOTACHAR);251}252253if (buffer != NULL)254{255buffer[0] = new_start;256buffer[1] = new_end;257buffer += 2;258(void)buffer;259}260return result;261}262263#endif264265/* Add a character list to a buffer. */266267static size_t268append_char_list(const uint32_t *p, uint32_t *buffer)269{270const uint32_t *n;271size_t result = 0;272273while (*p != NOTACHAR)274{275n = p;276while (n[0] == n[1] - 1) n++;277278PCRE2_ASSERT(*p < 0xffff);279280if (buffer != NULL)281{282buffer[0] = *p;283buffer[1] = *n;284buffer += 2;285}286287result += 2;288p = n + 1;289}290291return result;292}293294static uint32_t295get_highest_char(uint32_t options)296{297(void)options; /* Avoid compiler warning. */298299#if PCRE2_CODE_UNIT_WIDTH == 8300return MAX_UTF_CODE_POINT;301#else302#ifdef SUPPORT_UNICODE303return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);304#else305return MAX_UCHAR_VALUE;306#endif307#endif308}309310/* Add a negated character list to a buffer. */311static size_t312append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)313{314const uint32_t *n;315uint32_t start = 0;316size_t result = 2;317318PCRE2_ASSERT(*p > 0);319320while (*p != NOTACHAR)321{322n = p;323while (n[0] == n[1] - 1) n++;324325PCRE2_ASSERT(*p < 0xffff);326327if (buffer != NULL)328{329buffer[0] = start;330buffer[1] = *p - 1;331buffer += 2;332}333334result += 2;335start = *n + 1;336p = n + 1;337}338339if (buffer != NULL)340{341buffer[0] = start;342buffer[1] = get_highest_char(options);343buffer += 2;344(void)buffer;345}346347return result;348}349350static uint32_t *351append_non_ascii_range(uint32_t options, uint32_t *buffer)352{353if (buffer == NULL) return NULL;354355buffer[0] = 0x100;356buffer[1] = get_highest_char(options);357return buffer + 2;358}359360static size_t361parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)362{363size_t total_size = 0;364size_t size;365uint32_t meta_arg;366uint32_t start_char;367368while (TRUE)369{370switch (META_CODE(*ptr))371{372case META_ESCAPE:373meta_arg = META_DATA(*ptr);374switch (meta_arg)375{376case ESC_D:377case ESC_W:378case ESC_S:379buffer = append_non_ascii_range(options, buffer);380total_size += 2;381break;382383case ESC_h:384size = append_char_list(PRIV(hspace_list), buffer);385total_size += size;386if (buffer != NULL) buffer += size;387break;388389case ESC_H:390size = append_negated_char_list(PRIV(hspace_list), options, buffer);391total_size += size;392if (buffer != NULL) buffer += size;393break;394395case ESC_v:396size = append_char_list(PRIV(vspace_list), buffer);397total_size += size;398if (buffer != NULL) buffer += size;399break;400401case ESC_V:402size = append_negated_char_list(PRIV(vspace_list), options, buffer);403total_size += size;404if (buffer != NULL) buffer += size;405break;406407case ESC_p:408case ESC_P:409ptr++;410if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)411{412if (buffer != NULL)413{414buffer[0] = 0;415buffer[1] = get_highest_char(options);416buffer += 2;417}418total_size += 2;419}420break;421}422ptr++;423continue;424case META_POSIX_NEG:425buffer = append_non_ascii_range(options, buffer);426total_size += 2;427ptr += 2;428continue;429case META_POSIX:430ptr += 2;431continue;432case META_BIGVALUE:433/* Character literal */434ptr++;435break;436CLASS_END_CASES(*ptr)437if (*ptr >= META_END) return total_size;438break;439}440441start_char = *ptr;442443if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)444{445ptr += 2;446PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);447448if (*ptr == META_BIGVALUE) ptr++;449450#ifdef EBCDIC451#error "Missing EBCDIC support"452#endif453}454455#ifdef SUPPORT_UNICODE456if (options & PARSE_CLASS_CASELESS_UTF)457{458size = utf_caseless_extend(start_char, *ptr++, options, buffer);459if (buffer != NULL) buffer += size;460total_size += size;461continue;462}463#endif464465if (buffer != NULL)466{467buffer[0] = start_char;468buffer[1] = *ptr;469buffer += 2;470}471472ptr++;473total_size += 2;474}475476return total_size;477}478479/* Extra uint32_t values for storing the lengths of range lists in480the worst case. Two uint32_t lengths and a range end for a range481starting before 255 */482#define CHAR_LIST_EXTRA_SIZE 3483484/* Starting character values for each character list. */485486static const uint32_t char_list_starts[] = {487#if PCRE2_CODE_UNIT_WIDTH == 32488XCL_CHAR_LIST_HIGH_32_START,489#endif490#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE491XCL_CHAR_LIST_LOW_32_START,492#endif493XCL_CHAR_LIST_HIGH_16_START,494/* Must be terminated by XCL_CHAR_LIST_LOW_16_START,495which also represents the end of the bitset. */496XCL_CHAR_LIST_LOW_16_START,497};498499static class_ranges *500compile_optimize_class(uint32_t *start_ptr, uint32_t options,501uint32_t xoptions, compile_block *cb)502{503class_ranges* cranges;504uint32_t *ptr;505uint32_t *buffer;506uint32_t *dst;507uint32_t class_options = 0;508size_t range_list_size = 0, total_size, i;509uint32_t tmp1, tmp2;510const uint32_t *char_list_next;511uint16_t *next_char;512uint32_t char_list_start, char_list_end;513uint32_t range_start, range_end;514515#ifdef SUPPORT_UNICODE516if (options & PCRE2_UTF)517class_options |= PARSE_CLASS_UTF;518519if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))520class_options |= PARSE_CLASS_CASELESS_UTF;521522if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)523class_options |= PARSE_CLASS_RESTRICTED_UTF;524525if (xoptions & PCRE2_EXTRA_TURKISH_CASING)526class_options |= PARSE_CLASS_TURKISH_UTF;527#endif528529/* Compute required space for the range. */530531range_list_size = parse_class(start_ptr, class_options, NULL);532PCRE2_ASSERT((range_list_size & 0x1) == 0);533534/* Allocate buffer. The total_size also represents the end of the buffer. */535536total_size = range_list_size +537((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);538539cranges = cb->cx->memctl.malloc(540sizeof(class_ranges) + total_size * sizeof(uint32_t),541cb->cx->memctl.memory_data);542543if (cranges == NULL) return NULL;544545cranges->next = NULL;546cranges->range_list_size = (uint16_t)range_list_size;547cranges->char_lists_types = 0;548cranges->char_lists_size = 0;549cranges->char_lists_start = 0;550551if (range_list_size == 0) return cranges;552553buffer = (uint32_t*)(cranges + 1);554parse_class(start_ptr, class_options, buffer);555556/* Using <= instead of == to help static analysis. */557if (range_list_size <= 2) return cranges;558559/* In-place sorting of ranges. */560561i = (((range_list_size >> 2) - 1) << 1);562while (TRUE)563{564do_heapify(buffer, range_list_size, i);565if (i == 0) break;566i -= 2;567}568569i = range_list_size - 2;570while (TRUE)571{572tmp1 = buffer[i];573tmp2 = buffer[i + 1];574buffer[i] = buffer[0];575buffer[i + 1] = buffer[1];576buffer[0] = tmp1;577buffer[1] = tmp2;578579do_heapify(buffer, i, 0);580if (i == 0) break;581i -= 2;582}583584/* Merge ranges whenever possible. */585dst = buffer;586ptr = buffer + 2;587range_list_size -= 2;588589/* The second condition is a very rare corner case, where the end of the last590range is the maximum character. This range cannot be extended further. */591592while (range_list_size > 0 && dst[1] != ~(uint32_t)0)593{594if (dst[1] + 1 < ptr[0])595{596dst += 2;597dst[0] = ptr[0];598dst[1] = ptr[1];599}600else if (dst[1] < ptr[1]) dst[1] = ptr[1];601602ptr += 2;603range_list_size -= 2;604}605606PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));607608/* When the number of ranges are less than six,609they are not converted to range lists. */610611ptr = buffer;612while (ptr < dst && ptr[1] < 0x100) ptr += 2;613if (dst - ptr < (2 * (6 - 1)))614{615cranges->range_list_size = (uint16_t)(dst + 2 - buffer);616return cranges;617}618619/* Compute character lists structures. */620621char_list_next = char_list_starts;622char_list_start = *char_list_next++;623#if PCRE2_CODE_UNIT_WIDTH == 32624char_list_end = XCL_CHAR_LIST_HIGH_32_END;625#elif defined SUPPORT_UNICODE626char_list_end = XCL_CHAR_LIST_LOW_32_END;627#else628char_list_end = XCL_CHAR_LIST_HIGH_16_END;629#endif630next_char = (uint16_t*)(buffer + total_size);631632tmp1 = 0;633tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;634PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);635range_start = dst[0];636range_end = dst[1];637638while (TRUE)639{640if (range_start >= char_list_start)641{642if (range_start == range_end || range_end < char_list_end)643{644tmp1++;645next_char--;646647if (char_list_start < XCL_CHAR_LIST_LOW_32_START)648*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);649else650*(uint32_t*)(--next_char) =651(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;652}653654if (range_start < range_end)655{656if (range_start > char_list_start)657{658tmp1++;659next_char--;660661if (char_list_start < XCL_CHAR_LIST_LOW_32_START)662*next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);663else664*(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);665}666else667cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;668}669670PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);671672if (dst > buffer)673{674dst -= 2;675range_start = dst[0];676range_end = dst[1];677continue;678}679680range_start = 0;681range_end = 0;682}683684if (range_end >= char_list_start)685{686PCRE2_ASSERT(range_start < char_list_start);687688if (range_end < char_list_end)689{690tmp1++;691next_char--;692693if (char_list_start < XCL_CHAR_LIST_LOW_32_START)694*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);695else696*(uint32_t*)(--next_char) =697(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;698699PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);700}701702cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;703}704705if (tmp1 >= XCL_ITEM_COUNT_MASK)706{707cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;708next_char--;709710if (char_list_start < XCL_CHAR_LIST_LOW_32_START)711*next_char = (uint16_t)tmp1;712else713*(uint32_t*)(--next_char) = tmp1;714}715else716cranges->char_lists_types |= tmp1 << tmp2;717718if (range_start < XCL_CHAR_LIST_LOW_16_START) break;719720PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);721char_list_end = char_list_start - 1;722char_list_start = *char_list_next++;723tmp1 = 0;724tmp2 -= XCL_TYPE_BIT_LEN;725}726727if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;728PCRE2_ASSERT((uint16_t*)dst <= next_char);729730cranges->char_lists_size =731(size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);732cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);733cranges->range_list_size = (uint16_t)(dst - buffer);734return cranges;735}736737#endif /* SUPPORT_WIDE_CHARS */738739#ifdef SUPPORT_UNICODE740741void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,742uint8_t *classbits)743{744/* Update PRIV(xclass) when this function is changed. */745int c, chartype;746const ucd_record *prop;747uint32_t gentype;748BOOL set_bit;749750if (ptype == PT_ANY)751{752if (!negated) memset(classbits, 0xff, 32);753return;754}755756for (c = 0; c < 256; c++)757{758prop = GET_UCD(c);759set_bit = FALSE;760(void)set_bit;761762switch (ptype)763{764case PT_LAMP:765chartype = prop->chartype;766set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);767break;768769case PT_GC:770set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);771break;772773case PT_PC:774set_bit = (prop->chartype == pdata);775break;776777case PT_SC:778set_bit = (prop->script == pdata);779break;780781case PT_SCX:782set_bit = (prop->script == pdata ||783MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);784break;785786case PT_ALNUM:787gentype = PRIV(ucp_gentype)[prop->chartype];788set_bit = (gentype == ucp_L || gentype == ucp_N);789break;790791case PT_SPACE: /* Perl space */792case PT_PXSPACE: /* POSIX space */793switch(c)794{795HSPACE_BYTE_CASES:796VSPACE_BYTE_CASES:797set_bit = TRUE;798break;799800default:801set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);802break;803}804break;805806case PT_WORD:807chartype = prop->chartype;808gentype = PRIV(ucp_gentype)[chartype];809set_bit = (gentype == ucp_L || gentype == ucp_N ||810chartype == ucp_Mn || chartype == ucp_Pc);811break;812813case PT_UCNC:814set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||815c == CHAR_GRAVE_ACCENT || c >= 0xa0);816break;817818case PT_BIDICL:819set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);820break;821822case PT_BOOL:823set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +824UCD_BPROPS_PROP(prop), pdata) != 0;825break;826827case PT_PXGRAPH:828chartype = prop->chartype;829gentype = PRIV(ucp_gentype)[chartype];830set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));831break;832833case PT_PXPRINT:834chartype = prop->chartype;835set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&836(PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));837break;838839case PT_PXPUNCT:840gentype = PRIV(ucp_gentype)[prop->chartype];841set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));842break;843844default:845PCRE2_ASSERT(ptype == PT_PXXDIGIT);846set_bit = (c >= CHAR_0 && c <= CHAR_9) ||847(c >= CHAR_A && c <= CHAR_F) ||848(c >= CHAR_a && c <= CHAR_f);849break;850}851852if (negated) set_bit = !set_bit;853if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));854if ((c & 0x7) == 0x7) classbits++;855}856}857858#endif /* SUPPORT_UNICODE */859860861862#ifdef SUPPORT_WIDE_CHARS863864/*************************************************865* XClass related properties *866*************************************************/867868/* XClass needs to be generated. */869#define XCLASS_REQUIRED 0x1870/* XClass has 8 bit character. */871#define XCLASS_HAS_8BIT_CHARS 0x2872/* XClass has properties. */873#define XCLASS_HAS_PROPS 0x4874/* XClass has character lists. */875#define XCLASS_HAS_CHAR_LISTS 0x8876/* XClass matches to all >= 256 characters. */877#define XCLASS_HIGH_ANY 0x10878879#endif880881882/*************************************************883* Internal entry point for add range to class *884*************************************************/885886/* This function sets the overall range for characters < 256.887It also handles non-utf case folding.888889Arguments:890options the options bits891xoptions the extra options bits892cb compile data893start start of range character894end end of range character895896Returns: cb->classbits is updated897*/898899static void900add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,901uint32_t start, uint32_t end)902{903uint8_t *classbits = cb->classbits.classbits;904uint32_t c, byte_start, byte_end;905uint32_t classbits_end = (end <= 0xff ? end : 0xff);906907/* If caseless matching is required, scan the range and process alternate908cases. In Unicode, there are 8-bit characters that have alternate cases that909are greater than 255 and vice-versa (though these may be ignored if caseless910restriction is in force). Sometimes we can just extend the original range. */911912if ((options & PCRE2_CASELESS) != 0)913{914#ifdef SUPPORT_UNICODE915/* UTF mode. This branch is taken if we don't support wide characters (e.g.9168-bit library, without UTF), but we do treat those characters as Unicode917(if UCP flag is set). In this case, we only need to expand the character class918set to include the case pairs which are in the 0-255 codepoint range. */919if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)920{921BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==922PCRE2_EXTRA_TURKISH_CASING;923if (start < 128)924{925uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);926for (c = start; c <= lo_end; c++)927{928if (turkish_i && UCD_ANY_I(c)) continue;929SETBIT(classbits, cb->fcc[c]);930}931}932if (classbits_end >= 128)933{934uint32_t hi_start = (start > 128 ? start : 128);935for (c = hi_start; c <= classbits_end; c++)936{937uint32_t co = UCD_OTHERCASE(c);938if (co <= 0xff) SETBIT(classbits, co);939}940}941}942943else944#endif /* SUPPORT_UNICODE */945946/* Not UTF mode */947{948for (c = start; c <= classbits_end; c++)949SETBIT(classbits, cb->fcc[c]);950}951}952953/* Use the bitmap for characters < 256. Otherwise use extra data. */954955byte_start = (start + 7) >> 3;956byte_end = (classbits_end + 1) >> 3;957958if (byte_start >= byte_end)959{960for (c = start; c <= classbits_end; c++)961/* Regardless of start, c will always be <= 255. */962SETBIT(classbits, c);963return;964}965966for (c = byte_start; c < byte_end; c++)967classbits[c] = 0xff;968969byte_start <<= 3;970byte_end <<= 3;971972for (c = start; c < byte_start; c++)973SETBIT(classbits, c);974975for (c = byte_end; c <= classbits_end; c++)976SETBIT(classbits, c);977}978979980#if PCRE2_CODE_UNIT_WIDTH == 8981/*************************************************982* Internal entry point for add list to class *983*************************************************/984985/* This function is used for adding a list of horizontal or vertical whitespace986characters to a class. The list must be in order so that ranges of characters987can be detected and handled appropriately. This function sets the overall range988so that the internal functions can try to avoid duplication when handling989case-independence.990991Arguments:992options the options bits993xoptions the extra options bits994cb contains pointers to tables etc.995p points to row of 32-bit values, terminated by NOTACHAR996997Returns: cb->classbits is updated998*/9991000static void1001add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,1002const uint32_t *p)1003{1004while (p[0] < 256)1005{1006unsigned int n = 0;10071008while(p[n+1] == p[0] + n + 1) n++;1009add_to_class(options, xoptions, cb, p[0], p[n]);10101011p += n + 1;1012}1013}1014101510161017/*************************************************1018* Add characters not in a list to a class *1019*************************************************/10201021/* This function is used for adding the complement of a list of horizontal or1022vertical whitespace to a class. The list must be in order.10231024Arguments:1025options the options bits1026xoptions the extra options bits1027cb contains pointers to tables etc.1028p points to row of 32-bit values, terminated by NOTACHAR10291030Returns: cb->classbits is updated1031*/10321033static void1034add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,1035const uint32_t *p)1036{1037if (p[0] > 0)1038add_to_class(options, xoptions, cb, 0, p[0] - 1);1039while (p[0] < 256)1040{1041while (p[1] == p[0] + 1) p++;1042add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);1043p++;1044}1045}1046#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */1047104810491050/*************************************************1051* Main entry-point to compile a character class *1052*************************************************/10531054/* This function consumes a "leaf", which is a set of characters that will1055become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */10561057uint32_t *1058PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,1059uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,1060int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)1061{1062uint32_t *pptr = start_ptr;1063PCRE2_UCHAR *code = *pcode;1064BOOL should_flip_negation;1065const uint8_t *cbits = cb->cbits;1066/* Some functions such as add_to_class() or eclass processing1067expects that the bitset is stored in cb->classbits.classbits. */1068uint8_t *const classbits = cb->classbits.classbits;10691070#ifdef SUPPORT_UNICODE1071BOOL utf = (options & PCRE2_UTF) != 0;1072#else /* No Unicode support */1073BOOL utf = FALSE;1074#endif10751076/* Helper variables for OP_XCLASS opcode (for characters > 255). */10771078#ifdef SUPPORT_WIDE_CHARS1079uint32_t xclass_props;1080PCRE2_UCHAR *class_uchardata;1081class_ranges* cranges;1082#endif10831084/* If an XClass contains a negative special such as \S, we need to flip the1085negation flag at the end, so that support for characters > 255 works correctly1086(they are all included in the class). An XClass may need to insert specific1087matching or non-matching code for wide characters.1088*/10891090should_flip_negation = FALSE;10911092/* XClass will be used when characters > 255 might match. */10931094#ifdef SUPPORT_WIDE_CHARS1095xclass_props = 0;10961097#if PCRE2_CODE_UNIT_WIDTH == 81098cranges = NULL;10991100if (utf)1101#endif1102{1103if (lengthptr != NULL)1104{1105cranges = compile_optimize_class(pptr, options, xoptions, cb);11061107if (cranges == NULL)1108{1109*errorcodeptr = ERR21;1110return NULL;1111}11121113/* Caching the pre-processed character ranges. */1114if (cb->next_cranges != NULL)1115cb->next_cranges->next = cranges;1116else1117cb->cranges = cranges;11181119cb->next_cranges = cranges;1120}1121else1122{1123/* Reuse the pre-processed character ranges. */1124cranges = cb->cranges;1125PCRE2_ASSERT(cranges != NULL);1126cb->cranges = cranges->next;1127}11281129if (cranges->range_list_size > 0)1130{1131const uint32_t *ranges = (const uint32_t*)(cranges + 1);11321133if (ranges[0] <= 255)1134xclass_props |= XCLASS_HAS_8BIT_CHARS;11351136if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&1137ranges[cranges->range_list_size - 2] <= 256)1138xclass_props |= XCLASS_HIGH_ANY;1139}1140}11411142class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */1143#endif /* SUPPORT_WIDE_CHARS */11441145/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map1146in a temporary bit of memory, in case the class contains fewer than two11478-bit characters because in that case the compiled code doesn't use the bit1148map. */11491150memset(classbits, 0, 32);11511152/* Process items until end_ptr is reached. */11531154while (TRUE)1155{1156uint32_t meta = *(pptr++);1157BOOL local_negate;1158int posix_class;1159int taboffset, tabopt;1160class_bits_storage pbits;1161uint32_t escape, c;11621163/* Handle POSIX classes such as [:alpha:] etc. */1164switch (META_CODE(meta))1165{1166case META_POSIX:1167case META_POSIX_NEG:11681169local_negate = (meta == META_POSIX_NEG);1170posix_class = *(pptr++);11711172if (local_negate) should_flip_negation = TRUE; /* Note negative special */11731174/* If matching is caseless, upper and lower are converted to alpha.1175This relies on the fact that the class table starts with alpha,1176lower, upper as the first 3 entries. */11771178if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)1179posix_class = 0;11801181/* When PCRE2_UCP is set, some of the POSIX classes are converted to1182different escape sequences that use Unicode properties \p or \P.1183Others that are not available via \p or \P have to generate1184XCL_PROP/XCL_NOTPROP directly, which is done here. */11851186#ifdef SUPPORT_UNICODE1187/* TODO This entire block of code here appears to be unreachable!? I simply1188can't see how it can be hit, given that the frontend parser doesn't emit1189META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */1190if ((options & PCRE2_UCP) != 0 &&1191(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)1192{1193uint32_t ptype;11941195switch(posix_class)1196{1197case PC_GRAPH:1198case PC_PRINT:1199case PC_PUNCT:1200ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :1201(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;12021203PRIV(update_classbits)(ptype, 0, local_negate, classbits);12041205if ((xclass_props & XCLASS_HIGH_ANY) == 0)1206{1207if (lengthptr != NULL)1208*lengthptr += 3;1209else1210{1211*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;1212*class_uchardata++ = (PCRE2_UCHAR)ptype;1213*class_uchardata++ = 0;1214}1215xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;1216}1217continue;12181219/* For the other POSIX classes (ex: ascii) we are going to1220fall through to the non-UCP case and build a bit map for1221characters with code points less than 256. However, if we are in1222a negated POSIX class, characters with code points greater than1223255 must either all match or all not match, depending on whether1224the whole class is not or is negated. For example, for1225[[:^ascii:]... they must all match, whereas for [^[:^ascii:]...1226they must not.12271228In the special case where there are no xclass items, this is1229automatically handled by the use of OP_CLASS or OP_NCLASS, but an1230explicit range is needed for OP_XCLASS. Setting a flag here1231causes the range to be generated later when it is known that1232OP_XCLASS is required. In the 8-bit library this is relevant only in1233utf mode, since no wide characters can exist otherwise. */12341235default:1236break;1237}1238}1239#endif /* SUPPORT_UNICODE */12401241/* In the non-UCP case, or when UCP makes no difference, we build the1242bit map for the POSIX class in a chunk of local store because we may1243be adding and subtracting from it, and we don't want to subtract bits1244that may be in the main map already. At the end we or the result into1245the bit map that is being built. */12461247posix_class *= 3;12481249/* Copy in the first table (always present) */12501251memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);12521253/* If there is a second table, add or remove it as required. */12541255taboffset = PRIV(posix_class_maps)[posix_class + 1];1256tabopt = PRIV(posix_class_maps)[posix_class + 2];12571258if (taboffset >= 0)1259{1260if (tabopt >= 0)1261for (int i = 0; i < 32; i++)1262pbits.classbits[i] |= cbits[i + taboffset];1263else1264for (int i = 0; i < 32; i++)1265pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);1266}12671268/* Now see if we need to remove any special characters. An option1269value of 1 removes vertical space and 2 removes underscore. */12701271if (tabopt < 0) tabopt = -tabopt;1272if (tabopt == 1) pbits.classbits[1] &= ~0x3c;1273else if (tabopt == 2) pbits.classbits[11] &= 0x7f;12741275/* Add the POSIX table or its complement into the main table that is1276being built and we are done. */12771278{1279uint32_t *classwords = cb->classbits.classwords;12801281if (local_negate)1282for (int i = 0; i < 8; i++)1283classwords[i] |= (uint32_t)(~pbits.classwords[i]);1284else1285for (int i = 0; i < 8; i++)1286classwords[i] |= pbits.classwords[i];1287}12881289#ifdef SUPPORT_WIDE_CHARS1290/* Every class contains at least one < 256 character. */1291xclass_props |= XCLASS_HAS_8BIT_CHARS;1292#endif1293continue; /* End of POSIX handling */12941295/* Other than POSIX classes, the only items we should encounter are1296\d-type escapes and literal characters (possibly as ranges). */1297case META_BIGVALUE:1298meta = *(pptr++);1299break;13001301case META_ESCAPE:1302escape = META_DATA(meta);13031304switch(escape)1305{1306case ESC_d:1307for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];1308break;13091310case ESC_D:1311should_flip_negation = TRUE;1312for (int i = 0; i < 32; i++)1313classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);1314break;13151316case ESC_w:1317for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];1318break;13191320case ESC_W:1321should_flip_negation = TRUE;1322for (int i = 0; i < 32; i++)1323classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);1324break;13251326/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl13275.18. Before PCRE 8.34, we had to preserve the VT bit if it was1328previously set by something earlier in the character class.1329Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so1330we could just adjust the appropriate bit. From PCRE 8.34 we no1331longer treat \s and \S specially. */13321333case ESC_s:1334for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];1335break;13361337case ESC_S:1338should_flip_negation = TRUE;1339for (int i = 0; i < 32; i++)1340classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);1341break;13421343/* When adding the horizontal or vertical space lists to a class, or1344their complements, disable PCRE2_CASELESS, because it justs wastes1345time, and in the "not-x" UTF cases can create unwanted duplicates in1346the XCLASS list (provoked by characters that have more than one other1347case and by both cases being in the same "not-x" sublist). */13481349case ESC_h:1350#if PCRE2_CODE_UNIT_WIDTH == 81351#ifdef SUPPORT_UNICODE1352if (cranges != NULL) break;1353#endif1354add_list_to_class(options & ~PCRE2_CASELESS, xoptions,1355cb, PRIV(hspace_list));1356#else1357PCRE2_ASSERT(cranges != NULL);1358#endif1359break;13601361case ESC_H:1362#if PCRE2_CODE_UNIT_WIDTH == 81363#ifdef SUPPORT_UNICODE1364if (cranges != NULL) break;1365#endif1366add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,1367cb, PRIV(hspace_list));1368#else1369PCRE2_ASSERT(cranges != NULL);1370#endif1371break;13721373case ESC_v:1374#if PCRE2_CODE_UNIT_WIDTH == 81375#ifdef SUPPORT_UNICODE1376if (cranges != NULL) break;1377#endif1378add_list_to_class(options & ~PCRE2_CASELESS, xoptions,1379cb, PRIV(vspace_list));1380#else1381PCRE2_ASSERT(cranges != NULL);1382#endif1383break;13841385case ESC_V:1386#if PCRE2_CODE_UNIT_WIDTH == 81387#ifdef SUPPORT_UNICODE1388if (cranges != NULL) break;1389#endif1390add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,1391cb, PRIV(vspace_list));1392#else1393PCRE2_ASSERT(cranges != NULL);1394#endif1395break;13961397/* If Unicode is not supported, \P and \p are not allowed and are1398faulted at parse time, so will never appear here. */13991400#ifdef SUPPORT_UNICODE1401case ESC_p:1402case ESC_P:1403{1404uint32_t ptype = *pptr >> 16;1405uint32_t pdata = *(pptr++) & 0xffff;14061407/* The "Any" is processed by PRIV(update_classbits)(). */1408if (ptype == PT_ANY)1409{1410#if PCRE2_CODE_UNIT_WIDTH == 81411if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);1412#endif1413continue;1414}14151416PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);14171418if ((xclass_props & XCLASS_HIGH_ANY) == 0)1419{1420if (lengthptr != NULL)1421*lengthptr += 3;1422else1423{1424*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;1425*class_uchardata++ = ptype;1426*class_uchardata++ = pdata;1427}1428xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;1429}1430}1431continue;1432#endif1433}14341435#ifdef SUPPORT_WIDE_CHARS1436/* Every non-property class contains at least one < 256 character. */1437xclass_props |= XCLASS_HAS_8BIT_CHARS;1438#endif1439/* End handling \d-type escapes */1440continue;14411442CLASS_END_CASES(meta)1443/* Literals. */1444if (meta < META_END) break;1445/* Non-literals: end of class contents. */1446goto END_PROCESSING;1447}14481449/* A literal character may be followed by a range meta. At parse time1450there are checks for out-of-order characters, for ranges where the two1451characters are equal, and for hyphens that cannot indicate a range. At1452this point, therefore, no checking is needed. */14531454c = meta;14551456/* Remember if \r or \n were explicitly used */14571458if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;14591460/* Process a character range */14611462if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)1463{1464uint32_t d;14651466#ifdef EBCDIC1467BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);1468#endif1469++pptr;1470d = *(pptr++);1471if (d == META_BIGVALUE) d = *(pptr++);14721473/* Remember an explicit \r or \n, and add the range to the class. */14741475if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;14761477#if PCRE2_CODE_UNIT_WIDTH == 81478#ifdef SUPPORT_UNICODE1479if (cranges != NULL) continue;1480xclass_props |= XCLASS_HAS_8BIT_CHARS;1481#endif14821483/* In an EBCDIC environment, Perl treats alphabetic ranges specially1484because there are holes in the encoding, and simply using the range1485A-Z (for example) would include the characters in the holes. This1486applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */14871488#ifdef EBCDIC1489if (range_is_literal &&1490(cb->ctypes[c] & ctype_letter) != 0 &&1491(cb->ctypes[d] & ctype_letter) != 0 &&1492(c <= CHAR_z) == (d <= CHAR_z))1493{1494uint32_t uc = (d <= CHAR_z)? 0 : 64;1495uint32_t C = c - uc;1496uint32_t D = d - uc;14971498if (C <= CHAR_i)1499{1500add_to_class(options, xoptions, cb, C + uc,1501((D < CHAR_i)? D : CHAR_i) + uc);1502C = CHAR_j;1503}15041505if (C <= D && C <= CHAR_r)1506{1507add_to_class(options, xoptions, cb, C + uc,1508((D < CHAR_r)? D : CHAR_r) + uc);1509C = CHAR_s;1510}15111512if (C <= D)1513add_to_class(options, xoptions, cb, C + uc, D + uc);1514}1515else1516#endif1517/* Not an EBCDIC special range */15181519add_to_class(options, xoptions, cb, c, d);1520#else1521PCRE2_ASSERT(cranges != NULL);1522#endif1523continue;1524} /* End of range handling */15251526/* Character ranges are ignored when class_ranges is present. */1527#if PCRE2_CODE_UNIT_WIDTH == 81528#ifdef SUPPORT_UNICODE1529if (cranges != NULL) continue;1530xclass_props |= XCLASS_HAS_8BIT_CHARS;1531#endif1532/* Handle a single character. */15331534add_to_class(options, xoptions, cb, meta, meta);1535#else1536PCRE2_ASSERT(cranges != NULL);1537#endif1538} /* End of main class-processing loop */15391540END_PROCESSING:15411542#ifdef SUPPORT_WIDE_CHARS1543PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||1544(xclass_props & XCLASS_HIGH_ANY) == 0);15451546if (cranges != NULL)1547{1548uint32_t *range = (uint32_t*)(cranges + 1);1549uint32_t *end = range + cranges->range_list_size;15501551while (range < end && range[0] < 256)1552{1553PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);1554/* Add range to bitset. If we are in UTF or UCP mode, then clear the1555caseless bit, because the cranges handle caselessness (only) in this1556condition; see the condition for PARSE_CLASS_CASELESS_UTF in1557compile_optimize_class(). */1558add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?1559(options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);15601561if (range[1] > 255) break;1562range += 2;1563}15641565if (cranges->char_lists_size > 0)1566{1567/* The cranges structure is still used and freed later. */1568PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);1569xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;1570}1571else1572{1573if ((xclass_props & XCLASS_HIGH_ANY) != 0)1574{1575PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&1576range[1] >= GET_MAX_CHAR_VALUE(utf));1577should_flip_negation = TRUE;1578range = end;1579}15801581while (range < end)1582{1583uint32_t range_start = range[0];1584uint32_t range_end = range[1];15851586range += 2;1587xclass_props |= XCLASS_REQUIRED;15881589if (range_start < 256) range_start = 256;15901591if (lengthptr != NULL)1592{1593#ifdef SUPPORT_UNICODE1594if (utf)1595{1596*lengthptr += 1;15971598if (range_start < range_end)1599*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);16001601*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);1602continue;1603}1604#endif /* SUPPORT_UNICODE */16051606*lengthptr += range_start < range_end ? 3 : 2;1607continue;1608}16091610#ifdef SUPPORT_UNICODE1611if (utf)1612{1613if (range_start < range_end)1614{1615*class_uchardata++ = XCL_RANGE;1616class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);1617}1618else1619*class_uchardata++ = XCL_SINGLE;16201621class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);1622continue;1623}1624#endif /* SUPPORT_UNICODE */16251626/* Without UTF support, character values are constrained1627by the bit length, and can only be > 256 for 16-bit and162832-bit libraries. */1629#if PCRE2_CODE_UNIT_WIDTH != 81630if (range_start < range_end)1631{1632*class_uchardata++ = XCL_RANGE;1633*class_uchardata++ = range_start;1634}1635else1636*class_uchardata++ = XCL_SINGLE;16371638*class_uchardata++ = range_end;1639#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */1640}16411642if (lengthptr == NULL)1643cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);1644}1645}1646#endif /* SUPPORT_WIDE_CHARS */16471648/* If there are characters with values > 255, or Unicode property settings1649(\p or \P), we have to compile an extended class, with its own opcode,1650unless there were no property settings and there was a negated special such1651as \S in the class, and PCRE2_UCP is not set, because in that case all1652characters > 255 are in or not in the class, so any that were explicitly1653given as well can be ignored.16541655In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were1656were present in a class, we either have to match or not match all wide1657characters (depending on whether the whole class is or is not negated).1658This requirement is indicated by match_all_or_no_wide_chars being true.1659We do this by including an explicit range, which works in both cases.1660This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there1661cannot be any wide characters in 8-bit non-UTF mode.16621663When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit1664class where \S etc is present without PCRE2_UCP, causing an extended class1665to be compiled, we make sure that all characters > 255 are included by1666forcing match_all_or_no_wide_chars to be true.16671668If, when generating an xclass, there are no characters < 256, we can omit1669the bitmap in the actual compiled code. */16701671#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */1672if ((xclass_props & XCLASS_REQUIRED) != 0)1673{1674PCRE2_UCHAR *previous = code;16751676if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)1677*class_uchardata++ = XCL_END; /* Marks the end of extra data */1678*code++ = OP_XCLASS;1679code += LINK_SIZE;1680*code = negate_class? XCL_NOT:0;1681if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;16821683/* If the map is required, move up the extra data to make room for it;1684otherwise just move the code pointer to the end of the extra data. */16851686if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)1687{1688if (negate_class)1689{1690uint32_t *classwords = cb->classbits.classwords;1691for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];1692}16931694if (has_bitmap == NULL)1695{1696*code++ |= XCL_MAP;1697(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,1698CU2BYTES(class_uchardata - code));1699memcpy(code, classbits, 32);1700code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));1701}1702else1703{1704code = class_uchardata;1705if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)1706*has_bitmap = TRUE;1707}1708}1709else code = class_uchardata;17101711if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)1712{1713/* Char lists size is an even number, because all items are 16 or 321714bit values. The character list data is always aligned to 32 bits. */1715size_t char_lists_size = cranges->char_lists_size;1716PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&1717(cb->char_lists_size & 0x3) == 0);17181719if (lengthptr != NULL)1720{1721char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));17221723#if PCRE2_CODE_UNIT_WIDTH == 81724*lengthptr += 2 + LINK_SIZE;1725#else1726*lengthptr += 1 + LINK_SIZE;1727#endif17281729cb->char_lists_size += char_lists_size;17301731char_lists_size /= sizeof(PCRE2_UCHAR);17321733/* Storage space for character lists is included1734in the maximum pattern size. */1735if (*lengthptr > MAX_PATTERN_SIZE ||1736MAX_PATTERN_SIZE - *lengthptr < char_lists_size)1737{1738*errorcodeptr = ERR20; /* Pattern is too large */1739return NULL;1740}1741}1742else1743{1744uint8_t *data;17451746PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);1747#if PCRE2_CODE_UNIT_WIDTH == 81748/* Encode as high / low bytes. */1749code[0] = (uint8_t)(XCL_LIST |1750(cranges->char_lists_types >> 8));1751code[1] = (uint8_t)cranges->char_lists_types;1752code += 2;1753#else1754*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);1755#endif17561757/* Character lists are stored in backwards direction from1758byte code start. The non-dfa/dfa matchers can access these1759lists using the byte code start stored in match blocks.1760Each list is aligned to 32 bit with an optional unused176116 bit value at the beginning of the character list. */17621763cb->char_lists_size += char_lists_size;1764data = (uint8_t*)cb->start_code - cb->char_lists_size;17651766memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,1767char_lists_size);17681769/* Since character lists total size is less than MAX_PATTERN_SIZE,1770their starting offset fits into a value which size is LINK_SIZE. */17711772char_lists_size = cb->char_lists_size;1773PUT(code, 0, (uint32_t)(char_lists_size >> 1));1774code += LINK_SIZE;17751776#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND1777if ((char_lists_size & 0x2) != 0)1778{1779/* In debug the unused 16 bit value is set1780to a fixed value and marked unused. */1781((uint16_t*)data)[-1] = 0x5555;1782#ifdef SUPPORT_VALGRIND1783VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);1784#endif1785}1786#endif17871788cb->char_lists_size =1789CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));17901791cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);1792}1793}17941795/* Now fill in the complete length of the item */17961797PUT(previous, 1, (int)(code - previous));1798goto DONE; /* End of class handling */1799}1800#endif /* SUPPORT_WIDE_CHARS */18011802/* If there are no characters > 255, or they are all to be included or1803excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the1804whole class was negated and whether there were negative specials such as \S1805(non-UCP) in the class. Then copy the 32-byte map into the code vector,1806negating it if necessary. */18071808if (negate_class)1809{1810uint32_t *classwords = cb->classbits.classwords;18111812for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];1813}18141815if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&1816cb->classbits.classwords[0] == ~(uint32_t)0)1817{1818const uint32_t *classwords = cb->classbits.classwords;1819int i;18201821for (i = 0; i < 8; i++)1822if (classwords[i] != ~(uint32_t)0) break;18231824if (i == 8)1825{1826*code++ = OP_ALLANY;1827goto DONE; /* End of class handling */1828}1829}18301831*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;1832memcpy(code, classbits, 32);1833code += 32 / sizeof(PCRE2_UCHAR);18341835DONE:1836*pcode = code;1837return pptr - 1;1838}1839184018411842/* ===================================================================*/1843/* Here follows a block of ECLASS-compiling functions. You may well want to1844read them from top to bottom; they are ordered from leafmost (at the top) to1845outermost parser (at the bottom of the file). */18461847/* This function folds one operand using the negation operator.1848The new, combined chunk of stack code is written out to *pop_info. */18491850static void1851fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,1852BOOL preserve_classbits)1853{1854/* If the chunk of stack code is already composed of multiple ops, we won't1855descend in and try and propagate the negation down the tree. (That would lead1856to O(n^2) compile-time, which could be exploitable with a malicious regex -1857although maybe that's not really too much of a worry in a library that offers1858an exponential-time matching function!) */18591860if (pop_info->op_single_type == 0)1861{1862if (lengthptr != NULL)1863*lengthptr += 1;1864else1865pop_info->code_start[pop_info->length] = ECL_NOT;1866pop_info->length += 1;1867}18681869/* Otherwise, it's a nice single-op item, so we can easily fold in the negation1870without needing to produce an ECL_NOT. */18711872else if (pop_info->op_single_type == ECL_ANY ||1873pop_info->op_single_type == ECL_NONE)1874{1875pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?1876ECL_ANY : ECL_NONE;1877if (lengthptr == NULL)1878*(pop_info->code_start) = pop_info->op_single_type;1879}1880else1881{1882PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&1883pop_info->length >= 1 + LINK_SIZE + 1);1884if (lengthptr == NULL)1885pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;1886}18871888if (!preserve_classbits)1889{1890for (int i = 0; i < 8; i++)1891pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];1892}1893}1894189518961897/* This function folds together two operands using a binary operator.1898The new, combined chunk of stack code is written out to *lhs_op_info. */18991900static void1901fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,1902PCRE2_SIZE *lengthptr)1903{1904switch (op)1905{1906/* ECL_AND truth table:19071908LHS RHS RESULT1909----------------1910ANY * RHS1911* ANY LHS1912NONE * NONE1913* NONE NONE1914X Y X & Y1915*/19161917case ECL_AND:1918if (rhs_op_info->op_single_type == ECL_ANY)1919{1920/* no-op: drop the RHS */1921}1922else if (lhs_op_info->op_single_type == ECL_ANY)1923{1924/* no-op: drop the LHS, and memmove the RHS into its place */1925if (lengthptr == NULL)1926memmove(lhs_op_info->code_start, rhs_op_info->code_start,1927CU2BYTES(rhs_op_info->length));1928lhs_op_info->length = rhs_op_info->length;1929lhs_op_info->op_single_type = rhs_op_info->op_single_type;1930}1931else if (rhs_op_info->op_single_type == ECL_NONE)1932{1933/* the result is ECL_NONE: write into the LHS */1934if (lengthptr == NULL)1935lhs_op_info->code_start[0] = ECL_NONE;1936lhs_op_info->length = 1;1937lhs_op_info->op_single_type = ECL_NONE;1938}1939else if (lhs_op_info->op_single_type == ECL_NONE)1940{1941/* the result is ECL_NONE: drop the RHS */1942}1943else1944{1945/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */1946if (lengthptr != NULL)1947*lengthptr += 1;1948else1949{1950PCRE2_ASSERT(rhs_op_info->code_start ==1951lhs_op_info->code_start + lhs_op_info->length);1952rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;1953}1954lhs_op_info->length += rhs_op_info->length + 1;1955lhs_op_info->op_single_type = 0;1956}19571958for (int i = 0; i < 8; i++)1959lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];1960break;19611962/* ECL_OR truth table:19631964LHS RHS RESULT1965----------------1966ANY * ANY1967* ANY ANY1968NONE * RHS1969* NONE LHS1970X Y X | Y1971*/19721973case ECL_OR:1974if (rhs_op_info->op_single_type == ECL_NONE)1975{1976/* no-op: drop the RHS */1977}1978else if (lhs_op_info->op_single_type == ECL_NONE)1979{1980/* no-op: drop the LHS, and memmove the RHS into its place */1981if (lengthptr == NULL)1982memmove(lhs_op_info->code_start, rhs_op_info->code_start,1983CU2BYTES(rhs_op_info->length));1984lhs_op_info->length = rhs_op_info->length;1985lhs_op_info->op_single_type = rhs_op_info->op_single_type;1986}1987else if (rhs_op_info->op_single_type == ECL_ANY)1988{1989/* the result is ECL_ANY: write into the LHS */1990if (lengthptr == NULL)1991lhs_op_info->code_start[0] = ECL_ANY;1992lhs_op_info->length = 1;1993lhs_op_info->op_single_type = ECL_ANY;1994}1995else if (lhs_op_info->op_single_type == ECL_ANY)1996{1997/* the result is ECL_ANY: drop the RHS */1998}1999else2000{2001/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */2002if (lengthptr != NULL)2003*lengthptr += 1;2004else2005{2006PCRE2_ASSERT(rhs_op_info->code_start ==2007lhs_op_info->code_start + lhs_op_info->length);2008rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;2009}2010lhs_op_info->length += rhs_op_info->length + 1;2011lhs_op_info->op_single_type = 0;2012}20132014for (int i = 0; i < 8; i++)2015lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];2016break;20172018/* ECL_XOR truth table:20192020LHS RHS RESULT2021----------------2022ANY * !RHS2023* ANY !LHS2024NONE * RHS2025* NONE LHS2026X Y X ^ Y2027*/20282029case ECL_XOR:2030if (rhs_op_info->op_single_type == ECL_NONE)2031{2032/* no-op: drop the RHS */2033}2034else if (lhs_op_info->op_single_type == ECL_NONE)2035{2036/* no-op: drop the LHS, and memmove the RHS into its place */2037if (lengthptr == NULL)2038memmove(lhs_op_info->code_start, rhs_op_info->code_start,2039CU2BYTES(rhs_op_info->length));2040lhs_op_info->length = rhs_op_info->length;2041lhs_op_info->op_single_type = rhs_op_info->op_single_type;2042}2043else if (rhs_op_info->op_single_type == ECL_ANY)2044{2045/* the result is !LHS: fold in the negation, and drop the RHS */2046/* Preserve the classbits, because we promise to deal with them later. */2047fold_negation(lhs_op_info, lengthptr, TRUE);2048}2049else if (lhs_op_info->op_single_type == ECL_ANY)2050{2051/* the result is !RHS: drop the LHS, memmove the RHS into its place, and2052fold in the negation */2053if (lengthptr == NULL)2054memmove(lhs_op_info->code_start, rhs_op_info->code_start,2055CU2BYTES(rhs_op_info->length));2056lhs_op_info->length = rhs_op_info->length;2057lhs_op_info->op_single_type = rhs_op_info->op_single_type;20582059/* Preserve the classbits, because we promise to deal with them later. */2060fold_negation(lhs_op_info, lengthptr, TRUE);2061}2062else2063{2064/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */2065if (lengthptr != NULL)2066*lengthptr += 1;2067else2068{2069PCRE2_ASSERT(rhs_op_info->code_start ==2070lhs_op_info->code_start + lhs_op_info->length);2071rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;2072}2073lhs_op_info->length += rhs_op_info->length + 1;2074lhs_op_info->op_single_type = 0;2075}20762077for (int i = 0; i < 8; i++)2078lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];2079break;20802081default:2082PCRE2_DEBUG_UNREACHABLE();2083break;2084}2085}2086208720882089static BOOL2090compile_eclass_nested(eclass_context *context, BOOL negated,2091uint32_t **pptr, PCRE2_UCHAR **pcode,2092eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);20932094/* This function consumes a group of implicitly-unioned class elements.2095These can be characters, ranges, properties, or nested classes, as long2096as they are all joined by being placed adjacently. */20972098static BOOL2099compile_class_operand(eclass_context *context, BOOL negated,2100uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2101PCRE2_SIZE *lengthptr)2102{2103uint32_t *ptr = *pptr;2104uint32_t *prev_ptr;2105PCRE2_UCHAR *code = *pcode;2106PCRE2_UCHAR *code_start = code;2107PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;2108PCRE2_SIZE extra_length;2109uint32_t meta = META_CODE(*ptr);21102111switch (meta)2112{2113case META_CLASS_EMPTY_NOT:2114case META_CLASS_EMPTY:2115++ptr;2116pop_info->length = 1;2117if ((meta == META_CLASS_EMPTY) == negated)2118{2119*code++ = pop_info->op_single_type = ECL_ANY;2120memset(pop_info->bits.classbits, 0xff, 32);2121}2122else2123{2124*code++ = pop_info->op_single_type = ECL_NONE;2125memset(pop_info->bits.classbits, 0, 32);2126}2127break;21282129case META_CLASS:2130case META_CLASS_NOT:2131if ((*ptr & CLASS_IS_ECLASS) != 0)2132{2133if (!compile_eclass_nested(context, negated, &ptr, &code,2134pop_info, lengthptr))2135return FALSE;21362137PCRE2_ASSERT(*ptr == META_CLASS_END);2138ptr++;2139goto DONE;2140}21412142ptr++;2143/* Fall through */21442145default:2146/* Scan forward characters, ranges, and properties.2147For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but2148we still need to collect that fragment up into a "leaf" OP_CLASS. */21492150prev_ptr = ptr;2151ptr = PRIV(compile_class_not_nested)(2152context->options, context->xoptions, ptr, &code,2153(meta != META_CLASS_NOT) == negated, &context->needs_bitmap,2154context->errorcodeptr, context->cb, lengthptr);2155if (ptr == NULL) return FALSE;21562157/* We must have a 100% guarantee that ptr increases when2158compile_class_operand() returns, even on Release builds, so that we can2159statically prove our loops terminate. */2160if (ptr <= prev_ptr)2161{2162PCRE2_DEBUG_UNREACHABLE();2163return FALSE;2164}21652166/* If we fell through above, consume the closing ']'. */2167if (meta == META_CLASS || meta == META_CLASS_NOT)2168{2169PCRE2_ASSERT(*ptr == META_CLASS_END);2170ptr++;2171}21722173/* Regardless of whether (lengthptr == NULL), some data will still be written2174out to *pcode, which we need: we have to peek at it, to transform the opcode2175into the ECLASS version (since we need to hoist up the bitmaps). */2176PCRE2_ASSERT(code > code_start);2177extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;21782179/* Easiest case: convert OP_ALLANY to ECL_ANY */21802181if (*code_start == OP_ALLANY)2182{2183PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);2184pop_info->length = 1;2185*code_start = pop_info->op_single_type = ECL_ANY;2186memset(pop_info->bits.classbits, 0xff, 32);2187}21882189/* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to2190ECL_NONE / ECL_ANY respectively. */21912192else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)2193{2194PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&2195extra_length == 0);2196pop_info->length = 1;2197*code_start = pop_info->op_single_type =2198(*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;2199memcpy(pop_info->bits.classbits, code_start + 1, 32);2200/* Rewind the code pointer, but make sure we adjust *lengthptr, because we2201do need to reserve that space (even though we only use it temporarily). */2202if (lengthptr != NULL)2203*lengthptr += code - (code_start + 1);2204code = code_start + 1;22052206if (!context->needs_bitmap && *code_start == ECL_NONE)2207{2208uint32_t *classwords = pop_info->bits.classwords;22092210for (int i = 0; i < 8; i++)2211if (classwords[i] != 0)2212{2213context->needs_bitmap = TRUE;2214break;2215}2216}2217else2218context->needs_bitmap = TRUE;2219}22202221/* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to2222ECL_XCLASS. */22232224else2225{2226PCRE2_ASSERT(*code_start == OP_XCLASS);2227*code_start = pop_info->op_single_type = ECL_XCLASS;22282229PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);22302231memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);2232pop_info->length = (code - code_start) + extra_length;2233}22342235break;2236} /* End of switch(meta) */22372238pop_info->code_start = (lengthptr == NULL)? code_start : NULL;22392240if (lengthptr != NULL)2241{2242*lengthptr += code - code_start;2243code = code_start;2244}22452246DONE:2247PCRE2_ASSERT(lengthptr == NULL || (code == code_start));22482249*pptr = ptr;2250*pcode = code;2251return TRUE;2252}2253225422552256/* This function consumes a group of implicitly-unioned class elements.2257These can be characters, ranges, properties, or nested classes, as long2258as they are all joined by being placed adjacently. */22592260static BOOL2261compile_class_juxtaposition(eclass_context *context, BOOL negated,2262uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2263PCRE2_SIZE *lengthptr)2264{2265uint32_t *ptr = *pptr;2266PCRE2_UCHAR *code = *pcode;2267#ifdef PCRE2_DEBUG2268PCRE2_UCHAR *start_code = *pcode;2269#endif22702271/* See compile_class_binary_loose() for comments on compile-time folding of2272the "negated" flag. */22732274/* Because it's a non-empty class, there must be an operand at the start. */2275if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))2276return FALSE;22772278while (*ptr != META_CLASS_END &&2279!(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))2280{2281uint32_t op;2282BOOL rhs_negated;2283eclass_op_info rhs_op_info;22842285if (negated)2286{2287/* !(A juxtapose B) -> !A && !B */2288op = ECL_AND;2289rhs_negated = TRUE;2290}2291else2292{2293/* A juxtapose B -> A || B */2294op = ECL_OR;2295rhs_negated = FALSE;2296}22972298/* An operand must follow the operator. */2299if (!compile_class_operand(context, rhs_negated, &ptr, &code,2300&rhs_op_info, lengthptr))2301return FALSE;23022303/* Convert infix to postfix (RPN). */2304fold_binary(op, pop_info, &rhs_op_info, lengthptr);2305if (lengthptr == NULL)2306code = pop_info->code_start + pop_info->length;2307}23082309PCRE2_ASSERT(lengthptr == NULL || code == start_code);23102311*pptr = ptr;2312*pcode = code;2313return TRUE;2314}2315231623172318/* This function consumes unary prefix operators. */23192320static BOOL2321compile_class_unary(eclass_context *context, BOOL negated,2322uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2323PCRE2_SIZE *lengthptr)2324{2325uint32_t *ptr = *pptr;2326#ifdef PCRE2_DEBUG2327PCRE2_UCHAR *start_code = *pcode;2328#endif23292330while (*ptr == META_ECLASS_NOT)2331{2332++ptr;2333negated = !negated;2334}23352336*pptr = ptr;2337/* Because it's a non-empty class, there must be an operand. */2338if (!compile_class_juxtaposition(context, negated, pptr, pcode,2339pop_info, lengthptr))2340return FALSE;23412342PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);2343return TRUE;2344}2345234623472348/* This function consumes tightly-binding binary operators. */23492350static BOOL2351compile_class_binary_tight(eclass_context *context, BOOL negated,2352uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2353PCRE2_SIZE *lengthptr)2354{2355uint32_t *ptr = *pptr;2356PCRE2_UCHAR *code = *pcode;2357#ifdef PCRE2_DEBUG2358PCRE2_UCHAR *start_code = *pcode;2359#endif23602361/* See compile_class_binary_loose() for comments on compile-time folding of2362the "negated" flag. */23632364/* Because it's a non-empty class, there must be an operand at the start. */2365if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))2366return FALSE;23672368while (*ptr == META_ECLASS_AND)2369{2370uint32_t op;2371BOOL rhs_negated;2372eclass_op_info rhs_op_info;23732374if (negated)2375{2376/* !(A && B) -> !A || !B */2377op = ECL_OR;2378rhs_negated = TRUE;2379}2380else2381{2382/* A && B -> A && B */2383op = ECL_AND;2384rhs_negated = FALSE;2385}23862387++ptr;23882389/* An operand must follow the operator. */2390if (!compile_class_unary(context, rhs_negated, &ptr, &code,2391&rhs_op_info, lengthptr))2392return FALSE;23932394/* Convert infix to postfix (RPN). */2395fold_binary(op, pop_info, &rhs_op_info, lengthptr);2396if (lengthptr == NULL)2397code = pop_info->code_start + pop_info->length;2398}23992400PCRE2_ASSERT(lengthptr == NULL || code == start_code);24012402*pptr = ptr;2403*pcode = code;2404return TRUE;2405}2406240724082409/* This function consumes loosely-binding binary operators. */24102411static BOOL2412compile_class_binary_loose(eclass_context *context, BOOL negated,2413uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2414PCRE2_SIZE *lengthptr)2415{2416uint32_t *ptr = *pptr;2417PCRE2_UCHAR *code = *pcode;2418#ifdef PCRE2_DEBUG2419PCRE2_UCHAR *start_code = *pcode;2420#endif24212422/* We really want to fold the negation operator, if at all possible, so that2423simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want2424to produce a fully-folded expression, so that we can guarantee not to emit any2425OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).24262427This has the consequence that with a little ingenuity, we can in fact avoid2428emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:2429!(A ...2430We have parsed the preceding "!", and we are about to parse the "A" operand. We2431don't know yet whether there will even be a following binary operand! Both of2432these are possibilities for what follows:2433!(A && B)2434!(A)2435However, we can still fold the "!" into the "A" operand, because no matter what2436the following binary operator will be, we can produce an expression which is2437equivalent. */24382439/* Because it's a non-empty class, there must be an operand at the start. */2440if (!compile_class_binary_tight(context, negated, &ptr, &code,2441pop_info, lengthptr))2442return FALSE;24432444while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)2445{2446uint32_t op;2447BOOL op_neg;2448BOOL rhs_negated;2449eclass_op_info rhs_op_info;24502451if (negated)2452{2453/* The whole expression is being negated; we respond by unconditionally2454negating the LHS A, before seeing what follows. And hooray! We can recover,2455no matter what follows. */2456/* !(A || B) -> !A && !B */2457/* !(A -- B) -> !(A && !B) -> !A || B */2458/* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */2459op = (*ptr == META_ECLASS_OR )? ECL_AND :2460(*ptr == META_ECLASS_SUB)? ECL_OR :2461/*ptr == META_ECLASS_XOR*/ ECL_XOR;2462op_neg = (*ptr == META_ECLASS_XOR);2463rhs_negated = *ptr != META_ECLASS_SUB;2464}2465else2466{2467/* A || B -> A || B */2468/* A -- B -> A && !B */2469/* A XOR B -> A XOR B */2470op = (*ptr == META_ECLASS_OR )? ECL_OR :2471(*ptr == META_ECLASS_SUB)? ECL_AND :2472/*ptr == META_ECLASS_XOR*/ ECL_XOR;2473op_neg = FALSE;2474rhs_negated = *ptr == META_ECLASS_SUB;2475}24762477++ptr;24782479/* An operand must follow the operator. */2480if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,2481&rhs_op_info, lengthptr))2482return FALSE;24832484/* Convert infix to postfix (RPN). */2485fold_binary(op, pop_info, &rhs_op_info, lengthptr);2486if (op_neg) fold_negation(pop_info, lengthptr, FALSE);2487if (lengthptr == NULL)2488code = pop_info->code_start + pop_info->length;2489}24902491PCRE2_ASSERT(lengthptr == NULL || code == start_code);24922493*pptr = ptr;2494*pcode = code;2495return TRUE;2496}2497249824992500/* This function converts the META codes in pptr into opcodes written to2501pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.25022503The class is compiled as a left-associative sequence of operator2504applications.25052506The pptr will be left pointing at the matching META_CLASS_END. */25072508static BOOL2509compile_eclass_nested(eclass_context *context, BOOL negated,2510uint32_t **pptr, PCRE2_UCHAR **pcode,2511eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)2512{2513uint32_t *ptr = *pptr;2514#ifdef PCRE2_DEBUG2515PCRE2_UCHAR *start_code = *pcode;2516#endif25172518/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */2519PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||2520*ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));25212522if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))2523negated = !negated;25242525(*pptr)++;25262527/* Because it's a non-empty class, there must be an operand at the start. */2528if (!compile_class_binary_loose(context, negated, pptr, pcode,2529pop_info, lengthptr))2530return FALSE;25312532PCRE2_ASSERT(**pptr == META_CLASS_END);2533PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);2534return TRUE;2535}25362537BOOL2538PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,2539uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,2540compile_block *cb, PCRE2_SIZE *lengthptr)2541{2542eclass_context context;2543eclass_op_info op_info;2544PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;2545PCRE2_UCHAR *code = *pcode;2546PCRE2_UCHAR *previous;2547BOOL allbitsone = TRUE;25482549context.needs_bitmap = FALSE;2550context.options = options;2551context.xoptions = xoptions;2552context.errorcodeptr = errorcodeptr;2553context.cb = cb;25542555previous = code;2556*code++ = OP_ECLASS;2557code += LINK_SIZE;2558*code++ = 0; /* Flags, currently zero. */2559if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))2560return FALSE;25612562if (lengthptr != NULL)2563{2564*lengthptr += code - previous;2565code = previous;2566/* (*lengthptr - previous_length) now holds the amount of buffer that2567we require to make the call to compile_class_nested() with2568lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out2569before that call. */2570}25712572/* Do some useful counting of what's in the bitmap. */2573for (int i = 0; i < 8; i++)2574if (op_info.bits.classwords[i] != 0xffffffff)2575{2576allbitsone = FALSE;2577break;2578}25792580/* After constant-folding the extended class syntax, it may turn out to be2581a simple class after all. In that case, we can unwrap it from the2582OP_ECLASS container - and in fact, we must do so, because in 8-bit2583no-Unicode mode the matcher is compiled without support for OP_ECLASS. */25842585#ifndef SUPPORT_WIDE_CHARS2586PCRE2_ASSERT(op_info.op_single_type != 0);2587#else2588if (op_info.op_single_type != 0)2589#endif2590{2591/* Rewind back over the OP_ECLASS. */2592code = previous;25932594/* If the bits are all ones, and the "high characters" are all matched2595too, we use a special-cased encoding of OP_ALLANY. */25962597if (op_info.op_single_type == ECL_ANY && allbitsone)2598{2599/* Advancing code means rewinding lengthptr, at this point. */2600if (lengthptr != NULL) *lengthptr -= 1;2601*code++ = OP_ALLANY;2602}26032604/* If the high bits are all matched / all not-matched, then we emit an2605OP_NCLASS/OP_CLASS respectively. */26062607else if (op_info.op_single_type == ECL_ANY ||2608op_info.op_single_type == ECL_NONE)2609{2610PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));26112612if (lengthptr != NULL)2613{2614if (required_len > (*lengthptr - previous_length))2615*lengthptr = previous_length + required_len;2616}26172618/* Advancing code means rewinding lengthptr, at this point. */2619if (lengthptr != NULL) *lengthptr -= required_len;2620*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;2621memcpy(code, op_info.bits.classbits, 32);2622code += 32 / sizeof(PCRE2_UCHAR);2623}26242625/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data2626there, but, we pulled out its bitmap into op_info, so now we have to2627put that back into the OP_XCLASS. */26282629else2630{2631#ifndef SUPPORT_WIDE_CHARS2632PCRE2_DEBUG_UNREACHABLE();2633#else2634BOOL need_map = context.needs_bitmap;2635PCRE2_SIZE required_len;26362637PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);2638required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);26392640if (lengthptr != NULL)2641{2642/* Don't unconditionally request all the space we need - we may2643already have asked for more during processing of the ECLASS. */2644if (required_len > (*lengthptr - previous_length))2645*lengthptr = previous_length + required_len;26462647/* The code we write out here won't be ignored, even during the2648(lengthptr != NULL) phase, because if there's a following quantifier2649it will peek backwards. So we do have to write out a (truncated)2650OP_XCLASS, even on this branch. */2651*lengthptr -= 1 + LINK_SIZE + 1;2652*code++ = OP_XCLASS;2653PUT(code, 0, 1 + LINK_SIZE + 1);2654code += LINK_SIZE;2655*code++ = 0;2656}2657else2658{2659PCRE2_UCHAR *rest;2660PCRE2_SIZE rest_len;2661PCRE2_UCHAR flags;26622663/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */2664PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);2665rest = op_info.code_start + 1 + LINK_SIZE + 1;2666rest_len = (op_info.code_start + op_info.length) - rest;26672668/* First read any data we use, before memmove splats it. */2669flags = op_info.code_start[1 + LINK_SIZE];2670PCRE2_ASSERT((flags & XCL_MAP) == 0);26712672/* Next do the memmove before any writes. */2673memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),2674rest, CU2BYTES(rest_len));26752676/* Finally write the header data. */2677*code++ = OP_XCLASS;2678PUT(code, 0, (int)required_len);2679code += LINK_SIZE;2680*code++ = flags | (need_map? XCL_MAP : 0);2681if (need_map)2682{2683memcpy(code, op_info.bits.classbits, 32);2684code += 32 / sizeof(PCRE2_UCHAR);2685}2686code += rest_len;2687}2688#endif /* SUPPORT_WIDE_CHARS */2689}2690}26912692/* Otherwise, we're going to keep the OP_ECLASS. However, again we need2693to do some adjustment to insert the bitmap if we have one. */26942695#ifdef SUPPORT_WIDE_CHARS2696else2697{2698BOOL need_map = context.needs_bitmap;2699PCRE2_SIZE required_len =27001 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;27012702if (lengthptr != NULL)2703{2704if (required_len > (*lengthptr - previous_length))2705*lengthptr = previous_length + required_len;27062707/* As for the XCLASS branch above, we do have to write out a dummy2708OP_ECLASS, because of the backwards peek by the quantifier code. Write2709out a (truncated) OP_ECLASS, even on this branch. */2710*lengthptr -= 1 + LINK_SIZE + 1;2711*code++ = OP_ECLASS;2712PUT(code, 0, 1 + LINK_SIZE + 1);2713code += LINK_SIZE;2714*code++ = 0;2715}2716else2717{2718if (need_map)2719{2720PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;2721previous[1 + LINK_SIZE] |= ECL_MAP;2722memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,2723CU2BYTES(code - map_start));2724memcpy(map_start, op_info.bits.classbits, 32);2725code += 32 / sizeof(PCRE2_UCHAR);2726}2727PUT(previous, 1, (int)(code - previous));2728}2729}2730#endif /* SUPPORT_WIDE_CHARS */27312732*pcode = code;2733return TRUE;2734}27352736/* End of pcre2_compile_class.c */273727382739