Path: blob/master/thirdparty/pcre2/src/pcre2_compile_class.c
21733 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#include "pcre2_compile.h"42434445typedef struct {46/* Option bits for eclass. */47uint32_t options;48uint32_t xoptions;49/* Rarely used members. */50int *errorcodeptr;51compile_block *cb;52/* Bitmap is needed. */53BOOL needs_bitmap;54} eclass_context;5556/* Checks the allowed tokens at the end of a class structure in debug mode.57When a new token is not processed by all loops, and the token is equals to58a) one of the cases here:59the compiler will complain about a duplicated case value.60b) none of the cases here:61the loop without the handler will stop with an assertion failure. */6263#ifdef PCRE2_DEBUG64#define CLASS_END_CASES(meta) \65default: \66PCRE2_ASSERT((meta) <= META_END); \67PCRE2_FALLTHROUGH /* Fall through */ \68case META_CLASS: \69case META_CLASS_NOT: \70case META_CLASS_EMPTY: \71case META_CLASS_EMPTY_NOT: \72case META_CLASS_END: \73case META_ECLASS_AND: \74case META_ECLASS_OR: \75case META_ECLASS_SUB: \76case META_ECLASS_XOR: \77case META_ECLASS_NOT:78#else79#define CLASS_END_CASES(meta) \80default:81#endif8283#ifdef SUPPORT_WIDE_CHARS8485/* Heapsort algorithm. */8687static void do_heapify(uint32_t *buffer, size_t size, size_t i)88{89size_t max;90size_t left;91size_t right;92uint32_t tmp1, tmp2;9394while (TRUE)95{96max = i;97left = (i << 1) + 2;98right = left + 2;99100if (left < size && buffer[left] > buffer[max]) max = left;101if (right < size && buffer[right] > buffer[max]) max = right;102if (i == max) return;103104/* Swap items. */105tmp1 = buffer[i];106tmp2 = buffer[i + 1];107buffer[i] = buffer[max];108buffer[i + 1] = buffer[max + 1];109buffer[max] = tmp1;110buffer[max + 1] = tmp2;111i = max;112}113}114115#ifdef SUPPORT_UNICODE116117#define PARSE_CLASS_UTF 0x1118#define PARSE_CLASS_CASELESS_UTF 0x2119#define PARSE_CLASS_RESTRICTED_UTF 0x4120#define PARSE_CLASS_TURKISH_UTF 0x8121122/* Get the range of nocase characters which includes the123'c' character passed as argument, or directly follows 'c'. */124125static const uint32_t*126get_nocase_range(uint32_t c)127{128uint32_t left = 0;129uint32_t right = PRIV(ucd_nocase_ranges_size);130uint32_t middle;131132if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;133134while (TRUE)135{136/* Range end of the middle element. */137middle = ((left + right) >> 1) | 0x1;138139if (PRIV(ucd_nocase_ranges)[middle] <= c)140left = middle + 1;141else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)142right = middle - 1;143else144return PRIV(ucd_nocase_ranges) + (middle - 1);145}146}147148/* Get the list of othercase characters, which belongs to the passed range.149Create ranges from these characters, and append them to the buffer argument. */150151static size_t152utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,153uint32_t *buffer)154{155uint32_t new_start = start;156uint32_t new_end = end;157uint32_t c = start;158const uint32_t *list;159uint32_t tmp[3];160size_t result = 2;161const uint32_t *skip_range = get_nocase_range(c);162uint32_t skip_start = skip_range[0];163164#if PCRE2_CODE_UNIT_WIDTH == 8165PCRE2_ASSERT(options & PARSE_CLASS_UTF);166#endif167168#if PCRE2_CODE_UNIT_WIDTH == 32169if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;170#endif171172while (c <= end)173{174uint32_t co;175176if (c > skip_start)177{178c = skip_range[1];179skip_range += 2;180skip_start = skip_range[0];181continue;182}183184/* Compute caseless set. */185186if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==187PARSE_CLASS_TURKISH_UTF &&188UCD_ANY_I(c))189{190co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);191}192else if ((co = UCD_CASESET(c)) != 0 &&193(options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&194PRIV(ucd_caseless_sets)[co] < 128)195{196co = 0; /* Ignore the caseless set if it's restricted. */197}198199if (co != 0)200list = PRIV(ucd_caseless_sets) + co;201else202{203co = UCD_OTHERCASE(c);204list = tmp;205tmp[0] = c;206tmp[1] = NOTACHAR;207208if (co != c)209{210tmp[1] = co;211tmp[2] = NOTACHAR;212}213}214c++;215216/* Add characters. */217do218{219#if PCRE2_CODE_UNIT_WIDTH == 16220if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;221#endif222223if (*list < new_start)224{225if (*list + 1 == new_start)226{227new_start--;228continue;229}230}231else if (*list > new_end)232{233if (*list - 1 == new_end)234{235new_end++;236continue;237}238}239else continue;240241result += 2;242if (buffer != NULL)243{244buffer[0] = *list;245buffer[1] = *list;246buffer += 2;247}248}249while (*(++list) != NOTACHAR);250}251252if (buffer != NULL)253{254buffer[0] = new_start;255buffer[1] = new_end;256buffer += 2;257(void)buffer;258}259return result;260}261262#endif263264/* Add a character list to a buffer. */265266static size_t267append_char_list(const uint32_t *p, uint32_t *buffer)268{269const uint32_t *n;270size_t result = 0;271272while (*p != NOTACHAR)273{274n = p;275while (n[0] == n[1] - 1) n++;276277PCRE2_ASSERT(*p < 0xffff);278279if (buffer != NULL)280{281buffer[0] = *p;282buffer[1] = *n;283buffer += 2;284}285286result += 2;287p = n + 1;288}289290return result;291}292293static uint32_t294get_highest_char(uint32_t options)295{296(void)options; /* Avoid compiler warning. */297298#if PCRE2_CODE_UNIT_WIDTH == 8299return MAX_UTF_CODE_POINT;300#else301#ifdef SUPPORT_UNICODE302return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);303#else304return MAX_UCHAR_VALUE;305#endif306#endif307}308309/* Add a negated character list to a buffer. */310static size_t311append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)312{313const uint32_t *n;314uint32_t start = 0;315size_t result = 2;316317PCRE2_ASSERT(*p > 0);318319while (*p != NOTACHAR)320{321n = p;322while (n[0] == n[1] - 1) n++;323324PCRE2_ASSERT(*p < 0xffff);325326if (buffer != NULL)327{328buffer[0] = start;329buffer[1] = *p - 1;330buffer += 2;331}332333result += 2;334start = *n + 1;335p = n + 1;336}337338if (buffer != NULL)339{340buffer[0] = start;341buffer[1] = get_highest_char(options);342buffer += 2;343(void)buffer;344}345346return result;347}348349static uint32_t *350append_non_ascii_range(uint32_t options, uint32_t *buffer)351{352if (buffer == NULL) return NULL;353354buffer[0] = 0x100;355buffer[1] = get_highest_char(options);356return buffer + 2;357}358359static size_t360parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)361{362size_t total_size = 0;363size_t size;364uint32_t meta_arg;365uint32_t start_char;366367while (TRUE)368{369switch (META_CODE(*ptr))370{371case META_ESCAPE:372meta_arg = META_DATA(*ptr);373switch (meta_arg)374{375case ESC_D:376case ESC_W:377case ESC_S:378buffer = append_non_ascii_range(options, buffer);379total_size += 2;380break;381382case ESC_h:383size = append_char_list(PRIV(hspace_list), buffer);384total_size += size;385if (buffer != NULL) buffer += size;386break;387388case ESC_H:389size = append_negated_char_list(PRIV(hspace_list), options, buffer);390total_size += size;391if (buffer != NULL) buffer += size;392break;393394case ESC_v:395size = append_char_list(PRIV(vspace_list), buffer);396total_size += size;397if (buffer != NULL) buffer += size;398break;399400case ESC_V:401size = append_negated_char_list(PRIV(vspace_list), options, buffer);402total_size += size;403if (buffer != NULL) buffer += size;404break;405406case ESC_p:407case ESC_P:408ptr++;409if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)410{411if (buffer != NULL)412{413buffer[0] = 0;414buffer[1] = get_highest_char(options);415buffer += 2;416}417total_size += 2;418}419break;420}421ptr++;422continue;423case META_POSIX_NEG:424buffer = append_non_ascii_range(options, buffer);425total_size += 2;426ptr += 2;427continue;428case META_POSIX:429ptr += 2;430continue;431case META_BIGVALUE:432/* Character literal */433ptr++;434break;435CLASS_END_CASES(*ptr)436if (*ptr >= META_END) return total_size;437break;438}439440start_char = *ptr;441442if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)443{444ptr += 2;445PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);446447if (*ptr == META_BIGVALUE) ptr++;448449#ifdef EBCDIC450#error "Missing EBCDIC support"451#endif452}453454#ifdef SUPPORT_UNICODE455if (options & PARSE_CLASS_CASELESS_UTF)456{457size = utf_caseless_extend(start_char, *ptr++, options, buffer);458if (buffer != NULL) buffer += size;459total_size += size;460continue;461}462#endif463464if (buffer != NULL)465{466buffer[0] = start_char;467buffer[1] = *ptr;468buffer += 2;469}470471ptr++;472total_size += 2;473}474475return total_size;476}477478/* Extra uint32_t values for storing the lengths of range lists in479the worst case. Two uint32_t lengths and a range end for a range480starting before 255 */481#define CHAR_LIST_EXTRA_SIZE 3482483/* Starting character values for each character list. */484485static const uint32_t char_list_starts[] = {486#if PCRE2_CODE_UNIT_WIDTH == 32487XCL_CHAR_LIST_HIGH_32_START,488#endif489#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE490XCL_CHAR_LIST_LOW_32_START,491#endif492XCL_CHAR_LIST_HIGH_16_START,493/* Must be terminated by XCL_CHAR_LIST_LOW_16_START,494which also represents the end of the bitset. */495XCL_CHAR_LIST_LOW_16_START,496};497498static class_ranges *499compile_optimize_class(uint32_t *start_ptr, uint32_t options,500uint32_t xoptions, compile_block *cb)501{502class_ranges* cranges;503uint32_t *ptr;504uint32_t *buffer;505uint32_t *dst;506uint32_t class_options = 0;507size_t range_list_size = 0, total_size, i;508uint32_t tmp1, tmp2;509const uint32_t *char_list_next;510uint16_t *next_char;511uint32_t char_list_start, char_list_end;512uint32_t range_start, range_end;513514#ifdef SUPPORT_UNICODE515if (options & PCRE2_UTF)516class_options |= PARSE_CLASS_UTF;517518if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))519class_options |= PARSE_CLASS_CASELESS_UTF;520521if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)522class_options |= PARSE_CLASS_RESTRICTED_UTF;523524if (xoptions & PCRE2_EXTRA_TURKISH_CASING)525class_options |= PARSE_CLASS_TURKISH_UTF;526#else527(void)options; /* Avoid compiler warning. */528(void)xoptions; /* Avoid compiler warning. */529#endif530531/* Compute required space for the range. */532533range_list_size = parse_class(start_ptr, class_options, NULL);534PCRE2_ASSERT((range_list_size & 0x1) == 0);535536/* Allocate buffer. The total_size also represents the end of the buffer. */537538total_size = range_list_size +539((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);540541cranges = cb->cx->memctl.malloc(542sizeof(class_ranges) + total_size * sizeof(uint32_t),543cb->cx->memctl.memory_data);544545if (cranges == NULL) return NULL;546547cranges->header.next = NULL;548#ifdef PCRE2_DEBUG549cranges->header.type = CDATA_CRANGE;550#endif551cranges->range_list_size = (uint16_t)range_list_size;552cranges->char_lists_types = 0;553cranges->char_lists_size = 0;554cranges->char_lists_start = 0;555556if (range_list_size == 0) return cranges;557558buffer = (uint32_t*)(cranges + 1);559parse_class(start_ptr, class_options, buffer);560561/* Using <= instead of == to help static analysis. */562if (range_list_size <= 2) return cranges;563564/* In-place sorting of ranges. */565566i = (((range_list_size >> 2) - 1) << 1);567while (TRUE)568{569do_heapify(buffer, range_list_size, i);570if (i == 0) break;571i -= 2;572}573574i = range_list_size - 2;575while (TRUE)576{577tmp1 = buffer[i];578tmp2 = buffer[i + 1];579buffer[i] = buffer[0];580buffer[i + 1] = buffer[1];581buffer[0] = tmp1;582buffer[1] = tmp2;583584do_heapify(buffer, i, 0);585if (i == 0) break;586i -= 2;587}588589/* Merge ranges whenever possible. */590dst = buffer;591ptr = buffer + 2;592range_list_size -= 2;593594/* The second condition is a very rare corner case, where the end of the last595range is the maximum character. This range cannot be extended further. */596597while (range_list_size > 0 && dst[1] != ~(uint32_t)0)598{599if (dst[1] + 1 < ptr[0])600{601dst += 2;602dst[0] = ptr[0];603dst[1] = ptr[1];604}605else if (dst[1] < ptr[1]) dst[1] = ptr[1];606607ptr += 2;608range_list_size -= 2;609}610611PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));612613/* When the number of ranges are less than six,614they are not converted to range lists. */615616ptr = buffer;617while (ptr < dst && ptr[1] < 0x100) ptr += 2;618if (dst - ptr < (2 * (6 - 1)))619{620cranges->range_list_size = (uint16_t)(dst + 2 - buffer);621return cranges;622}623624/* Compute character lists structures. */625626char_list_next = char_list_starts;627char_list_start = *char_list_next++;628#if PCRE2_CODE_UNIT_WIDTH == 32629char_list_end = XCL_CHAR_LIST_HIGH_32_END;630#elif defined SUPPORT_UNICODE631char_list_end = XCL_CHAR_LIST_LOW_32_END;632#else633char_list_end = XCL_CHAR_LIST_HIGH_16_END;634#endif635next_char = (uint16_t*)(buffer + total_size);636637tmp1 = 0;638tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;639PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);640range_start = dst[0];641range_end = dst[1];642643while (TRUE)644{645if (range_start >= char_list_start)646{647if (range_start == range_end || range_end < char_list_end)648{649tmp1++;650next_char--;651652if (char_list_start < XCL_CHAR_LIST_LOW_32_START)653*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);654else655*(uint32_t*)(--next_char) =656(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;657}658659if (range_start < range_end)660{661if (range_start > char_list_start)662{663tmp1++;664next_char--;665666if (char_list_start < XCL_CHAR_LIST_LOW_32_START)667*next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);668else669*(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);670}671else672cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;673}674675PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);676677if (dst > buffer)678{679dst -= 2;680range_start = dst[0];681range_end = dst[1];682continue;683}684685range_start = 0;686range_end = 0;687}688689if (range_end >= char_list_start)690{691PCRE2_ASSERT(range_start < char_list_start);692693if (range_end < char_list_end)694{695tmp1++;696next_char--;697698if (char_list_start < XCL_CHAR_LIST_LOW_32_START)699*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);700else701*(uint32_t*)(--next_char) =702(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;703704PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);705}706707cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;708}709710if (tmp1 >= XCL_ITEM_COUNT_MASK)711{712cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;713next_char--;714715if (char_list_start < XCL_CHAR_LIST_LOW_32_START)716*next_char = (uint16_t)tmp1;717else718*(uint32_t*)(--next_char) = tmp1;719}720else721cranges->char_lists_types |= tmp1 << tmp2;722723if (range_start < XCL_CHAR_LIST_LOW_16_START) break;724725PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);726char_list_end = char_list_start - 1;727char_list_start = *char_list_next++;728tmp1 = 0;729tmp2 -= XCL_TYPE_BIT_LEN;730}731732if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;733PCRE2_ASSERT((uint16_t*)dst <= next_char);734735cranges->char_lists_size =736(size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);737cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);738cranges->range_list_size = (uint16_t)(dst - buffer);739return cranges;740}741742#endif /* SUPPORT_WIDE_CHARS */743744#ifdef SUPPORT_UNICODE745746void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,747uint8_t *classbits)748{749/* Update PRIV(xclass) when this function is changed. */750int c, chartype;751const ucd_record *prop;752uint32_t gentype;753BOOL set_bit;754755if (ptype == PT_ANY)756{757if (!negated) memset(classbits, 0xff, 32);758return;759}760761for (c = 0; c < 256; c++)762{763prop = GET_UCD(c);764set_bit = FALSE;765(void)set_bit;766767switch (ptype)768{769case PT_LAMP:770chartype = prop->chartype;771set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);772break;773774case PT_GC:775set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);776break;777778case PT_PC:779set_bit = (prop->chartype == pdata);780break;781782case PT_SC:783set_bit = (prop->script == pdata);784break;785786case PT_SCX:787set_bit = (prop->script == pdata ||788MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);789break;790791case PT_ALNUM:792gentype = PRIV(ucp_gentype)[prop->chartype];793set_bit = (gentype == ucp_L || gentype == ucp_N);794break;795796case PT_SPACE: /* Perl space */797case PT_PXSPACE: /* POSIX space */798switch(c)799{800HSPACE_BYTE_CASES:801VSPACE_BYTE_CASES:802set_bit = TRUE;803break;804805default:806set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);807break;808}809break;810811case PT_WORD:812chartype = prop->chartype;813gentype = PRIV(ucp_gentype)[chartype];814set_bit = (gentype == ucp_L || gentype == ucp_N ||815chartype == ucp_Mn || chartype == ucp_Pc);816break;817818case PT_UCNC:819set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||820c == CHAR_GRAVE_ACCENT || c >= 0xa0);821break;822823case PT_BIDICL:824set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);825break;826827case PT_BOOL:828set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +829UCD_BPROPS_PROP(prop), pdata) != 0;830break;831832case PT_PXGRAPH:833chartype = prop->chartype;834gentype = PRIV(ucp_gentype)[chartype];835set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));836break;837838case PT_PXPRINT:839chartype = prop->chartype;840set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&841(PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));842break;843844case PT_PXPUNCT:845gentype = PRIV(ucp_gentype)[prop->chartype];846set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));847break;848849default:850PCRE2_ASSERT(ptype == PT_PXXDIGIT);851set_bit = (c >= CHAR_0 && c <= CHAR_9) ||852(c >= CHAR_A && c <= CHAR_F) ||853(c >= CHAR_a && c <= CHAR_f);854break;855}856857if (negated) set_bit = !set_bit;858if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));859if ((c & 0x7) == 0x7) classbits++;860}861}862863#endif /* SUPPORT_UNICODE */864865866867#ifdef SUPPORT_WIDE_CHARS868869/*************************************************870* XClass related properties *871*************************************************/872873/* XClass needs to be generated. */874#define XCLASS_REQUIRED 0x1875/* XClass has 8 bit character. */876#define XCLASS_HAS_8BIT_CHARS 0x2877/* XClass has properties. */878#define XCLASS_HAS_PROPS 0x4879/* XClass has character lists. */880#define XCLASS_HAS_CHAR_LISTS 0x8881/* XClass matches to all >= 256 characters. */882#define XCLASS_HIGH_ANY 0x10883884#endif885886887/*************************************************888* Internal entry point for add range to class *889*************************************************/890891/* This function sets the overall range for characters < 256.892It also handles non-utf case folding.893894Arguments:895options the options bits896xoptions the extra options bits897cb compile data898start start of range character899end end of range character900901Returns: cb->classbits is updated902*/903904static void905add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,906uint32_t start, uint32_t end)907{908uint8_t *classbits = cb->classbits.classbits;909uint32_t c, byte_start, byte_end;910uint32_t classbits_end = (end <= 0xff ? end : 0xff);911912#ifndef SUPPORT_UNICODE913(void)xoptions; /* Avoid compiler warning. */914#endif915916/* If caseless matching is required, scan the range and process alternate917cases. In Unicode, there are 8-bit characters that have alternate cases that918are greater than 255 and vice-versa (though these may be ignored if caseless919restriction is in force). Sometimes we can just extend the original range. */920921if ((options & PCRE2_CASELESS) != 0)922{923#ifdef SUPPORT_UNICODE924/* UTF mode. This branch is taken if we don't support wide characters (e.g.9258-bit library, without UTF), but we do treat those characters as Unicode926(if UCP flag is set). In this case, we only need to expand the character class927set to include the case pairs which are in the 0-255 codepoint range. */928if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)929{930BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==931PCRE2_EXTRA_TURKISH_CASING;932if (start < 128)933{934uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);935for (c = start; c <= lo_end; c++)936{937if (turkish_i && UCD_ANY_I(c)) continue;938SETBIT(classbits, cb->fcc[c]);939}940}941if (classbits_end >= 128)942{943uint32_t hi_start = (start > 128 ? start : 128);944for (c = hi_start; c <= classbits_end; c++)945{946uint32_t co = UCD_OTHERCASE(c);947if (co <= 0xff) SETBIT(classbits, co);948}949}950}951952else953#endif /* SUPPORT_UNICODE */954955/* Not UTF mode */956{957for (c = start; c <= classbits_end; c++)958SETBIT(classbits, cb->fcc[c]);959}960}961962/* Use the bitmap for characters < 256. Otherwise use extra data. */963964byte_start = (start + 7) >> 3;965byte_end = (classbits_end + 1) >> 3;966967if (byte_start >= byte_end)968{969for (c = start; c <= classbits_end; c++)970/* Regardless of start, c will always be <= 255. */971SETBIT(classbits, c);972return;973}974975for (c = byte_start; c < byte_end; c++)976classbits[c] = 0xff;977978byte_start <<= 3;979byte_end <<= 3;980981for (c = start; c < byte_start; c++)982SETBIT(classbits, c);983984for (c = byte_end; c <= classbits_end; c++)985SETBIT(classbits, c);986}987988989#if PCRE2_CODE_UNIT_WIDTH == 8990/*************************************************991* Internal entry point for add list to class *992*************************************************/993994/* This function is used for adding a list of horizontal or vertical whitespace995characters to a class. The list must be in order so that ranges of characters996can be detected and handled appropriately. This function sets the overall range997so that the internal functions can try to avoid duplication when handling998case-independence.9991000Arguments:1001options the options bits1002xoptions the extra options bits1003cb contains pointers to tables etc.1004p points to row of 32-bit values, terminated by NOTACHAR10051006Returns: cb->classbits is updated1007*/10081009static void1010add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,1011const uint32_t *p)1012{1013while (p[0] < 256)1014{1015unsigned int n = 0;10161017while(p[n+1] == p[0] + n + 1) n++;1018add_to_class(options, xoptions, cb, p[0], p[n]);10191020p += n + 1;1021}1022}1023102410251026/*************************************************1027* Add characters not in a list to a class *1028*************************************************/10291030/* This function is used for adding the complement of a list of horizontal or1031vertical whitespace to a class. The list must be in order.10321033Arguments:1034options the options bits1035xoptions the extra options bits1036cb contains pointers to tables etc.1037p points to row of 32-bit values, terminated by NOTACHAR10381039Returns: cb->classbits is updated1040*/10411042static void1043add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,1044const uint32_t *p)1045{1046if (p[0] > 0)1047add_to_class(options, xoptions, cb, 0, p[0] - 1);1048while (p[0] < 256)1049{1050while (p[1] == p[0] + 1) p++;1051add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);1052p++;1053}1054}1055#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */1056105710581059/*************************************************1060* Main entry-point to compile a character class *1061*************************************************/10621063/* This function consumes a "leaf", which is a set of characters that will1064become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */10651066uint32_t *1067PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,1068uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,1069int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)1070{1071uint32_t *pptr = start_ptr;1072PCRE2_UCHAR *code = *pcode;1073BOOL should_flip_negation;1074const uint8_t *cbits = cb->cbits;1075/* Some functions such as add_to_class() or eclass processing1076expects that the bitset is stored in cb->classbits.classbits. */1077uint8_t *const classbits = cb->classbits.classbits;10781079#ifdef SUPPORT_UNICODE1080BOOL utf = (options & PCRE2_UTF) != 0;1081#else /* No Unicode support */1082BOOL utf = FALSE;1083#endif10841085/* Helper variables for OP_XCLASS opcode (for characters > 255). */10861087#ifdef SUPPORT_WIDE_CHARS1088uint32_t xclass_props;1089PCRE2_UCHAR *class_uchardata;1090class_ranges* cranges;1091#else1092(void)has_bitmap; /* Avoid compiler warning. */1093(void)errorcodeptr; /* Avoid compiler warning. */1094(void)lengthptr; /* Avoid compiler warning. */1095#endif10961097/* If an XClass contains a negative special such as \S, we need to flip the1098negation flag at the end, so that support for characters > 255 works correctly1099(they are all included in the class). An XClass may need to insert specific1100matching or non-matching code for wide characters.1101*/11021103should_flip_negation = FALSE;11041105/* XClass will be used when characters > 255 might match. */11061107#ifdef SUPPORT_WIDE_CHARS1108xclass_props = 0;11091110#if PCRE2_CODE_UNIT_WIDTH == 81111cranges = NULL;11121113if (utf)1114#endif1115{1116if (lengthptr != NULL)1117{1118cranges = compile_optimize_class(pptr, options, xoptions, cb);11191120if (cranges == NULL)1121{1122*errorcodeptr = ERR21;1123return NULL;1124}11251126/* Caching the pre-processed character ranges. */1127if (cb->last_data != NULL)1128cb->last_data->next = &cranges->header;1129else1130cb->first_data = &cranges->header;11311132cb->last_data = &cranges->header;1133}1134else1135{1136/* Reuse the pre-processed character ranges. */1137cranges = (class_ranges*)cb->first_data;1138PCRE2_ASSERT(cranges != NULL && cranges->header.type == CDATA_CRANGE);1139cb->first_data = cranges->header.next;1140}11411142if (cranges->range_list_size > 0)1143{1144const uint32_t *ranges = (const uint32_t*)(cranges + 1);11451146if (ranges[0] <= 255)1147xclass_props |= XCLASS_HAS_8BIT_CHARS;11481149if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&1150ranges[cranges->range_list_size - 2] <= 256)1151xclass_props |= XCLASS_HIGH_ANY;1152}1153}11541155class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */1156#endif /* SUPPORT_WIDE_CHARS */11571158/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map1159in a temporary bit of memory, in case the class contains fewer than two11608-bit characters because in that case the compiled code doesn't use the bit1161map. */11621163memset(classbits, 0, 32);11641165/* Process items until end_ptr is reached. */11661167while (TRUE)1168{1169uint32_t meta = *(pptr++);1170BOOL local_negate;1171int posix_class;1172int taboffset, tabopt;1173class_bits_storage pbits;1174uint32_t escape, c;11751176/* Handle POSIX classes such as [:alpha:] etc. */1177switch (META_CODE(meta))1178{1179case META_POSIX:1180case META_POSIX_NEG:11811182local_negate = (meta == META_POSIX_NEG);1183posix_class = *(pptr++);11841185if (local_negate) should_flip_negation = TRUE; /* Note negative special */11861187/* If matching is caseless, upper and lower are converted to alpha.1188This relies on the fact that the class table starts with alpha,1189lower, upper as the first 3 entries. */11901191if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)1192posix_class = 0;11931194/* When PCRE2_UCP is set, some of the POSIX classes are converted to1195different escape sequences that use Unicode properties \p or \P.1196Others that are not available via \p or \P have to generate1197XCL_PROP/XCL_NOTPROP directly, which is done here. */11981199#ifdef SUPPORT_UNICODE1200/* TODO This entire block of code here appears to be unreachable!? I simply1201can't see how it can be hit, given that the frontend parser doesn't emit1202META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */1203if ((options & PCRE2_UCP) != 0 &&1204(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)1205{1206uint32_t ptype;12071208switch(posix_class)1209{1210case PC_GRAPH:1211case PC_PRINT:1212case PC_PUNCT:1213ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :1214(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;12151216PRIV(update_classbits)(ptype, 0, local_negate, classbits);12171218if ((xclass_props & XCLASS_HIGH_ANY) == 0)1219{1220if (lengthptr != NULL)1221*lengthptr += 3;1222else1223{1224*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;1225*class_uchardata++ = (PCRE2_UCHAR)ptype;1226*class_uchardata++ = 0;1227}1228xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;1229}1230continue;12311232/* For the other POSIX classes (ex: ascii) we are going to1233fall through to the non-UCP case and build a bit map for1234characters with code points less than 256. However, if we are in1235a negated POSIX class, characters with code points greater than1236255 must either all match or all not match, depending on whether1237the whole class is not or is negated. For example, for1238[[:^ascii:]... they must all match, whereas for [^[:^ascii:]...1239they must not.12401241In the special case where there are no xclass items, this is1242automatically handled by the use of OP_CLASS or OP_NCLASS, but an1243explicit range is needed for OP_XCLASS. Setting a flag here1244causes the range to be generated later when it is known that1245OP_XCLASS is required. In the 8-bit library this is relevant only in1246utf mode, since no wide characters can exist otherwise. */12471248default:1249break;1250}1251}1252#endif /* SUPPORT_UNICODE */12531254/* In the non-UCP case, or when UCP makes no difference, we build the1255bit map for the POSIX class in a chunk of local store because we may1256be adding and subtracting from it, and we don't want to subtract bits1257that may be in the main map already. At the end we or the result into1258the bit map that is being built. */12591260posix_class *= 3;12611262/* Copy in the first table (always present) */12631264memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);12651266/* If there is a second table, add or remove it as required. */12671268taboffset = PRIV(posix_class_maps)[posix_class + 1];1269tabopt = PRIV(posix_class_maps)[posix_class + 2];12701271if (taboffset >= 0)1272{1273if (tabopt >= 0)1274for (int i = 0; i < 32; i++)1275pbits.classbits[i] |= cbits[i + taboffset];1276else1277for (int i = 0; i < 32; i++)1278pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);1279}12801281/* Now see if we need to remove any special characters. An option1282value of 1 removes vertical space and 2 removes underscore. */12831284if (tabopt < 0) tabopt = -tabopt;1285#ifdef EBCDIC1286{1287uint8_t posix_vertical[4] = { CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR };1288uint8_t posix_underscore = CHAR_UNDERSCORE;1289uint8_t *chars = NULL;1290int n = 0;12911292if (tabopt == 1) { chars = posix_vertical; n = 4; }1293else if (tabopt == 2) { chars = &posix_underscore; n = 1; }12941295for (; n > 0; ++chars, --n)1296pbits.classbits[*chars/8] &= ~(1u << (*chars&7));1297}1298#else1299if (tabopt == 1) pbits.classbits[1] &= ~0x3c;1300else if (tabopt == 2) pbits.classbits[11] &= 0x7f;1301#endif13021303/* Add the POSIX table or its complement into the main table that is1304being built and we are done. */13051306{1307uint32_t *classwords = cb->classbits.classwords;13081309if (local_negate)1310for (int i = 0; i < 8; i++)1311classwords[i] |= (uint32_t)(~pbits.classwords[i]);1312else1313for (int i = 0; i < 8; i++)1314classwords[i] |= pbits.classwords[i];1315}13161317#ifdef SUPPORT_WIDE_CHARS1318/* Every class contains at least one < 256 character. */1319xclass_props |= XCLASS_HAS_8BIT_CHARS;1320#endif1321continue; /* End of POSIX handling */13221323/* Other than POSIX classes, the only items we should encounter are1324\d-type escapes and literal characters (possibly as ranges). */1325case META_BIGVALUE:1326meta = *(pptr++);1327break;13281329case META_ESCAPE:1330escape = META_DATA(meta);13311332switch(escape)1333{1334case ESC_d:1335for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];1336break;13371338case ESC_D:1339should_flip_negation = TRUE;1340for (int i = 0; i < 32; i++)1341classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);1342break;13431344case ESC_w:1345for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];1346break;13471348case ESC_W:1349should_flip_negation = TRUE;1350for (int i = 0; i < 32; i++)1351classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);1352break;13531354/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl13555.18. Before PCRE 8.34, we had to preserve the VT bit if it was1356previously set by something earlier in the character class.1357Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so1358we could just adjust the appropriate bit. From PCRE 8.34 we no1359longer treat \s and \S specially. */13601361case ESC_s:1362for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];1363break;13641365case ESC_S:1366should_flip_negation = TRUE;1367for (int i = 0; i < 32; i++)1368classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);1369break;13701371/* When adding the horizontal or vertical space lists to a class, or1372their complements, disable PCRE2_CASELESS, because it justs wastes1373time, and in the "not-x" UTF cases can create unwanted duplicates in1374the XCLASS list (provoked by characters that have more than one other1375case and by both cases being in the same "not-x" sublist). */13761377case ESC_h:1378#if PCRE2_CODE_UNIT_WIDTH == 81379#ifdef SUPPORT_UNICODE1380if (cranges != NULL) break;1381#endif1382add_list_to_class(options & ~PCRE2_CASELESS, xoptions,1383cb, PRIV(hspace_list));1384#else1385PCRE2_ASSERT(cranges != NULL);1386#endif1387break;13881389case ESC_H:1390#if PCRE2_CODE_UNIT_WIDTH == 81391#ifdef SUPPORT_UNICODE1392if (cranges != NULL) break;1393#endif1394add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,1395cb, PRIV(hspace_list));1396#else1397PCRE2_ASSERT(cranges != NULL);1398#endif1399break;14001401case ESC_v:1402#if PCRE2_CODE_UNIT_WIDTH == 81403#ifdef SUPPORT_UNICODE1404if (cranges != NULL) break;1405#endif1406add_list_to_class(options & ~PCRE2_CASELESS, xoptions,1407cb, PRIV(vspace_list));1408#else1409PCRE2_ASSERT(cranges != NULL);1410#endif1411break;14121413case ESC_V:1414#if PCRE2_CODE_UNIT_WIDTH == 81415#ifdef SUPPORT_UNICODE1416if (cranges != NULL) break;1417#endif1418add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,1419cb, PRIV(vspace_list));1420#else1421PCRE2_ASSERT(cranges != NULL);1422#endif1423break;14241425/* If Unicode is not supported, \P and \p are not allowed and are1426faulted at parse time, so will never appear here. */14271428#ifdef SUPPORT_UNICODE1429case ESC_p:1430case ESC_P:1431{1432uint32_t ptype = *pptr >> 16;1433uint32_t pdata = *(pptr++) & 0xffff;14341435/* The "Any" is processed by PRIV(update_classbits)(). */1436if (ptype == PT_ANY)1437{1438#if PCRE2_CODE_UNIT_WIDTH == 81439if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);1440#endif1441continue;1442}14431444PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);14451446if ((xclass_props & XCLASS_HIGH_ANY) == 0)1447{1448if (lengthptr != NULL)1449*lengthptr += 3;1450else1451{1452*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;1453*class_uchardata++ = ptype;1454*class_uchardata++ = pdata;1455}1456xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;1457}1458}1459continue;1460#endif1461}14621463#ifdef SUPPORT_WIDE_CHARS1464/* Every non-property class contains at least one < 256 character. */1465xclass_props |= XCLASS_HAS_8BIT_CHARS;1466#endif1467/* End handling \d-type escapes */1468continue;14691470CLASS_END_CASES(meta)1471/* Literals. */1472if (meta < META_END) break;1473/* Non-literals: end of class contents. */1474goto END_PROCESSING;1475}14761477/* A literal character may be followed by a range meta. At parse time1478there are checks for out-of-order characters, for ranges where the two1479characters are equal, and for hyphens that cannot indicate a range. At1480this point, therefore, no checking is needed. */14811482c = meta;14831484/* Remember if \r or \n were explicitly used */14851486if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;14871488/* Process a character range */14891490if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)1491{1492uint32_t d;14931494#ifdef EBCDIC1495BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);1496#endif1497++pptr;1498d = *(pptr++);1499if (d == META_BIGVALUE) d = *(pptr++);15001501/* Remember an explicit \r or \n, and add the range to the class. */15021503if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;15041505#if PCRE2_CODE_UNIT_WIDTH == 81506#ifdef SUPPORT_UNICODE1507if (cranges != NULL) continue;1508xclass_props |= XCLASS_HAS_8BIT_CHARS;1509#endif15101511/* In an EBCDIC environment, Perl treats alphabetic ranges specially1512because there are holes in the encoding, and simply using the range1513A-Z (for example) would include the characters in the holes. This1514applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */15151516#ifdef EBCDIC1517if (range_is_literal &&1518(cb->ctypes[c] & ctype_letter) != 0 &&1519(cb->ctypes[d] & ctype_letter) != 0 &&1520(c <= CHAR_z) == (d <= CHAR_z))1521{1522uint32_t uc = (d <= CHAR_z)? 0 : 64;1523uint32_t C = c - uc;1524uint32_t D = d - uc;15251526if (C <= CHAR_i)1527{1528add_to_class(options, xoptions, cb, C + uc,1529((D < CHAR_i)? D : CHAR_i) + uc);1530C = CHAR_j;1531}15321533if (C <= D && C <= CHAR_r)1534{1535add_to_class(options, xoptions, cb, C + uc,1536((D < CHAR_r)? D : CHAR_r) + uc);1537C = CHAR_s;1538}15391540if (C <= D)1541add_to_class(options, xoptions, cb, C + uc, D + uc);1542}1543else1544#endif1545/* Not an EBCDIC special range */15461547add_to_class(options, xoptions, cb, c, d);1548#else1549PCRE2_ASSERT(cranges != NULL);1550#endif1551continue;1552} /* End of range handling */15531554/* Character ranges are ignored when class_ranges is present. */1555#if PCRE2_CODE_UNIT_WIDTH == 81556#ifdef SUPPORT_UNICODE1557if (cranges != NULL) continue;1558xclass_props |= XCLASS_HAS_8BIT_CHARS;1559#endif1560/* Handle a single character. */15611562add_to_class(options, xoptions, cb, meta, meta);1563#else1564PCRE2_ASSERT(cranges != NULL);1565#endif1566} /* End of main class-processing loop */15671568END_PROCESSING:15691570#ifdef SUPPORT_WIDE_CHARS1571PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||1572(xclass_props & XCLASS_HIGH_ANY) == 0);15731574if (cranges != NULL)1575{1576uint32_t *range = (uint32_t*)(cranges + 1);1577uint32_t *end = range + cranges->range_list_size;15781579while (range < end && range[0] < 256)1580{1581PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);1582/* Add range to bitset. If we are in UTF or UCP mode, then clear the1583caseless bit, because the cranges handle caselessness (only) in this1584condition; see the condition for PARSE_CLASS_CASELESS_UTF in1585compile_optimize_class(). */1586add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?1587(options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);15881589if (range[1] > 255) break;1590range += 2;1591}15921593if (cranges->char_lists_size > 0)1594{1595/* The cranges structure is still used and freed later. */1596PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);1597xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;1598}1599else1600{1601if ((xclass_props & XCLASS_HIGH_ANY) != 0)1602{1603PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&1604range[1] >= GET_MAX_CHAR_VALUE(utf));1605should_flip_negation = TRUE;1606range = end;1607}16081609while (range < end)1610{1611uint32_t range_start = range[0];1612uint32_t range_end = range[1];16131614range += 2;1615xclass_props |= XCLASS_REQUIRED;16161617if (range_start < 256) range_start = 256;16181619if (lengthptr != NULL)1620{1621#ifdef SUPPORT_UNICODE1622if (utf)1623{1624*lengthptr += 1;16251626if (range_start < range_end)1627*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);16281629*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);1630continue;1631}1632#endif /* SUPPORT_UNICODE */16331634*lengthptr += range_start < range_end ? 3 : 2;1635continue;1636}16371638#ifdef SUPPORT_UNICODE1639if (utf)1640{1641if (range_start < range_end)1642{1643*class_uchardata++ = XCL_RANGE;1644class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);1645}1646else1647*class_uchardata++ = XCL_SINGLE;16481649class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);1650continue;1651}1652#endif /* SUPPORT_UNICODE */16531654/* Without UTF support, character values are constrained1655by the bit length, and can only be > 256 for 16-bit and165632-bit libraries. */1657#if PCRE2_CODE_UNIT_WIDTH != 81658if (range_start < range_end)1659{1660*class_uchardata++ = XCL_RANGE;1661*class_uchardata++ = range_start;1662}1663else1664*class_uchardata++ = XCL_SINGLE;16651666*class_uchardata++ = range_end;1667#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */1668}16691670if (lengthptr == NULL)1671cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);1672}1673}1674#endif /* SUPPORT_WIDE_CHARS */16751676/* If there are characters with values > 255, or Unicode property settings1677(\p or \P), we have to compile an extended class, with its own opcode,1678unless there were no property settings and there was a negated special such1679as \S in the class, and PCRE2_UCP is not set, because in that case all1680characters > 255 are in or not in the class, so any that were explicitly1681given as well can be ignored.16821683In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were1684were present in a class, we either have to match or not match all wide1685characters (depending on whether the whole class is or is not negated).1686This requirement is indicated by match_all_or_no_wide_chars being true.1687We do this by including an explicit range, which works in both cases.1688This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there1689cannot be any wide characters in 8-bit non-UTF mode.16901691When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit1692class where \S etc is present without PCRE2_UCP, causing an extended class1693to be compiled, we make sure that all characters > 255 are included by1694forcing match_all_or_no_wide_chars to be true.16951696If, when generating an xclass, there are no characters < 256, we can omit1697the bitmap in the actual compiled code. */16981699#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */1700if ((xclass_props & XCLASS_REQUIRED) != 0)1701{1702PCRE2_UCHAR *previous = code;17031704if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)1705*class_uchardata++ = XCL_END; /* Marks the end of extra data */1706*code++ = OP_XCLASS;1707code += LINK_SIZE;1708*code = negate_class? XCL_NOT:0;1709if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;17101711/* If the map is required, move up the extra data to make room for it;1712otherwise just move the code pointer to the end of the extra data. */17131714if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)1715{1716if (negate_class)1717{1718uint32_t *classwords = cb->classbits.classwords;1719for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];1720}17211722if (has_bitmap == NULL)1723{1724*code++ |= XCL_MAP;1725(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,1726CU2BYTES(class_uchardata - code));1727memcpy(code, classbits, 32);1728code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));1729}1730else1731{1732code = class_uchardata;1733if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)1734*has_bitmap = TRUE;1735}1736}1737else code = class_uchardata;17381739if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)1740{1741/* Char lists size is an even number, because all items are 16 or 321742bit values. The character list data is always aligned to 32 bits. */1743size_t char_lists_size = cranges->char_lists_size;1744PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&1745(cb->char_lists_size & 0x3) == 0);17461747if (lengthptr != NULL)1748{1749char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));17501751#if PCRE2_CODE_UNIT_WIDTH == 81752*lengthptr += 2 + LINK_SIZE;1753#else1754*lengthptr += 1 + LINK_SIZE;1755#endif17561757cb->char_lists_size += char_lists_size;17581759char_lists_size /= sizeof(PCRE2_UCHAR);17601761/* Storage space for character lists is included1762in the maximum pattern size. */1763if (*lengthptr > MAX_PATTERN_SIZE ||1764MAX_PATTERN_SIZE - *lengthptr < char_lists_size)1765{1766*errorcodeptr = ERR20; /* Pattern is too large */1767return NULL;1768}1769}1770else1771{1772uint8_t *data;17731774PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);1775#if PCRE2_CODE_UNIT_WIDTH == 81776/* Encode as high / low bytes. */1777code[0] = (uint8_t)(XCL_LIST |1778(cranges->char_lists_types >> 8));1779code[1] = (uint8_t)cranges->char_lists_types;1780code += 2;1781#else1782*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);1783#endif17841785/* Character lists are stored in backwards direction from1786byte code start. The non-dfa/dfa matchers can access these1787lists using the byte code start stored in match blocks.1788Each list is aligned to 32 bit with an optional unused178916 bit value at the beginning of the character list. */17901791cb->char_lists_size += char_lists_size;1792data = (uint8_t*)cb->start_code - cb->char_lists_size;17931794memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,1795char_lists_size);17961797/* Since character lists total size is less than MAX_PATTERN_SIZE,1798their starting offset fits into a value which size is LINK_SIZE. */17991800char_lists_size = cb->char_lists_size;1801PUT(code, 0, (uint32_t)(char_lists_size >> 1));1802code += LINK_SIZE;18031804#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND1805if ((char_lists_size & 0x2) != 0)1806{1807/* In debug the unused 16 bit value is set1808to a fixed value and marked unused. */1809((uint16_t*)data)[-1] = 0x5555;1810#ifdef SUPPORT_VALGRIND1811VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);1812#endif1813}1814#endif18151816cb->char_lists_size =1817CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));18181819cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);1820}1821}18221823/* Now fill in the complete length of the item */18241825PUT(previous, 1, (int)(code - previous));1826goto DONE; /* End of class handling */1827}1828#endif /* SUPPORT_WIDE_CHARS */18291830/* If there are no characters > 255, or they are all to be included or1831excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the1832whole class was negated and whether there were negative specials such as \S1833(non-UCP) in the class. Then copy the 32-byte map into the code vector,1834negating it if necessary. */18351836if (negate_class)1837{1838uint32_t *classwords = cb->classbits.classwords;18391840for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];1841}18421843if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&1844cb->classbits.classwords[0] == ~(uint32_t)0)1845{1846const uint32_t *classwords = cb->classbits.classwords;1847int i;18481849for (i = 0; i < 8; i++)1850if (classwords[i] != ~(uint32_t)0) break;18511852if (i == 8)1853{1854*code++ = OP_ALLANY;1855goto DONE; /* End of class handling */1856}1857}18581859*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;1860memcpy(code, classbits, 32);1861code += 32 / sizeof(PCRE2_UCHAR);18621863DONE:1864*pcode = code;1865return pptr - 1;1866}1867186818691870/* ===================================================================*/1871/* Here follows a block of ECLASS-compiling functions. You may well want to1872read them from top to bottom; they are ordered from leafmost (at the top) to1873outermost parser (at the bottom of the file). */18741875/* This function folds one operand using the negation operator.1876The new, combined chunk of stack code is written out to *pop_info. */18771878static void1879fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,1880BOOL preserve_classbits)1881{1882/* If the chunk of stack code is already composed of multiple ops, we won't1883descend in and try and propagate the negation down the tree. (That would lead1884to O(n^2) compile-time, which could be exploitable with a malicious regex -1885although maybe that's not really too much of a worry in a library that offers1886an exponential-time matching function!) */18871888if (pop_info->op_single_type == 0)1889{1890if (lengthptr != NULL)1891*lengthptr += 1;1892else1893pop_info->code_start[pop_info->length] = ECL_NOT;1894pop_info->length += 1;1895}18961897/* Otherwise, it's a nice single-op item, so we can easily fold in the negation1898without needing to produce an ECL_NOT. */18991900else if (pop_info->op_single_type == ECL_ANY ||1901pop_info->op_single_type == ECL_NONE)1902{1903pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?1904ECL_ANY : ECL_NONE;1905if (lengthptr == NULL)1906*(pop_info->code_start) = pop_info->op_single_type;1907}1908else1909{1910PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&1911pop_info->length >= 1 + LINK_SIZE + 1);1912if (lengthptr == NULL)1913pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;1914}19151916if (!preserve_classbits)1917{1918for (int i = 0; i < 8; i++)1919pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];1920}1921}1922192319241925/* This function folds together two operands using a binary operator.1926The new, combined chunk of stack code is written out to *lhs_op_info. */19271928static void1929fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,1930PCRE2_SIZE *lengthptr)1931{1932switch (op)1933{1934/* ECL_AND truth table:19351936LHS RHS RESULT1937----------------1938ANY * RHS1939* ANY LHS1940NONE * NONE1941* NONE NONE1942X Y X & Y1943*/19441945case ECL_AND:1946if (rhs_op_info->op_single_type == ECL_ANY)1947{1948/* no-op: drop the RHS */1949}1950else if (lhs_op_info->op_single_type == ECL_ANY)1951{1952/* no-op: drop the LHS, and memmove the RHS into its place */1953if (lengthptr == NULL)1954memmove(lhs_op_info->code_start, rhs_op_info->code_start,1955CU2BYTES(rhs_op_info->length));1956lhs_op_info->length = rhs_op_info->length;1957lhs_op_info->op_single_type = rhs_op_info->op_single_type;1958}1959else if (rhs_op_info->op_single_type == ECL_NONE)1960{1961/* the result is ECL_NONE: write into the LHS */1962if (lengthptr == NULL)1963lhs_op_info->code_start[0] = ECL_NONE;1964lhs_op_info->length = 1;1965lhs_op_info->op_single_type = ECL_NONE;1966}1967else if (lhs_op_info->op_single_type == ECL_NONE)1968{1969/* the result is ECL_NONE: drop the RHS */1970}1971else1972{1973/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */1974if (lengthptr != NULL)1975*lengthptr += 1;1976else1977{1978PCRE2_ASSERT(rhs_op_info->code_start ==1979lhs_op_info->code_start + lhs_op_info->length);1980rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;1981}1982lhs_op_info->length += rhs_op_info->length + 1;1983lhs_op_info->op_single_type = 0;1984}19851986for (int i = 0; i < 8; i++)1987lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];1988break;19891990/* ECL_OR truth table:19911992LHS RHS RESULT1993----------------1994ANY * ANY1995* ANY ANY1996NONE * RHS1997* NONE LHS1998X Y X | Y1999*/20002001case ECL_OR:2002if (rhs_op_info->op_single_type == ECL_NONE)2003{2004/* no-op: drop the RHS */2005}2006else if (lhs_op_info->op_single_type == ECL_NONE)2007{2008/* no-op: drop the LHS, and memmove the RHS into its place */2009if (lengthptr == NULL)2010memmove(lhs_op_info->code_start, rhs_op_info->code_start,2011CU2BYTES(rhs_op_info->length));2012lhs_op_info->length = rhs_op_info->length;2013lhs_op_info->op_single_type = rhs_op_info->op_single_type;2014}2015else if (rhs_op_info->op_single_type == ECL_ANY)2016{2017/* the result is ECL_ANY: write into the LHS */2018if (lengthptr == NULL)2019lhs_op_info->code_start[0] = ECL_ANY;2020lhs_op_info->length = 1;2021lhs_op_info->op_single_type = ECL_ANY;2022}2023else if (lhs_op_info->op_single_type == ECL_ANY)2024{2025/* the result is ECL_ANY: drop the RHS */2026}2027else2028{2029/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */2030if (lengthptr != NULL)2031*lengthptr += 1;2032else2033{2034PCRE2_ASSERT(rhs_op_info->code_start ==2035lhs_op_info->code_start + lhs_op_info->length);2036rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;2037}2038lhs_op_info->length += rhs_op_info->length + 1;2039lhs_op_info->op_single_type = 0;2040}20412042for (int i = 0; i < 8; i++)2043lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];2044break;20452046/* ECL_XOR truth table:20472048LHS RHS RESULT2049----------------2050ANY * !RHS2051* ANY !LHS2052NONE * RHS2053* NONE LHS2054X Y X ^ Y2055*/20562057case ECL_XOR:2058if (rhs_op_info->op_single_type == ECL_NONE)2059{2060/* no-op: drop the RHS */2061}2062else if (lhs_op_info->op_single_type == ECL_NONE)2063{2064/* no-op: drop the LHS, and memmove the RHS into its place */2065if (lengthptr == NULL)2066memmove(lhs_op_info->code_start, rhs_op_info->code_start,2067CU2BYTES(rhs_op_info->length));2068lhs_op_info->length = rhs_op_info->length;2069lhs_op_info->op_single_type = rhs_op_info->op_single_type;2070}2071else if (rhs_op_info->op_single_type == ECL_ANY)2072{2073/* the result is !LHS: fold in the negation, and drop the RHS */2074/* Preserve the classbits, because we promise to deal with them later. */2075fold_negation(lhs_op_info, lengthptr, TRUE);2076}2077else if (lhs_op_info->op_single_type == ECL_ANY)2078{2079/* the result is !RHS: drop the LHS, memmove the RHS into its place, and2080fold in the negation */2081if (lengthptr == NULL)2082memmove(lhs_op_info->code_start, rhs_op_info->code_start,2083CU2BYTES(rhs_op_info->length));2084lhs_op_info->length = rhs_op_info->length;2085lhs_op_info->op_single_type = rhs_op_info->op_single_type;20862087/* Preserve the classbits, because we promise to deal with them later. */2088fold_negation(lhs_op_info, lengthptr, TRUE);2089}2090else2091{2092/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */2093if (lengthptr != NULL)2094*lengthptr += 1;2095else2096{2097PCRE2_ASSERT(rhs_op_info->code_start ==2098lhs_op_info->code_start + lhs_op_info->length);2099rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;2100}2101lhs_op_info->length += rhs_op_info->length + 1;2102lhs_op_info->op_single_type = 0;2103}21042105for (int i = 0; i < 8; i++)2106lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];2107break;21082109/* LCOV_EXCL_START */2110default:2111PCRE2_DEBUG_UNREACHABLE();2112break;2113/* LCOV_EXCL_STOP */2114}2115}2116211721182119static BOOL2120compile_eclass_nested(eclass_context *context, BOOL negated,2121uint32_t **pptr, PCRE2_UCHAR **pcode,2122eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);21232124/* This function consumes a group of implicitly-unioned class elements.2125These can be characters, ranges, properties, or nested classes, as long2126as they are all joined by being placed adjacently. */21272128static BOOL2129compile_class_operand(eclass_context *context, BOOL negated,2130uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2131PCRE2_SIZE *lengthptr)2132{2133uint32_t *ptr = *pptr;2134uint32_t *prev_ptr;2135PCRE2_UCHAR *code = *pcode;2136PCRE2_UCHAR *code_start = code;2137PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;2138PCRE2_SIZE extra_length;2139uint32_t meta = META_CODE(*ptr);21402141switch (meta)2142{2143case META_CLASS_EMPTY_NOT:2144case META_CLASS_EMPTY:2145++ptr;2146pop_info->length = 1;2147if ((meta == META_CLASS_EMPTY) == negated)2148{2149*code++ = pop_info->op_single_type = ECL_ANY;2150memset(pop_info->bits.classbits, 0xff, 32);2151}2152else2153{2154*code++ = pop_info->op_single_type = ECL_NONE;2155memset(pop_info->bits.classbits, 0, 32);2156}2157break;21582159case META_CLASS:2160case META_CLASS_NOT:2161if ((*ptr & CLASS_IS_ECLASS) != 0)2162{2163if (!compile_eclass_nested(context, negated, &ptr, &code,2164pop_info, lengthptr))2165return FALSE;21662167PCRE2_ASSERT(*ptr == META_CLASS_END);2168ptr++;2169goto DONE;2170}21712172ptr++;2173PCRE2_FALLTHROUGH /* Fall through */21742175default:2176/* Scan forward characters, ranges, and properties.2177For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but2178we still need to collect that fragment up into a "leaf" OP_CLASS. */21792180prev_ptr = ptr;2181ptr = PRIV(compile_class_not_nested)(2182context->options, context->xoptions, ptr, &code,2183(meta != META_CLASS_NOT) == negated, &context->needs_bitmap,2184context->errorcodeptr, context->cb, lengthptr);2185if (ptr == NULL) return FALSE;21862187/* We must have a 100% guarantee that ptr increases when2188compile_class_operand() returns, even on Release builds, so that we can2189statically prove our loops terminate. */2190/* LCOV_EXCL_START */2191if (ptr <= prev_ptr)2192{2193PCRE2_DEBUG_UNREACHABLE();2194return FALSE;2195}2196/* LCOV_EXCL_STOP */21972198/* If we fell through above, consume the closing ']'. */2199if (meta == META_CLASS || meta == META_CLASS_NOT)2200{2201PCRE2_ASSERT(*ptr == META_CLASS_END);2202ptr++;2203}22042205/* Regardless of whether (lengthptr == NULL), some data will still be written2206out to *pcode, which we need: we have to peek at it, to transform the opcode2207into the ECLASS version (since we need to hoist up the bitmaps). */2208PCRE2_ASSERT(code > code_start);2209extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;22102211/* Easiest case: convert OP_ALLANY to ECL_ANY */22122213if (*code_start == OP_ALLANY)2214{2215PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);2216pop_info->length = 1;2217*code_start = pop_info->op_single_type = ECL_ANY;2218memset(pop_info->bits.classbits, 0xff, 32);2219}22202221/* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to2222ECL_NONE / ECL_ANY respectively. */22232224else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)2225{2226PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&2227extra_length == 0);2228pop_info->length = 1;2229*code_start = pop_info->op_single_type =2230(*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;2231memcpy(pop_info->bits.classbits, code_start + 1, 32);2232/* Rewind the code pointer, but make sure we adjust *lengthptr, because we2233do need to reserve that space (even though we only use it temporarily). */2234if (lengthptr != NULL)2235*lengthptr += code - (code_start + 1);2236code = code_start + 1;22372238if (!context->needs_bitmap && *code_start == ECL_NONE)2239{2240uint32_t *classwords = pop_info->bits.classwords;22412242for (int i = 0; i < 8; i++)2243if (classwords[i] != 0)2244{2245context->needs_bitmap = TRUE;2246break;2247}2248}2249else2250context->needs_bitmap = TRUE;2251}22522253/* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to2254ECL_XCLASS. */22552256else2257{2258PCRE2_ASSERT(*code_start == OP_XCLASS);2259*code_start = pop_info->op_single_type = ECL_XCLASS;22602261PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);22622263memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);2264pop_info->length = (code - code_start) + extra_length;2265}22662267break;2268} /* End of switch(meta) */22692270pop_info->code_start = (lengthptr == NULL)? code_start : NULL;22712272if (lengthptr != NULL)2273{2274*lengthptr += code - code_start;2275code = code_start;2276}22772278DONE:2279PCRE2_ASSERT(lengthptr == NULL || (code == code_start));22802281*pptr = ptr;2282*pcode = code;2283return TRUE;2284}2285228622872288/* This function consumes a group of implicitly-unioned class elements.2289These can be characters, ranges, properties, or nested classes, as long2290as they are all joined by being placed adjacently. */22912292static BOOL2293compile_class_juxtaposition(eclass_context *context, BOOL negated,2294uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2295PCRE2_SIZE *lengthptr)2296{2297uint32_t *ptr = *pptr;2298PCRE2_UCHAR *code = *pcode;2299#ifdef PCRE2_DEBUG2300PCRE2_UCHAR *start_code = *pcode;2301#endif23022303/* See compile_class_binary_loose() for comments on compile-time folding of2304the "negated" flag. */23052306/* Because it's a non-empty class, there must be an operand at the start. */2307if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))2308return FALSE;23092310while (*ptr != META_CLASS_END &&2311!(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))2312{2313uint32_t op;2314BOOL rhs_negated;2315eclass_op_info rhs_op_info;23162317if (negated)2318{2319/* !(A juxtapose B) -> !A && !B */2320op = ECL_AND;2321rhs_negated = TRUE;2322}2323else2324{2325/* A juxtapose B -> A || B */2326op = ECL_OR;2327rhs_negated = FALSE;2328}23292330/* An operand must follow the operator. */2331if (!compile_class_operand(context, rhs_negated, &ptr, &code,2332&rhs_op_info, lengthptr))2333return FALSE;23342335/* Convert infix to postfix (RPN). */2336fold_binary(op, pop_info, &rhs_op_info, lengthptr);2337if (lengthptr == NULL)2338code = pop_info->code_start + pop_info->length;2339}23402341PCRE2_ASSERT(lengthptr == NULL || code == start_code);23422343*pptr = ptr;2344*pcode = code;2345return TRUE;2346}2347234823492350/* This function consumes unary prefix operators. */23512352static BOOL2353compile_class_unary(eclass_context *context, BOOL negated,2354uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2355PCRE2_SIZE *lengthptr)2356{2357uint32_t *ptr = *pptr;2358#ifdef PCRE2_DEBUG2359PCRE2_UCHAR *start_code = *pcode;2360#endif23612362while (*ptr == META_ECLASS_NOT)2363{2364++ptr;2365negated = !negated;2366}23672368*pptr = ptr;2369/* Because it's a non-empty class, there must be an operand. */2370if (!compile_class_juxtaposition(context, negated, pptr, pcode,2371pop_info, lengthptr))2372return FALSE;23732374PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);2375return TRUE;2376}2377237823792380/* This function consumes tightly-binding binary operators. */23812382static BOOL2383compile_class_binary_tight(eclass_context *context, BOOL negated,2384uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2385PCRE2_SIZE *lengthptr)2386{2387uint32_t *ptr = *pptr;2388PCRE2_UCHAR *code = *pcode;2389#ifdef PCRE2_DEBUG2390PCRE2_UCHAR *start_code = *pcode;2391#endif23922393/* See compile_class_binary_loose() for comments on compile-time folding of2394the "negated" flag. */23952396/* Because it's a non-empty class, there must be an operand at the start. */2397if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))2398return FALSE;23992400while (*ptr == META_ECLASS_AND)2401{2402uint32_t op;2403BOOL rhs_negated;2404eclass_op_info rhs_op_info;24052406if (negated)2407{2408/* !(A && B) -> !A || !B */2409op = ECL_OR;2410rhs_negated = TRUE;2411}2412else2413{2414/* A && B -> A && B */2415op = ECL_AND;2416rhs_negated = FALSE;2417}24182419++ptr;24202421/* An operand must follow the operator. */2422if (!compile_class_unary(context, rhs_negated, &ptr, &code,2423&rhs_op_info, lengthptr))2424return FALSE;24252426/* Convert infix to postfix (RPN). */2427fold_binary(op, pop_info, &rhs_op_info, lengthptr);2428if (lengthptr == NULL)2429code = pop_info->code_start + pop_info->length;2430}24312432PCRE2_ASSERT(lengthptr == NULL || code == start_code);24332434*pptr = ptr;2435*pcode = code;2436return TRUE;2437}2438243924402441/* This function consumes loosely-binding binary operators. */24422443static BOOL2444compile_class_binary_loose(eclass_context *context, BOOL negated,2445uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,2446PCRE2_SIZE *lengthptr)2447{2448uint32_t *ptr = *pptr;2449PCRE2_UCHAR *code = *pcode;2450#ifdef PCRE2_DEBUG2451PCRE2_UCHAR *start_code = *pcode;2452#endif24532454/* We really want to fold the negation operator, if at all possible, so that2455simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want2456to produce a fully-folded expression, so that we can guarantee not to emit any2457OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).24582459This has the consequence that with a little ingenuity, we can in fact avoid2460emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:2461!(A ...2462We have parsed the preceding "!", and we are about to parse the "A" operand. We2463don't know yet whether there will even be a following binary operand! Both of2464these are possibilities for what follows:2465!(A && B)2466!(A)2467However, we can still fold the "!" into the "A" operand, because no matter what2468the following binary operator will be, we can produce an expression which is2469equivalent. */24702471/* Because it's a non-empty class, there must be an operand at the start. */2472if (!compile_class_binary_tight(context, negated, &ptr, &code,2473pop_info, lengthptr))2474return FALSE;24752476while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)2477{2478uint32_t op;2479BOOL op_neg;2480BOOL rhs_negated;2481eclass_op_info rhs_op_info;24822483if (negated)2484{2485/* The whole expression is being negated; we respond by unconditionally2486negating the LHS A, before seeing what follows. And hooray! We can recover,2487no matter what follows. */2488/* !(A || B) -> !A && !B */2489/* !(A -- B) -> !(A && !B) -> !A || B */2490/* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */2491op = (*ptr == META_ECLASS_OR )? ECL_AND :2492(*ptr == META_ECLASS_SUB)? ECL_OR :2493/*ptr == META_ECLASS_XOR*/ ECL_XOR;2494op_neg = (*ptr == META_ECLASS_XOR);2495rhs_negated = *ptr != META_ECLASS_SUB;2496}2497else2498{2499/* A || B -> A || B */2500/* A -- B -> A && !B */2501/* A XOR B -> A XOR B */2502op = (*ptr == META_ECLASS_OR )? ECL_OR :2503(*ptr == META_ECLASS_SUB)? ECL_AND :2504/*ptr == META_ECLASS_XOR*/ ECL_XOR;2505op_neg = FALSE;2506rhs_negated = *ptr == META_ECLASS_SUB;2507}25082509++ptr;25102511/* An operand must follow the operator. */2512if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,2513&rhs_op_info, lengthptr))2514return FALSE;25152516/* Convert infix to postfix (RPN). */2517fold_binary(op, pop_info, &rhs_op_info, lengthptr);2518if (op_neg) fold_negation(pop_info, lengthptr, FALSE);2519if (lengthptr == NULL)2520code = pop_info->code_start + pop_info->length;2521}25222523PCRE2_ASSERT(lengthptr == NULL || code == start_code);25242525*pptr = ptr;2526*pcode = code;2527return TRUE;2528}2529253025312532/* This function converts the META codes in pptr into opcodes written to2533pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.25342535The class is compiled as a left-associative sequence of operator2536applications.25372538The pptr will be left pointing at the matching META_CLASS_END. */25392540static BOOL2541compile_eclass_nested(eclass_context *context, BOOL negated,2542uint32_t **pptr, PCRE2_UCHAR **pcode,2543eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)2544{2545uint32_t *ptr = *pptr;2546#ifdef PCRE2_DEBUG2547PCRE2_UCHAR *start_code = *pcode;2548#endif25492550/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */2551PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||2552*ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));25532554if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))2555negated = !negated;25562557(*pptr)++;25582559/* Because it's a non-empty class, there must be an operand at the start. */2560if (!compile_class_binary_loose(context, negated, pptr, pcode,2561pop_info, lengthptr))2562return FALSE;25632564PCRE2_ASSERT(**pptr == META_CLASS_END);2565PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);2566return TRUE;2567}25682569BOOL2570PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,2571uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,2572compile_block *cb, PCRE2_SIZE *lengthptr)2573{2574eclass_context context;2575eclass_op_info op_info;2576PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;2577PCRE2_UCHAR *code = *pcode;2578PCRE2_UCHAR *previous;2579BOOL allbitsone = TRUE;25802581context.needs_bitmap = FALSE;2582context.options = options;2583context.xoptions = xoptions;2584context.errorcodeptr = errorcodeptr;2585context.cb = cb;25862587previous = code;2588*code++ = OP_ECLASS;2589code += LINK_SIZE;2590*code++ = 0; /* Flags, currently zero. */2591if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))2592return FALSE;25932594if (lengthptr != NULL)2595{2596*lengthptr += code - previous;2597code = previous;2598/* (*lengthptr - previous_length) now holds the amount of buffer that2599we require to make the call to compile_class_nested() with2600lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out2601before that call. */2602}26032604/* Do some useful counting of what's in the bitmap. */2605for (int i = 0; i < 8; i++)2606if (op_info.bits.classwords[i] != 0xffffffff)2607{2608allbitsone = FALSE;2609break;2610}26112612/* After constant-folding the extended class syntax, it may turn out to be2613a simple class after all. In that case, we can unwrap it from the2614OP_ECLASS container - and in fact, we must do so, because in 8-bit2615no-Unicode mode the matcher is compiled without support for OP_ECLASS. */26162617#ifndef SUPPORT_WIDE_CHARS2618PCRE2_ASSERT(op_info.op_single_type != 0);2619#else2620if (op_info.op_single_type != 0)2621#endif2622{2623/* Rewind back over the OP_ECLASS. */2624code = previous;26252626/* If the bits are all ones, and the "high characters" are all matched2627too, we use a special-cased encoding of OP_ALLANY. */26282629if (op_info.op_single_type == ECL_ANY && allbitsone)2630{2631/* Advancing code means rewinding lengthptr, at this point. */2632if (lengthptr != NULL) *lengthptr -= 1;2633*code++ = OP_ALLANY;2634}26352636/* If the high bits are all matched / all not-matched, then we emit an2637OP_NCLASS/OP_CLASS respectively. */26382639else if (op_info.op_single_type == ECL_ANY ||2640op_info.op_single_type == ECL_NONE)2641{2642PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));26432644if (lengthptr != NULL)2645{2646if (required_len > (*lengthptr - previous_length))2647*lengthptr = previous_length + required_len;2648}26492650/* Advancing code means rewinding lengthptr, at this point. */2651if (lengthptr != NULL) *lengthptr -= required_len;2652*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;2653memcpy(code, op_info.bits.classbits, 32);2654code += 32 / sizeof(PCRE2_UCHAR);2655}26562657/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data2658there, but, we pulled out its bitmap into op_info, so now we have to2659put that back into the OP_XCLASS. */26602661else2662{2663#ifndef SUPPORT_WIDE_CHARS2664PCRE2_DEBUG_UNREACHABLE();2665#else2666BOOL need_map = context.needs_bitmap;2667PCRE2_SIZE required_len;26682669PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);2670required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);26712672if (lengthptr != NULL)2673{2674/* Don't unconditionally request all the space we need - we may2675already have asked for more during processing of the ECLASS. */2676if (required_len > (*lengthptr - previous_length))2677*lengthptr = previous_length + required_len;26782679/* The code we write out here won't be ignored, even during the2680(lengthptr != NULL) phase, because if there's a following quantifier2681it will peek backwards. So we do have to write out a (truncated)2682OP_XCLASS, even on this branch. */2683*lengthptr -= 1 + LINK_SIZE + 1;2684*code++ = OP_XCLASS;2685PUT(code, 0, 1 + LINK_SIZE + 1);2686code += LINK_SIZE;2687*code++ = 0;2688}2689else2690{2691PCRE2_UCHAR *rest;2692PCRE2_SIZE rest_len;2693PCRE2_UCHAR flags;26942695/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */2696PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);2697rest = op_info.code_start + 1 + LINK_SIZE + 1;2698rest_len = (op_info.code_start + op_info.length) - rest;26992700/* First read any data we use, before memmove splats it. */2701flags = op_info.code_start[1 + LINK_SIZE];2702PCRE2_ASSERT((flags & XCL_MAP) == 0);27032704/* Next do the memmove before any writes. */2705memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),2706rest, CU2BYTES(rest_len));27072708/* Finally write the header data. */2709*code++ = OP_XCLASS;2710PUT(code, 0, (int)required_len);2711code += LINK_SIZE;2712*code++ = flags | (need_map? XCL_MAP : 0);2713if (need_map)2714{2715memcpy(code, op_info.bits.classbits, 32);2716code += 32 / sizeof(PCRE2_UCHAR);2717}2718code += rest_len;2719}2720#endif /* SUPPORT_WIDE_CHARS */2721}2722}27232724/* Otherwise, we're going to keep the OP_ECLASS. However, again we need2725to do some adjustment to insert the bitmap if we have one. */27262727#ifdef SUPPORT_WIDE_CHARS2728else2729{2730BOOL need_map = context.needs_bitmap;2731PCRE2_SIZE required_len =27321 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;27332734if (lengthptr != NULL)2735{2736if (required_len > (*lengthptr - previous_length))2737*lengthptr = previous_length + required_len;27382739/* As for the XCLASS branch above, we do have to write out a dummy2740OP_ECLASS, because of the backwards peek by the quantifier code. Write2741out a (truncated) OP_ECLASS, even on this branch. */2742*lengthptr -= 1 + LINK_SIZE + 1;2743*code++ = OP_ECLASS;2744PUT(code, 0, 1 + LINK_SIZE + 1);2745code += LINK_SIZE;2746*code++ = 0;2747}2748else2749{2750if (need_map)2751{2752PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;2753previous[1 + LINK_SIZE] |= ECL_MAP;2754memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,2755CU2BYTES(code - map_start));2756memcpy(map_start, op_info.bits.classbits, 32);2757code += 32 / sizeof(PCRE2_UCHAR);2758}2759PUT(previous, 1, (int)(code - previous));2760}2761}2762#endif /* SUPPORT_WIDE_CHARS */27632764*pcode = code;2765return TRUE;2766}27672768/* End of pcre2_compile_class.c */276927702771