Path: blob/master/thirdparty/pcre2/src/pcre2_jit_char_inc.h
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8This module by Zoltan Herczeg9Original API code Copyright (c) 1997-2012 University of Cambridge10New API code Copyright (c) 2016-2024 University of Cambridge1112-----------------------------------------------------------------------------13Redistribution and use in source and binary forms, with or without14modification, are permitted provided that the following conditions are met:1516* Redistributions of source code must retain the above copyright notice,17this list of conditions and the following disclaimer.1819* Redistributions in binary form must reproduce the above copyright20notice, this list of conditions and the following disclaimer in the21documentation and/or other materials provided with the distribution.2223* Neither the name of the University of Cambridge nor the names of its24contributors may be used to endorse or promote products derived from25this software without specific prior written permission.2627THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE37POSSIBILITY OF SUCH DAMAGE.38-----------------------------------------------------------------------------39*/4041/* XClass matching code. */4243#ifdef SUPPORT_WIDE_CHARS4445#define ECLASS_CHAR_DATA STACK_TOP46#define ECLASS_STACK_DATA STACK_LIMIT4748#define SET_CHAR_OFFSET(value) \49if ((value) != charoffset) \50{ \51if ((value) < charoffset) \52OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(charoffset - (value))); \53else \54OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)((value) - charoffset)); \55} \56charoffset = (value);5758#define READ_FROM_CHAR_LIST(destination) \59if (list_ind <= 1) \60{ \61destination = *(const uint16_t*)next_char; \62next_char += 2; \63} \64else \65{ \66destination = *(const uint32_t*)next_char; \67next_char += 4; \68}6970#define XCLASS_LOCAL_RANGES_SIZE 3271#define XCLASS_LOCAL_RANGES_LOG2_SIZE 57273typedef struct xclass_stack_item {74sljit_u32 first_item;75sljit_u32 last_item;76struct sljit_jump *jump;77} xclass_stack_item;7879typedef struct xclass_ranges {80size_t range_count;81/* Pointer to ranges. A stack area is provided when a small buffer is enough. */82uint32_t *ranges;83uint32_t local_ranges[XCLASS_LOCAL_RANGES_SIZE * 2];84/* Stack size must be log2(ranges / 2). */85xclass_stack_item *stack;86xclass_stack_item local_stack[XCLASS_LOCAL_RANGES_LOG2_SIZE];87} xclass_ranges;8889static void xclass_compute_ranges(compiler_common *common, PCRE2_SPTR cc, xclass_ranges *ranges)90{91DEFINE_COMPILER;92size_t range_count = 0, est_range_count;93size_t est_stack_size, tmp;94uint32_t type, list_ind;95uint32_t est_type;96uint32_t char_list_add, range_start, range_end;97const uint8_t *next_char;98const uint8_t *est_next_char;99#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)100BOOL utf = common->utf;101#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */102103if (*cc == XCL_SINGLE || *cc == XCL_RANGE)104{105/* Only a few ranges are present. */106do107{108type = *cc++;109SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);110GETCHARINCTEST(range_end, cc);111ranges->ranges[range_count] = range_end;112113if (type == XCL_RANGE)114{115GETCHARINCTEST(range_end, cc);116}117118ranges->ranges[range_count + 1] = range_end;119range_count += 2;120}121while (*cc != XCL_END);122123SLJIT_ASSERT(range_count <= XCLASS_LOCAL_RANGES_SIZE);124ranges->range_count = range_count;125return;126}127128SLJIT_ASSERT(cc[0] >= XCL_LIST);129#if PCRE2_CODE_UNIT_WIDTH == 8130type = (uint32_t)(cc[0] << 8) | cc[1];131cc += 2;132#else133type = cc[0];134cc++;135#endif /* CODE_UNIT_WIDTH */136137/* Align characters. */138next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);139type &= XCL_TYPE_MASK;140141/* Estimate size. */142est_next_char = next_char;143est_type = type;144est_range_count = 0;145list_ind = 0;146147while (est_type > 0)148{149uint32_t item_count = est_type & XCL_ITEM_COUNT_MASK;150151if (item_count == XCL_ITEM_COUNT_MASK)152{153if (list_ind <= 1)154{155item_count = *(const uint16_t*)est_next_char;156est_next_char += 2;157}158else159{160item_count = *(const uint32_t*)est_next_char;161est_next_char += 4;162}163}164165est_type >>= XCL_TYPE_BIT_LEN;166est_next_char += (size_t)item_count << (list_ind <= 1 ? 1 : 2);167list_ind++;168est_range_count += item_count + 1;169}170171if (est_range_count > XCLASS_LOCAL_RANGES_SIZE)172{173est_stack_size = 0;174tmp = est_range_count - 1;175176/* Compute log2(est_range_count) */177while (tmp > 0)178{179est_stack_size++;180tmp >>= 1;181}182183ranges->stack = (xclass_stack_item*)SLJIT_MALLOC((sizeof(xclass_stack_item) * est_stack_size)184+ ((sizeof(uint32_t) << 1) * (size_t)est_range_count), compiler->allocator_data);185186if (ranges->stack == NULL)187{188sljit_set_compiler_memory_error(compiler);189ranges->ranges = NULL;190return;191}192193ranges->ranges = (uint32_t*)(ranges->stack + est_stack_size);194}195196char_list_add = XCL_CHAR_LIST_LOW_16_ADD;197range_start = ~(uint32_t)0;198list_ind = 0;199200if ((type & XCL_BEGIN_WITH_RANGE) != 0)201range_start = XCL_CHAR_LIST_LOW_16_START;202203while (type > 0)204{205uint32_t item_count = type & XCL_ITEM_COUNT_MASK;206207if (item_count == XCL_ITEM_COUNT_MASK)208{209READ_FROM_CHAR_LIST(item_count);210SLJIT_ASSERT(item_count >= XCL_ITEM_COUNT_MASK);211}212213while (item_count > 0)214{215READ_FROM_CHAR_LIST(range_end);216217if ((range_end & XCL_CHAR_END) != 0)218{219range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);220221if (range_start == ~(uint32_t)0)222range_start = range_end;223224ranges->ranges[range_count] = range_start;225ranges->ranges[range_count + 1] = range_end;226range_count += 2;227range_start = ~(uint32_t)0;228}229else230range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);231232item_count--;233}234235list_ind++;236type >>= XCL_TYPE_BIT_LEN;237238if (range_start == ~(uint32_t)0)239{240if ((type & XCL_BEGIN_WITH_RANGE) != 0)241{242if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;243#if PCRE2_CODE_UNIT_WIDTH == 32244else if (list_ind == 2) range_start = XCL_CHAR_LIST_LOW_32_START;245else range_start = XCL_CHAR_LIST_HIGH_32_START;246#else247else range_start = XCL_CHAR_LIST_LOW_32_START;248#endif249}250}251else if ((type & XCL_BEGIN_WITH_RANGE) == 0)252{253if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;254else if (list_ind == 2) range_end = XCL_CHAR_LIST_HIGH_16_END;255#if PCRE2_CODE_UNIT_WIDTH == 32256else if (list_ind == 3) range_end = XCL_CHAR_LIST_LOW_32_END;257else range_end = XCL_CHAR_LIST_HIGH_32_END;258#else259else range_end = XCL_CHAR_LIST_LOW_32_END;260#endif261262ranges->ranges[range_count] = range_start;263ranges->ranges[range_count + 1] = range_end;264range_count += 2;265range_start = ~(uint32_t)0;266}267268if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;269#if PCRE2_CODE_UNIT_WIDTH == 32270else if (list_ind == 2) char_list_add = XCL_CHAR_LIST_LOW_32_ADD;271else char_list_add = XCL_CHAR_LIST_HIGH_32_ADD;272#else273else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;274#endif275}276277SLJIT_ASSERT(range_count > 0 && range_count <= (est_range_count << 1));278SLJIT_ASSERT(next_char <= (const uint8_t*)common->start);279ranges->range_count = range_count;280}281282static void xclass_check_bitset(compiler_common *common, const sljit_u8 *bitset, jump_list **found, jump_list **backtracks)283{284DEFINE_COMPILER;285struct sljit_jump *jump;286287jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);288if (!optimize_class(common, bitset, (bitset[31] & 0x80) != 0, TRUE, found))289{290OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);291OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);292OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)bitset);293OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);294OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);295add_jump(compiler, found, JUMP(SLJIT_NOT_ZERO));296}297298add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));299JUMPHERE(jump);300}301302#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)303304static void xclass_update_min_max(compiler_common *common, PCRE2_SPTR cc, sljit_u32 *min_ptr, sljit_u32 *max_ptr)305{306uint32_t type, list_ind, c;307sljit_u32 min = *min_ptr;308sljit_u32 max = *max_ptr;309uint32_t char_list_add;310const uint8_t *next_char;311BOOL utf = TRUE;312313/* This function is pointless without utf 8/16. */314SLJIT_ASSERT(common->utf);315if (*cc == XCL_SINGLE || *cc == XCL_RANGE)316{317/* Only a few ranges are present. */318do319{320type = *cc++;321SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);322GETCHARINCTEST(c, cc);323324if (c < min)325min = c;326327if (type == XCL_RANGE)328{329GETCHARINCTEST(c, cc);330}331332if (c > max)333max = c;334}335while (*cc != XCL_END);336337SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);338*min_ptr = min;339*max_ptr = max;340return;341}342343SLJIT_ASSERT(cc[0] >= XCL_LIST);344#if PCRE2_CODE_UNIT_WIDTH == 8345type = (uint32_t)(cc[0] << 8) | cc[1];346cc += 2;347#else348type = cc[0];349cc++;350#endif /* CODE_UNIT_WIDTH */351352/* Align characters. */353next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);354type &= XCL_TYPE_MASK;355356SLJIT_ASSERT(type != 0);357358/* Detect minimum. */359360/* Skip unused ranges. */361list_ind = 0;362while ((type & (XCL_BEGIN_WITH_RANGE | XCL_ITEM_COUNT_MASK)) == 0)363{364type >>= XCL_TYPE_BIT_LEN;365list_ind++;366}367368SLJIT_ASSERT(list_ind <= 2);369switch (list_ind)370{371case 0:372char_list_add = XCL_CHAR_LIST_LOW_16_ADD;373c = XCL_CHAR_LIST_LOW_16_START;374break;375376case 1:377char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;378c = XCL_CHAR_LIST_HIGH_16_START;379break;380381default:382char_list_add = XCL_CHAR_LIST_LOW_32_ADD;383c = XCL_CHAR_LIST_LOW_32_START;384break;385}386387if ((type & XCL_BEGIN_WITH_RANGE) != 0)388{389if (c < min)390min = c;391}392else393{394if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)395{396if (list_ind <= 1)397c = *(const uint16_t*)(next_char + 2);398else399c = *(const uint32_t*)(next_char + 4);400}401else402{403if (list_ind <= 1)404c = *(const uint16_t*)next_char;405else406c = *(const uint32_t*)next_char;407}408409c = char_list_add + (c >> XCL_CHAR_SHIFT);410if (c < min)411min = c;412}413414/* Detect maximum. */415416/* Skip intermediate ranges. */417while (TRUE)418{419if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)420{421if (list_ind <= 1)422{423c = *(const uint16_t*)next_char;424next_char += (c + 1) << 1;425}426else427{428c = *(const uint32_t*)next_char;429next_char += (c + 1) << 2;430}431}432else433next_char += (type & XCL_ITEM_COUNT_MASK) << (list_ind <= 1 ? 1 : 2);434435if ((type >> XCL_TYPE_BIT_LEN) == 0)436break;437438list_ind++;439type >>= XCL_TYPE_BIT_LEN;440}441442SLJIT_ASSERT(list_ind <= 2 && type != 0);443switch (list_ind)444{445case 0:446char_list_add = XCL_CHAR_LIST_LOW_16_ADD;447c = XCL_CHAR_LIST_LOW_16_END;448break;449450case 1:451char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;452c = XCL_CHAR_LIST_HIGH_16_END;453break;454455default:456char_list_add = XCL_CHAR_LIST_LOW_32_ADD;457c = XCL_CHAR_LIST_LOW_32_END;458break;459}460461if ((type & XCL_ITEM_COUNT_MASK) != 0)462{463/* Type is reused as temporary. */464if (list_ind <= 1)465type = *(const uint16_t*)(next_char - 2);466else467type = *(const uint32_t*)(next_char - 4);468469if (type & XCL_CHAR_END)470c = char_list_add + (type >> XCL_CHAR_SHIFT);471}472473if (c > max)474max = c;475476SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);477*min_ptr = min;478*max_ptr = max;479}480481#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */482483#define XCLASS_IS_ECLASS 0x001484#ifdef SUPPORT_UNICODE485#define XCLASS_SAVE_CHAR 0x002486#define XCLASS_HAS_TYPE 0x004487#define XCLASS_HAS_SCRIPT 0x008488#define XCLASS_HAS_SCRIPT_EXTENSION 0x010489#define XCLASS_HAS_BOOL 0x020490#define XCLASS_HAS_BIDICL 0x040491#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BOOL | XCLASS_HAS_BIDICL)492#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080493#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100494#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0 0x200495#endif /* SUPPORT_UNICODE */496497static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);498499/* TMP3 must be preserved because it is used by compile_iterator_matchingpath. */500static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks, sljit_u32 status)501{502DEFINE_COMPILER;503jump_list *found = NULL;504jump_list *check_result = NULL;505jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks;506sljit_uw c, charoffset;507sljit_u32 max = READ_CHAR_MAX, min = 0;508struct sljit_jump *jump = NULL;509PCRE2_UCHAR flags;510PCRE2_SPTR ccbegin;511sljit_u32 compares, invertcmp, depth;512sljit_u32 first_item, last_item, mid_item;513sljit_u32 range_start, range_end;514xclass_ranges ranges;515BOOL has_cmov, last_range_set;516517#ifdef SUPPORT_UNICODE518sljit_u32 category_list = 0;519sljit_u32 items;520int typereg = TMP1;521#endif /* SUPPORT_UNICODE */522523SLJIT_ASSERT(common->locals_size >= SSIZE_OF(sw));524/* Scanning the necessary info. */525flags = *cc++;526ccbegin = cc;527compares = 0;528529if (flags & XCL_MAP)530cc += 32 / sizeof(PCRE2_UCHAR);531532#ifdef SUPPORT_UNICODE533while (*cc == XCL_PROP || *cc == XCL_NOTPROP)534{535compares++;536cc++;537538items = 0;539540switch(*cc)541{542case PT_LAMP:543items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);544break;545546case PT_GC:547items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);548break;549550case PT_PC:551items = UCPCAT(cc[1]);552break;553554case PT_WORD:555items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;556break;557558case PT_ALNUM:559items = UCPCAT_L | UCPCAT_N;560break;561562case PT_SCX:563status |= XCLASS_HAS_SCRIPT_EXTENSION;564if (cc[-1] == XCL_NOTPROP)565{566status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;567break;568}569compares++;570/* Fall through */571572case PT_SC:573status |= XCLASS_HAS_SCRIPT;574break;575576case PT_SPACE:577case PT_PXSPACE:578case PT_PXGRAPH:579case PT_PXPRINT:580case PT_PXPUNCT:581status |= XCLASS_SAVE_CHAR | XCLASS_HAS_TYPE;582break;583584case PT_UCNC:585case PT_PXXDIGIT:586status |= XCLASS_SAVE_CHAR;587break;588589case PT_BOOL:590status |= XCLASS_HAS_BOOL;591break;592593case PT_BIDICL:594status |= XCLASS_HAS_BIDICL;595break;596597default:598SLJIT_UNREACHABLE();599break;600}601602if (items > 0)603{604if (cc[-1] == XCL_NOTPROP)605items ^= UCPCAT_ALL;606category_list |= items;607status |= XCLASS_HAS_TYPE;608compares--;609}610611cc += 2;612}613614if (category_list == UCPCAT_ALL)615{616/* All or no characters are accepted, same as dotall. */617if (status & XCLASS_IS_ECLASS)618{619if (list != backtracks)620OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);621return;622}623624compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);625if (list == backtracks)626add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));627return;628}629630if (category_list != 0)631compares++;632#endif633634if (*cc != XCL_END)635{636#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)637if (common->utf && compares == 0 && !(status & XCLASS_IS_ECLASS))638{639SLJIT_ASSERT(category_list == 0);640max = 0;641min = (flags & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX;642xclass_update_min_max(common, cc, &min, &max);643}644#endif645compares++;646#ifdef SUPPORT_UNICODE647status |= XCLASS_SAVE_CHAR;648#endif /* SUPPORT_UNICODE */649}650651#ifdef SUPPORT_UNICODE652SLJIT_ASSERT(compares > 0 || category_list != 0);653#else /* !SUPPORT_UNICODE */654SLJIT_ASSERT(compares > 0);655#endif /* SUPPORT_UNICODE */656657/* We are not necessary in utf mode even in 8 bit mode. */658cc = ccbegin;659if (!(status & XCLASS_IS_ECLASS))660{661if ((flags & XCL_NOT) != 0)662read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);663else664{665#ifdef SUPPORT_UNICODE666read_char(common, min, max, (status & XCLASS_NEEDS_UCD) ? backtracks : NULL, 0);667#else /* !SUPPORT_UNICODE */668read_char(common, min, max, NULL, 0);669#endif /* SUPPORT_UNICODE */670}671}672673if ((flags & XCL_MAP) != 0)674{675SLJIT_ASSERT(!(status & XCLASS_IS_ECLASS));676xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);677cc += 32 / sizeof(PCRE2_UCHAR);678}679680#ifdef SUPPORT_UNICODE681if (status & XCLASS_NEEDS_UCD)682{683if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)684OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0);685686#if PCRE2_CODE_UNIT_WIDTH == 32687if (!common->utf)688{689OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);690SELECT(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, UNASSIGNED_UTF_CHAR, TMP1);691}692#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */693694OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);695OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);696OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));697OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);698OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);699OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);700OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));701OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);702OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 3);703OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);704OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);705706ccbegin = cc;707708if (status & XCLASS_HAS_BIDICL)709{710OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));711OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BIDICLASS_SHIFT);712713while (*cc == XCL_PROP || *cc == XCL_NOTPROP)714{715cc++;716717if (*cc == PT_BIDICL)718{719compares--;720invertcmp = (compares == 0 && list != backtracks);721if (cc[-1] == XCL_NOTPROP)722invertcmp ^= 0x1;723jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);724add_jump(compiler, compares > 0 ? list : backtracks, jump);725}726cc += 2;727}728729cc = ccbegin;730}731732if (status & XCLASS_HAS_BOOL)733{734OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bprops));735OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BPROPS_MASK);736OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);737738while (*cc == XCL_PROP || *cc == XCL_NOTPROP)739{740cc++;741if (*cc == PT_BOOL)742{743compares--;744invertcmp = (compares == 0 && list != backtracks);745if (cc[-1] == XCL_NOTPROP)746invertcmp ^= 0x1;747748OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_boolprop_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));749add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));750}751cc += 2;752}753754cc = ccbegin;755}756757if (status & XCLASS_HAS_SCRIPT)758{759OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));760761while (*cc == XCL_PROP || *cc == XCL_NOTPROP)762{763cc++;764765switch (*cc)766{767case PT_SCX:768if (cc[-1] == XCL_NOTPROP)769break;770/* Fall through */771772case PT_SC:773compares--;774invertcmp = (compares == 0 && list != backtracks);775if (cc[-1] == XCL_NOTPROP)776invertcmp ^= 0x1;777778add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));779}780cc += 2;781}782783cc = ccbegin;784}785786if (status & XCLASS_HAS_SCRIPT_EXTENSION)787{788OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));789OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_SCRIPTX_MASK);790OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);791792if (status & XCLASS_SCRIPT_EXTENSION_NOTPROP)793{794if (status & XCLASS_HAS_TYPE)795{796if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)797{798OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, TMP2, 0);799status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0;800}801else802{803OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);804status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;805}806}807OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));808}809810while (*cc == XCL_PROP || *cc == XCL_NOTPROP)811{812cc++;813814if (*cc == PT_SCX)815{816compares--;817invertcmp = (compares == 0 && list != backtracks);818819jump = NULL;820if (cc[-1] == XCL_NOTPROP)821{822jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);823if (invertcmp)824{825add_jump(compiler, backtracks, jump);826jump = NULL;827}828invertcmp ^= 0x1;829}830831OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));832add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));833834if (jump != NULL)835JUMPHERE(jump);836}837cc += 2;838}839840if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0)841OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);842else if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)843OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);844cc = ccbegin;845}846847if (status & XCLASS_SAVE_CHAR)848OP1(SLJIT_MOV, TMP1, 0, (status & XCLASS_IS_ECLASS) ? ECLASS_CHAR_DATA : RETURN_ADDR, 0);849850if (status & XCLASS_HAS_TYPE)851{852if (status & XCLASS_SAVE_CHAR)853typereg = RETURN_ADDR;854855OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));856OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0);857858if (category_list > 0)859{860compares--;861invertcmp = (compares == 0 && list != backtracks);862OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list);863add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));864}865}866}867#endif /* SUPPORT_UNICODE */868869/* Generating code. */870charoffset = 0;871872#ifdef SUPPORT_UNICODE873while (*cc == XCL_PROP || *cc == XCL_NOTPROP)874{875compares--;876invertcmp = (compares == 0 && list != backtracks);877jump = NULL;878879if (*cc == XCL_NOTPROP)880invertcmp ^= 0x1;881cc++;882switch(*cc)883{884case PT_LAMP:885case PT_GC:886case PT_PC:887case PT_SC:888case PT_SCX:889case PT_BOOL:890case PT_BIDICL:891case PT_WORD:892case PT_ALNUM:893compares++;894/* Already handled. */895break;896897case PT_SPACE:898case PT_PXSPACE:899SET_CHAR_OFFSET(9);900OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xd - 0x9);901OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);902903OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x85 - 0x9);904OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);905906OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x9);907OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);908909OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Zl, ucp_Zs));910OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);911jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);912break;913914case PT_UCNC:915OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset));916OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);917OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset));918OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);919OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset));920OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);921922SET_CHAR_OFFSET(0xa0);923OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset));924OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);925SET_CHAR_OFFSET(0);926OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0);927OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL);928jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);929break;930931case PT_PXGRAPH:932OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT_RANGE(ucp_Zl, ucp_Zs));933OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);934935OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));936jump = JUMP(SLJIT_ZERO);937938c = charoffset;939/* In case of ucp_Cf, we overwrite the result. */940SET_CHAR_OFFSET(0x2066);941OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);942OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);943944OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);945OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);946947OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066);948OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);949950/* Restore charoffset. */951SET_CHAR_OFFSET(c);952953JUMPHERE(jump);954jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);955break;956957case PT_PXPRINT:958OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT2(ucp_Zl, ucp_Zp));959OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);960961OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));962jump = JUMP(SLJIT_ZERO);963964c = charoffset;965/* In case of ucp_Cf, we overwrite the result. */966SET_CHAR_OFFSET(0x2066);967OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);968OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);969970OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);971OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);972973/* Restore charoffset. */974SET_CHAR_OFFSET(c);975976JUMPHERE(jump);977jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);978break;979980case PT_PXPUNCT:981OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Sc, ucp_So));982OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);983984SET_CHAR_OFFSET(0);985OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x7f);986OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL);987988OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Pc, ucp_Ps));989OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);990jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);991break;992993case PT_PXXDIGIT:994SET_CHAR_OFFSET(CHAR_A);995OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, ~0x20);996OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP2, 0, SLJIT_IMM, CHAR_F - CHAR_A);997OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);998999SET_CHAR_OFFSET(CHAR_0);1000OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_9 - CHAR_0);1001OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);10021003SET_CHAR_OFFSET(0xff10);1004jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff10);10051006OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff19 - 0xff10);1007OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);10081009SET_CHAR_OFFSET(0xff21);1010OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff26 - 0xff21);1011OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);10121013SET_CHAR_OFFSET(0xff41);1014OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff41);1015OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);10161017SET_CHAR_OFFSET(0xff10);10181019JUMPHERE(jump);1020OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, 0);1021jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);1022break;10231024default:1025SLJIT_UNREACHABLE();1026break;1027}10281029cc += 2;10301031if (jump != NULL)1032add_jump(compiler, compares > 0 ? list : backtracks, jump);1033}10341035if (compares == 0)1036{1037if (found != NULL)1038set_jumps(found, LABEL());10391040if (status & XCLASS_IS_ECLASS)1041OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1042return;1043}1044#endif /* SUPPORT_UNICODE */10451046SLJIT_ASSERT(compares == 1);1047ranges.range_count = 0;1048ranges.ranges = ranges.local_ranges;1049ranges.stack = ranges.local_stack;10501051xclass_compute_ranges(common, cc, &ranges);10521053/* Memory error is set for the compiler. */1054if (ranges.stack == NULL)1055return;10561057#if (defined SLJIT_DEBUG && SLJIT_DEBUG) && \1058defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)1059if (common->utf)1060{1061min = READ_CHAR_MAX;1062max = 0;1063xclass_update_min_max(common, cc, &min, &max);1064SLJIT_ASSERT(ranges.ranges[0] == min && ranges.ranges[ranges.range_count - 1] == max);1065}1066#endif /* SLJIT_DEBUG && SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */10671068invertcmp = (list != backtracks);10691070if (ranges.range_count == 2)1071{1072range_start = ranges.ranges[0];1073range_end = ranges.ranges[1];10741075if (range_start < range_end)1076{1077SET_CHAR_OFFSET(range_start);1078jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));1079}1080else1081jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));10821083add_jump(compiler, backtracks, jump);10841085SLJIT_ASSERT(ranges.stack == ranges.local_stack);1086if (found != NULL)1087set_jumps(found, LABEL());10881089if (status & XCLASS_IS_ECLASS)1090OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1091return;1092}10931094range_start = ranges.ranges[0];1095SET_CHAR_OFFSET(range_start);1096if (ranges.range_count >= 6)1097{1098/* Early fail. */1099range_end = ranges.ranges[ranges.range_count - 1];1100add_jump(compiler, (flags & XCL_NOT) == 0 ? backtracks : &found,1101CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)));1102}11031104depth = 0;1105first_item = 0;1106last_item = ranges.range_count - 2;1107has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;11081109while (TRUE)1110{1111/* At least two items are present. */1112SLJIT_ASSERT(first_item < last_item && charoffset == ranges.ranges[0]);1113last_range_set = FALSE;11141115if (first_item + 6 <= last_item)1116{1117mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1;1118SLJIT_ASSERT(last_item >= mid_item + 4);11191120range_end = ranges.ranges[mid_item + 1];1121if (first_item + 6 > mid_item && ranges.ranges[mid_item] == range_end)1122{1123OP2U(SLJIT_SUB | SLJIT_SET_GREATER | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));1124ranges.stack[depth].jump = JUMP(SLJIT_GREATER);1125OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);1126last_range_set = TRUE;1127}1128else1129ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));11301131ranges.stack[depth].first_item = (sljit_u32)(mid_item + 2);1132ranges.stack[depth].last_item = (sljit_u32)last_item;11331134depth++;1135SLJIT_ASSERT(ranges.stack == ranges.local_stack ?1136depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges);11371138last_item = mid_item;1139if (!last_range_set)1140continue;11411142last_item -= 2;1143}11441145if (!last_range_set)1146{1147range_start = ranges.ranges[first_item];1148range_end = ranges.ranges[first_item + 1];11491150if (range_start < range_end)1151{1152SET_CHAR_OFFSET(range_start);1153OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));1154OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);1155}1156else1157{1158OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));1159OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);1160}1161first_item += 2;1162}11631164SLJIT_ASSERT(first_item <= last_item);11651166do1167{1168range_start = ranges.ranges[first_item];1169range_end = ranges.ranges[first_item + 1];11701171if (range_start < range_end)1172{1173SET_CHAR_OFFSET(range_start);1174OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));11751176if (has_cmov)1177SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2);1178else1179OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL);1180}1181else1182{1183OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));11841185if (has_cmov)1186SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);1187else1188OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);1189}11901191first_item += 2;1192}1193while (first_item <= last_item);11941195if (depth == 0) break;11961197add_jump(compiler, &check_result, JUMP(SLJIT_JUMP));11981199/* The charoffset resets after the end of a branch is reached. */1200charoffset = ranges.ranges[0];1201depth--;1202first_item = ranges.stack[depth].first_item;1203last_item = ranges.stack[depth].last_item;1204JUMPHERE(ranges.stack[depth].jump);1205}12061207if (check_result != NULL)1208set_jumps(check_result, LABEL());12091210if (has_cmov)1211jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);1212else1213{1214sljit_set_current_flags(compiler, SLJIT_SET_Z);1215jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp);1216}12171218add_jump(compiler, backtracks, jump);12191220if (found != NULL)1221set_jumps(found, LABEL());12221223if (status & XCLASS_IS_ECLASS)1224OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);12251226if (ranges.stack != ranges.local_stack)1227SLJIT_FREE(ranges.stack, compiler->allocator_data);1228}12291230static PCRE2_SPTR compile_eclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)1231{1232DEFINE_COMPILER;1233PCRE2_SPTR end = cc + GET(cc, 0) - 1;1234PCRE2_SPTR begin;1235jump_list *not_found;1236jump_list *found = NULL;12371238cc += LINK_SIZE;12391240/* Should be optimized later. */1241read_char(common, 0, READ_CHAR_MAX, backtracks, 0);12421243if (((*cc++) & ECL_MAP) != 0)1244{1245xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);1246cc += 32 / sizeof(PCRE2_UCHAR);1247}12481249begin = cc;12501251OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, ECLASS_CHAR_DATA, 0);1252OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL1, ECLASS_STACK_DATA, 0);1253OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);1254OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, TMP1, 0);12551256/* All eclass must start with an xclass. */1257SLJIT_ASSERT(*cc == ECL_XCLASS);12581259while (cc < end)1260{1261switch (*cc)1262{1263case ECL_AND:1264++cc;1265OP2(SLJIT_OR, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, ~(sljit_sw)1);1266OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1267OP2(SLJIT_AND, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);1268break;12691270case ECL_OR:1271++cc;1272OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1273OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1274OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);1275break;12761277case ECL_XOR:1278++cc;1279OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1280OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1281OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);1282break;12831284case ECL_NOT:1285++cc;1286OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1287break;12881289default:1290SLJIT_ASSERT(*cc == ECL_XCLASS);1291if (cc != begin)1292{1293OP1(SLJIT_MOV, TMP1, 0, ECLASS_CHAR_DATA, 0);1294OP2(SLJIT_SHL, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);1295}12961297not_found = NULL;1298compile_xclass_matchingpath(common, cc + 1 + LINK_SIZE, ¬_found, XCLASS_IS_ECLASS);1299set_jumps(not_found, LABEL());13001301cc += GET(cc, 1);1302break;1303}1304}13051306OP2U(SLJIT_SUB | SLJIT_SET_Z, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);1307OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);1308OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL1);1309add_jump(compiler, backtracks, JUMP(SLJIT_EQUAL));1310set_jumps(found, LABEL());1311return end;1312}13131314/* Generic character matching code. */13151316#undef SET_CHAR_OFFSET1317#undef READ_FROM_CHAR_LIST1318#undef XCLASS_LOCAL_RANGES_SIZE1319#undef XCLASS_LOCAL_RANGES_LOG2_SIZE13201321#endif /* SUPPORT_WIDE_CHARS */13221323static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc,1324compare_context *context, jump_list **backtracks)1325{1326DEFINE_COMPILER;1327unsigned int othercasebit = 0;1328PCRE2_SPTR othercasechar = NULL;1329#ifdef SUPPORT_UNICODE1330int utflength;1331#endif13321333if (caseless && char_has_othercase(common, cc))1334{1335othercasebit = char_get_othercase_bit(common, cc);1336SLJIT_ASSERT(othercasebit);1337/* Extracting bit difference info. */1338#if PCRE2_CODE_UNIT_WIDTH == 81339othercasechar = cc + (othercasebit >> 8);1340othercasebit &= 0xff;1341#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 321342/* Note that this code only handles characters in the BMP. If there1343ever are characters outside the BMP whose othercase differs in only one1344bit from itself (there currently are none), this code will need to be1345revised for PCRE2_CODE_UNIT_WIDTH == 32. */1346othercasechar = cc + (othercasebit >> 9);1347if ((othercasebit & 0x100) != 0)1348othercasebit = (othercasebit & 0xff) << 8;1349else1350othercasebit &= 0xff;1351#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */1352}13531354if (context->sourcereg == -1)1355{1356#if PCRE2_CODE_UNIT_WIDTH == 81357#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED1358if (context->length >= 4)1359OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);1360else if (context->length >= 2)1361OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);1362else1363#endif1364OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);1365#elif PCRE2_CODE_UNIT_WIDTH == 161366#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED1367if (context->length >= 4)1368OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);1369else1370#endif1371OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);1372#elif PCRE2_CODE_UNIT_WIDTH == 321373OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);1374#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */1375context->sourcereg = TMP2;1376}13771378#ifdef SUPPORT_UNICODE1379utflength = 1;1380if (common->utf && HAS_EXTRALEN(*cc))1381utflength += GET_EXTRALEN(*cc);13821383do1384{1385#endif13861387context->length -= IN_UCHARS(1);1388#if (defined SLJIT_UNALIGNED && SLJIT_UNALIGNED) && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)13891390/* Unaligned read is supported. */1391if (othercasebit != 0 && othercasechar == cc)1392{1393context->c.asuchars[context->ucharptr] = *cc | othercasebit;1394context->oc.asuchars[context->ucharptr] = othercasebit;1395}1396else1397{1398context->c.asuchars[context->ucharptr] = *cc;1399context->oc.asuchars[context->ucharptr] = 0;1400}1401context->ucharptr++;14021403#if PCRE2_CODE_UNIT_WIDTH == 81404if (context->ucharptr >= 4 || context->length == 0 || (context->ucharptr == 2 && context->length == 1))1405#else1406if (context->ucharptr >= 2 || context->length == 0)1407#endif1408{1409if (context->length >= 4)1410OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);1411else if (context->length >= 2)1412OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);1413#if PCRE2_CODE_UNIT_WIDTH == 81414else if (context->length >= 1)1415OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);1416#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */1417context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;14181419switch(context->ucharptr)1420{1421case 4 / sizeof(PCRE2_UCHAR):1422if (context->oc.asint != 0)1423OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asint);1424add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asint | context->oc.asint));1425break;14261427case 2 / sizeof(PCRE2_UCHAR):1428if (context->oc.asushort != 0)1429OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asushort);1430add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asushort | context->oc.asushort));1431break;14321433#if PCRE2_CODE_UNIT_WIDTH == 81434case 1:1435if (context->oc.asbyte != 0)1436OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asbyte);1437add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asbyte | context->oc.asbyte));1438break;1439#endif14401441default:1442SLJIT_UNREACHABLE();1443break;1444}1445context->ucharptr = 0;1446}14471448#else14491450/* Unaligned read is unsupported or in 32 bit mode. */1451if (context->length >= 1)1452OP1(MOV_UCHAR, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);14531454context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;14551456if (othercasebit != 0 && othercasechar == cc)1457{1458OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, othercasebit);1459add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc | othercasebit));1460}1461else1462add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc));14631464#endif14651466cc++;1467#ifdef SUPPORT_UNICODE1468utflength--;1469}1470while (utflength > 0);1471#endif14721473return cc;1474}14751476#ifdef SUPPORT_UNICODE14771478#if PCRE2_CODE_UNIT_WIDTH != 3214791480/* The code in this function copies the logic of the interpreter function that1481is defined in the pcre2_extuni.c source. If that code is updated, this1482function, and those below it, must be kept in step (note by PH, June 2024). */14831484static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc)1485{1486PCRE2_SPTR start_subject = args->begin;1487PCRE2_SPTR end_subject = args->end;1488int lgb, rgb, ricount;1489PCRE2_SPTR prevcc, endcc, bptr;1490BOOL first = TRUE;1491BOOL was_ep_ZWJ = FALSE;1492uint32_t c;14931494prevcc = cc;1495endcc = NULL;1496do1497{1498GETCHARINC(c, cc);1499rgb = UCD_GRAPHBREAK(c);15001501if (first)1502{1503lgb = rgb;1504endcc = cc;1505first = FALSE;1506continue;1507}15081509if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)1510break;15111512/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was1513preceded by Extended Pictographic. */15141515if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)1516break;15171518/* Not breaking between Regional Indicators is allowed only if there1519are an even number of preceding RIs. */15201521if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)1522{1523ricount = 0;1524bptr = prevcc;15251526/* bptr is pointing to the left-hand character */1527while (bptr > start_subject)1528{1529bptr--;1530BACKCHAR(bptr);1531GETCHAR(c, bptr);15321533if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)1534break;15351536ricount++;1537}15381539if ((ricount & 1) != 0) break; /* Grapheme break required */1540}15411542/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in1543between; see next statement). */15441545was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);15461547/* If Extend follows Extended_Pictographic, do not update lgb; this allows1548any number of them before a following ZWJ. */15491550if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)1551lgb = rgb;15521553prevcc = endcc;1554endcc = cc;1555}1556while (cc < end_subject);15571558return endcc;1559}15601561#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */15621563/* The code in this function copies the logic of the interpreter function that1564is defined in the pcre2_extuni.c source. If that code is updated, this1565function, and the one below it, must be kept in step (note by PH, June 2024). */15661567static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)1568{1569PCRE2_SPTR start_subject = args->begin;1570PCRE2_SPTR end_subject = args->end;1571int lgb, rgb, ricount;1572PCRE2_SPTR prevcc, endcc, bptr;1573BOOL first = TRUE;1574BOOL was_ep_ZWJ = FALSE;1575uint32_t c;15761577prevcc = cc;1578endcc = NULL;1579do1580{1581GETCHARINC_INVALID(c, cc, end_subject, break);1582rgb = UCD_GRAPHBREAK(c);15831584if (first)1585{1586lgb = rgb;1587endcc = cc;1588first = FALSE;1589continue;1590}15911592if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)1593break;15941595/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was1596preceded by Extended Pictographic. */15971598if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)1599break;16001601/* Not breaking between Regional Indicators is allowed only if there1602are an even number of preceding RIs. */16031604if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)1605{1606ricount = 0;1607bptr = prevcc;16081609/* bptr is pointing to the left-hand character */1610while (bptr > start_subject)1611{1612GETCHARBACK_INVALID(c, bptr, start_subject, break);16131614if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)1615break;16161617ricount++;1618}16191620if ((ricount & 1) != 0)1621break; /* Grapheme break required */1622}16231624/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in1625between; see next statement). */16261627was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);16281629/* If Extend follows Extended_Pictographic, do not update lgb; this allows1630any number of them before a following ZWJ. */16311632if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)1633lgb = rgb;16341635prevcc = endcc;1636endcc = cc;1637}1638while (cc < end_subject);16391640return endcc;1641}16421643/* The code in this function copies the logic of the interpreter function that1644is defined in the pcre2_extuni.c source. If that code is updated, this1645function must be kept in step (note by PH, June 2024). */16461647static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)1648{1649PCRE2_SPTR start_subject = args->begin;1650PCRE2_SPTR end_subject = args->end;1651int lgb, rgb, ricount;1652PCRE2_SPTR bptr;1653uint32_t c;1654BOOL was_ep_ZWJ = FALSE;16551656/* Patch by PH */1657/* GETCHARINC(c, cc); */1658c = *cc++;16591660#if PCRE2_CODE_UNIT_WIDTH == 321661if (c >= 0x110000)1662return cc;1663#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */1664lgb = UCD_GRAPHBREAK(c);16651666while (cc < end_subject)1667{1668c = *cc;1669#if PCRE2_CODE_UNIT_WIDTH == 321670if (c >= 0x110000)1671break;1672#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */1673rgb = UCD_GRAPHBREAK(c);16741675if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)1676break;16771678/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was1679preceded by Extended Pictographic. */16801681if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)1682break;16831684/* Not breaking between Regional Indicators is allowed only if there1685are an even number of preceding RIs. */16861687if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)1688{1689ricount = 0;1690bptr = cc - 1;16911692/* bptr is pointing to the left-hand character */1693while (bptr > start_subject)1694{1695bptr--;1696c = *bptr;1697#if PCRE2_CODE_UNIT_WIDTH == 321698if (c >= 0x110000)1699break;1700#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */17011702if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;17031704ricount++;1705}17061707if ((ricount & 1) != 0)1708break; /* Grapheme break required */1709}17101711/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in1712between; see next statement). */17131714was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);17151716/* If Extend follows Extended_Pictographic, do not update lgb; this allows1717any number of them before a following ZWJ. */17181719if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)1720lgb = rgb;17211722cc++;1723}17241725return cc;1726}17271728static void compile_clist(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)1729{1730DEFINE_COMPILER;1731const sljit_u32 *other_cases;1732struct sljit_jump *jump;1733sljit_u32 min = 0, max = READ_CHAR_MAX;1734BOOL has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;17351736SLJIT_ASSERT(cc[1] == PT_CLIST);17371738if (cc[0] == OP_PROP)1739{1740other_cases = PRIV(ucd_caseless_sets) + cc[2];17411742min = *other_cases++;1743max = min;17441745while (*other_cases != NOTACHAR)1746{1747if (*other_cases > max) max = *other_cases;1748if (*other_cases < min) min = *other_cases;1749other_cases++;1750}1751}17521753other_cases = PRIV(ucd_caseless_sets) + cc[2];1754SLJIT_ASSERT(other_cases[0] != NOTACHAR && other_cases[1] != NOTACHAR);1755/* The NOTACHAR is higher than any character. */1756SLJIT_ASSERT(other_cases[0] < other_cases[1] && other_cases[1] < other_cases[2]);17571758read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);17591760/* At least two characters are required.1761Otherwise this case would be handled by the normal code path. */1762/* NOTACHAR is the unsigned maximum. */17631764/* Optimizing character pairs, if their difference is power of 2. */1765if (is_powerof2(other_cases[1] ^ other_cases[0]))1766{1767OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[1] ^ other_cases[0]));1768OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[1]);1769OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);1770other_cases += 2;1771}1772else if (is_powerof2(other_cases[2] ^ other_cases[1]))1773{1774SLJIT_ASSERT(other_cases[2] != NOTACHAR);17751776OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[2] ^ other_cases[1]));1777OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[2]);1778OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);17791780OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)other_cases[0]);17811782if (has_cmov)1783SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);1784else1785OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);17861787other_cases += 3;1788}1789else1790{1791OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));1792OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);1793}17941795while (*other_cases != NOTACHAR)1796{1797OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));17981799if (has_cmov)1800SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);1801else1802OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);1803}18041805if (has_cmov)1806jump = CMP(cc[0] == OP_PROP ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0);1807else1808jump = JUMP(cc[0] == OP_PROP ? SLJIT_ZERO : SLJIT_NOT_ZERO);18091810add_jump(compiler, backtracks, jump);1811}18121813#endif /* SUPPORT_UNICODE */18141815static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr)1816{1817DEFINE_COMPILER;1818int length;1819unsigned int c, oc, bit;1820compare_context context;1821struct sljit_jump *jump[3];1822jump_list *end_list;1823#ifdef SUPPORT_UNICODE1824PCRE2_UCHAR propdata[5];1825#endif /* SUPPORT_UNICODE */18261827switch(type)1828{1829case OP_NOT_DIGIT:1830case OP_DIGIT:1831/* Digits are usually 0-9, so it is worth to optimize them. */1832if (check_str_ptr)1833detect_partial_match(common, backtracks);1834#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81835if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE))1836read_char7_type(common, backtracks, type == OP_NOT_DIGIT);1837else1838#endif1839read_char8_type(common, backtracks, type == OP_NOT_DIGIT);1840/* Flip the starting bit in the negative case. */1841OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_digit);1842add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO));1843return cc;18441845case OP_NOT_WHITESPACE:1846case OP_WHITESPACE:1847if (check_str_ptr)1848detect_partial_match(common, backtracks);1849#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81850if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE))1851read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE);1852else1853#endif1854read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE);1855OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_space);1856add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO));1857return cc;18581859case OP_NOT_WORDCHAR:1860case OP_WORDCHAR:1861if (check_str_ptr)1862detect_partial_match(common, backtracks);1863#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 81864if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE))1865read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR);1866else1867#endif1868read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR);1869OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_word);1870add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO));1871return cc;18721873case OP_ANY:1874if (check_str_ptr)1875detect_partial_match(common, backtracks);1876read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);1877if (common->nltype == NLTYPE_FIXED && common->newline > 255)1878{1879jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);1880end_list = NULL;1881if (common->mode != PCRE2_JIT_PARTIAL_HARD)1882add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));1883else1884check_str_end(common, &end_list);18851886OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);1887add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));1888set_jumps(end_list, LABEL());1889JUMPHERE(jump[0]);1890}1891else1892check_newlinechar(common, common->nltype, backtracks, TRUE);1893return cc;18941895case OP_ALLANY:1896if (check_str_ptr)1897detect_partial_match(common, backtracks);1898#ifdef SUPPORT_UNICODE1899if (common->utf && common->invalid_utf)1900{1901read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);1902return cc;1903}1904#endif /* SUPPORT_UNICODE */19051906skip_valid_char(common);1907return cc;19081909case OP_ANYBYTE:1910if (check_str_ptr)1911detect_partial_match(common, backtracks);1912OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));1913return cc;19141915#ifdef SUPPORT_UNICODE1916case OP_NOTPROP:1917case OP_PROP:1918if (check_str_ptr)1919detect_partial_match(common, backtracks);1920if (cc[0] == PT_CLIST)1921{1922compile_clist(common, cc - 1, backtracks);1923return cc + 2;1924}19251926propdata[0] = 0;1927propdata[1] = type == OP_NOTPROP ? XCL_NOTPROP : XCL_PROP;1928propdata[2] = cc[0];1929propdata[3] = cc[1];1930propdata[4] = XCL_END;1931compile_xclass_matchingpath(common, propdata, backtracks, 0);1932return cc + 2;1933#endif19341935case OP_ANYNL:1936if (check_str_ptr)1937detect_partial_match(common, backtracks);1938read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);1939jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);1940/* We don't need to handle soft partial matching case. */1941end_list = NULL;1942if (common->mode != PCRE2_JIT_PARTIAL_HARD)1943add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));1944else1945check_str_end(common, &end_list);1946OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);1947OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL);1948OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);1949#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 321950OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);1951#endif1952OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);1953jump[1] = JUMP(SLJIT_JUMP);1954JUMPHERE(jump[0]);1955check_newlinechar(common, common->bsr_nltype, backtracks, FALSE);1956set_jumps(end_list, LABEL());1957JUMPHERE(jump[1]);1958return cc;19591960case OP_NOT_HSPACE:1961case OP_HSPACE:1962if (check_str_ptr)1963detect_partial_match(common, backtracks);19641965if (type == OP_NOT_HSPACE)1966read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);1967else1968read_char(common, 0x9, 0x3000, NULL, 0);19691970add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));1971sljit_set_current_flags(compiler, SLJIT_SET_Z);1972add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));1973return cc;19741975case OP_NOT_VSPACE:1976case OP_VSPACE:1977if (check_str_ptr)1978detect_partial_match(common, backtracks);19791980if (type == OP_NOT_VSPACE)1981read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);1982else1983read_char(common, 0xa, 0x2029, NULL, 0);19841985add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));1986sljit_set_current_flags(compiler, SLJIT_SET_Z);1987add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));1988return cc;19891990#ifdef SUPPORT_UNICODE1991case OP_EXTUNI:1992if (check_str_ptr)1993detect_partial_match(common, backtracks);19941995SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1);1996OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0);19971998#if PCRE2_CODE_UNIT_WIDTH != 321999sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,2000common->utf ? (common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_utf)) : SLJIT_FUNC_ADDR(do_extuni_no_utf));2001if (common->invalid_utf)2002add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));2003#else2004sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,2005common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_no_utf));2006if (common->invalid_utf)2007add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));2008#endif20092010OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);20112012if (common->mode == PCRE2_JIT_PARTIAL_HARD)2013{2014jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0);2015/* Since we successfully read a char above, partial matching must occur. */2016check_partial(common, TRUE);2017JUMPHERE(jump[0]);2018}2019return cc;2020#endif20212022case OP_CHAR:2023case OP_CHARI:2024length = 1;2025#ifdef SUPPORT_UNICODE2026if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);2027#endif20282029if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE)2030detect_partial_match(common, backtracks);20312032if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)2033{2034OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length));2035if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE))2036add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));20372038context.length = IN_UCHARS(length);2039context.sourcereg = -1;2040#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED2041context.ucharptr = 0;2042#endif2043return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);2044}20452046#ifdef SUPPORT_UNICODE2047if (common->utf)2048{2049GETCHAR(c, cc);2050}2051else2052#endif2053c = *cc;20542055SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc));20562057if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)2058add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));20592060oc = char_othercase(common, c);2061read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);20622063SLJIT_ASSERT(!is_powerof2(c ^ oc));20642065if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))2066{2067OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, oc);2068SELECT(SLJIT_EQUAL, TMP1, SLJIT_IMM, c, TMP1);2069add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));2070}2071else2072{2073jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);2074add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));2075JUMPHERE(jump[0]);2076}2077return cc + length;20782079case OP_NOT:2080case OP_NOTI:2081if (check_str_ptr)2082detect_partial_match(common, backtracks);20832084length = 1;2085#ifdef SUPPORT_UNICODE2086if (common->utf)2087{2088#if PCRE2_CODE_UNIT_WIDTH == 82089c = *cc;2090if (c < 128 && !common->invalid_utf)2091{2092OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);2093if (type == OP_NOT || !char_has_othercase(common, cc))2094add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));2095else2096{2097/* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */2098OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);2099add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));2100}2101/* Skip the variable-length character. */2102OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));2103jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);2104OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);2105OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);2106JUMPHERE(jump[0]);2107return cc + 1;2108}2109else2110#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */2111{2112GETCHARLEN(c, cc, length);2113}2114}2115else2116#endif /* SUPPORT_UNICODE */2117c = *cc;21182119if (type == OP_NOT || !char_has_othercase(common, cc))2120{2121read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);2122add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));2123}2124else2125{2126oc = char_othercase(common, c);2127read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);2128bit = c ^ oc;2129if (is_powerof2(bit))2130{2131OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit);2132add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));2133}2134else2135{2136add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));2137add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, oc));2138}2139}2140return cc + length;21412142case OP_CLASS:2143case OP_NCLASS:2144if (check_str_ptr)2145detect_partial_match(common, backtracks);21462147#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 82148bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;2149if (type == OP_NCLASS)2150read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);2151else2152read_char(common, 0, bit, NULL, 0);2153#else2154if (type == OP_NCLASS)2155read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);2156else2157read_char(common, 0, 255, NULL, 0);2158#endif21592160if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))2161return cc + 32 / sizeof(PCRE2_UCHAR);21622163#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 82164jump[0] = NULL;2165if (common->utf)2166{2167jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, bit);2168if (type == OP_CLASS)2169{2170add_jump(compiler, backtracks, jump[0]);2171jump[0] = NULL;2172}2173}2174#elif PCRE2_CODE_UNIT_WIDTH != 82175jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);2176if (type == OP_CLASS)2177{2178add_jump(compiler, backtracks, jump[0]);2179jump[0] = NULL;2180}2181#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */21822183OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);2184OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);2185OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc);2186OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);2187OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);2188add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));21892190#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 82191if (jump[0] != NULL)2192JUMPHERE(jump[0]);2193#endif2194return cc + 32 / sizeof(PCRE2_UCHAR);21952196#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 322197case OP_XCLASS:2198if (check_str_ptr)2199detect_partial_match(common, backtracks);2200compile_xclass_matchingpath(common, cc + LINK_SIZE, backtracks, 0);2201return cc + GET(cc, 0) - 1;22022203case OP_ECLASS:2204if (check_str_ptr)2205detect_partial_match(common, backtracks);2206return compile_eclass_matchingpath(common, cc, backtracks);2207#endif2208}2209SLJIT_UNREACHABLE();2210return cc;2211}22122213static SLJIT_INLINE PCRE2_SPTR compile_charn_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, jump_list **backtracks)2214{2215/* This function consumes at least one input character. */2216/* To decrease the number of length checks, we try to concatenate the fixed length character sequences. */2217DEFINE_COMPILER;2218PCRE2_SPTR ccbegin = cc;2219compare_context context;2220int size;22212222context.length = 0;2223do2224{2225if (cc >= ccend)2226break;22272228if (*cc == OP_CHAR)2229{2230size = 1;2231#ifdef SUPPORT_UNICODE2232if (common->utf && HAS_EXTRALEN(cc[1]))2233size += GET_EXTRALEN(cc[1]);2234#endif2235}2236else if (*cc == OP_CHARI)2237{2238size = 1;2239#ifdef SUPPORT_UNICODE2240if (common->utf)2241{2242if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)2243size = 0;2244else if (HAS_EXTRALEN(cc[1]))2245size += GET_EXTRALEN(cc[1]);2246}2247else2248#endif2249if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)2250size = 0;2251}2252else2253size = 0;22542255cc += 1 + size;2256context.length += IN_UCHARS(size);2257}2258while (size > 0 && context.length <= 128);22592260cc = ccbegin;2261if (context.length > 0)2262{2263/* We have a fixed-length byte sequence. */2264OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, context.length);2265add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));22662267context.sourcereg = -1;2268#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED2269context.ucharptr = 0;2270#endif2271do cc = byte_sequence_compare(common, *cc == OP_CHARI, cc + 1, &context, backtracks); while (context.length > 0);2272return cc;2273}22742275/* A non-fixed length character will be checked if length == 0. */2276return compile_char1_matchingpath(common, *cc, cc + 1, backtracks, TRUE);2277}22782279228022812282