Path: blob/master/thirdparty/pcre2/src/pcre2_jit_simd_inc.h
21807 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8This module by Zoltan Herczeg9Original API code Copyright (c) 1997-2012 University of Cambridge10New API code Copyright (c) 2016-2019 University of Cambridge1112-----------------------------------------------------------------------------13Redistribution and use in source and binary forms, with or without14modification, are permitted provided that the following conditions are met:1516* Redistributions of source code must retain the above copyright notice,17this list of conditions and the following disclaimer.1819* Redistributions in binary form must reproduce the above copyright20notice, this list of conditions and the following disclaimer in the21documentation and/or other materials provided with the distribution.2223* Neither the name of the University of Cambridge nor the names of its24contributors may be used to endorse or promote products derived from25this software without specific prior written permission.2627THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE37POSSIBILITY OF SUCH DAMAGE.38-----------------------------------------------------------------------------39*/4041#if !(defined SUPPORT_VALGRIND)4243#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \44|| (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \45|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \46|| (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))4748typedef enum {49vector_compare_match1,50vector_compare_match1i,51vector_compare_match2,52} vector_compare_type;5354#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)55static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)56{57#if PCRE2_CODE_UNIT_WIDTH == 858/* The AVX2 code path is currently disabled. */59/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */60return 15;61#elif PCRE2_CODE_UNIT_WIDTH == 1662/* The AVX2 code path is currently disabled. */63/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */64return 7;65#elif PCRE2_CODE_UNIT_WIDTH == 3266/* The AVX2 code path is currently disabled. */67/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */68return 3;69#else70#error "Unsupported unit width"71#endif72}73#else /* !SLJIT_CONFIG_X86 */74static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)75{76#if PCRE2_CODE_UNIT_WIDTH == 877return 15;78#elif PCRE2_CODE_UNIT_WIDTH == 1679return 7;80#elif PCRE2_CODE_UNIT_WIDTH == 3281return 3;82#else83#error "Unsupported unit width"84#endif85}86#endif /* SLJIT_CONFIG_X86 */8788#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 3289static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)90{91#if PCRE2_CODE_UNIT_WIDTH == 892OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);93return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);94#elif PCRE2_CODE_UNIT_WIDTH == 1695OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);96return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);97#else98#error "Unknown code width"99#endif100}101#endif102103#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_ARM_64 || SLJIT_CONFIG_S390X || SLJIT_CONFIG_LOONGARCH_64 */104105#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)106107static sljit_s32 character_to_int32(PCRE2_UCHAR chr)108{109sljit_u32 value = chr;110#if PCRE2_CODE_UNIT_WIDTH == 8111#define SIMD_COMPARE_TYPE_INDEX 0112return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);113#elif PCRE2_CODE_UNIT_WIDTH == 16114#define SIMD_COMPARE_TYPE_INDEX 1115return (sljit_s32)((value << 16) | value);116#elif PCRE2_CODE_UNIT_WIDTH == 32117#define SIMD_COMPARE_TYPE_INDEX 2118return (sljit_s32)(value);119#else120#error "Unsupported unit width"121#endif122}123124static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,125sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)126{127sljit_u8 instruction[4];128129if (reg_type == SLJIT_SIMD_REG_128)130{131instruction[0] = 0x66;132instruction[1] = 0x0f;133}134else135{136/* Two byte VEX prefix. */137instruction[0] = 0xc5;138instruction[1] = 0xfd;139}140141SLJIT_ASSERT(step >= 0 && step <= 3);142143if (compare_type != vector_compare_match2)144{145if (step == 0)146{147if (compare_type == vector_compare_match1i)148{149/* POR xmm1, xmm2/m128 */150if (reg_type == SLJIT_SIMD_REG_256)151instruction[1] ^= (dst_ind << 3);152153/* Prefix is filled. */154instruction[2] = 0xeb;155instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;156sljit_emit_op_custom(compiler, instruction, 4);157}158return;159}160161if (step != 2)162return;163164/* PCMPEQB/W/D xmm1, xmm2/m128 */165if (reg_type == SLJIT_SIMD_REG_256)166instruction[1] ^= (dst_ind << 3);167168/* Prefix is filled. */169instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;170instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;171sljit_emit_op_custom(compiler, instruction, 4);172return;173}174175if (reg_type == SLJIT_SIMD_REG_256)176{177if (step == 2)178return;179180if (step == 0)181{182step = 2;183instruction[1] ^= (dst_ind << 3);184}185}186187switch (step)188{189case 0:190SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);191192/* MOVDQA xmm1, xmm2/m128 */193/* Prefix is filled. */194instruction[2] = 0x6f;195instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;196sljit_emit_op_custom(compiler, instruction, 4);197return;198199case 1:200/* PCMPEQB/W/D xmm1, xmm2/m128 */201if (reg_type == SLJIT_SIMD_REG_256)202instruction[1] ^= (dst_ind << 3);203204/* Prefix is filled. */205instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;206instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;207sljit_emit_op_custom(compiler, instruction, 4);208return;209210case 2:211/* PCMPEQB/W/D xmm1, xmm2/m128 */212/* Prefix is filled. */213instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;214instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;215sljit_emit_op_custom(compiler, instruction, 4);216return;217218case 3:219/* POR xmm1, xmm2/m128 */220if (reg_type == SLJIT_SIMD_REG_256)221instruction[1] ^= (dst_ind << 3);222223/* Prefix is filled. */224instruction[2] = 0xeb;225instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;226sljit_emit_op_custom(compiler, instruction, 4);227return;228}229}230231/* The AVX2 code path is currently disabled.232#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))233*/234#if defined(SLJIT_CONFIG_X86_64) && SLJIT_CONFIG_X86_64235#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1236#else237#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))238#endif239240static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)241{242DEFINE_COMPILER;243sljit_u8 instruction[8];244/* The AVX2 code path is currently disabled. */245/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */246sljit_s32 reg_type = SLJIT_SIMD_REG_128;247sljit_s32 value;248struct sljit_label *start;249#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32250struct sljit_label *restart;251#endif252struct sljit_jump *quit;253struct sljit_jump *partial_quit[2];254vector_compare_type compare_type = vector_compare_match1;255sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);256sljit_s32 data_ind = sljit_get_register_index(reg_type, SLJIT_VR0);257sljit_s32 cmp1_ind = sljit_get_register_index(reg_type, SLJIT_VR1);258sljit_s32 cmp2_ind = sljit_get_register_index(reg_type, SLJIT_VR2);259sljit_s32 tmp_ind = sljit_get_register_index(reg_type, SLJIT_VR3);260sljit_u32 bit = 0;261int i;262263SLJIT_UNUSED_ARG(offset);264265if (char1 != char2)266{267bit = char1 ^ char2;268compare_type = vector_compare_match1i;269270if (!is_powerof2(bit))271{272bit = 0;273compare_type = vector_compare_match2;274}275}276277partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);278if (common->mode == PCRE2_JIT_COMPLETE)279add_jump(compiler, &common->failed_match, partial_quit[0]);280281/* First part (unaligned start) */282value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;283sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));284285if (char1 != char2)286sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));287288OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);289290sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR1, SLJIT_VR1, 0);291292if (char1 != char2)293sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);294295#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32296restart = LABEL();297#endif298299value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;300OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);301OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);302303value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;304sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);305306for (i = 0; i < 4; i++)307fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);308309sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);310OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);311OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);312313quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);314315OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);316317/* Second part (aligned) */318start = LABEL();319320value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;321OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);322323partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);324if (common->mode == PCRE2_JIT_COMPLETE)325add_jump(compiler, &common->failed_match, partial_quit[1]);326327value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;328sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);329for (i = 0; i < 4; i++)330fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);331332sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);333CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);334335JUMPHERE(quit);336337SLJIT_ASSERT(tmp1_reg_ind < 8);338/* BSF r32, r/m32 */339instruction[0] = 0x0f;340instruction[1] = 0xbc;341instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;342sljit_emit_op_custom(compiler, instruction, 3);343344OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);345346if (common->mode != PCRE2_JIT_COMPLETE)347{348JUMPHERE(partial_quit[0]);349JUMPHERE(partial_quit[1]);350OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);351SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);352}353else354add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));355356#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32357if (common->utf && offset > 0)358{359SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);360361OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));362363quit = jump_if_utf_char_start(compiler, TMP1);364365OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));366add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));367OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);368JUMPTO(SLJIT_JUMP, restart);369370JUMPHERE(quit);371}372#endif373}374375/* The AVX2 code path is currently disabled.376#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))377*/378#if defined(SLJIT_CONFIG_X86_64) && SLJIT_CONFIG_X86_64379#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1380#else381#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))382#endif383384static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)385{386DEFINE_COMPILER;387sljit_u8 instruction[8];388/* The AVX2 code path is currently disabled. */389/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */390sljit_s32 reg_type = SLJIT_SIMD_REG_128;391sljit_s32 value;392struct sljit_label *start;393struct sljit_jump *quit;394jump_list *not_found = NULL;395vector_compare_type compare_type = vector_compare_match1;396sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);397sljit_s32 data_ind = sljit_get_register_index(reg_type, SLJIT_VR0);398sljit_s32 cmp1_ind = sljit_get_register_index(reg_type, SLJIT_VR1);399sljit_s32 cmp2_ind = sljit_get_register_index(reg_type, SLJIT_VR2);400sljit_s32 tmp_ind = sljit_get_register_index(reg_type, SLJIT_VR3);401sljit_u32 bit = 0;402int i;403404if (char1 != char2)405{406bit = char1 ^ char2;407compare_type = vector_compare_match1i;408409if (!is_powerof2(bit))410{411bit = 0;412compare_type = vector_compare_match2;413}414}415416add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));417OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);418OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);419420/* First part (unaligned start) */421422value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;423sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));424425if (char1 != char2)426sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));427428OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);429430sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR1, SLJIT_VR1, 0);431432if (char1 != char2)433sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);434435value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;436OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);437OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);438439value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;440sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);441442for (i = 0; i < 4; i++)443fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);444445sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);446OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);447OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);448449quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);450451OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);452453/* Second part (aligned) */454start = LABEL();455456value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;457OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);458459add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));460461value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;462sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);463464for (i = 0; i < 4; i++)465fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);466467sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);468CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);469470JUMPHERE(quit);471472SLJIT_ASSERT(tmp1_reg_ind < 8);473/* BSF r32, r/m32 */474instruction[0] = 0x0f;475instruction[1] = 0xbc;476instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;477sljit_emit_op_custom(compiler, instruction, 3);478479OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);480add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));481482OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);483return not_found;484}485486#ifndef _WIN64487488/* The AVX2 code path is currently disabled.489#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))490*/491#if defined(SLJIT_CONFIG_X86_64) && SLJIT_CONFIG_X86_64492#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1493#else494#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))495#endif496497static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,498PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)499{500DEFINE_COMPILER;501sljit_u8 instruction[8];502/* The AVX2 code path is currently disabled. */503/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */504sljit_s32 reg_type = SLJIT_SIMD_REG_128;505sljit_s32 value;506vector_compare_type compare1_type = vector_compare_match1;507vector_compare_type compare2_type = vector_compare_match1;508sljit_u32 bit1 = 0;509sljit_u32 bit2 = 0;510sljit_u32 diff = IN_UCHARS(offs1 - offs2);511sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);512sljit_s32 data1_ind = sljit_get_register_index(reg_type, SLJIT_VR0);513sljit_s32 data2_ind = sljit_get_register_index(reg_type, SLJIT_VR1);514sljit_s32 cmp1a_ind = sljit_get_register_index(reg_type, SLJIT_VR2);515sljit_s32 cmp2a_ind = sljit_get_register_index(reg_type, SLJIT_VR3);516sljit_s32 cmp1b_ind = sljit_get_register_index(reg_type, SLJIT_VR4);517sljit_s32 cmp2b_ind = sljit_get_register_index(reg_type, SLJIT_VR5);518sljit_s32 tmp1_ind = sljit_get_register_index(reg_type, SLJIT_VR6);519sljit_s32 tmp2_ind = sljit_get_register_index(reg_type, SLJIT_TMP_DEST_VREG);520struct sljit_label *start;521#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32522struct sljit_label *restart;523#endif524struct sljit_jump *jump[2];525int i;526527SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);528SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));529530/* Initialize. */531if (common->match_end_ptr != 0)532{533OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);534OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);535OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));536537OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);538SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);539}540541OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));542add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));543544if (char1a == char1b)545OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));546else547{548bit1 = char1a ^ char1b;549if (is_powerof2(bit1))550{551compare1_type = vector_compare_match1i;552OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));553OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));554}555else556{557compare1_type = vector_compare_match2;558bit1 = 0;559OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));560OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));561}562}563564value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;565sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, TMP1, 0);566567if (char1a != char1b)568sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR4, 0, TMP2, 0);569570if (char2a == char2b)571OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));572else573{574bit2 = char2a ^ char2b;575if (is_powerof2(bit2))576{577compare2_type = vector_compare_match1i;578OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));579OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));580}581else582{583compare2_type = vector_compare_match2;584bit2 = 0;585OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));586OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));587}588}589590sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR3, 0, TMP1, 0);591592if (char2a != char2b)593sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR5, 0, TMP2, 0);594595sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);596if (char1a != char1b)597sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR4, SLJIT_VR4, 0);598599sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR3, SLJIT_VR3, 0);600if (char2a != char2b)601sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR5, SLJIT_VR5, 0);602603#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32604restart = LABEL();605#endif606607OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);608OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);609value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;610OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);611612value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;613sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);614615jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);616617sljit_emit_simd_mov(compiler, reg_type, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);618jump[1] = JUMP(SLJIT_JUMP);619620JUMPHERE(jump[0]);621622if (reg_type == SLJIT_SIMD_REG_256)623{624if (diff != 16)625{626/* PSLLDQ ymm1, ymm2, imm8 */627instruction[0] = 0xc5;628instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));629instruction[2] = 0x73;630instruction[3] = 0xc0 | (7 << 3) | data1_ind;631instruction[4] = diff & 0xf;632sljit_emit_op_custom(compiler, instruction, 5);633}634635instruction[0] = 0xc4;636instruction[1] = 0xe3;637if (diff < 16)638{639/* VINSERTI128 xmm1, xmm2, xmm3/m128 */640/* instruction[0] = 0xc4; */641/* instruction[1] = 0xe3; */642instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));643instruction[3] = 0x38;644SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);645instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);646instruction[5] = (sljit_u8)(16 - diff);647instruction[6] = 1;648sljit_emit_op_custom(compiler, instruction, 7);649}650else651{652/* VPERM2I128 xmm1, xmm2, xmm3/m128 */653/* instruction[0] = 0xc4; */654/* instruction[1] = 0xe3; */655value = (diff == 16) ? data1_ind : data2_ind;656instruction[2] = (sljit_u8)(0x7d ^ (value << 3));657instruction[3] = 0x46;658instruction[4] = 0xc0 | (data2_ind << 3) | value;659instruction[5] = 0x08;660sljit_emit_op_custom(compiler, instruction, 6);661}662}663else664{665/* MOVDQA xmm1, xmm2/m128 */666instruction[0] = 0x66;667instruction[1] = 0x0f;668instruction[2] = 0x6f;669instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;670sljit_emit_op_custom(compiler, instruction, 4);671672/* PSLLDQ xmm1, imm8 */673/* instruction[0] = 0x66; */674/* instruction[1] = 0x0f; */675instruction[2] = 0x73;676instruction[3] = 0xc0 | (7 << 3) | data2_ind;677instruction[4] = diff;678sljit_emit_op_custom(compiler, instruction, 5);679}680681JUMPHERE(jump[1]);682683value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;684OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);685686for (i = 0; i < 4; i++)687{688fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);689fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);690}691692sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);693sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);694695/* Ignore matches before the first STR_PTR. */696OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);697OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);698699jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);700701OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);702703/* Main loop. */704start = LABEL();705706value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;707OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);708add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));709710value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;711sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);712sljit_emit_simd_mov(compiler, reg_type, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);713714for (i = 0; i < 4; i++)715{716fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);717fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);718}719720sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);721sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);722723CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);724725JUMPHERE(jump[0]);726727SLJIT_ASSERT(tmp1_reg_ind < 8);728/* BSF r32, r/m32 */729instruction[0] = 0x0f;730instruction[1] = 0xbc;731instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;732sljit_emit_op_custom(compiler, instruction, 3);733734OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);735736add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));737738#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32739if (common->utf)740{741OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));742743jump[0] = jump_if_utf_char_start(compiler, TMP1);744745OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));746CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);747748add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));749750JUMPHERE(jump[0]);751}752#endif753754OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));755756if (common->match_end_ptr != 0)757OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);758}759760#endif /* !_WIN64 */761762#undef SIMD_COMPARE_TYPE_INDEX763764#endif /* SLJIT_CONFIG_X86 */765766#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))767768#if PCRE2_CODE_UNIT_WIDTH == 8769#define PCRE2_REPLICATE_TYPE (SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_8)770#elif PCRE2_CODE_UNIT_WIDTH == 16771#define PCRE2_REPLICATE_TYPE (SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_16)772#elif PCRE2_CODE_UNIT_WIDTH == 32773#define PCRE2_REPLICATE_TYPE (SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32)774#else775#error "Unsupported unit width"776#endif777778#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1779780static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,781int step, sljit_u32 dst_ind, sljit_u32 cmp1_ind, sljit_u32 cmp2_ind, sljit_u32 tmp_ind)782{783sljit_u32 instruction;784#if PCRE2_CODE_UNIT_WIDTH == 8785sljit_u32 size = 0 << 22;786#elif PCRE2_CODE_UNIT_WIDTH == 16787sljit_u32 size = 1 << 22;788#elif PCRE2_CODE_UNIT_WIDTH == 32789sljit_u32 size = 2 << 22;790#else791#error "Unsupported unit width"792#endif793794SLJIT_ASSERT(step >= 0 && step <= 2);795796if (step == 1)797{798/* CMEQ */799instruction = 0x6e208c00 | size | (cmp1_ind << 16) | (dst_ind << 5) | dst_ind;800sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));801return;802}803804if (compare_type != vector_compare_match2)805{806if (step == 0 && compare_type == vector_compare_match1i)807{808/* ORR */809instruction = 0x4ea01c00 | (cmp2_ind << 16) | (dst_ind << 5) | dst_ind;810sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));811}812return;813}814815switch (step)816{817case 0:818/* CMEQ */819instruction = 0x6e208c00 | size | (cmp2_ind << 16) | (dst_ind << 5) | tmp_ind;820sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));821return;822823case 2:824/* ORR */825instruction = 0x4ea01c00 | (tmp_ind << 16) | (dst_ind << 5) | dst_ind;826sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));827return;828}829}830831static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)832{833DEFINE_COMPILER;834sljit_u32 instruction;835struct sljit_label *start;836#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32837struct sljit_label *restart;838#endif839struct sljit_jump *quit;840struct sljit_jump *partial_quit[2];841vector_compare_type compare_type = vector_compare_match1;842sljit_u32 data_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);843sljit_u32 cmp1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);844sljit_u32 cmp2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);845sljit_u32 tmp_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);846sljit_u32 bit = 0;847int i;848849SLJIT_UNUSED_ARG(offset);850851if (char1 != char2)852{853bit = char1 ^ char2;854compare_type = vector_compare_match1i;855856if (!is_powerof2(bit))857{858bit = 0;859compare_type = vector_compare_match2;860}861}862863partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);864if (common->mode == PCRE2_JIT_COMPLETE)865add_jump(compiler, &common->failed_match, partial_quit[0]);866867/* First part (unaligned start) */868if (char1 != char2)869sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit != 0 ? bit : char2);870sljit_emit_op1(compiler, SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);871872if (char1 != char2)873sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR2, TMP2, 0);874sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR1, TMP1, 0);875876OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);877878#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32879restart = LABEL();880#endif881882OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~(sljit_sw)0xf);883OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);884885sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);886887for (i = 0; i < 3; i++)888fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);889890/* SHRN */891instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;892sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));893894sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);895896OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);897OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);898OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, TMP2, 0);899900quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);901902/* Second part (aligned) */903start = LABEL();904905OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);906907partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);908if (common->mode == PCRE2_JIT_COMPLETE)909add_jump(compiler, &common->failed_match, partial_quit[1]);910911sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);912for (i = 0; i < 3; i++)913fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);914915/* SHRN */916instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;917sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));918919sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);920CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);921922JUMPHERE(quit);923924sljit_emit_op1(compiler, SLJIT_CTZ, TMP1, 0, TMP1, 0);925OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);926OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);927928if (common->mode != PCRE2_JIT_COMPLETE)929{930JUMPHERE(partial_quit[0]);931JUMPHERE(partial_quit[1]);932OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);933SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);934}935else936add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));937938#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32939if (common->utf && offset > 0)940{941SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);942943OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));944945quit = jump_if_utf_char_start(compiler, TMP1);946947OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));948add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));949OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);950JUMPTO(SLJIT_JUMP, restart);951952JUMPHERE(quit);953}954#endif955}956957#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1958959static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)960{961DEFINE_COMPILER;962sljit_u32 instruction;963struct sljit_label *start;964struct sljit_jump *quit;965jump_list *not_found = NULL;966vector_compare_type compare_type = vector_compare_match1;967sljit_u32 data_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);968sljit_u32 cmp1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);969sljit_u32 cmp2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);970sljit_u32 tmp_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);971sljit_u32 bit = 0;972int i;973974if (char1 != char2)975{976bit = char1 ^ char2;977compare_type = vector_compare_match1i;978979if (!is_powerof2(bit))980{981bit = 0;982compare_type = vector_compare_match2;983}984}985986add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));987988/* First part (unaligned start) */989990if (char1 != char2)991sljit_emit_op1(compiler, SLJIT_MOV, TMP3, 0, SLJIT_IMM, bit != 0 ? bit : char2);992sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1 | bit);993994if (char1 != char2)995sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR2, TMP3, 0);996sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR1, TMP2, 0);997998OP2(SLJIT_AND, TMP3, 0, TMP1, 0, SLJIT_IMM, ~(sljit_sw)0xf);999OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf);10001001sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(TMP3), 0);10021003for (i = 0; i < 3; i++)1004fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);10051006/* SHRN */1007instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;1008sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));10091010sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);10111012OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);1013OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);1014OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, TMP2, 0);10151016quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);10171018/* Second part (aligned) */1019start = LABEL();10201021OP2(SLJIT_ADD, TMP3, 0, TMP3, 0, SLJIT_IMM, 16);10221023add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP3, 0, STR_END, 0));10241025sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(TMP3), 0);10261027for (i = 0; i < 3; i++)1028fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);10291030/* SHRN */1031instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;1032sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));10331034sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);1035CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);10361037JUMPHERE(quit);10381039sljit_emit_op1(compiler, SLJIT_CTZ, TMP1, 0, TMP1, 0);1040OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);1041OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);1042add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));10431044return not_found;1045}10461047#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 110481049static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,1050PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)1051{1052DEFINE_COMPILER;1053sljit_u32 instruction;1054vector_compare_type compare1_type = vector_compare_match1;1055vector_compare_type compare2_type = vector_compare_match1;1056sljit_u32 bit1 = 0;1057sljit_u32 bit2 = 0;1058sljit_u32 diff = IN_UCHARS(offs1 - offs2);1059sljit_u32 data1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);1060sljit_u32 data2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);1061sljit_u32 cmp1a_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);1062sljit_u32 cmp2a_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);1063sljit_u32 cmp1b_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR4);1064sljit_u32 cmp2b_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR5);1065sljit_u32 tmp1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR6);1066sljit_u32 tmp2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR7);1067struct sljit_label *start;1068#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321069struct sljit_label *restart;1070#endif1071struct sljit_jump *jump[2];1072int i;10731074SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);1075SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));10761077if (char1a != char1b)1078{1079bit1 = char1a ^ char1b;1080compare1_type = vector_compare_match1i;10811082if (!is_powerof2(bit1))1083{1084bit1 = 0;1085compare1_type = vector_compare_match2;1086}1087}10881089if (char2a != char2b)1090{1091bit2 = char2a ^ char2b;1092compare2_type = vector_compare_match1i;10931094if (!is_powerof2(bit2))1095{1096bit2 = 0;1097compare2_type = vector_compare_match2;1098}1099}11001101/* Initialize. */1102if (common->match_end_ptr != 0)1103{1104OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);1105OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);1106OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));11071108OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);1109SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);1110}11111112OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));1113add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));11141115sljit_emit_op1(compiler, SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);1116sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2a | bit2);1117sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR2, TMP1, 0);1118sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR3, TMP2, 0);11191120if (char1a != char1b)1121sljit_emit_op1(compiler, SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit1 != 0 ? bit1 : char1b);11221123if (char2a != char2b)1124sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2 != 0 ? bit2 : char2b);11251126if (char1a != char1b)1127sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR4, TMP1, 0);11281129if (char2a != char2b)1130sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR5, TMP2, 0);11311132#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321133restart = LABEL();1134#endif11351136OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);1137OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);1138OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~(sljit_sw)0xf);11391140sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);11411142jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);11431144sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);1145jump[1] = JUMP(SLJIT_JUMP);11461147JUMPHERE(jump[0]);11481149if (diff >= 8)1150{1151/* MOV (element) */1152instruction = 0x6e180400 | (data1_ind << 5) | data2_ind;1153sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));11541155if (diff > 8)1156{1157/* SHL */1158instruction = 0x4f405400 | ((diff - 8) << 19) | (data2_ind << 5) | data2_ind;1159sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));1160}1161}1162else1163{1164/* MOV (element) */1165instruction = 0x6e180400 | (data1_ind << 5) | tmp1_ind;1166sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));11671168/* SHL */1169instruction = 0x4f405400 | (diff << 19) | (data1_ind << 5) | data2_ind;1170sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));11711172/* USHR */1173instruction = 0x6f400400 | (diff << 19) | (tmp1_ind << 5) | tmp1_ind;1174sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));11751176sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | SLJIT_SIMD_REG_128, SLJIT_VR1, SLJIT_VR1, SLJIT_VR6, 0);1177}11781179JUMPHERE(jump[1]);11801181OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);11821183for (i = 0; i < 3; i++)1184{1185fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);1186fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);1187}11881189sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | SLJIT_SIMD_REG_128, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);11901191/* SHRN */1192instruction = 0x0f0c8400 | (data1_ind << 5) | data1_ind;1193sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));11941195sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);11961197/* Ignore matches before the first STR_PTR. */1198OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);1199OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);1200OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, TMP2, 0);12011202jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);12031204/* Main loop. */1205start = LABEL();12061207OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);1208add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));12091210sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);1211sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);12121213for (i = 0; i < 3; i++)1214{1215fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);1216fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);1217}12181219sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | SLJIT_SIMD_REG_128, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);12201221/* SHRN */1222instruction = 0x0f0c8400 | (data1_ind << 5) | data1_ind;1223sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));12241225sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);12261227CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);12281229JUMPHERE(jump[0]);12301231sljit_emit_op1(compiler, SLJIT_CTZ, TMP1, 0, TMP1, 0);1232OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);1233OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);12341235add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));12361237#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321238if (common->utf)1239{1240OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));12411242jump[0] = jump_if_utf_char_start(compiler, TMP1);12431244OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));1245CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);12461247add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));12481249JUMPHERE(jump[0]);1250}1251#endif12521253OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));12541255if (common->match_end_ptr != 0)1256OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);1257}12581259#undef PCRE2_REPLICATE_TYPE12601261#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */12621263#if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)12641265#if PCRE2_CODE_UNIT_WIDTH == 81266#define VECTOR_ELEMENT_SIZE 01267#elif PCRE2_CODE_UNIT_WIDTH == 161268#define VECTOR_ELEMENT_SIZE 11269#elif PCRE2_CODE_UNIT_WIDTH == 321270#define VECTOR_ELEMENT_SIZE 21271#else1272#error "Unsupported unit width"1273#endif12741275static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,1276sljit_s32 base_reg, sljit_s32 index_reg)1277{1278sljit_u16 instruction[3];12791280instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);1281instruction[1] = (sljit_u16)(base_reg << 12);1282instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));12831284sljit_emit_op_custom(compiler, instruction, 6);1285}12861287#if PCRE2_CODE_UNIT_WIDTH == 3212881289static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,1290PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)1291{1292sljit_u16 instruction[3];12931294SLJIT_ASSERT(step >= 0 && step <= 1);12951296if (chr < 0x7fff)1297{1298if (step == 1)1299return;13001301/* VREPI */1302instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));1303instruction[1] = (sljit_u16)chr;1304instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1305sljit_emit_op_custom(compiler, instruction, 6);1306return;1307}13081309if (step == 0)1310{1311OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);13121313/* VLVG */1314instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));1315instruction[1] = 0;1316instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);1317sljit_emit_op_custom(compiler, instruction, 6);1318return;1319}13201321/* VREP */1322instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);1323instruction[1] = 0;1324instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);1325sljit_emit_op_custom(compiler, instruction, 6);1326}13271328#endif13291330static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,1331int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)1332{1333sljit_u16 instruction[3];13341335SLJIT_ASSERT(step >= 0 && step <= 2);13361337if (step == 1)1338{1339/* VCEQ */1340instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);1341instruction[1] = (sljit_u16)(cmp1_ind << 12);1342instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);1343sljit_emit_op_custom(compiler, instruction, 6);1344return;1345}13461347if (compare_type != vector_compare_match2)1348{1349if (step == 0 && compare_type == vector_compare_match1i)1350{1351/* VO */1352instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);1353instruction[1] = (sljit_u16)(cmp2_ind << 12);1354instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);1355sljit_emit_op_custom(compiler, instruction, 6);1356}1357return;1358}13591360switch (step)1361{1362case 0:1363/* VCEQ */1364instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);1365instruction[1] = (sljit_u16)(cmp2_ind << 12);1366instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);1367sljit_emit_op_custom(compiler, instruction, 6);1368return;13691370case 2:1371/* VO */1372instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);1373instruction[1] = (sljit_u16)(tmp_ind << 12);1374instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);1375sljit_emit_op_custom(compiler, instruction, 6);1376return;1377}1378}13791380#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 113811382static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)1383{1384DEFINE_COMPILER;1385sljit_u16 instruction[3];1386struct sljit_label *start;1387#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321388struct sljit_label *restart;1389#endif1390struct sljit_jump *quit;1391struct sljit_jump *partial_quit[2];1392vector_compare_type compare_type = vector_compare_match1;1393sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1394sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);1395sljit_s32 data_ind = 0;1396sljit_s32 tmp_ind = 1;1397sljit_s32 cmp1_ind = 2;1398sljit_s32 cmp2_ind = 3;1399sljit_s32 zero_ind = 4;1400sljit_u32 bit = 0;1401int i;14021403SLJIT_UNUSED_ARG(offset);14041405if (char1 != char2)1406{1407bit = char1 ^ char2;1408compare_type = vector_compare_match1i;14091410if (!is_powerof2(bit))1411{1412bit = 0;1413compare_type = vector_compare_match2;1414}1415}14161417partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);1418if (common->mode == PCRE2_JIT_COMPLETE)1419add_jump(compiler, &common->failed_match, partial_quit[0]);14201421/* First part (unaligned start) */14221423OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);14241425#if PCRE2_CODE_UNIT_WIDTH != 3214261427/* VREPI */1428instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));1429instruction[1] = (sljit_u16)(char1 | bit);1430instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1431sljit_emit_op_custom(compiler, instruction, 6);14321433if (char1 != char2)1434{1435/* VREPI */1436instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));1437instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);1438/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1439sljit_emit_op_custom(compiler, instruction, 6);1440}14411442#else /* PCRE2_CODE_UNIT_WIDTH == 32 */14431444for (i = 0; i < 2; i++)1445{1446replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);14471448if (char1 != char2)1449replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);1450}14511452#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */14531454if (compare_type == vector_compare_match2)1455{1456/* VREPI */1457instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));1458instruction[1] = 0;1459instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);1460sljit_emit_op_custom(compiler, instruction, 6);1461}14621463#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321464restart = LABEL();1465#endif14661467load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);1468OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);14691470if (compare_type != vector_compare_match2)1471{1472if (compare_type == vector_compare_match1i)1473fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);14741475/* VFEE */1476instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1477instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1478instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1479sljit_emit_op_custom(compiler, instruction, 6);1480}1481else1482{1483for (i = 0; i < 3; i++)1484fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);14851486/* VFENE */1487instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1488instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1489instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1490sljit_emit_op_custom(compiler, instruction, 6);1491}14921493/* VLGVB */1494instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);1495instruction[1] = 7;1496instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1497sljit_emit_op_custom(compiler, instruction, 6);14981499OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);1500quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);15011502OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);15031504/* Second part (aligned) */1505start = LABEL();15061507OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);15081509partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);1510if (common->mode == PCRE2_JIT_COMPLETE)1511add_jump(compiler, &common->failed_match, partial_quit[1]);15121513load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);15141515if (compare_type != vector_compare_match2)1516{1517if (compare_type == vector_compare_match1i)1518fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);15191520/* VFEE */1521instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1522instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1523instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1524sljit_emit_op_custom(compiler, instruction, 6);1525}1526else1527{1528for (i = 0; i < 3; i++)1529fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);15301531/* VFENE */1532instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1533instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1534instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1535sljit_emit_op_custom(compiler, instruction, 6);1536}15371538sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);1539JUMPTO(SLJIT_OVERFLOW, start);15401541/* VLGVB */1542instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);1543instruction[1] = 7;1544instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1545sljit_emit_op_custom(compiler, instruction, 6);15461547OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);15481549JUMPHERE(quit);15501551if (common->mode != PCRE2_JIT_COMPLETE)1552{1553JUMPHERE(partial_quit[0]);1554JUMPHERE(partial_quit[1]);1555OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);1556SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);1557}1558else1559add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));15601561#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321562if (common->utf && offset > 0)1563{1564SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);15651566OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));15671568quit = jump_if_utf_char_start(compiler, TMP1);15691570OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));1571add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));15721573OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);1574JUMPTO(SLJIT_JUMP, restart);15751576JUMPHERE(quit);1577}1578#endif1579}15801581#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 115821583static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)1584{1585DEFINE_COMPILER;1586sljit_u16 instruction[3];1587struct sljit_label *start;1588struct sljit_jump *quit;1589jump_list *not_found = NULL;1590vector_compare_type compare_type = vector_compare_match1;1591sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1592sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);1593sljit_s32 data_ind = 0;1594sljit_s32 tmp_ind = 1;1595sljit_s32 cmp1_ind = 2;1596sljit_s32 cmp2_ind = 3;1597sljit_s32 zero_ind = 4;1598sljit_u32 bit = 0;1599int i;16001601if (char1 != char2)1602{1603bit = char1 ^ char2;1604compare_type = vector_compare_match1i;16051606if (!is_powerof2(bit))1607{1608bit = 0;1609compare_type = vector_compare_match2;1610}1611}16121613add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));16141615/* First part (unaligned start) */16161617OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);16181619#if PCRE2_CODE_UNIT_WIDTH != 3216201621/* VREPI */1622instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));1623instruction[1] = (sljit_u16)(char1 | bit);1624instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1625sljit_emit_op_custom(compiler, instruction, 6);16261627if (char1 != char2)1628{1629/* VREPI */1630instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));1631instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);1632/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1633sljit_emit_op_custom(compiler, instruction, 6);1634}16351636#else /* PCRE2_CODE_UNIT_WIDTH == 32 */16371638for (i = 0; i < 2; i++)1639{1640replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);16411642if (char1 != char2)1643replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);1644}16451646#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */16471648if (compare_type == vector_compare_match2)1649{1650/* VREPI */1651instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));1652instruction[1] = 0;1653instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);1654sljit_emit_op_custom(compiler, instruction, 6);1655}16561657load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);1658OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);16591660if (compare_type != vector_compare_match2)1661{1662if (compare_type == vector_compare_match1i)1663fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);16641665/* VFEE */1666instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1667instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1668instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1669sljit_emit_op_custom(compiler, instruction, 6);1670}1671else1672{1673for (i = 0; i < 3; i++)1674fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);16751676/* VFENE */1677instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1678instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1679instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1680sljit_emit_op_custom(compiler, instruction, 6);1681}16821683/* VLGVB */1684instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);1685instruction[1] = 7;1686instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1687sljit_emit_op_custom(compiler, instruction, 6);16881689OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);1690quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);16911692OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);16931694/* Second part (aligned) */1695start = LABEL();16961697OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);16981699add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));17001701load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);17021703if (compare_type != vector_compare_match2)1704{1705if (compare_type == vector_compare_match1i)1706fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);17071708/* VFEE */1709instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1710instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1711instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1712sljit_emit_op_custom(compiler, instruction, 6);1713}1714else1715{1716for (i = 0; i < 3; i++)1717fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);17181719/* VFENE */1720instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1721instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1722instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1723sljit_emit_op_custom(compiler, instruction, 6);1724}17251726sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);1727JUMPTO(SLJIT_OVERFLOW, start);17281729/* VLGVB */1730instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);1731instruction[1] = 7;1732instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1733sljit_emit_op_custom(compiler, instruction, 6);17341735OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);17361737JUMPHERE(quit);1738add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));17391740return not_found;1741}17421743#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 117441745static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,1746PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)1747{1748DEFINE_COMPILER;1749sljit_u16 instruction[3];1750struct sljit_label *start;1751#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321752struct sljit_label *restart;1753#endif1754struct sljit_jump *quit;1755struct sljit_jump *jump[2];1756vector_compare_type compare1_type = vector_compare_match1;1757vector_compare_type compare2_type = vector_compare_match1;1758sljit_u32 bit1 = 0;1759sljit_u32 bit2 = 0;1760sljit_s32 diff = IN_UCHARS(offs2 - offs1);1761sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1762sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);1763sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);1764sljit_s32 data1_ind = 0;1765sljit_s32 data2_ind = 1;1766sljit_s32 tmp1_ind = 2;1767sljit_s32 tmp2_ind = 3;1768sljit_s32 cmp1a_ind = 4;1769sljit_s32 cmp1b_ind = 5;1770sljit_s32 cmp2a_ind = 6;1771sljit_s32 cmp2b_ind = 7;1772sljit_s32 zero_ind = 8;1773int i;17741775SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);1776SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));1777SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);17781779if (char1a != char1b)1780{1781bit1 = char1a ^ char1b;1782compare1_type = vector_compare_match1i;17831784if (!is_powerof2(bit1))1785{1786bit1 = 0;1787compare1_type = vector_compare_match2;1788}1789}17901791if (char2a != char2b)1792{1793bit2 = char2a ^ char2b;1794compare2_type = vector_compare_match1i;17951796if (!is_powerof2(bit2))1797{1798bit2 = 0;1799compare2_type = vector_compare_match2;1800}1801}18021803/* Initialize. */1804if (common->match_end_ptr != 0)1805{1806OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);1807OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);1808OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));18091810OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);1811SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);1812}18131814OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));1815add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));1816OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);18171818#if PCRE2_CODE_UNIT_WIDTH != 3218191820OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);18211822/* VREPI */1823instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));1824instruction[1] = (sljit_u16)(char1a | bit1);1825instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1826sljit_emit_op_custom(compiler, instruction, 6);18271828if (char1a != char1b)1829{1830/* VREPI */1831instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));1832instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);1833/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1834sljit_emit_op_custom(compiler, instruction, 6);1835}18361837/* VREPI */1838instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));1839instruction[1] = (sljit_u16)(char2a | bit2);1840/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1841sljit_emit_op_custom(compiler, instruction, 6);18421843if (char2a != char2b)1844{1845/* VREPI */1846instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));1847instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);1848/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1849sljit_emit_op_custom(compiler, instruction, 6);1850}18511852#else /* PCRE2_CODE_UNIT_WIDTH == 32 */18531854for (i = 0; i < 2; i++)1855{1856replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);18571858if (char1a != char1b)1859replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);18601861replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);18621863if (char2a != char2b)1864replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);1865}18661867OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);18681869#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */18701871/* VREPI */1872instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));1873instruction[1] = 0;1874instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);1875sljit_emit_op_custom(compiler, instruction, 6);18761877#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321878restart = LABEL();1879#endif18801881jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);1882load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);1883jump[1] = JUMP(SLJIT_JUMP);1884JUMPHERE(jump[0]);1885load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);1886JUMPHERE(jump[1]);18871888load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);1889OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);18901891for (i = 0; i < 3; i++)1892{1893fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);1894fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);1895}18961897/* VN */1898instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1899instruction[1] = (sljit_u16)(data2_ind << 12);1900instruction[2] = (sljit_u16)((0xe << 8) | 0x68);1901sljit_emit_op_custom(compiler, instruction, 6);19021903/* VFENE */1904instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1905instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1906instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1907sljit_emit_op_custom(compiler, instruction, 6);19081909/* VLGVB */1910instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);1911instruction[1] = 7;1912instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1913sljit_emit_op_custom(compiler, instruction, 6);19141915OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);1916quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);19171918OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);1919OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);19201921/* Main loop. */1922start = LABEL();19231924OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);1925add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));19261927load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);1928load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);19291930for (i = 0; i < 3; i++)1931{1932fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);1933fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);1934}19351936/* VN */1937instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1938instruction[1] = (sljit_u16)(data2_ind << 12);1939instruction[2] = (sljit_u16)((0xe << 8) | 0x68);1940sljit_emit_op_custom(compiler, instruction, 6);19411942/* VFENE */1943instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1944instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1945instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1946sljit_emit_op_custom(compiler, instruction, 6);19471948sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);1949JUMPTO(SLJIT_OVERFLOW, start);19501951/* VLGVB */1952instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);1953instruction[1] = 7;1954instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1955sljit_emit_op_custom(compiler, instruction, 6);19561957OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);19581959JUMPHERE(quit);19601961add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));19621963#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321964if (common->utf)1965{1966SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);19671968OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));19691970quit = jump_if_utf_char_start(compiler, TMP1);19711972OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));1973add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));19741975/* TMP1 contains diff. */1976OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);1977OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);1978JUMPTO(SLJIT_JUMP, restart);19791980JUMPHERE(quit);1981}1982#endif19831984OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));19851986if (common->match_end_ptr != 0)1987OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);1988}19891990#endif /* SLJIT_CONFIG_S390X */19911992#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)19931994#ifdef __linux__1995/* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */1996#include <sys/auxv.h>1997#define LOONGARCH_HWCAP_LSX (1 << 4)1998#define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)1999#else2000#define HAS_LSX_SUPPORT 02001#endif20022003typedef sljit_ins sljit_u32;20042005#define SI12_IMM_MASK 0x003ffc002006#define UI5_IMM_MASK 0x00007c002007#define UI2_IMM_MASK 0x00000c0020082009#define VD(vd) ((sljit_ins)vd << 0)2010#define VJ(vj) ((sljit_ins)vj << 5)2011#define VK(vk) ((sljit_ins)vk << 10)2012#define RD_V(rd) ((sljit_ins)rd << 0)2013#define RJ_V(rj) ((sljit_ins)rj << 5)20142015#define IMM_SI12(imm) (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)2016#define IMM_UI5(imm) (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)2017#define IMM_UI2(imm) (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)20182019// LSX OPCODES:2020#define VBSLL_V 0x728e00002021#define VMSKLTZ_B 0x729c40002022#define VPICKVE2GR_WU 0x72f3e00020232024#if PCRE2_CODE_UNIT_WIDTH == 82025#define VREPLGR2VR_X 0x729f00002026#define VSEQ 0x700000002027#elif PCRE2_CODE_UNIT_WIDTH == 162028#define VREPLGR2VR_X 0x729f04002029#define VSEQ 0x700080002030#else2031#define VREPLGR2VR_X 0x729f08002032#define VSEQ 0x700100002033#endif20342035static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,2036sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)2037{2038if (compare_type != vector_compare_match2)2039{2040if (compare_type == vector_compare_match1i)2041{2042/* VOR.V vd, vj, vk */2043push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));2044}20452046/* VSEQ.B/H/W vd, vj, vk */2047push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));2048return;2049}20502051/* VBSLL.V vd, vj, ui5 */2052push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));20532054/* VSEQ.B/H/W vd, vj, vk */2055push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));20562057/* VSEQ.B/H/W vd, vj, vk */2058push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));20592060/* VOR vd, vj, vk */2061push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));2062return;2063}20642065#define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT20662067static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)2068{2069DEFINE_COMPILER;2070struct sljit_label *start;2071#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322072struct sljit_label *restart;2073#endif2074struct sljit_jump *quit;2075struct sljit_jump *partial_quit[2];2076vector_compare_type compare_type = vector_compare_match1;2077sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);2078sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);2079sljit_s32 data_ind = 0;2080sljit_s32 tmp_ind = 1;2081sljit_s32 cmp1_ind = 2;2082sljit_s32 cmp2_ind = 3;2083sljit_u32 bit = 0;20842085SLJIT_UNUSED_ARG(offset);20862087if (char1 != char2)2088{2089bit = char1 ^ char2;2090compare_type = vector_compare_match1i;20912092if (!is_powerof2(bit))2093{2094bit = 0;2095compare_type = vector_compare_match2;2096}2097}20982099partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);2100if (common->mode == PCRE2_JIT_COMPLETE)2101add_jump(compiler, &common->failed_match, partial_quit[0]);21022103/* First part (unaligned start) */21042105OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);21062107/* VREPLGR2VR.B/H/W vd, rj */2108push_inst(compiler, VREPLGR2VR_X | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));21092110if (char1 != char2)2111{2112OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);21132114/* VREPLGR2VR.B/H/W vd, rj */2115push_inst(compiler, VREPLGR2VR_X | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));2116}21172118OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);21192120#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322121restart = LABEL();2122#endif21232124OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);2125OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);21262127/* VLD vd, rj, si12 */2128push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2129fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);21302131/* VMSKLTZ.B vd, vj */2132push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));21332134/* VPICKVE2GR.WU rd, vj, ui2 */2135push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));21362137OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);2138OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);21392140quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);21412142OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);21432144/* Second part (aligned) */2145start = LABEL();21462147OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);21482149partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);2150if (common->mode == PCRE2_JIT_COMPLETE)2151add_jump(compiler, &common->failed_match, partial_quit[1]);21522153/* VLD vd, rj, si12 */2154push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2155fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);21562157/* VMSKLTZ.B vd, vj */2158push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));21592160/* VPICKVE2GR.WU rd, vj, ui2 */2161push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));21622163CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);21642165JUMPHERE(quit);21662167/* CTZ.W rd, rj */2168push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));21692170OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);21712172if (common->mode != PCRE2_JIT_COMPLETE)2173{2174JUMPHERE(partial_quit[0]);2175JUMPHERE(partial_quit[1]);2176OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);2177SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);2178}2179else2180add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));21812182#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322183if (common->utf && offset > 0)2184{2185SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);21862187OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));21882189quit = jump_if_utf_char_start(compiler, TMP1);21902191OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));2192add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));2193OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);2194JUMPTO(SLJIT_JUMP, restart);21952196JUMPHERE(quit);2197}2198#endif2199}22002201#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT22022203static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)2204{2205DEFINE_COMPILER;2206struct sljit_label *start;2207struct sljit_jump *quit;2208jump_list *not_found = NULL;2209vector_compare_type compare_type = vector_compare_match1;2210sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);2211sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);2212sljit_s32 data_ind = 0;2213sljit_s32 tmp_ind = 1;2214sljit_s32 cmp1_ind = 2;2215sljit_s32 cmp2_ind = 3;2216sljit_u32 bit = 0;22172218if (char1 != char2)2219{2220bit = char1 ^ char2;2221compare_type = vector_compare_match1i;22222223if (!is_powerof2(bit))2224{2225bit = 0;2226compare_type = vector_compare_match2;2227}2228}22292230add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));2231OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);2232OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);22332234/* First part (unaligned start) */22352236OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);22372238/* VREPLGR2VR.B/H/W vd, rj */2239push_inst(compiler, VREPLGR2VR_X | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));22402241if (char1 != char2)2242{2243OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);2244/* VREPLGR2VR.B/H/W vd, rj */2245push_inst(compiler, VREPLGR2VR_X | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));2246}22472248OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);2249OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);2250OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);22512252/* VLD vd, rj, si12 */2253push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2254fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);22552256/* VMSKLTZ.B vd, vj */2257push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));22582259/* VPICKVE2GR.WU rd, vj, ui2 */2260push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));22612262OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);2263OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);22642265quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);22662267OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);22682269/* Second part (aligned) */2270start = LABEL();22712272OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);22732274add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));22752276/* VLD vd, rj, si12 */2277push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2278fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);22792280/* VMSKLTZ.B vd, vj */2281push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));22822283/* VPICKVE2GR.WU rd, vj, ui2 */2284push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));22852286CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);22872288JUMPHERE(quit);22892290/* CTZ.W rd, rj */2291push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));22922293OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);2294add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));22952296OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);2297return not_found;2298}22992300#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT23012302static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,2303PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)2304{2305DEFINE_COMPILER;2306vector_compare_type compare1_type = vector_compare_match1;2307vector_compare_type compare2_type = vector_compare_match1;2308sljit_u32 bit1 = 0;2309sljit_u32 bit2 = 0;2310sljit_u32 diff = IN_UCHARS(offs1 - offs2);2311sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);2312sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);2313sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);2314sljit_s32 data1_ind = 0;2315sljit_s32 data2_ind = 1;2316sljit_s32 tmp1_ind = 2;2317sljit_s32 tmp2_ind = 3;2318sljit_s32 cmp1a_ind = 4;2319sljit_s32 cmp1b_ind = 5;2320sljit_s32 cmp2a_ind = 6;2321sljit_s32 cmp2b_ind = 7;2322struct sljit_label *start;2323#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322324struct sljit_label *restart;2325#endif2326struct sljit_jump *jump[2];23272328SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);2329SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));23302331/* Initialize. */2332if (common->match_end_ptr != 0)2333{2334OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);2335OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));2336OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);23372338OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);2339SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);2340}23412342OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));2343add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));23442345if (char1a == char1b)2346OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);2347else2348{2349bit1 = char1a ^ char1b;2350if (is_powerof2(bit1))2351{2352compare1_type = vector_compare_match1i;2353OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);2354OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);2355}2356else2357{2358compare1_type = vector_compare_match2;2359bit1 = 0;2360OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);2361OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);2362}2363}23642365/* VREPLGR2VR.B/H/W vd, rj */2366push_inst(compiler, VREPLGR2VR_X | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));23672368if (char1a != char1b)2369{2370/* VREPLGR2VR.B/H/W vd, rj */2371push_inst(compiler, VREPLGR2VR_X | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));2372}23732374if (char2a == char2b)2375OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);2376else2377{2378bit2 = char2a ^ char2b;2379if (is_powerof2(bit2))2380{2381compare2_type = vector_compare_match1i;2382OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);2383OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);2384}2385else2386{2387compare2_type = vector_compare_match2;2388bit2 = 0;2389OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);2390OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);2391}2392}23932394/* VREPLGR2VR.B/H/W vd, rj */2395push_inst(compiler, VREPLGR2VR_X | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));23962397if (char2a != char2b)2398{2399/* VREPLGR2VR.B/H/W vd, rj */2400push_inst(compiler, VREPLGR2VR_X | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));2401}24022403#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322404restart = LABEL();2405#endif24062407OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);2408OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);2409OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);2410OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);24112412/* VLD vd, rj, si12 */2413push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));24142415jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);24162417/* VLD vd, rj, si12 */2418push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));2419jump[1] = JUMP(SLJIT_JUMP);24202421JUMPHERE(jump[0]);24222423/* VBSLL.V vd, vj, ui5 */2424push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));24252426JUMPHERE(jump[1]);24272428fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);2429fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);24302431/* VAND vd, vj, vk */2432push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));24332434/* VMSKLTZ.B vd, vj */2435push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));24362437/* VPICKVE2GR.WU rd, vj, ui2 */2438push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));24392440/* Ignore matches before the first STR_PTR. */2441OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);2442OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);24432444jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);24452446OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);24472448/* Main loop. */2449start = LABEL();24502451OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);2452add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));24532454/* VLD vd, rj, si12 */2455push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2456push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));24572458fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);2459fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);24602461/* VAND.V vd, vj, vk */2462push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));24632464/* VMSKLTZ.B vd, vj */2465push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));24662467/* VPICKVE2GR.WU rd, vj, ui2 */2468push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));24692470CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);24712472JUMPHERE(jump[0]);24732474/* CTZ.W rd, rj */2475push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));24762477OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);24782479add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));24802481#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322482if (common->utf)2483{2484OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));24852486jump[0] = jump_if_utf_char_start(compiler, TMP1);24872488OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));2489CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);24902491add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));24922493JUMPHERE(jump[0]);2494}2495#endif24962497OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));24982499if (common->match_end_ptr != 0)2500OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);2501}25022503#endif /* SLJIT_CONFIG_LOONGARCH_64 */25042505#endif /* !SUPPORT_VALGRIND */250625072508