Path: blob/master/thirdparty/pcre2/src/pcre2_jit_simd_inc.h
9898 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8This module by Zoltan Herczeg9Original API code Copyright (c) 1997-2012 University of Cambridge10New API code Copyright (c) 2016-2019 University of Cambridge1112-----------------------------------------------------------------------------13Redistribution and use in source and binary forms, with or without14modification, are permitted provided that the following conditions are met:1516* Redistributions of source code must retain the above copyright notice,17this list of conditions and the following disclaimer.1819* Redistributions in binary form must reproduce the above copyright20notice, this list of conditions and the following disclaimer in the21documentation and/or other materials provided with the distribution.2223* Neither the name of the University of Cambridge nor the names of its24contributors may be used to endorse or promote products derived from25this software without specific prior written permission.2627THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE37POSSIBILITY OF SUCH DAMAGE.38-----------------------------------------------------------------------------39*/4041#if !(defined SUPPORT_VALGRIND)4243#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \44|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \45|| (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))4647typedef enum {48vector_compare_match1,49vector_compare_match1i,50vector_compare_match2,51} vector_compare_type;5253#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)54static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)55{56#if PCRE2_CODE_UNIT_WIDTH == 857/* The AVX2 code path is currently disabled. */58/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */59return 15;60#elif PCRE2_CODE_UNIT_WIDTH == 1661/* The AVX2 code path is currently disabled. */62/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */63return 7;64#elif PCRE2_CODE_UNIT_WIDTH == 3265/* The AVX2 code path is currently disabled. */66/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */67return 3;68#else69#error "Unsupported unit width"70#endif71}72#else /* !SLJIT_CONFIG_X86 */73static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)74{75#if PCRE2_CODE_UNIT_WIDTH == 876return 15;77#elif PCRE2_CODE_UNIT_WIDTH == 1678return 7;79#elif PCRE2_CODE_UNIT_WIDTH == 3280return 3;81#else82#error "Unsupported unit width"83#endif84}85#endif /* SLJIT_CONFIG_X86 */8687#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 3288static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)89{90#if PCRE2_CODE_UNIT_WIDTH == 891OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);92return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);93#elif PCRE2_CODE_UNIT_WIDTH == 1694OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);95return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);96#else97#error "Unknown code width"98#endif99}100#endif101102#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */103104#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)105106static sljit_s32 character_to_int32(PCRE2_UCHAR chr)107{108sljit_u32 value = chr;109#if PCRE2_CODE_UNIT_WIDTH == 8110#define SIMD_COMPARE_TYPE_INDEX 0111return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);112#elif PCRE2_CODE_UNIT_WIDTH == 16113#define SIMD_COMPARE_TYPE_INDEX 1114return (sljit_s32)((value << 16) | value);115#elif PCRE2_CODE_UNIT_WIDTH == 32116#define SIMD_COMPARE_TYPE_INDEX 2117return (sljit_s32)(value);118#else119#error "Unsupported unit width"120#endif121}122123static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,124sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)125{126sljit_u8 instruction[4];127128if (reg_type == SLJIT_SIMD_REG_128)129{130instruction[0] = 0x66;131instruction[1] = 0x0f;132}133else134{135/* Two byte VEX prefix. */136instruction[0] = 0xc5;137instruction[1] = 0xfd;138}139140SLJIT_ASSERT(step >= 0 && step <= 3);141142if (compare_type != vector_compare_match2)143{144if (step == 0)145{146if (compare_type == vector_compare_match1i)147{148/* POR xmm1, xmm2/m128 */149if (reg_type == SLJIT_SIMD_REG_256)150instruction[1] ^= (dst_ind << 3);151152/* Prefix is filled. */153instruction[2] = 0xeb;154instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;155sljit_emit_op_custom(compiler, instruction, 4);156}157return;158}159160if (step != 2)161return;162163/* PCMPEQB/W/D xmm1, xmm2/m128 */164if (reg_type == SLJIT_SIMD_REG_256)165instruction[1] ^= (dst_ind << 3);166167/* Prefix is filled. */168instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;169instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;170sljit_emit_op_custom(compiler, instruction, 4);171return;172}173174if (reg_type == SLJIT_SIMD_REG_256)175{176if (step == 2)177return;178179if (step == 0)180{181step = 2;182instruction[1] ^= (dst_ind << 3);183}184}185186switch (step)187{188case 0:189SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);190191/* MOVDQA xmm1, xmm2/m128 */192/* Prefix is filled. */193instruction[2] = 0x6f;194instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;195sljit_emit_op_custom(compiler, instruction, 4);196return;197198case 1:199/* PCMPEQB/W/D xmm1, xmm2/m128 */200if (reg_type == SLJIT_SIMD_REG_256)201instruction[1] ^= (dst_ind << 3);202203/* Prefix is filled. */204instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;205instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;206sljit_emit_op_custom(compiler, instruction, 4);207return;208209case 2:210/* PCMPEQB/W/D xmm1, xmm2/m128 */211/* Prefix is filled. */212instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;213instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;214sljit_emit_op_custom(compiler, instruction, 4);215return;216217case 3:218/* POR xmm1, xmm2/m128 */219if (reg_type == SLJIT_SIMD_REG_256)220instruction[1] ^= (dst_ind << 3);221222/* Prefix is filled. */223instruction[2] = 0xeb;224instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;225sljit_emit_op_custom(compiler, instruction, 4);226return;227}228}229230#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))231232static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)233{234DEFINE_COMPILER;235sljit_u8 instruction[8];236/* The AVX2 code path is currently disabled. */237/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */238sljit_s32 reg_type = SLJIT_SIMD_REG_128;239sljit_s32 value;240struct sljit_label *start;241#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32242struct sljit_label *restart;243#endif244struct sljit_jump *quit;245struct sljit_jump *partial_quit[2];246vector_compare_type compare_type = vector_compare_match1;247sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);248sljit_s32 data_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);249sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);250sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);251sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);252sljit_u32 bit = 0;253int i;254255SLJIT_UNUSED_ARG(offset);256257if (char1 != char2)258{259bit = char1 ^ char2;260compare_type = vector_compare_match1i;261262if (!is_powerof2(bit))263{264bit = 0;265compare_type = vector_compare_match2;266}267}268269partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);270if (common->mode == PCRE2_JIT_COMPLETE)271add_jump(compiler, &common->failed_match, partial_quit[0]);272273/* First part (unaligned start) */274value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;275sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));276277if (char1 != char2)278sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));279280OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);281282sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR1, SLJIT_VR1, 0);283284if (char1 != char2)285sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);286287#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32288restart = LABEL();289#endif290291value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;292OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);293OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);294295value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;296sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);297298for (i = 0; i < 4; i++)299fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);300301sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);302OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);303OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);304305quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);306307OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);308309/* Second part (aligned) */310start = LABEL();311312value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;313OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);314315partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);316if (common->mode == PCRE2_JIT_COMPLETE)317add_jump(compiler, &common->failed_match, partial_quit[1]);318319value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;320sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);321for (i = 0; i < 4; i++)322fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);323324sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);325CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);326327JUMPHERE(quit);328329SLJIT_ASSERT(tmp1_reg_ind < 8);330/* BSF r32, r/m32 */331instruction[0] = 0x0f;332instruction[1] = 0xbc;333instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;334sljit_emit_op_custom(compiler, instruction, 3);335336OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);337338if (common->mode != PCRE2_JIT_COMPLETE)339{340JUMPHERE(partial_quit[0]);341JUMPHERE(partial_quit[1]);342OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);343SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);344}345else346add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));347348#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32349if (common->utf && offset > 0)350{351SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);352353OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));354355quit = jump_if_utf_char_start(compiler, TMP1);356357OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));358add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));359OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);360JUMPTO(SLJIT_JUMP, restart);361362JUMPHERE(quit);363}364#endif365}366367#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))368369static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)370{371DEFINE_COMPILER;372sljit_u8 instruction[8];373/* The AVX2 code path is currently disabled. */374/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */375sljit_s32 reg_type = SLJIT_SIMD_REG_128;376sljit_s32 value;377struct sljit_label *start;378struct sljit_jump *quit;379jump_list *not_found = NULL;380vector_compare_type compare_type = vector_compare_match1;381sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);382sljit_s32 data_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);383sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);384sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);385sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);386sljit_u32 bit = 0;387int i;388389if (char1 != char2)390{391bit = char1 ^ char2;392compare_type = vector_compare_match1i;393394if (!is_powerof2(bit))395{396bit = 0;397compare_type = vector_compare_match2;398}399}400401add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));402OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);403OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);404405/* First part (unaligned start) */406407value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;408sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));409410if (char1 != char2)411sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));412413OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);414415sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR1, SLJIT_VR1, 0);416417if (char1 != char2)418sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);419420value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;421OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);422OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);423424value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;425sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);426427for (i = 0; i < 4; i++)428fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);429430sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);431OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);432OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);433434quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);435436OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);437438/* Second part (aligned) */439start = LABEL();440441value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;442OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);443444add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));445446value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;447sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);448449for (i = 0; i < 4; i++)450fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);451452sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);453CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);454455JUMPHERE(quit);456457SLJIT_ASSERT(tmp1_reg_ind < 8);458/* BSF r32, r/m32 */459instruction[0] = 0x0f;460instruction[1] = 0xbc;461instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;462sljit_emit_op_custom(compiler, instruction, 3);463464OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);465add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));466467OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);468return not_found;469}470471#ifndef _WIN64472473#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))474475static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,476PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)477{478DEFINE_COMPILER;479sljit_u8 instruction[8];480/* The AVX2 code path is currently disabled. */481/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */482sljit_s32 reg_type = SLJIT_SIMD_REG_128;483sljit_s32 value;484vector_compare_type compare1_type = vector_compare_match1;485vector_compare_type compare2_type = vector_compare_match1;486sljit_u32 bit1 = 0;487sljit_u32 bit2 = 0;488sljit_u32 diff = IN_UCHARS(offs1 - offs2);489sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);490sljit_s32 data1_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);491sljit_s32 data2_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);492sljit_s32 cmp1a_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);493sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);494sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR4);495sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR5);496sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR6);497sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_TMP_DEST_VREG);498struct sljit_label *start;499#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32500struct sljit_label *restart;501#endif502struct sljit_jump *jump[2];503int i;504505SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);506SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));507508/* Initialize. */509if (common->match_end_ptr != 0)510{511OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);512OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);513OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));514515OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);516SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);517}518519OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));520add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));521522if (char1a == char1b)523OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));524else525{526bit1 = char1a ^ char1b;527if (is_powerof2(bit1))528{529compare1_type = vector_compare_match1i;530OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));531OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));532}533else534{535compare1_type = vector_compare_match2;536bit1 = 0;537OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));538OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));539}540}541542value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;543sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, TMP1, 0);544545if (char1a != char1b)546sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR4, 0, TMP2, 0);547548if (char2a == char2b)549OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));550else551{552bit2 = char2a ^ char2b;553if (is_powerof2(bit2))554{555compare2_type = vector_compare_match1i;556OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));557OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));558}559else560{561compare2_type = vector_compare_match2;562bit2 = 0;563OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));564OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));565}566}567568sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR3, 0, TMP1, 0);569570if (char2a != char2b)571sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR5, 0, TMP2, 0);572573sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);574if (char1a != char1b)575sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR4, SLJIT_VR4, 0);576577sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR3, SLJIT_VR3, 0);578if (char2a != char2b)579sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR5, SLJIT_VR5, 0);580581#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32582restart = LABEL();583#endif584585OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);586OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);587value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;588OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);589590value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;591sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);592593jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);594595sljit_emit_simd_mov(compiler, reg_type, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);596jump[1] = JUMP(SLJIT_JUMP);597598JUMPHERE(jump[0]);599600if (reg_type == SLJIT_SIMD_REG_256)601{602if (diff != 16)603{604/* PSLLDQ ymm1, ymm2, imm8 */605instruction[0] = 0xc5;606instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));607instruction[2] = 0x73;608instruction[3] = 0xc0 | (7 << 3) | data1_ind;609instruction[4] = diff & 0xf;610sljit_emit_op_custom(compiler, instruction, 5);611}612613instruction[0] = 0xc4;614instruction[1] = 0xe3;615if (diff < 16)616{617/* VINSERTI128 xmm1, xmm2, xmm3/m128 */618/* instruction[0] = 0xc4; */619/* instruction[1] = 0xe3; */620instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));621instruction[3] = 0x38;622SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);623instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);624instruction[5] = (sljit_u8)(16 - diff);625instruction[6] = 1;626sljit_emit_op_custom(compiler, instruction, 7);627}628else629{630/* VPERM2I128 xmm1, xmm2, xmm3/m128 */631/* instruction[0] = 0xc4; */632/* instruction[1] = 0xe3; */633value = (diff == 16) ? data1_ind : data2_ind;634instruction[2] = (sljit_u8)(0x7d ^ (value << 3));635instruction[3] = 0x46;636instruction[4] = 0xc0 | (data2_ind << 3) | value;637instruction[5] = 0x08;638sljit_emit_op_custom(compiler, instruction, 6);639}640}641else642{643/* MOVDQA xmm1, xmm2/m128 */644instruction[0] = 0x66;645instruction[1] = 0x0f;646instruction[2] = 0x6f;647instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;648sljit_emit_op_custom(compiler, instruction, 4);649650/* PSLLDQ xmm1, imm8 */651/* instruction[0] = 0x66; */652/* instruction[1] = 0x0f; */653instruction[2] = 0x73;654instruction[3] = 0xc0 | (7 << 3) | data2_ind;655instruction[4] = diff;656sljit_emit_op_custom(compiler, instruction, 5);657}658659JUMPHERE(jump[1]);660661value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;662OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);663664for (i = 0; i < 4; i++)665{666fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);667fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);668}669670sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);671sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);672673/* Ignore matches before the first STR_PTR. */674OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);675OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);676677jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);678679OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);680681/* Main loop. */682start = LABEL();683684value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;685OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);686add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));687688value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;689sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);690sljit_emit_simd_mov(compiler, reg_type, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);691692for (i = 0; i < 4; i++)693{694fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);695fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);696}697698sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);699sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);700701CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);702703JUMPHERE(jump[0]);704705SLJIT_ASSERT(tmp1_reg_ind < 8);706/* BSF r32, r/m32 */707instruction[0] = 0x0f;708instruction[1] = 0xbc;709instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;710sljit_emit_op_custom(compiler, instruction, 3);711712OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);713714add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));715716#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32717if (common->utf)718{719OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));720721jump[0] = jump_if_utf_char_start(compiler, TMP1);722723OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));724CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);725726add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));727728JUMPHERE(jump[0]);729}730#endif731732OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));733734if (common->match_end_ptr != 0)735OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);736}737738#endif /* !_WIN64 */739740#undef SIMD_COMPARE_TYPE_INDEX741742#endif /* SLJIT_CONFIG_X86 */743744#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))745746#include <arm_neon.h>747748typedef union {749unsigned int x;750struct { unsigned char c1, c2, c3, c4; } c;751} int_char;752753#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32754static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)755{756#if PCRE2_CODE_UNIT_WIDTH == 8757return (*s & 0xc0) == 0x80;758#elif PCRE2_CODE_UNIT_WIDTH == 16759return (*s & 0xfc00) == 0xdc00;760#else761#error "Unknown code width"762#endif763}764#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */765766#if PCRE2_CODE_UNIT_WIDTH == 8767# define VECTOR_FACTOR 16768# define vect_t uint8x16_t769# define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))770# define VCEQQ vceqq_u8771# define VORRQ vorrq_u8772# define VST1Q vst1q_u8773# define VDUPQ vdupq_n_u8774# define VEXTQ vextq_u8775# define VANDQ vandq_u8776typedef union {777uint8_t mem[16];778uint64_t dw[2];779} quad_word;780#elif PCRE2_CODE_UNIT_WIDTH == 16781# define VECTOR_FACTOR 8782# define vect_t uint16x8_t783# define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))784# define VCEQQ vceqq_u16785# define VORRQ vorrq_u16786# define VST1Q vst1q_u16787# define VDUPQ vdupq_n_u16788# define VEXTQ vextq_u16789# define VANDQ vandq_u16790typedef union {791uint16_t mem[8];792uint64_t dw[2];793} quad_word;794#else795# define VECTOR_FACTOR 4796# define vect_t uint32x4_t797# define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))798# define VCEQQ vceqq_u32799# define VORRQ vorrq_u32800# define VST1Q vst1q_u32801# define VDUPQ vdupq_n_u32802# define VEXTQ vextq_u32803# define VANDQ vandq_u32804typedef union {805uint32_t mem[4];806uint64_t dw[2];807} quad_word;808#endif809810#define FFCS811#include "pcre2_jit_neon_inc.h"812#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32813# define FF_UTF814# include "pcre2_jit_neon_inc.h"815# undef FF_UTF816#endif817#undef FFCS818819#define FFCS_2820#include "pcre2_jit_neon_inc.h"821#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32822# define FF_UTF823# include "pcre2_jit_neon_inc.h"824# undef FF_UTF825#endif826#undef FFCS_2827828#define FFCS_MASK829#include "pcre2_jit_neon_inc.h"830#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32831# define FF_UTF832# include "pcre2_jit_neon_inc.h"833# undef FF_UTF834#endif835#undef FFCS_MASK836837#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1838839static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)840{841DEFINE_COMPILER;842int_char ic;843struct sljit_jump *partial_quit, *quit;844/* Save temporary registers. */845SLJIT_ASSERT(common->locals_size >= 2 * (int)sizeof(sljit_sw));846OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, STR_PTR, 0);847OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL1, TMP3, 0);848849/* Prepare function arguments */850OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);851GET_LOCAL_BASE(SLJIT_R1, 0, LOCAL0);852OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);853854if (char1 == char2)855{856ic.c.c1 = char1;857ic.c.c2 = char2;858OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);859860#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32861if (common->utf && offset > 0)862sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),863SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));864else865sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),866SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));867#else868sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),869SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));870#endif871}872else873{874PCRE2_UCHAR mask = char1 ^ char2;875if (is_powerof2(mask))876{877ic.c.c1 = char1 | mask;878ic.c.c2 = mask;879OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);880881#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32882if (common->utf && offset > 0)883sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),884SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));885else886sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),887SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));888#else889sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),890SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));891#endif892}893else894{895ic.c.c1 = char1;896ic.c.c2 = char2;897OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);898899#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32900if (common->utf && offset > 0)901sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),902SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));903else904sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),905SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));906#else907sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),908SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));909#endif910}911}912/* Restore registers. */913OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);914OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCAL1);915916/* Check return value. */917partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);918if (common->mode == PCRE2_JIT_COMPLETE)919add_jump(compiler, &common->failed_match, partial_quit);920921/* Fast forward STR_PTR to the result of memchr. */922OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);923if (common->mode != PCRE2_JIT_COMPLETE)924{925quit = CMP(SLJIT_NOT_ZERO, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);926JUMPHERE(partial_quit);927OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);928SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);929JUMPHERE(quit);930}931}932933typedef enum {934compare_match1,935compare_match1i,936compare_match2,937} compare_type;938939static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)940{941if (ctype == compare_match2)942{943vect_t tmp = dst;944dst = VCEQQ(dst, cmp1);945tmp = VCEQQ(tmp, cmp2);946dst = VORRQ(dst, tmp);947return dst;948}949950if (ctype == compare_match1i)951dst = VORRQ(dst, cmp2);952dst = VCEQQ(dst, cmp1);953return dst;954}955956static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)957{958#if PCRE2_CODE_UNIT_WIDTH == 8959return 15;960#elif PCRE2_CODE_UNIT_WIDTH == 16961return 7;962#elif PCRE2_CODE_UNIT_WIDTH == 32963return 3;964#else965#error "Unsupported unit width"966#endif967}968969/* ARM doesn't have a shift left across lanes. */970static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)971{972vect_t zero = VDUPQ(0);973SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);974/* VEXTQ takes an immediate as last argument. */975#define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);976switch (n)977{978C(1); C(2); C(3);979#if PCRE2_CODE_UNIT_WIDTH != 32980C(4); C(5); C(6); C(7);981# if PCRE2_CODE_UNIT_WIDTH != 16982C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);983# endif984#endif985default:986/* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't987happen. The return is still here for compilers to not warn. */988return a;989}990}991992#define FFCPS993#define FFCPS_DIFF1994#define FFCPS_CHAR1A2A995996#define FFCPS_0997#include "pcre2_jit_neon_inc.h"998#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32999# define FF_UTF1000# include "pcre2_jit_neon_inc.h"1001# undef FF_UTF1002#endif1003#undef FFCPS_010041005#undef FFCPS_CHAR1A2A10061007#define FFCPS_11008#include "pcre2_jit_neon_inc.h"1009#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321010# define FF_UTF1011# include "pcre2_jit_neon_inc.h"1012# undef FF_UTF1013#endif1014#undef FFCPS_110151016#undef FFCPS_DIFF110171018#define FFCPS_DEFAULT1019#include "pcre2_jit_neon_inc.h"1020#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321021# define FF_UTF1022# include "pcre2_jit_neon_inc.h"1023# undef FF_UTF1024#endif1025#undef FFCPS10261027#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 110281029static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,1030PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)1031{1032DEFINE_COMPILER;1033sljit_u32 diff = IN_UCHARS(offs1 - offs2);1034struct sljit_jump *partial_quit;1035int_char ic;1036SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);1037SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));1038SLJIT_ASSERT(compiler->scratches == 5);10391040/* Save temporary register STR_PTR. */1041OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, STR_PTR, 0);10421043/* Prepare arguments for the function call. */1044if (common->match_end_ptr == 0)1045OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);1046else1047{1048OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);1049OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));10501051OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);1052SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);1053}10541055GET_LOCAL_BASE(SLJIT_R1, 0, LOCAL0);1056OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);1057OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);1058ic.c.c1 = char1a;1059ic.c.c2 = char1b;1060ic.c.c3 = char2a;1061ic.c.c4 = char2b;1062OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);10631064if (diff == 1) {1065if (char1a == char1b && char2a == char2b) {1066#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321067if (common->utf)1068sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),1069SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));1070else1071#endif1072sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),1073SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));1074} else {1075#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321076if (common->utf)1077sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),1078SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));1079else1080#endif1081sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),1082SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));1083}1084} else {1085#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321086if (common->utf)1087sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),1088SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));1089else1090#endif1091sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),1092SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));1093}10941095/* Restore STR_PTR register. */1096OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);10971098/* Check return value. */1099partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);1100add_jump(compiler, &common->failed_match, partial_quit);11011102/* Fast forward STR_PTR to the result of memchr. */1103OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);11041105JUMPHERE(partial_quit);1106}11071108#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */11091110#if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)11111112#if PCRE2_CODE_UNIT_WIDTH == 81113#define VECTOR_ELEMENT_SIZE 01114#elif PCRE2_CODE_UNIT_WIDTH == 161115#define VECTOR_ELEMENT_SIZE 11116#elif PCRE2_CODE_UNIT_WIDTH == 321117#define VECTOR_ELEMENT_SIZE 21118#else1119#error "Unsupported unit width"1120#endif11211122static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,1123sljit_s32 base_reg, sljit_s32 index_reg)1124{1125sljit_u16 instruction[3];11261127instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);1128instruction[1] = (sljit_u16)(base_reg << 12);1129instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));11301131sljit_emit_op_custom(compiler, instruction, 6);1132}11331134#if PCRE2_CODE_UNIT_WIDTH == 3211351136static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,1137PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)1138{1139sljit_u16 instruction[3];11401141SLJIT_ASSERT(step >= 0 && step <= 1);11421143if (chr < 0x7fff)1144{1145if (step == 1)1146return;11471148/* VREPI */1149instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));1150instruction[1] = (sljit_u16)chr;1151instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1152sljit_emit_op_custom(compiler, instruction, 6);1153return;1154}11551156if (step == 0)1157{1158OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);11591160/* VLVG */1161instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));1162instruction[1] = 0;1163instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);1164sljit_emit_op_custom(compiler, instruction, 6);1165return;1166}11671168/* VREP */1169instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);1170instruction[1] = 0;1171instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);1172sljit_emit_op_custom(compiler, instruction, 6);1173}11741175#endif11761177static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,1178int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)1179{1180sljit_u16 instruction[3];11811182SLJIT_ASSERT(step >= 0 && step <= 2);11831184if (step == 1)1185{1186/* VCEQ */1187instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);1188instruction[1] = (sljit_u16)(cmp1_ind << 12);1189instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);1190sljit_emit_op_custom(compiler, instruction, 6);1191return;1192}11931194if (compare_type != vector_compare_match2)1195{1196if (step == 0 && compare_type == vector_compare_match1i)1197{1198/* VO */1199instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);1200instruction[1] = (sljit_u16)(cmp2_ind << 12);1201instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);1202sljit_emit_op_custom(compiler, instruction, 6);1203}1204return;1205}12061207switch (step)1208{1209case 0:1210/* VCEQ */1211instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);1212instruction[1] = (sljit_u16)(cmp2_ind << 12);1213instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);1214sljit_emit_op_custom(compiler, instruction, 6);1215return;12161217case 2:1218/* VO */1219instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);1220instruction[1] = (sljit_u16)(tmp_ind << 12);1221instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);1222sljit_emit_op_custom(compiler, instruction, 6);1223return;1224}1225}12261227#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 112281229static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)1230{1231DEFINE_COMPILER;1232sljit_u16 instruction[3];1233struct sljit_label *start;1234#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321235struct sljit_label *restart;1236#endif1237struct sljit_jump *quit;1238struct sljit_jump *partial_quit[2];1239vector_compare_type compare_type = vector_compare_match1;1240sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1241sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);1242sljit_s32 data_ind = 0;1243sljit_s32 tmp_ind = 1;1244sljit_s32 cmp1_ind = 2;1245sljit_s32 cmp2_ind = 3;1246sljit_s32 zero_ind = 4;1247sljit_u32 bit = 0;1248int i;12491250SLJIT_UNUSED_ARG(offset);12511252if (char1 != char2)1253{1254bit = char1 ^ char2;1255compare_type = vector_compare_match1i;12561257if (!is_powerof2(bit))1258{1259bit = 0;1260compare_type = vector_compare_match2;1261}1262}12631264partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);1265if (common->mode == PCRE2_JIT_COMPLETE)1266add_jump(compiler, &common->failed_match, partial_quit[0]);12671268/* First part (unaligned start) */12691270OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);12711272#if PCRE2_CODE_UNIT_WIDTH != 3212731274/* VREPI */1275instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));1276instruction[1] = (sljit_u16)(char1 | bit);1277instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1278sljit_emit_op_custom(compiler, instruction, 6);12791280if (char1 != char2)1281{1282/* VREPI */1283instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));1284instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);1285/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1286sljit_emit_op_custom(compiler, instruction, 6);1287}12881289#else /* PCRE2_CODE_UNIT_WIDTH == 32 */12901291for (int i = 0; i < 2; i++)1292{1293replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);12941295if (char1 != char2)1296replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);1297}12981299#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */13001301if (compare_type == vector_compare_match2)1302{1303/* VREPI */1304instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));1305instruction[1] = 0;1306instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);1307sljit_emit_op_custom(compiler, instruction, 6);1308}13091310#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321311restart = LABEL();1312#endif13131314load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);1315OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);13161317if (compare_type != vector_compare_match2)1318{1319if (compare_type == vector_compare_match1i)1320fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);13211322/* VFEE */1323instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1324instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1325instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1326sljit_emit_op_custom(compiler, instruction, 6);1327}1328else1329{1330for (i = 0; i < 3; i++)1331fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);13321333/* VFENE */1334instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1335instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1336instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1337sljit_emit_op_custom(compiler, instruction, 6);1338}13391340/* VLGVB */1341instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);1342instruction[1] = 7;1343instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1344sljit_emit_op_custom(compiler, instruction, 6);13451346OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);1347quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);13481349OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);13501351/* Second part (aligned) */1352start = LABEL();13531354OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);13551356partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);1357if (common->mode == PCRE2_JIT_COMPLETE)1358add_jump(compiler, &common->failed_match, partial_quit[1]);13591360load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);13611362if (compare_type != vector_compare_match2)1363{1364if (compare_type == vector_compare_match1i)1365fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);13661367/* VFEE */1368instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1369instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1370instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1371sljit_emit_op_custom(compiler, instruction, 6);1372}1373else1374{1375for (i = 0; i < 3; i++)1376fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);13771378/* VFENE */1379instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1380instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1381instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1382sljit_emit_op_custom(compiler, instruction, 6);1383}13841385sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);1386JUMPTO(SLJIT_OVERFLOW, start);13871388/* VLGVB */1389instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);1390instruction[1] = 7;1391instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1392sljit_emit_op_custom(compiler, instruction, 6);13931394OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);13951396JUMPHERE(quit);13971398if (common->mode != PCRE2_JIT_COMPLETE)1399{1400JUMPHERE(partial_quit[0]);1401JUMPHERE(partial_quit[1]);1402OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);1403SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);1404}1405else1406add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));14071408#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321409if (common->utf && offset > 0)1410{1411SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);14121413OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));14141415quit = jump_if_utf_char_start(compiler, TMP1);14161417OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));1418add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));14191420OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);1421JUMPTO(SLJIT_JUMP, restart);14221423JUMPHERE(quit);1424}1425#endif1426}14271428#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 114291430static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)1431{1432DEFINE_COMPILER;1433sljit_u16 instruction[3];1434struct sljit_label *start;1435struct sljit_jump *quit;1436jump_list *not_found = NULL;1437vector_compare_type compare_type = vector_compare_match1;1438sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1439sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);1440sljit_s32 data_ind = 0;1441sljit_s32 tmp_ind = 1;1442sljit_s32 cmp1_ind = 2;1443sljit_s32 cmp2_ind = 3;1444sljit_s32 zero_ind = 4;1445sljit_u32 bit = 0;1446int i;14471448if (char1 != char2)1449{1450bit = char1 ^ char2;1451compare_type = vector_compare_match1i;14521453if (!is_powerof2(bit))1454{1455bit = 0;1456compare_type = vector_compare_match2;1457}1458}14591460add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));14611462/* First part (unaligned start) */14631464OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);14651466#if PCRE2_CODE_UNIT_WIDTH != 3214671468/* VREPI */1469instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));1470instruction[1] = (sljit_u16)(char1 | bit);1471instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1472sljit_emit_op_custom(compiler, instruction, 6);14731474if (char1 != char2)1475{1476/* VREPI */1477instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));1478instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);1479/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1480sljit_emit_op_custom(compiler, instruction, 6);1481}14821483#else /* PCRE2_CODE_UNIT_WIDTH == 32 */14841485for (int i = 0; i < 2; i++)1486{1487replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);14881489if (char1 != char2)1490replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);1491}14921493#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */14941495if (compare_type == vector_compare_match2)1496{1497/* VREPI */1498instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));1499instruction[1] = 0;1500instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);1501sljit_emit_op_custom(compiler, instruction, 6);1502}15031504load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);1505OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);15061507if (compare_type != vector_compare_match2)1508{1509if (compare_type == vector_compare_match1i)1510fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);15111512/* VFEE */1513instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1514instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1515instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1516sljit_emit_op_custom(compiler, instruction, 6);1517}1518else1519{1520for (i = 0; i < 3; i++)1521fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);15221523/* VFENE */1524instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1525instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1526instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1527sljit_emit_op_custom(compiler, instruction, 6);1528}15291530/* VLGVB */1531instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);1532instruction[1] = 7;1533instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1534sljit_emit_op_custom(compiler, instruction, 6);15351536OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);1537quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);15381539OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);15401541/* Second part (aligned) */1542start = LABEL();15431544OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);15451546add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));15471548load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);15491550if (compare_type != vector_compare_match2)1551{1552if (compare_type == vector_compare_match1i)1553fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);15541555/* VFEE */1556instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1557instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));1558instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);1559sljit_emit_op_custom(compiler, instruction, 6);1560}1561else1562{1563for (i = 0; i < 3; i++)1564fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);15651566/* VFENE */1567instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);1568instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1569instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1570sljit_emit_op_custom(compiler, instruction, 6);1571}15721573sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);1574JUMPTO(SLJIT_OVERFLOW, start);15751576/* VLGVB */1577instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);1578instruction[1] = 7;1579instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1580sljit_emit_op_custom(compiler, instruction, 6);15811582OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);15831584JUMPHERE(quit);1585add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));15861587return not_found;1588}15891590#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 115911592static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,1593PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)1594{1595DEFINE_COMPILER;1596sljit_u16 instruction[3];1597struct sljit_label *start;1598#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321599struct sljit_label *restart;1600#endif1601struct sljit_jump *quit;1602struct sljit_jump *jump[2];1603vector_compare_type compare1_type = vector_compare_match1;1604vector_compare_type compare2_type = vector_compare_match1;1605sljit_u32 bit1 = 0;1606sljit_u32 bit2 = 0;1607sljit_s32 diff = IN_UCHARS(offs2 - offs1);1608sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1609sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);1610sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);1611sljit_s32 data1_ind = 0;1612sljit_s32 data2_ind = 1;1613sljit_s32 tmp1_ind = 2;1614sljit_s32 tmp2_ind = 3;1615sljit_s32 cmp1a_ind = 4;1616sljit_s32 cmp1b_ind = 5;1617sljit_s32 cmp2a_ind = 6;1618sljit_s32 cmp2b_ind = 7;1619sljit_s32 zero_ind = 8;1620int i;16211622SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);1623SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));1624SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);16251626if (char1a != char1b)1627{1628bit1 = char1a ^ char1b;1629compare1_type = vector_compare_match1i;16301631if (!is_powerof2(bit1))1632{1633bit1 = 0;1634compare1_type = vector_compare_match2;1635}1636}16371638if (char2a != char2b)1639{1640bit2 = char2a ^ char2b;1641compare2_type = vector_compare_match1i;16421643if (!is_powerof2(bit2))1644{1645bit2 = 0;1646compare2_type = vector_compare_match2;1647}1648}16491650/* Initialize. */1651if (common->match_end_ptr != 0)1652{1653OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);1654OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);1655OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));16561657OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);1658SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);1659}16601661OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));1662add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));1663OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);16641665#if PCRE2_CODE_UNIT_WIDTH != 3216661667OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);16681669/* VREPI */1670instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));1671instruction[1] = (sljit_u16)(char1a | bit1);1672instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);1673sljit_emit_op_custom(compiler, instruction, 6);16741675if (char1a != char1b)1676{1677/* VREPI */1678instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));1679instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);1680/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1681sljit_emit_op_custom(compiler, instruction, 6);1682}16831684/* VREPI */1685instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));1686instruction[1] = (sljit_u16)(char2a | bit2);1687/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1688sljit_emit_op_custom(compiler, instruction, 6);16891690if (char2a != char2b)1691{1692/* VREPI */1693instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));1694instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);1695/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */1696sljit_emit_op_custom(compiler, instruction, 6);1697}16981699#else /* PCRE2_CODE_UNIT_WIDTH == 32 */17001701for (int i = 0; i < 2; i++)1702{1703replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);17041705if (char1a != char1b)1706replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);17071708replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);17091710if (char2a != char2b)1711replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);1712}17131714OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);17151716#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */17171718/* VREPI */1719instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));1720instruction[1] = 0;1721instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);1722sljit_emit_op_custom(compiler, instruction, 6);17231724#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321725restart = LABEL();1726#endif17271728jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);1729load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);1730jump[1] = JUMP(SLJIT_JUMP);1731JUMPHERE(jump[0]);1732load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);1733JUMPHERE(jump[1]);17341735load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);1736OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);17371738for (i = 0; i < 3; i++)1739{1740fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);1741fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);1742}17431744/* VN */1745instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1746instruction[1] = (sljit_u16)(data2_ind << 12);1747instruction[2] = (sljit_u16)((0xe << 8) | 0x68);1748sljit_emit_op_custom(compiler, instruction, 6);17491750/* VFENE */1751instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1752instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1753instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1754sljit_emit_op_custom(compiler, instruction, 6);17551756/* VLGVB */1757instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);1758instruction[1] = 7;1759instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1760sljit_emit_op_custom(compiler, instruction, 6);17611762OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);1763quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);17641765OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);1766OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);17671768/* Main loop. */1769start = LABEL();17701771OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);1772add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));17731774load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);1775load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);17761777for (i = 0; i < 3; i++)1778{1779fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);1780fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);1781}17821783/* VN */1784instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1785instruction[1] = (sljit_u16)(data2_ind << 12);1786instruction[2] = (sljit_u16)((0xe << 8) | 0x68);1787sljit_emit_op_custom(compiler, instruction, 6);17881789/* VFENE */1790instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);1791instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));1792instruction[2] = (sljit_u16)((0xe << 8) | 0x81);1793sljit_emit_op_custom(compiler, instruction, 6);17941795sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);1796JUMPTO(SLJIT_OVERFLOW, start);17971798/* VLGVB */1799instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);1800instruction[1] = 7;1801instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);1802sljit_emit_op_custom(compiler, instruction, 6);18031804OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);18051806JUMPHERE(quit);18071808add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));18091810#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321811if (common->utf)1812{1813SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);18141815OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));18161817quit = jump_if_utf_char_start(compiler, TMP1);18181819OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));1820add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));18211822/* TMP1 contains diff. */1823OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);1824OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);1825JUMPTO(SLJIT_JUMP, restart);18261827JUMPHERE(quit);1828}1829#endif18301831OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));18321833if (common->match_end_ptr != 0)1834OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);1835}18361837#endif /* SLJIT_CONFIG_S390X */18381839#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)18401841#ifdef __linux__1842/* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */1843#include <sys/auxv.h>1844#define LOONGARCH_HWCAP_LSX (1 << 4)1845#define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)1846#else1847#define HAS_LSX_SUPPORT 01848#endif18491850typedef sljit_ins sljit_u32;18511852#define SI12_IMM_MASK 0x003ffc001853#define UI5_IMM_MASK 0x00007c001854#define UI2_IMM_MASK 0x00000c0018551856#define VD(vd) ((sljit_ins)vd << 0)1857#define VJ(vj) ((sljit_ins)vj << 5)1858#define VK(vk) ((sljit_ins)vk << 10)1859#define RD_V(rd) ((sljit_ins)rd << 0)1860#define RJ_V(rj) ((sljit_ins)rj << 5)18611862#define IMM_SI12(imm) (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)1863#define IMM_UI5(imm) (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)1864#define IMM_UI2(imm) (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)18651866// LSX OPCODES:1867#define VLD 0x2c0000001868#define VOR_V 0x712680001869#define VAND_V 0x712600001870#define VBSLL_V 0x728e00001871#define VMSKLTZ_B 0x729c40001872#define VPICKVE2GR_WU 0x72f3e00018731874#if PCRE2_CODE_UNIT_WIDTH == 81875#define VREPLGR2VR 0x729f00001876#define VSEQ 0x700000001877#elif PCRE2_CODE_UNIT_WIDTH == 161878#define VREPLGR2VR 0x729f04001879#define VSEQ 0x700080001880#else1881#define VREPLGR2VR 0x729f08001882#define VSEQ 0x700100001883#endif18841885static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,1886sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)1887{1888if (compare_type != vector_compare_match2)1889{1890if (compare_type == vector_compare_match1i)1891{1892/* VOR.V vd, vj, vk */1893push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));1894}18951896/* VSEQ.B/H/W vd, vj, vk */1897push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));1898return;1899}19001901/* VBSLL.V vd, vj, ui5 */1902push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));19031904/* VSEQ.B/H/W vd, vj, vk */1905push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));19061907/* VSEQ.B/H/W vd, vj, vk */1908push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));19091910/* VOR vd, vj, vk */1911push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));1912return;1913}19141915#define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT19161917static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)1918{1919DEFINE_COMPILER;1920struct sljit_label *start;1921#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321922struct sljit_label *restart;1923#endif1924struct sljit_jump *quit;1925struct sljit_jump *partial_quit[2];1926vector_compare_type compare_type = vector_compare_match1;1927sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);1928sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);1929sljit_s32 data_ind = 0;1930sljit_s32 tmp_ind = 1;1931sljit_s32 cmp1_ind = 2;1932sljit_s32 cmp2_ind = 3;1933sljit_u32 bit = 0;19341935SLJIT_UNUSED_ARG(offset);19361937if (char1 != char2)1938{1939bit = char1 ^ char2;1940compare_type = vector_compare_match1i;19411942if (!is_powerof2(bit))1943{1944bit = 0;1945compare_type = vector_compare_match2;1946}1947}19481949partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);1950if (common->mode == PCRE2_JIT_COMPLETE)1951add_jump(compiler, &common->failed_match, partial_quit[0]);19521953/* First part (unaligned start) */19541955OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);19561957/* VREPLGR2VR.B/H/W vd, rj */1958push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));19591960if (char1 != char2)1961{1962OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);19631964/* VREPLGR2VR.B/H/W vd, rj */1965push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));1966}19671968OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);19691970#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 321971restart = LABEL();1972#endif19731974OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);1975OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);19761977/* VLD vd, rj, si12 */1978push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));1979fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);19801981/* VMSKLTZ.B vd, vj */1982push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));19831984/* VPICKVE2GR.WU rd, vj, ui2 */1985push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));19861987OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);1988OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);19891990quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);19911992OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);19931994/* Second part (aligned) */1995start = LABEL();19961997OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);19981999partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);2000if (common->mode == PCRE2_JIT_COMPLETE)2001add_jump(compiler, &common->failed_match, partial_quit[1]);20022003/* VLD vd, rj, si12 */2004push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2005fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);20062007/* VMSKLTZ.B vd, vj */2008push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));20092010/* VPICKVE2GR.WU rd, vj, ui2 */2011push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));20122013CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);20142015JUMPHERE(quit);20162017/* CTZ.W rd, rj */2018push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));20192020OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);20212022if (common->mode != PCRE2_JIT_COMPLETE)2023{2024JUMPHERE(partial_quit[0]);2025JUMPHERE(partial_quit[1]);2026OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);2027SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);2028}2029else2030add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));20312032#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322033if (common->utf && offset > 0)2034{2035SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);20362037OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));20382039quit = jump_if_utf_char_start(compiler, TMP1);20402041OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));2042add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));2043OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);2044JUMPTO(SLJIT_JUMP, restart);20452046JUMPHERE(quit);2047}2048#endif2049}20502051#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT20522053static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)2054{2055DEFINE_COMPILER;2056struct sljit_label *start;2057struct sljit_jump *quit;2058jump_list *not_found = NULL;2059vector_compare_type compare_type = vector_compare_match1;2060sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);2061sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);2062sljit_s32 data_ind = 0;2063sljit_s32 tmp_ind = 1;2064sljit_s32 cmp1_ind = 2;2065sljit_s32 cmp2_ind = 3;2066sljit_u32 bit = 0;20672068if (char1 != char2)2069{2070bit = char1 ^ char2;2071compare_type = vector_compare_match1i;20722073if (!is_powerof2(bit))2074{2075bit = 0;2076compare_type = vector_compare_match2;2077}2078}20792080add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));2081OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);2082OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);20832084/* First part (unaligned start) */20852086OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);20872088/* VREPLGR2VR vd, rj */2089push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));20902091if (char1 != char2)2092{2093OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);2094/* VREPLGR2VR vd, rj */2095push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));2096}20972098OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);2099OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);2100OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);21012102/* VLD vd, rj, si12 */2103push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2104fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);21052106/* VMSKLTZ.B vd, vj */2107push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));21082109/* VPICKVE2GR.WU rd, vj, ui2 */2110push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));21112112OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);2113OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);21142115quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);21162117OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);21182119/* Second part (aligned) */2120start = LABEL();21212122OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);21232124add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));21252126/* VLD vd, rj, si12 */2127push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2128fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);21292130/* VMSKLTZ.B vd, vj */2131push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));21322133/* VPICKVE2GR.WU rd, vj, ui2 */2134push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));21352136CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);21372138JUMPHERE(quit);21392140/* CTZ.W rd, rj */2141push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));21422143OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);2144add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));21452146OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);2147return not_found;2148}21492150#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT21512152static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,2153PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)2154{2155DEFINE_COMPILER;2156vector_compare_type compare1_type = vector_compare_match1;2157vector_compare_type compare2_type = vector_compare_match1;2158sljit_u32 bit1 = 0;2159sljit_u32 bit2 = 0;2160sljit_u32 diff = IN_UCHARS(offs1 - offs2);2161sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);2162sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);2163sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);2164sljit_s32 data1_ind = 0;2165sljit_s32 data2_ind = 1;2166sljit_s32 tmp1_ind = 2;2167sljit_s32 tmp2_ind = 3;2168sljit_s32 cmp1a_ind = 4;2169sljit_s32 cmp1b_ind = 5;2170sljit_s32 cmp2a_ind = 6;2171sljit_s32 cmp2b_ind = 7;2172struct sljit_label *start;2173#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322174struct sljit_label *restart;2175#endif2176struct sljit_jump *jump[2];21772178SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);2179SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));21802181/* Initialize. */2182if (common->match_end_ptr != 0)2183{2184OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);2185OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));2186OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);21872188OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);2189SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);2190}21912192OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));2193add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));21942195if (char1a == char1b)2196OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);2197else2198{2199bit1 = char1a ^ char1b;2200if (is_powerof2(bit1))2201{2202compare1_type = vector_compare_match1i;2203OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);2204OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);2205}2206else2207{2208compare1_type = vector_compare_match2;2209bit1 = 0;2210OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);2211OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);2212}2213}22142215/* VREPLGR2VR vd, rj */2216push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));22172218if (char1a != char1b)2219{2220/* VREPLGR2VR vd, rj */2221push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));2222}22232224if (char2a == char2b)2225OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);2226else2227{2228bit2 = char2a ^ char2b;2229if (is_powerof2(bit2))2230{2231compare2_type = vector_compare_match1i;2232OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);2233OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);2234}2235else2236{2237compare2_type = vector_compare_match2;2238bit2 = 0;2239OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);2240OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);2241}2242}22432244/* VREPLGR2VR vd, rj */2245push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));22462247if (char2a != char2b)2248{2249/* VREPLGR2VR vd, rj */2250push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));2251}22522253#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322254restart = LABEL();2255#endif22562257OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);2258OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);2259OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);2260OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);22612262/* VLD vd, rj, si12 */2263push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));22642265jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);22662267/* VLD vd, rj, si12 */2268push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));2269jump[1] = JUMP(SLJIT_JUMP);22702271JUMPHERE(jump[0]);22722273/* VBSLL.V vd, vj, ui5 */2274push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));22752276JUMPHERE(jump[1]);22772278fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);2279fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);22802281/* VAND vd, vj, vk */2282push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));22832284/* VMSKLTZ.B vd, vj */2285push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));22862287/* VPICKVE2GR.WU rd, vj, ui2 */2288push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));22892290/* Ignore matches before the first STR_PTR. */2291OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);2292OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);22932294jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);22952296OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);22972298/* Main loop. */2299start = LABEL();23002301OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);2302add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));23032304/* VLD vd, rj, si12 */2305push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));2306push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));23072308fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);2309fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);23102311/* VAND.V vd, vj, vk */2312push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));23132314/* VMSKLTZ.B vd, vj */2315push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));23162317/* VPICKVE2GR.WU rd, vj, ui2 */2318push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));23192320CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);23212322JUMPHERE(jump[0]);23232324/* CTZ.W rd, rj */2325push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));23262327OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);23282329add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));23302331#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 322332if (common->utf)2333{2334OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));23352336jump[0] = jump_if_utf_char_start(compiler, TMP1);23372338OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));2339CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);23402341add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));23422343JUMPHERE(jump[0]);2344}2345#endif23462347OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));23482349if (common->match_end_ptr != 0)2350OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);2351}23522353#endif /* SLJIT_CONFIG_LOONGARCH_64 */23542355#endif /* !SUPPORT_VALGRIND */235623572358