Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/pcre2/src/pcre2_jit_simd_inc.h
21807 views
1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
Written by Philip Hazel
9
This module by Zoltan Herczeg
10
Original API code Copyright (c) 1997-2012 University of Cambridge
11
New API code Copyright (c) 2016-2019 University of Cambridge
12
13
-----------------------------------------------------------------------------
14
Redistribution and use in source and binary forms, with or without
15
modification, are permitted provided that the following conditions are met:
16
17
* Redistributions of source code must retain the above copyright notice,
18
this list of conditions and the following disclaimer.
19
20
* Redistributions in binary form must reproduce the above copyright
21
notice, this list of conditions and the following disclaimer in the
22
documentation and/or other materials provided with the distribution.
23
24
* Neither the name of the University of Cambridge nor the names of its
25
contributors may be used to endorse or promote products derived from
26
this software without specific prior written permission.
27
28
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
-----------------------------------------------------------------------------
40
*/
41
42
#if !(defined SUPPORT_VALGRIND)
43
44
#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45
|| (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
46
|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
47
|| (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))
48
49
typedef enum {
50
vector_compare_match1,
51
vector_compare_match1i,
52
vector_compare_match2,
53
} vector_compare_type;
54
55
#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
56
static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
57
{
58
#if PCRE2_CODE_UNIT_WIDTH == 8
59
/* The AVX2 code path is currently disabled. */
60
/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */
61
return 15;
62
#elif PCRE2_CODE_UNIT_WIDTH == 16
63
/* The AVX2 code path is currently disabled. */
64
/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */
65
return 7;
66
#elif PCRE2_CODE_UNIT_WIDTH == 32
67
/* The AVX2 code path is currently disabled. */
68
/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */
69
return 3;
70
#else
71
#error "Unsupported unit width"
72
#endif
73
}
74
#else /* !SLJIT_CONFIG_X86 */
75
static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
76
{
77
#if PCRE2_CODE_UNIT_WIDTH == 8
78
return 15;
79
#elif PCRE2_CODE_UNIT_WIDTH == 16
80
return 7;
81
#elif PCRE2_CODE_UNIT_WIDTH == 32
82
return 3;
83
#else
84
#error "Unsupported unit width"
85
#endif
86
}
87
#endif /* SLJIT_CONFIG_X86 */
88
89
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
90
static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
91
{
92
#if PCRE2_CODE_UNIT_WIDTH == 8
93
OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
94
return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
95
#elif PCRE2_CODE_UNIT_WIDTH == 16
96
OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
97
return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
98
#else
99
#error "Unknown code width"
100
#endif
101
}
102
#endif
103
104
#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_ARM_64 || SLJIT_CONFIG_S390X || SLJIT_CONFIG_LOONGARCH_64 */
105
106
#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
107
108
static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
109
{
110
sljit_u32 value = chr;
111
#if PCRE2_CODE_UNIT_WIDTH == 8
112
#define SIMD_COMPARE_TYPE_INDEX 0
113
return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
114
#elif PCRE2_CODE_UNIT_WIDTH == 16
115
#define SIMD_COMPARE_TYPE_INDEX 1
116
return (sljit_s32)((value << 16) | value);
117
#elif PCRE2_CODE_UNIT_WIDTH == 32
118
#define SIMD_COMPARE_TYPE_INDEX 2
119
return (sljit_s32)(value);
120
#else
121
#error "Unsupported unit width"
122
#endif
123
}
124
125
static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
126
sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
127
{
128
sljit_u8 instruction[4];
129
130
if (reg_type == SLJIT_SIMD_REG_128)
131
{
132
instruction[0] = 0x66;
133
instruction[1] = 0x0f;
134
}
135
else
136
{
137
/* Two byte VEX prefix. */
138
instruction[0] = 0xc5;
139
instruction[1] = 0xfd;
140
}
141
142
SLJIT_ASSERT(step >= 0 && step <= 3);
143
144
if (compare_type != vector_compare_match2)
145
{
146
if (step == 0)
147
{
148
if (compare_type == vector_compare_match1i)
149
{
150
/* POR xmm1, xmm2/m128 */
151
if (reg_type == SLJIT_SIMD_REG_256)
152
instruction[1] ^= (dst_ind << 3);
153
154
/* Prefix is filled. */
155
instruction[2] = 0xeb;
156
instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
157
sljit_emit_op_custom(compiler, instruction, 4);
158
}
159
return;
160
}
161
162
if (step != 2)
163
return;
164
165
/* PCMPEQB/W/D xmm1, xmm2/m128 */
166
if (reg_type == SLJIT_SIMD_REG_256)
167
instruction[1] ^= (dst_ind << 3);
168
169
/* Prefix is filled. */
170
instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
171
instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
172
sljit_emit_op_custom(compiler, instruction, 4);
173
return;
174
}
175
176
if (reg_type == SLJIT_SIMD_REG_256)
177
{
178
if (step == 2)
179
return;
180
181
if (step == 0)
182
{
183
step = 2;
184
instruction[1] ^= (dst_ind << 3);
185
}
186
}
187
188
switch (step)
189
{
190
case 0:
191
SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);
192
193
/* MOVDQA xmm1, xmm2/m128 */
194
/* Prefix is filled. */
195
instruction[2] = 0x6f;
196
instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
197
sljit_emit_op_custom(compiler, instruction, 4);
198
return;
199
200
case 1:
201
/* PCMPEQB/W/D xmm1, xmm2/m128 */
202
if (reg_type == SLJIT_SIMD_REG_256)
203
instruction[1] ^= (dst_ind << 3);
204
205
/* Prefix is filled. */
206
instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
207
instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
208
sljit_emit_op_custom(compiler, instruction, 4);
209
return;
210
211
case 2:
212
/* PCMPEQB/W/D xmm1, xmm2/m128 */
213
/* Prefix is filled. */
214
instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
215
instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
216
sljit_emit_op_custom(compiler, instruction, 4);
217
return;
218
219
case 3:
220
/* POR xmm1, xmm2/m128 */
221
if (reg_type == SLJIT_SIMD_REG_256)
222
instruction[1] ^= (dst_ind << 3);
223
224
/* Prefix is filled. */
225
instruction[2] = 0xeb;
226
instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
227
sljit_emit_op_custom(compiler, instruction, 4);
228
return;
229
}
230
}
231
232
/* The AVX2 code path is currently disabled.
233
#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
234
*/
235
#if defined(SLJIT_CONFIG_X86_64) && SLJIT_CONFIG_X86_64
236
#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
237
#else
238
#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))
239
#endif
240
241
static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
242
{
243
DEFINE_COMPILER;
244
sljit_u8 instruction[8];
245
/* The AVX2 code path is currently disabled. */
246
/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
247
sljit_s32 reg_type = SLJIT_SIMD_REG_128;
248
sljit_s32 value;
249
struct sljit_label *start;
250
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
251
struct sljit_label *restart;
252
#endif
253
struct sljit_jump *quit;
254
struct sljit_jump *partial_quit[2];
255
vector_compare_type compare_type = vector_compare_match1;
256
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
257
sljit_s32 data_ind = sljit_get_register_index(reg_type, SLJIT_VR0);
258
sljit_s32 cmp1_ind = sljit_get_register_index(reg_type, SLJIT_VR1);
259
sljit_s32 cmp2_ind = sljit_get_register_index(reg_type, SLJIT_VR2);
260
sljit_s32 tmp_ind = sljit_get_register_index(reg_type, SLJIT_VR3);
261
sljit_u32 bit = 0;
262
int i;
263
264
SLJIT_UNUSED_ARG(offset);
265
266
if (char1 != char2)
267
{
268
bit = char1 ^ char2;
269
compare_type = vector_compare_match1i;
270
271
if (!is_powerof2(bit))
272
{
273
bit = 0;
274
compare_type = vector_compare_match2;
275
}
276
}
277
278
partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
279
if (common->mode == PCRE2_JIT_COMPLETE)
280
add_jump(compiler, &common->failed_match, partial_quit[0]);
281
282
/* First part (unaligned start) */
283
value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
284
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
285
286
if (char1 != char2)
287
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
288
289
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
290
291
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR1, SLJIT_VR1, 0);
292
293
if (char1 != char2)
294
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);
295
296
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
297
restart = LABEL();
298
#endif
299
300
value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
301
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
302
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
303
304
value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
305
sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
306
307
for (i = 0; i < 4; i++)
308
fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
309
310
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);
311
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
312
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
313
314
quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
315
316
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
317
318
/* Second part (aligned) */
319
start = LABEL();
320
321
value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
322
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
323
324
partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
325
if (common->mode == PCRE2_JIT_COMPLETE)
326
add_jump(compiler, &common->failed_match, partial_quit[1]);
327
328
value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
329
sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
330
for (i = 0; i < 4; i++)
331
fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
332
333
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);
334
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
335
336
JUMPHERE(quit);
337
338
SLJIT_ASSERT(tmp1_reg_ind < 8);
339
/* BSF r32, r/m32 */
340
instruction[0] = 0x0f;
341
instruction[1] = 0xbc;
342
instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
343
sljit_emit_op_custom(compiler, instruction, 3);
344
345
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
346
347
if (common->mode != PCRE2_JIT_COMPLETE)
348
{
349
JUMPHERE(partial_quit[0]);
350
JUMPHERE(partial_quit[1]);
351
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
352
SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
353
}
354
else
355
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
356
357
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
358
if (common->utf && offset > 0)
359
{
360
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
361
362
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
363
364
quit = jump_if_utf_char_start(compiler, TMP1);
365
366
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
367
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
368
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
369
JUMPTO(SLJIT_JUMP, restart);
370
371
JUMPHERE(quit);
372
}
373
#endif
374
}
375
376
/* The AVX2 code path is currently disabled.
377
#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
378
*/
379
#if defined(SLJIT_CONFIG_X86_64) && SLJIT_CONFIG_X86_64
380
#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
381
#else
382
#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))
383
#endif
384
385
static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
386
{
387
DEFINE_COMPILER;
388
sljit_u8 instruction[8];
389
/* The AVX2 code path is currently disabled. */
390
/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
391
sljit_s32 reg_type = SLJIT_SIMD_REG_128;
392
sljit_s32 value;
393
struct sljit_label *start;
394
struct sljit_jump *quit;
395
jump_list *not_found = NULL;
396
vector_compare_type compare_type = vector_compare_match1;
397
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
398
sljit_s32 data_ind = sljit_get_register_index(reg_type, SLJIT_VR0);
399
sljit_s32 cmp1_ind = sljit_get_register_index(reg_type, SLJIT_VR1);
400
sljit_s32 cmp2_ind = sljit_get_register_index(reg_type, SLJIT_VR2);
401
sljit_s32 tmp_ind = sljit_get_register_index(reg_type, SLJIT_VR3);
402
sljit_u32 bit = 0;
403
int i;
404
405
if (char1 != char2)
406
{
407
bit = char1 ^ char2;
408
compare_type = vector_compare_match1i;
409
410
if (!is_powerof2(bit))
411
{
412
bit = 0;
413
compare_type = vector_compare_match2;
414
}
415
}
416
417
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
418
OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
419
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
420
421
/* First part (unaligned start) */
422
423
value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
424
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
425
426
if (char1 != char2)
427
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
428
429
OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
430
431
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR1, SLJIT_VR1, 0);
432
433
if (char1 != char2)
434
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);
435
436
value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
437
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
438
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
439
440
value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
441
sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
442
443
for (i = 0; i < 4; i++)
444
fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
445
446
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);
447
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
448
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
449
450
quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
451
452
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
453
454
/* Second part (aligned) */
455
start = LABEL();
456
457
value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
458
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
459
460
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
461
462
value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
463
sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
464
465
for (i = 0; i < 4; i++)
466
fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
467
468
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);
469
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
470
471
JUMPHERE(quit);
472
473
SLJIT_ASSERT(tmp1_reg_ind < 8);
474
/* BSF r32, r/m32 */
475
instruction[0] = 0x0f;
476
instruction[1] = 0xbc;
477
instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
478
sljit_emit_op_custom(compiler, instruction, 3);
479
480
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
481
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
482
483
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
484
return not_found;
485
}
486
487
#ifndef _WIN64
488
489
/* The AVX2 code path is currently disabled.
490
#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
491
*/
492
#if defined(SLJIT_CONFIG_X86_64) && SLJIT_CONFIG_X86_64
493
#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
494
#else
495
#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))
496
#endif
497
498
static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
499
PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
500
{
501
DEFINE_COMPILER;
502
sljit_u8 instruction[8];
503
/* The AVX2 code path is currently disabled. */
504
/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
505
sljit_s32 reg_type = SLJIT_SIMD_REG_128;
506
sljit_s32 value;
507
vector_compare_type compare1_type = vector_compare_match1;
508
vector_compare_type compare2_type = vector_compare_match1;
509
sljit_u32 bit1 = 0;
510
sljit_u32 bit2 = 0;
511
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
512
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
513
sljit_s32 data1_ind = sljit_get_register_index(reg_type, SLJIT_VR0);
514
sljit_s32 data2_ind = sljit_get_register_index(reg_type, SLJIT_VR1);
515
sljit_s32 cmp1a_ind = sljit_get_register_index(reg_type, SLJIT_VR2);
516
sljit_s32 cmp2a_ind = sljit_get_register_index(reg_type, SLJIT_VR3);
517
sljit_s32 cmp1b_ind = sljit_get_register_index(reg_type, SLJIT_VR4);
518
sljit_s32 cmp2b_ind = sljit_get_register_index(reg_type, SLJIT_VR5);
519
sljit_s32 tmp1_ind = sljit_get_register_index(reg_type, SLJIT_VR6);
520
sljit_s32 tmp2_ind = sljit_get_register_index(reg_type, SLJIT_TMP_DEST_VREG);
521
struct sljit_label *start;
522
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
523
struct sljit_label *restart;
524
#endif
525
struct sljit_jump *jump[2];
526
int i;
527
528
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
529
SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
530
531
/* Initialize. */
532
if (common->match_end_ptr != 0)
533
{
534
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
535
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
536
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
537
538
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
539
SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
540
}
541
542
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
543
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
544
545
if (char1a == char1b)
546
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
547
else
548
{
549
bit1 = char1a ^ char1b;
550
if (is_powerof2(bit1))
551
{
552
compare1_type = vector_compare_match1i;
553
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
554
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
555
}
556
else
557
{
558
compare1_type = vector_compare_match2;
559
bit1 = 0;
560
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
561
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
562
}
563
}
564
565
value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
566
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR2, 0, TMP1, 0);
567
568
if (char1a != char1b)
569
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR4, 0, TMP2, 0);
570
571
if (char2a == char2b)
572
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
573
else
574
{
575
bit2 = char2a ^ char2b;
576
if (is_powerof2(bit2))
577
{
578
compare2_type = vector_compare_match1i;
579
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
580
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
581
}
582
else
583
{
584
compare2_type = vector_compare_match2;
585
bit2 = 0;
586
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
587
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
588
}
589
}
590
591
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR3, 0, TMP1, 0);
592
593
if (char2a != char2b)
594
sljit_emit_simd_lane_mov(compiler, value, SLJIT_VR5, 0, TMP2, 0);
595
596
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR2, SLJIT_VR2, 0);
597
if (char1a != char1b)
598
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR4, SLJIT_VR4, 0);
599
600
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR3, SLJIT_VR3, 0);
601
if (char2a != char2b)
602
sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_VR5, SLJIT_VR5, 0);
603
604
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
605
restart = LABEL();
606
#endif
607
608
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
609
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
610
value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;
611
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
612
613
value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
614
sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
615
616
jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
617
618
sljit_emit_simd_mov(compiler, reg_type, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
619
jump[1] = JUMP(SLJIT_JUMP);
620
621
JUMPHERE(jump[0]);
622
623
if (reg_type == SLJIT_SIMD_REG_256)
624
{
625
if (diff != 16)
626
{
627
/* PSLLDQ ymm1, ymm2, imm8 */
628
instruction[0] = 0xc5;
629
instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));
630
instruction[2] = 0x73;
631
instruction[3] = 0xc0 | (7 << 3) | data1_ind;
632
instruction[4] = diff & 0xf;
633
sljit_emit_op_custom(compiler, instruction, 5);
634
}
635
636
instruction[0] = 0xc4;
637
instruction[1] = 0xe3;
638
if (diff < 16)
639
{
640
/* VINSERTI128 xmm1, xmm2, xmm3/m128 */
641
/* instruction[0] = 0xc4; */
642
/* instruction[1] = 0xe3; */
643
instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));
644
instruction[3] = 0x38;
645
SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);
646
instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
647
instruction[5] = (sljit_u8)(16 - diff);
648
instruction[6] = 1;
649
sljit_emit_op_custom(compiler, instruction, 7);
650
}
651
else
652
{
653
/* VPERM2I128 xmm1, xmm2, xmm3/m128 */
654
/* instruction[0] = 0xc4; */
655
/* instruction[1] = 0xe3; */
656
value = (diff == 16) ? data1_ind : data2_ind;
657
instruction[2] = (sljit_u8)(0x7d ^ (value << 3));
658
instruction[3] = 0x46;
659
instruction[4] = 0xc0 | (data2_ind << 3) | value;
660
instruction[5] = 0x08;
661
sljit_emit_op_custom(compiler, instruction, 6);
662
}
663
}
664
else
665
{
666
/* MOVDQA xmm1, xmm2/m128 */
667
instruction[0] = 0x66;
668
instruction[1] = 0x0f;
669
instruction[2] = 0x6f;
670
instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
671
sljit_emit_op_custom(compiler, instruction, 4);
672
673
/* PSLLDQ xmm1, imm8 */
674
/* instruction[0] = 0x66; */
675
/* instruction[1] = 0x0f; */
676
instruction[2] = 0x73;
677
instruction[3] = 0xc0 | (7 << 3) | data2_ind;
678
instruction[4] = diff;
679
sljit_emit_op_custom(compiler, instruction, 5);
680
}
681
682
JUMPHERE(jump[1]);
683
684
value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
685
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
686
687
for (i = 0; i < 4; i++)
688
{
689
fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
690
fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
691
}
692
693
sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);
694
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);
695
696
/* Ignore matches before the first STR_PTR. */
697
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
698
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
699
700
jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
701
702
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
703
704
/* Main loop. */
705
start = LABEL();
706
707
value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
708
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
709
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
710
711
value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
712
sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
713
sljit_emit_simd_mov(compiler, reg_type, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
714
715
for (i = 0; i < 4; i++)
716
{
717
fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
718
fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
719
}
720
721
sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);
722
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_VR0, TMP1, 0);
723
724
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
725
726
JUMPHERE(jump[0]);
727
728
SLJIT_ASSERT(tmp1_reg_ind < 8);
729
/* BSF r32, r/m32 */
730
instruction[0] = 0x0f;
731
instruction[1] = 0xbc;
732
instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
733
sljit_emit_op_custom(compiler, instruction, 3);
734
735
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
736
737
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
738
739
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
740
if (common->utf)
741
{
742
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
743
744
jump[0] = jump_if_utf_char_start(compiler, TMP1);
745
746
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
747
CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
748
749
add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
750
751
JUMPHERE(jump[0]);
752
}
753
#endif
754
755
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
756
757
if (common->match_end_ptr != 0)
758
OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
759
}
760
761
#endif /* !_WIN64 */
762
763
#undef SIMD_COMPARE_TYPE_INDEX
764
765
#endif /* SLJIT_CONFIG_X86 */
766
767
#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
768
769
#if PCRE2_CODE_UNIT_WIDTH == 8
770
#define PCRE2_REPLICATE_TYPE (SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_8)
771
#elif PCRE2_CODE_UNIT_WIDTH == 16
772
#define PCRE2_REPLICATE_TYPE (SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_16)
773
#elif PCRE2_CODE_UNIT_WIDTH == 32
774
#define PCRE2_REPLICATE_TYPE (SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32)
775
#else
776
#error "Unsupported unit width"
777
#endif
778
779
#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
780
781
static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
782
int step, sljit_u32 dst_ind, sljit_u32 cmp1_ind, sljit_u32 cmp2_ind, sljit_u32 tmp_ind)
783
{
784
sljit_u32 instruction;
785
#if PCRE2_CODE_UNIT_WIDTH == 8
786
sljit_u32 size = 0 << 22;
787
#elif PCRE2_CODE_UNIT_WIDTH == 16
788
sljit_u32 size = 1 << 22;
789
#elif PCRE2_CODE_UNIT_WIDTH == 32
790
sljit_u32 size = 2 << 22;
791
#else
792
#error "Unsupported unit width"
793
#endif
794
795
SLJIT_ASSERT(step >= 0 && step <= 2);
796
797
if (step == 1)
798
{
799
/* CMEQ */
800
instruction = 0x6e208c00 | size | (cmp1_ind << 16) | (dst_ind << 5) | dst_ind;
801
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
802
return;
803
}
804
805
if (compare_type != vector_compare_match2)
806
{
807
if (step == 0 && compare_type == vector_compare_match1i)
808
{
809
/* ORR */
810
instruction = 0x4ea01c00 | (cmp2_ind << 16) | (dst_ind << 5) | dst_ind;
811
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
812
}
813
return;
814
}
815
816
switch (step)
817
{
818
case 0:
819
/* CMEQ */
820
instruction = 0x6e208c00 | size | (cmp2_ind << 16) | (dst_ind << 5) | tmp_ind;
821
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
822
return;
823
824
case 2:
825
/* ORR */
826
instruction = 0x4ea01c00 | (tmp_ind << 16) | (dst_ind << 5) | dst_ind;
827
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
828
return;
829
}
830
}
831
832
static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
833
{
834
DEFINE_COMPILER;
835
sljit_u32 instruction;
836
struct sljit_label *start;
837
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
838
struct sljit_label *restart;
839
#endif
840
struct sljit_jump *quit;
841
struct sljit_jump *partial_quit[2];
842
vector_compare_type compare_type = vector_compare_match1;
843
sljit_u32 data_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);
844
sljit_u32 cmp1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);
845
sljit_u32 cmp2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);
846
sljit_u32 tmp_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);
847
sljit_u32 bit = 0;
848
int i;
849
850
SLJIT_UNUSED_ARG(offset);
851
852
if (char1 != char2)
853
{
854
bit = char1 ^ char2;
855
compare_type = vector_compare_match1i;
856
857
if (!is_powerof2(bit))
858
{
859
bit = 0;
860
compare_type = vector_compare_match2;
861
}
862
}
863
864
partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
865
if (common->mode == PCRE2_JIT_COMPLETE)
866
add_jump(compiler, &common->failed_match, partial_quit[0]);
867
868
/* First part (unaligned start) */
869
if (char1 != char2)
870
sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit != 0 ? bit : char2);
871
sljit_emit_op1(compiler, SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
872
873
if (char1 != char2)
874
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR2, TMP2, 0);
875
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR1, TMP1, 0);
876
877
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
878
879
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
880
restart = LABEL();
881
#endif
882
883
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~(sljit_sw)0xf);
884
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
885
886
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
887
888
for (i = 0; i < 3; i++)
889
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
890
891
/* SHRN */
892
instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;
893
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
894
895
sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);
896
897
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
898
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
899
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, TMP2, 0);
900
901
quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
902
903
/* Second part (aligned) */
904
start = LABEL();
905
906
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
907
908
partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
909
if (common->mode == PCRE2_JIT_COMPLETE)
910
add_jump(compiler, &common->failed_match, partial_quit[1]);
911
912
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
913
for (i = 0; i < 3; i++)
914
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
915
916
/* SHRN */
917
instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;
918
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
919
920
sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);
921
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
922
923
JUMPHERE(quit);
924
925
sljit_emit_op1(compiler, SLJIT_CTZ, TMP1, 0, TMP1, 0);
926
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
927
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
928
929
if (common->mode != PCRE2_JIT_COMPLETE)
930
{
931
JUMPHERE(partial_quit[0]);
932
JUMPHERE(partial_quit[1]);
933
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
934
SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
935
}
936
else
937
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
938
939
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
940
if (common->utf && offset > 0)
941
{
942
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
943
944
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
945
946
quit = jump_if_utf_char_start(compiler, TMP1);
947
948
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
949
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
950
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
951
JUMPTO(SLJIT_JUMP, restart);
952
953
JUMPHERE(quit);
954
}
955
#endif
956
}
957
958
#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
959
960
static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
961
{
962
DEFINE_COMPILER;
963
sljit_u32 instruction;
964
struct sljit_label *start;
965
struct sljit_jump *quit;
966
jump_list *not_found = NULL;
967
vector_compare_type compare_type = vector_compare_match1;
968
sljit_u32 data_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);
969
sljit_u32 cmp1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);
970
sljit_u32 cmp2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);
971
sljit_u32 tmp_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);
972
sljit_u32 bit = 0;
973
int i;
974
975
if (char1 != char2)
976
{
977
bit = char1 ^ char2;
978
compare_type = vector_compare_match1i;
979
980
if (!is_powerof2(bit))
981
{
982
bit = 0;
983
compare_type = vector_compare_match2;
984
}
985
}
986
987
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
988
989
/* First part (unaligned start) */
990
991
if (char1 != char2)
992
sljit_emit_op1(compiler, SLJIT_MOV, TMP3, 0, SLJIT_IMM, bit != 0 ? bit : char2);
993
sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1 | bit);
994
995
if (char1 != char2)
996
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR2, TMP3, 0);
997
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR1, TMP2, 0);
998
999
OP2(SLJIT_AND, TMP3, 0, TMP1, 0, SLJIT_IMM, ~(sljit_sw)0xf);
1000
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf);
1001
1002
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(TMP3), 0);
1003
1004
for (i = 0; i < 3; i++)
1005
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1006
1007
/* SHRN */
1008
instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;
1009
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1010
1011
sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);
1012
1013
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
1014
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
1015
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, TMP2, 0);
1016
1017
quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
1018
1019
/* Second part (aligned) */
1020
start = LABEL();
1021
1022
OP2(SLJIT_ADD, TMP3, 0, TMP3, 0, SLJIT_IMM, 16);
1023
1024
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP3, 0, STR_END, 0));
1025
1026
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(TMP3), 0);
1027
1028
for (i = 0; i < 3; i++)
1029
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1030
1031
/* SHRN */
1032
instruction = 0x0f0c8400 | (data_ind << 5) | data_ind;
1033
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1034
1035
sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);
1036
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
1037
1038
JUMPHERE(quit);
1039
1040
sljit_emit_op1(compiler, SLJIT_CTZ, TMP1, 0, TMP1, 0);
1041
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
1042
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1043
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1044
1045
return not_found;
1046
}
1047
1048
#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1049
1050
static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1051
PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1052
{
1053
DEFINE_COMPILER;
1054
sljit_u32 instruction;
1055
vector_compare_type compare1_type = vector_compare_match1;
1056
vector_compare_type compare2_type = vector_compare_match1;
1057
sljit_u32 bit1 = 0;
1058
sljit_u32 bit2 = 0;
1059
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1060
sljit_u32 data1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR0);
1061
sljit_u32 data2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR1);
1062
sljit_u32 cmp1a_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR2);
1063
sljit_u32 cmp2a_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR3);
1064
sljit_u32 cmp1b_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR4);
1065
sljit_u32 cmp2b_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR5);
1066
sljit_u32 tmp1_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR6);
1067
sljit_u32 tmp2_ind = (sljit_u32)sljit_get_register_index(SLJIT_SIMD_REG_128, SLJIT_VR7);
1068
struct sljit_label *start;
1069
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1070
struct sljit_label *restart;
1071
#endif
1072
struct sljit_jump *jump[2];
1073
int i;
1074
1075
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
1076
SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
1077
1078
if (char1a != char1b)
1079
{
1080
bit1 = char1a ^ char1b;
1081
compare1_type = vector_compare_match1i;
1082
1083
if (!is_powerof2(bit1))
1084
{
1085
bit1 = 0;
1086
compare1_type = vector_compare_match2;
1087
}
1088
}
1089
1090
if (char2a != char2b)
1091
{
1092
bit2 = char2a ^ char2b;
1093
compare2_type = vector_compare_match1i;
1094
1095
if (!is_powerof2(bit2))
1096
{
1097
bit2 = 0;
1098
compare2_type = vector_compare_match2;
1099
}
1100
}
1101
1102
/* Initialize. */
1103
if (common->match_end_ptr != 0)
1104
{
1105
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1106
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1107
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1108
1109
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1110
SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
1111
}
1112
1113
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1114
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1115
1116
sljit_emit_op1(compiler, SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
1117
sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2a | bit2);
1118
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR2, TMP1, 0);
1119
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR3, TMP2, 0);
1120
1121
if (char1a != char1b)
1122
sljit_emit_op1(compiler, SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit1 != 0 ? bit1 : char1b);
1123
1124
if (char2a != char2b)
1125
sljit_emit_op1(compiler, SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2 != 0 ? bit2 : char2b);
1126
1127
if (char1a != char1b)
1128
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR4, TMP1, 0);
1129
1130
if (char2a != char2b)
1131
sljit_emit_simd_replicate(compiler, PCRE2_REPLICATE_TYPE, SLJIT_VR5, TMP2, 0);
1132
1133
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1134
restart = LABEL();
1135
#endif
1136
1137
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
1138
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
1139
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~(sljit_sw)0xf);
1140
1141
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
1142
1143
jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
1144
1145
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
1146
jump[1] = JUMP(SLJIT_JUMP);
1147
1148
JUMPHERE(jump[0]);
1149
1150
if (diff >= 8)
1151
{
1152
/* MOV (element) */
1153
instruction = 0x6e180400 | (data1_ind << 5) | data2_ind;
1154
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1155
1156
if (diff > 8)
1157
{
1158
/* SHL */
1159
instruction = 0x4f405400 | ((diff - 8) << 19) | (data2_ind << 5) | data2_ind;
1160
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1161
}
1162
}
1163
else
1164
{
1165
/* MOV (element) */
1166
instruction = 0x6e180400 | (data1_ind << 5) | tmp1_ind;
1167
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1168
1169
/* SHL */
1170
instruction = 0x4f405400 | (diff << 19) | (data1_ind << 5) | data2_ind;
1171
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1172
1173
/* USHR */
1174
instruction = 0x6f400400 | (diff << 19) | (tmp1_ind << 5) | tmp1_ind;
1175
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1176
1177
sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | SLJIT_SIMD_REG_128, SLJIT_VR1, SLJIT_VR1, SLJIT_VR6, 0);
1178
}
1179
1180
JUMPHERE(jump[1]);
1181
1182
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
1183
1184
for (i = 0; i < 3; i++)
1185
{
1186
fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1187
fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1188
}
1189
1190
sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | SLJIT_SIMD_REG_128, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);
1191
1192
/* SHRN */
1193
instruction = 0x0f0c8400 | (data1_ind << 5) | data1_ind;
1194
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1195
1196
sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);
1197
1198
/* Ignore matches before the first STR_PTR. */
1199
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
1200
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
1201
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, TMP2, 0);
1202
1203
jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
1204
1205
/* Main loop. */
1206
start = LABEL();
1207
1208
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1209
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1210
1211
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_MEM_ALIGNED_128, SLJIT_VR0, SLJIT_MEM1(STR_PTR), 0);
1212
sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, SLJIT_VR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
1213
1214
for (i = 0; i < 3; i++)
1215
{
1216
fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1217
fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1218
}
1219
1220
sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | SLJIT_SIMD_REG_128, SLJIT_VR0, SLJIT_VR0, SLJIT_VR1, 0);
1221
1222
/* SHRN */
1223
instruction = 0x0f0c8400 | (data1_ind << 5) | data1_ind;
1224
sljit_emit_op_custom(compiler, &instruction, sizeof(sljit_u32));
1225
1226
sljit_emit_simd_lane_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64, SLJIT_VR0, 0, TMP1, 0);
1227
1228
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
1229
1230
JUMPHERE(jump[0]);
1231
1232
sljit_emit_op1(compiler, SLJIT_CTZ, TMP1, 0, TMP1, 0);
1233
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
1234
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1235
1236
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1237
1238
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1239
if (common->utf)
1240
{
1241
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1242
1243
jump[0] = jump_if_utf_char_start(compiler, TMP1);
1244
1245
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1246
CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
1247
1248
add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
1249
1250
JUMPHERE(jump[0]);
1251
}
1252
#endif
1253
1254
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1255
1256
if (common->match_end_ptr != 0)
1257
OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1258
}
1259
1260
#undef PCRE2_REPLICATE_TYPE
1261
1262
#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1263
1264
#if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1265
1266
#if PCRE2_CODE_UNIT_WIDTH == 8
1267
#define VECTOR_ELEMENT_SIZE 0
1268
#elif PCRE2_CODE_UNIT_WIDTH == 16
1269
#define VECTOR_ELEMENT_SIZE 1
1270
#elif PCRE2_CODE_UNIT_WIDTH == 32
1271
#define VECTOR_ELEMENT_SIZE 2
1272
#else
1273
#error "Unsupported unit width"
1274
#endif
1275
1276
static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1277
sljit_s32 base_reg, sljit_s32 index_reg)
1278
{
1279
sljit_u16 instruction[3];
1280
1281
instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1282
instruction[1] = (sljit_u16)(base_reg << 12);
1283
instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1284
1285
sljit_emit_op_custom(compiler, instruction, 6);
1286
}
1287
1288
#if PCRE2_CODE_UNIT_WIDTH == 32
1289
1290
static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1291
PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1292
{
1293
sljit_u16 instruction[3];
1294
1295
SLJIT_ASSERT(step >= 0 && step <= 1);
1296
1297
if (chr < 0x7fff)
1298
{
1299
if (step == 1)
1300
return;
1301
1302
/* VREPI */
1303
instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1304
instruction[1] = (sljit_u16)chr;
1305
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1306
sljit_emit_op_custom(compiler, instruction, 6);
1307
return;
1308
}
1309
1310
if (step == 0)
1311
{
1312
OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1313
1314
/* VLVG */
1315
instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
1316
instruction[1] = 0;
1317
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1318
sljit_emit_op_custom(compiler, instruction, 6);
1319
return;
1320
}
1321
1322
/* VREP */
1323
instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1324
instruction[1] = 0;
1325
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1326
sljit_emit_op_custom(compiler, instruction, 6);
1327
}
1328
1329
#endif
1330
1331
static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1332
int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1333
{
1334
sljit_u16 instruction[3];
1335
1336
SLJIT_ASSERT(step >= 0 && step <= 2);
1337
1338
if (step == 1)
1339
{
1340
/* VCEQ */
1341
instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1342
instruction[1] = (sljit_u16)(cmp1_ind << 12);
1343
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1344
sljit_emit_op_custom(compiler, instruction, 6);
1345
return;
1346
}
1347
1348
if (compare_type != vector_compare_match2)
1349
{
1350
if (step == 0 && compare_type == vector_compare_match1i)
1351
{
1352
/* VO */
1353
instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1354
instruction[1] = (sljit_u16)(cmp2_ind << 12);
1355
instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1356
sljit_emit_op_custom(compiler, instruction, 6);
1357
}
1358
return;
1359
}
1360
1361
switch (step)
1362
{
1363
case 0:
1364
/* VCEQ */
1365
instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1366
instruction[1] = (sljit_u16)(cmp2_ind << 12);
1367
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1368
sljit_emit_op_custom(compiler, instruction, 6);
1369
return;
1370
1371
case 2:
1372
/* VO */
1373
instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1374
instruction[1] = (sljit_u16)(tmp_ind << 12);
1375
instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1376
sljit_emit_op_custom(compiler, instruction, 6);
1377
return;
1378
}
1379
}
1380
1381
#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1382
1383
static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1384
{
1385
DEFINE_COMPILER;
1386
sljit_u16 instruction[3];
1387
struct sljit_label *start;
1388
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1389
struct sljit_label *restart;
1390
#endif
1391
struct sljit_jump *quit;
1392
struct sljit_jump *partial_quit[2];
1393
vector_compare_type compare_type = vector_compare_match1;
1394
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1395
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1396
sljit_s32 data_ind = 0;
1397
sljit_s32 tmp_ind = 1;
1398
sljit_s32 cmp1_ind = 2;
1399
sljit_s32 cmp2_ind = 3;
1400
sljit_s32 zero_ind = 4;
1401
sljit_u32 bit = 0;
1402
int i;
1403
1404
SLJIT_UNUSED_ARG(offset);
1405
1406
if (char1 != char2)
1407
{
1408
bit = char1 ^ char2;
1409
compare_type = vector_compare_match1i;
1410
1411
if (!is_powerof2(bit))
1412
{
1413
bit = 0;
1414
compare_type = vector_compare_match2;
1415
}
1416
}
1417
1418
partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1419
if (common->mode == PCRE2_JIT_COMPLETE)
1420
add_jump(compiler, &common->failed_match, partial_quit[0]);
1421
1422
/* First part (unaligned start) */
1423
1424
OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1425
1426
#if PCRE2_CODE_UNIT_WIDTH != 32
1427
1428
/* VREPI */
1429
instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1430
instruction[1] = (sljit_u16)(char1 | bit);
1431
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1432
sljit_emit_op_custom(compiler, instruction, 6);
1433
1434
if (char1 != char2)
1435
{
1436
/* VREPI */
1437
instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1438
instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1439
/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1440
sljit_emit_op_custom(compiler, instruction, 6);
1441
}
1442
1443
#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1444
1445
for (i = 0; i < 2; i++)
1446
{
1447
replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1448
1449
if (char1 != char2)
1450
replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1451
}
1452
1453
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1454
1455
if (compare_type == vector_compare_match2)
1456
{
1457
/* VREPI */
1458
instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1459
instruction[1] = 0;
1460
instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1461
sljit_emit_op_custom(compiler, instruction, 6);
1462
}
1463
1464
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1465
restart = LABEL();
1466
#endif
1467
1468
load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1469
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1470
1471
if (compare_type != vector_compare_match2)
1472
{
1473
if (compare_type == vector_compare_match1i)
1474
fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1475
1476
/* VFEE */
1477
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1478
instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1479
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1480
sljit_emit_op_custom(compiler, instruction, 6);
1481
}
1482
else
1483
{
1484
for (i = 0; i < 3; i++)
1485
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1486
1487
/* VFENE */
1488
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1489
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1490
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1491
sljit_emit_op_custom(compiler, instruction, 6);
1492
}
1493
1494
/* VLGVB */
1495
instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1496
instruction[1] = 7;
1497
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1498
sljit_emit_op_custom(compiler, instruction, 6);
1499
1500
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1501
quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1502
1503
OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1504
1505
/* Second part (aligned) */
1506
start = LABEL();
1507
1508
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1509
1510
partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1511
if (common->mode == PCRE2_JIT_COMPLETE)
1512
add_jump(compiler, &common->failed_match, partial_quit[1]);
1513
1514
load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1515
1516
if (compare_type != vector_compare_match2)
1517
{
1518
if (compare_type == vector_compare_match1i)
1519
fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1520
1521
/* VFEE */
1522
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1523
instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1524
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1525
sljit_emit_op_custom(compiler, instruction, 6);
1526
}
1527
else
1528
{
1529
for (i = 0; i < 3; i++)
1530
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1531
1532
/* VFENE */
1533
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1534
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1535
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1536
sljit_emit_op_custom(compiler, instruction, 6);
1537
}
1538
1539
sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1540
JUMPTO(SLJIT_OVERFLOW, start);
1541
1542
/* VLGVB */
1543
instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1544
instruction[1] = 7;
1545
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1546
sljit_emit_op_custom(compiler, instruction, 6);
1547
1548
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1549
1550
JUMPHERE(quit);
1551
1552
if (common->mode != PCRE2_JIT_COMPLETE)
1553
{
1554
JUMPHERE(partial_quit[0]);
1555
JUMPHERE(partial_quit[1]);
1556
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
1557
SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
1558
}
1559
else
1560
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1561
1562
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1563
if (common->utf && offset > 0)
1564
{
1565
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1566
1567
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1568
1569
quit = jump_if_utf_char_start(compiler, TMP1);
1570
1571
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1572
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1573
1574
OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1575
JUMPTO(SLJIT_JUMP, restart);
1576
1577
JUMPHERE(quit);
1578
}
1579
#endif
1580
}
1581
1582
#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1583
1584
static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1585
{
1586
DEFINE_COMPILER;
1587
sljit_u16 instruction[3];
1588
struct sljit_label *start;
1589
struct sljit_jump *quit;
1590
jump_list *not_found = NULL;
1591
vector_compare_type compare_type = vector_compare_match1;
1592
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1593
sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
1594
sljit_s32 data_ind = 0;
1595
sljit_s32 tmp_ind = 1;
1596
sljit_s32 cmp1_ind = 2;
1597
sljit_s32 cmp2_ind = 3;
1598
sljit_s32 zero_ind = 4;
1599
sljit_u32 bit = 0;
1600
int i;
1601
1602
if (char1 != char2)
1603
{
1604
bit = char1 ^ char2;
1605
compare_type = vector_compare_match1i;
1606
1607
if (!is_powerof2(bit))
1608
{
1609
bit = 0;
1610
compare_type = vector_compare_match2;
1611
}
1612
}
1613
1614
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1615
1616
/* First part (unaligned start) */
1617
1618
OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1619
1620
#if PCRE2_CODE_UNIT_WIDTH != 32
1621
1622
/* VREPI */
1623
instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1624
instruction[1] = (sljit_u16)(char1 | bit);
1625
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1626
sljit_emit_op_custom(compiler, instruction, 6);
1627
1628
if (char1 != char2)
1629
{
1630
/* VREPI */
1631
instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1632
instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1633
/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1634
sljit_emit_op_custom(compiler, instruction, 6);
1635
}
1636
1637
#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1638
1639
for (i = 0; i < 2; i++)
1640
{
1641
replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1642
1643
if (char1 != char2)
1644
replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1645
}
1646
1647
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1648
1649
if (compare_type == vector_compare_match2)
1650
{
1651
/* VREPI */
1652
instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1653
instruction[1] = 0;
1654
instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1655
sljit_emit_op_custom(compiler, instruction, 6);
1656
}
1657
1658
load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1659
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1660
1661
if (compare_type != vector_compare_match2)
1662
{
1663
if (compare_type == vector_compare_match1i)
1664
fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1665
1666
/* VFEE */
1667
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1668
instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1669
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1670
sljit_emit_op_custom(compiler, instruction, 6);
1671
}
1672
else
1673
{
1674
for (i = 0; i < 3; i++)
1675
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1676
1677
/* VFENE */
1678
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1679
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1680
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1681
sljit_emit_op_custom(compiler, instruction, 6);
1682
}
1683
1684
/* VLGVB */
1685
instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1686
instruction[1] = 7;
1687
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1688
sljit_emit_op_custom(compiler, instruction, 6);
1689
1690
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1691
quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1692
1693
OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1694
1695
/* Second part (aligned) */
1696
start = LABEL();
1697
1698
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1699
1700
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1701
1702
load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1703
1704
if (compare_type != vector_compare_match2)
1705
{
1706
if (compare_type == vector_compare_match1i)
1707
fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1708
1709
/* VFEE */
1710
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1711
instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1712
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1713
sljit_emit_op_custom(compiler, instruction, 6);
1714
}
1715
else
1716
{
1717
for (i = 0; i < 3; i++)
1718
fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1719
1720
/* VFENE */
1721
instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1722
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1723
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1724
sljit_emit_op_custom(compiler, instruction, 6);
1725
}
1726
1727
sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1728
JUMPTO(SLJIT_OVERFLOW, start);
1729
1730
/* VLGVB */
1731
instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1732
instruction[1] = 7;
1733
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1734
sljit_emit_op_custom(compiler, instruction, 6);
1735
1736
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1737
1738
JUMPHERE(quit);
1739
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1740
1741
return not_found;
1742
}
1743
1744
#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1745
1746
static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1747
PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1748
{
1749
DEFINE_COMPILER;
1750
sljit_u16 instruction[3];
1751
struct sljit_label *start;
1752
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1753
struct sljit_label *restart;
1754
#endif
1755
struct sljit_jump *quit;
1756
struct sljit_jump *jump[2];
1757
vector_compare_type compare1_type = vector_compare_match1;
1758
vector_compare_type compare2_type = vector_compare_match1;
1759
sljit_u32 bit1 = 0;
1760
sljit_u32 bit2 = 0;
1761
sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1762
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1763
sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
1764
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1765
sljit_s32 data1_ind = 0;
1766
sljit_s32 data2_ind = 1;
1767
sljit_s32 tmp1_ind = 2;
1768
sljit_s32 tmp2_ind = 3;
1769
sljit_s32 cmp1a_ind = 4;
1770
sljit_s32 cmp1b_ind = 5;
1771
sljit_s32 cmp2a_ind = 6;
1772
sljit_s32 cmp2b_ind = 7;
1773
sljit_s32 zero_ind = 8;
1774
int i;
1775
1776
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1777
SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1778
SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1779
1780
if (char1a != char1b)
1781
{
1782
bit1 = char1a ^ char1b;
1783
compare1_type = vector_compare_match1i;
1784
1785
if (!is_powerof2(bit1))
1786
{
1787
bit1 = 0;
1788
compare1_type = vector_compare_match2;
1789
}
1790
}
1791
1792
if (char2a != char2b)
1793
{
1794
bit2 = char2a ^ char2b;
1795
compare2_type = vector_compare_match1i;
1796
1797
if (!is_powerof2(bit2))
1798
{
1799
bit2 = 0;
1800
compare2_type = vector_compare_match2;
1801
}
1802
}
1803
1804
/* Initialize. */
1805
if (common->match_end_ptr != 0)
1806
{
1807
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1808
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1809
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1810
1811
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1812
SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
1813
}
1814
1815
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1816
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1817
OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1818
1819
#if PCRE2_CODE_UNIT_WIDTH != 32
1820
1821
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1822
1823
/* VREPI */
1824
instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1825
instruction[1] = (sljit_u16)(char1a | bit1);
1826
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1827
sljit_emit_op_custom(compiler, instruction, 6);
1828
1829
if (char1a != char1b)
1830
{
1831
/* VREPI */
1832
instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1833
instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1834
/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1835
sljit_emit_op_custom(compiler, instruction, 6);
1836
}
1837
1838
/* VREPI */
1839
instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1840
instruction[1] = (sljit_u16)(char2a | bit2);
1841
/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1842
sljit_emit_op_custom(compiler, instruction, 6);
1843
1844
if (char2a != char2b)
1845
{
1846
/* VREPI */
1847
instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1848
instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1849
/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1850
sljit_emit_op_custom(compiler, instruction, 6);
1851
}
1852
1853
#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1854
1855
for (i = 0; i < 2; i++)
1856
{
1857
replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1858
1859
if (char1a != char1b)
1860
replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1861
1862
replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1863
1864
if (char2a != char2b)
1865
replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1866
}
1867
1868
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1869
1870
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1871
1872
/* VREPI */
1873
instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1874
instruction[1] = 0;
1875
instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1876
sljit_emit_op_custom(compiler, instruction, 6);
1877
1878
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1879
restart = LABEL();
1880
#endif
1881
1882
jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1883
load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1884
jump[1] = JUMP(SLJIT_JUMP);
1885
JUMPHERE(jump[0]);
1886
load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1887
JUMPHERE(jump[1]);
1888
1889
load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1890
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1891
1892
for (i = 0; i < 3; i++)
1893
{
1894
fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1895
fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1896
}
1897
1898
/* VN */
1899
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1900
instruction[1] = (sljit_u16)(data2_ind << 12);
1901
instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1902
sljit_emit_op_custom(compiler, instruction, 6);
1903
1904
/* VFENE */
1905
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1906
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1907
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1908
sljit_emit_op_custom(compiler, instruction, 6);
1909
1910
/* VLGVB */
1911
instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1912
instruction[1] = 7;
1913
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1914
sljit_emit_op_custom(compiler, instruction, 6);
1915
1916
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1917
quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1918
1919
OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1920
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1921
1922
/* Main loop. */
1923
start = LABEL();
1924
1925
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1926
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1927
1928
load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1929
load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1930
1931
for (i = 0; i < 3; i++)
1932
{
1933
fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1934
fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1935
}
1936
1937
/* VN */
1938
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1939
instruction[1] = (sljit_u16)(data2_ind << 12);
1940
instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1941
sljit_emit_op_custom(compiler, instruction, 6);
1942
1943
/* VFENE */
1944
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1945
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1946
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1947
sljit_emit_op_custom(compiler, instruction, 6);
1948
1949
sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1950
JUMPTO(SLJIT_OVERFLOW, start);
1951
1952
/* VLGVB */
1953
instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1954
instruction[1] = 7;
1955
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1956
sljit_emit_op_custom(compiler, instruction, 6);
1957
1958
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1959
1960
JUMPHERE(quit);
1961
1962
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1963
1964
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1965
if (common->utf)
1966
{
1967
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1968
1969
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1970
1971
quit = jump_if_utf_char_start(compiler, TMP1);
1972
1973
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1974
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1975
1976
/* TMP1 contains diff. */
1977
OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1978
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1979
JUMPTO(SLJIT_JUMP, restart);
1980
1981
JUMPHERE(quit);
1982
}
1983
#endif
1984
1985
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1986
1987
if (common->match_end_ptr != 0)
1988
OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1989
}
1990
1991
#endif /* SLJIT_CONFIG_S390X */
1992
1993
#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
1994
1995
#ifdef __linux__
1996
/* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
1997
#include <sys/auxv.h>
1998
#define LOONGARCH_HWCAP_LSX (1 << 4)
1999
#define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)
2000
#else
2001
#define HAS_LSX_SUPPORT 0
2002
#endif
2003
2004
typedef sljit_ins sljit_u32;
2005
2006
#define SI12_IMM_MASK 0x003ffc00
2007
#define UI5_IMM_MASK 0x00007c00
2008
#define UI2_IMM_MASK 0x00000c00
2009
2010
#define VD(vd) ((sljit_ins)vd << 0)
2011
#define VJ(vj) ((sljit_ins)vj << 5)
2012
#define VK(vk) ((sljit_ins)vk << 10)
2013
#define RD_V(rd) ((sljit_ins)rd << 0)
2014
#define RJ_V(rj) ((sljit_ins)rj << 5)
2015
2016
#define IMM_SI12(imm) (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)
2017
#define IMM_UI5(imm) (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)
2018
#define IMM_UI2(imm) (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)
2019
2020
// LSX OPCODES:
2021
#define VBSLL_V 0x728e0000
2022
#define VMSKLTZ_B 0x729c4000
2023
#define VPICKVE2GR_WU 0x72f3e000
2024
2025
#if PCRE2_CODE_UNIT_WIDTH == 8
2026
#define VREPLGR2VR_X 0x729f0000
2027
#define VSEQ 0x70000000
2028
#elif PCRE2_CODE_UNIT_WIDTH == 16
2029
#define VREPLGR2VR_X 0x729f0400
2030
#define VSEQ 0x70008000
2031
#else
2032
#define VREPLGR2VR_X 0x729f0800
2033
#define VSEQ 0x70010000
2034
#endif
2035
2036
static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
2037
sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
2038
{
2039
if (compare_type != vector_compare_match2)
2040
{
2041
if (compare_type == vector_compare_match1i)
2042
{
2043
/* VOR.V vd, vj, vk */
2044
push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
2045
}
2046
2047
/* VSEQ.B/H/W vd, vj, vk */
2048
push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
2049
return;
2050
}
2051
2052
/* VBSLL.V vd, vj, ui5 */
2053
push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));
2054
2055
/* VSEQ.B/H/W vd, vj, vk */
2056
push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
2057
2058
/* VSEQ.B/H/W vd, vj, vk */
2059
push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));
2060
2061
/* VOR vd, vj, vk */
2062
push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
2063
return;
2064
}
2065
2066
#define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT
2067
2068
static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
2069
{
2070
DEFINE_COMPILER;
2071
struct sljit_label *start;
2072
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2073
struct sljit_label *restart;
2074
#endif
2075
struct sljit_jump *quit;
2076
struct sljit_jump *partial_quit[2];
2077
vector_compare_type compare_type = vector_compare_match1;
2078
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2079
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2080
sljit_s32 data_ind = 0;
2081
sljit_s32 tmp_ind = 1;
2082
sljit_s32 cmp1_ind = 2;
2083
sljit_s32 cmp2_ind = 3;
2084
sljit_u32 bit = 0;
2085
2086
SLJIT_UNUSED_ARG(offset);
2087
2088
if (char1 != char2)
2089
{
2090
bit = char1 ^ char2;
2091
compare_type = vector_compare_match1i;
2092
2093
if (!is_powerof2(bit))
2094
{
2095
bit = 0;
2096
compare_type = vector_compare_match2;
2097
}
2098
}
2099
2100
partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
2101
if (common->mode == PCRE2_JIT_COMPLETE)
2102
add_jump(compiler, &common->failed_match, partial_quit[0]);
2103
2104
/* First part (unaligned start) */
2105
2106
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
2107
2108
/* VREPLGR2VR.B/H/W vd, rj */
2109
push_inst(compiler, VREPLGR2VR_X | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
2110
2111
if (char1 != char2)
2112
{
2113
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
2114
2115
/* VREPLGR2VR.B/H/W vd, rj */
2116
push_inst(compiler, VREPLGR2VR_X | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
2117
}
2118
2119
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2120
2121
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2122
restart = LABEL();
2123
#endif
2124
2125
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2126
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2127
2128
/* VLD vd, rj, si12 */
2129
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2130
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2131
2132
/* VMSKLTZ.B vd, vj */
2133
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2134
2135
/* VPICKVE2GR.WU rd, vj, ui2 */
2136
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2137
2138
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2139
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2140
2141
quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2142
2143
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2144
2145
/* Second part (aligned) */
2146
start = LABEL();
2147
2148
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2149
2150
partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
2151
if (common->mode == PCRE2_JIT_COMPLETE)
2152
add_jump(compiler, &common->failed_match, partial_quit[1]);
2153
2154
/* VLD vd, rj, si12 */
2155
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2156
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2157
2158
/* VMSKLTZ.B vd, vj */
2159
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2160
2161
/* VPICKVE2GR.WU rd, vj, ui2 */
2162
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2163
2164
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2165
2166
JUMPHERE(quit);
2167
2168
/* CTZ.W rd, rj */
2169
push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2170
2171
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2172
2173
if (common->mode != PCRE2_JIT_COMPLETE)
2174
{
2175
JUMPHERE(partial_quit[0]);
2176
JUMPHERE(partial_quit[1]);
2177
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
2178
SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
2179
}
2180
else
2181
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2182
2183
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2184
if (common->utf && offset > 0)
2185
{
2186
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
2187
2188
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
2189
2190
quit = jump_if_utf_char_start(compiler, TMP1);
2191
2192
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2193
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2194
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2195
JUMPTO(SLJIT_JUMP, restart);
2196
2197
JUMPHERE(quit);
2198
}
2199
#endif
2200
}
2201
2202
#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT
2203
2204
static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
2205
{
2206
DEFINE_COMPILER;
2207
struct sljit_label *start;
2208
struct sljit_jump *quit;
2209
jump_list *not_found = NULL;
2210
vector_compare_type compare_type = vector_compare_match1;
2211
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2212
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2213
sljit_s32 data_ind = 0;
2214
sljit_s32 tmp_ind = 1;
2215
sljit_s32 cmp1_ind = 2;
2216
sljit_s32 cmp2_ind = 3;
2217
sljit_u32 bit = 0;
2218
2219
if (char1 != char2)
2220
{
2221
bit = char1 ^ char2;
2222
compare_type = vector_compare_match1i;
2223
2224
if (!is_powerof2(bit))
2225
{
2226
bit = 0;
2227
compare_type = vector_compare_match2;
2228
}
2229
}
2230
2231
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2232
OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
2233
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
2234
2235
/* First part (unaligned start) */
2236
2237
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
2238
2239
/* VREPLGR2VR.B/H/W vd, rj */
2240
push_inst(compiler, VREPLGR2VR_X | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
2241
2242
if (char1 != char2)
2243
{
2244
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
2245
/* VREPLGR2VR.B/H/W vd, rj */
2246
push_inst(compiler, VREPLGR2VR_X | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
2247
}
2248
2249
OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
2250
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2251
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2252
2253
/* VLD vd, rj, si12 */
2254
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2255
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2256
2257
/* VMSKLTZ.B vd, vj */
2258
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2259
2260
/* VPICKVE2GR.WU rd, vj, ui2 */
2261
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2262
2263
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2264
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2265
2266
quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2267
2268
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2269
2270
/* Second part (aligned) */
2271
start = LABEL();
2272
2273
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2274
2275
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2276
2277
/* VLD vd, rj, si12 */
2278
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2279
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2280
2281
/* VMSKLTZ.B vd, vj */
2282
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2283
2284
/* VPICKVE2GR.WU rd, vj, ui2 */
2285
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2286
2287
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2288
2289
JUMPHERE(quit);
2290
2291
/* CTZ.W rd, rj */
2292
push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2293
2294
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
2295
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2296
2297
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
2298
return not_found;
2299
}
2300
2301
#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT
2302
2303
static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
2304
PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
2305
{
2306
DEFINE_COMPILER;
2307
vector_compare_type compare1_type = vector_compare_match1;
2308
vector_compare_type compare2_type = vector_compare_match1;
2309
sljit_u32 bit1 = 0;
2310
sljit_u32 bit2 = 0;
2311
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
2312
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2313
sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
2314
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2315
sljit_s32 data1_ind = 0;
2316
sljit_s32 data2_ind = 1;
2317
sljit_s32 tmp1_ind = 2;
2318
sljit_s32 tmp2_ind = 3;
2319
sljit_s32 cmp1a_ind = 4;
2320
sljit_s32 cmp1b_ind = 5;
2321
sljit_s32 cmp2a_ind = 6;
2322
sljit_s32 cmp2b_ind = 7;
2323
struct sljit_label *start;
2324
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2325
struct sljit_label *restart;
2326
#endif
2327
struct sljit_jump *jump[2];
2328
2329
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
2330
SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
2331
2332
/* Initialize. */
2333
if (common->match_end_ptr != 0)
2334
{
2335
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
2336
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
2337
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
2338
2339
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
2340
SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
2341
}
2342
2343
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2344
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2345
2346
if (char1a == char1b)
2347
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2348
else
2349
{
2350
bit1 = char1a ^ char1b;
2351
if (is_powerof2(bit1))
2352
{
2353
compare1_type = vector_compare_match1i;
2354
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
2355
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
2356
}
2357
else
2358
{
2359
compare1_type = vector_compare_match2;
2360
bit1 = 0;
2361
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2362
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
2363
}
2364
}
2365
2366
/* VREPLGR2VR.B/H/W vd, rj */
2367
push_inst(compiler, VREPLGR2VR_X | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));
2368
2369
if (char1a != char1b)
2370
{
2371
/* VREPLGR2VR.B/H/W vd, rj */
2372
push_inst(compiler, VREPLGR2VR_X | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
2373
}
2374
2375
if (char2a == char2b)
2376
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2377
else
2378
{
2379
bit2 = char2a ^ char2b;
2380
if (is_powerof2(bit2))
2381
{
2382
compare2_type = vector_compare_match1i;
2383
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
2384
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
2385
}
2386
else
2387
{
2388
compare2_type = vector_compare_match2;
2389
bit2 = 0;
2390
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2391
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
2392
}
2393
}
2394
2395
/* VREPLGR2VR.B/H/W vd, rj */
2396
push_inst(compiler, VREPLGR2VR_X | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));
2397
2398
if (char2a != char2b)
2399
{
2400
/* VREPLGR2VR.B/H/W vd, rj */
2401
push_inst(compiler, VREPLGR2VR_X | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
2402
}
2403
2404
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2405
restart = LABEL();
2406
#endif
2407
2408
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
2409
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2410
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2411
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2412
2413
/* VLD vd, rj, si12 */
2414
push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2415
2416
jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
2417
2418
/* VLD vd, rj, si12 */
2419
push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2420
jump[1] = JUMP(SLJIT_JUMP);
2421
2422
JUMPHERE(jump[0]);
2423
2424
/* VBSLL.V vd, vj, ui5 */
2425
push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));
2426
2427
JUMPHERE(jump[1]);
2428
2429
fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
2430
fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
2431
2432
/* VAND vd, vj, vk */
2433
push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2434
2435
/* VMSKLTZ.B vd, vj */
2436
push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2437
2438
/* VPICKVE2GR.WU rd, vj, ui2 */
2439
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2440
2441
/* Ignore matches before the first STR_PTR. */
2442
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2443
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2444
2445
jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2446
2447
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2448
2449
/* Main loop. */
2450
start = LABEL();
2451
2452
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2453
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2454
2455
/* VLD vd, rj, si12 */
2456
push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2457
push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2458
2459
fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
2460
fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
2461
2462
/* VAND.V vd, vj, vk */
2463
push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2464
2465
/* VMSKLTZ.B vd, vj */
2466
push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2467
2468
/* VPICKVE2GR.WU rd, vj, ui2 */
2469
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2470
2471
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2472
2473
JUMPHERE(jump[0]);
2474
2475
/* CTZ.W rd, rj */
2476
push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2477
2478
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2479
2480
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2481
2482
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2483
if (common->utf)
2484
{
2485
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
2486
2487
jump[0] = jump_if_utf_char_start(compiler, TMP1);
2488
2489
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2490
CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
2491
2492
add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
2493
2494
JUMPHERE(jump[0]);
2495
}
2496
#endif
2497
2498
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2499
2500
if (common->match_end_ptr != 0)
2501
OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
2502
}
2503
2504
#endif /* SLJIT_CONFIG_LOONGARCH_64 */
2505
2506
#endif /* !SUPPORT_VALGRIND */
2507
2508