Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bionic-x86_64-string/ssse3-strcmp-slm.S
39475 views
1
/*
2
Copyright (c) 2014, Intel Corporation
3
All rights reserved.
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
* Redistributions of source code must retain the above copyright notice,
9
* this list of conditions and the following disclaimer.
10
11
* Redistributions in binary form must reproduce the above copyright notice,
12
* this list of conditions and the following disclaimer in the documentation
13
* and/or other materials provided with the distribution.
14
15
* Neither the name of Intel Corporation nor the names of its contributors
16
* may be used to endorse or promote products derived from this software
17
* without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#ifdef USE_AS_STRNCMP
32
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
33
if the new counter > the old one or is 0. */
34
#define UPDATE_STRNCMP_COUNTER \
35
/* calculate left number to compare */ \
36
lea -16(%rcx, %r11), %r9; \
37
cmp %r9, %r11; \
38
jb L(strcmp_exitz); \
39
test %r9, %r9; \
40
je L(strcmp_exitz); \
41
mov %r9, %r11
42
43
#else
44
#define UPDATE_STRNCMP_COUNTER
45
#ifndef STRCMP
46
#define STRCMP strcmp
47
#endif
48
#endif
49
50
#ifndef L
51
# define L(label) .L##label
52
#endif
53
54
#ifndef cfi_startproc
55
# define cfi_startproc .cfi_startproc
56
#endif
57
58
#ifndef cfi_endproc
59
# define cfi_endproc .cfi_endproc
60
#endif
61
62
#ifndef ENTRY
63
# define ENTRY(name) \
64
.type name, @function; \
65
.globl name; \
66
.p2align 4; \
67
name: \
68
cfi_startproc
69
#endif
70
71
#ifndef END
72
# define END(name) \
73
cfi_endproc; \
74
.size name, .-name
75
#endif
76
#define RETURN ret
77
.section .text.ssse3,"ax",@progbits
78
ENTRY (STRCMP)
79
/*
80
* This implementation uses SSE to compare up to 16 bytes at a time.
81
*/
82
#ifdef USE_AS_STRNCMP
83
test %rdx, %rdx
84
je L(strcmp_exitz)
85
cmp $1, %rdx
86
je L(Byte0)
87
mov %rdx, %r11
88
#endif
89
mov %esi, %ecx
90
mov %edi, %eax
91
/* Use 64bit AND here to avoid long NOP padding. */
92
and $0x3f, %rcx /* rsi alignment in cache line */
93
and $0x3f, %rax /* rdi alignment in cache line */
94
cmp $0x30, %ecx
95
ja L(crosscache) /* rsi: 16-byte load will cross cache line */
96
cmp $0x30, %eax
97
ja L(crosscache) /* rdi: 16-byte load will cross cache line */
98
movlpd (%rdi), %xmm1
99
movlpd (%rsi), %xmm2
100
movhpd 8(%rdi), %xmm1
101
movhpd 8(%rsi), %xmm2
102
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
103
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
104
pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
105
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
106
pmovmskb %xmm1, %edx
107
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
108
jnz L(less16bytes) /* If not, find different value or null char */
109
#ifdef USE_AS_STRNCMP
110
sub $16, %r11
111
jbe L(strcmp_exitz) /* finish comparision */
112
#endif
113
add $16, %rsi /* prepare to search next 16 bytes */
114
add $16, %rdi /* prepare to search next 16 bytes */
115
116
/*
117
* Determine source and destination string offsets from 16-byte alignment.
118
* Use relative offset difference between the two to determine which case
119
* below to use.
120
*/
121
.p2align 4
122
L(crosscache):
123
and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
124
and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
125
mov $0xffff, %edx /* for equivalent offset */
126
xor %r8d, %r8d
127
and $0xf, %ecx /* offset of rsi */
128
and $0xf, %eax /* offset of rdi */
129
cmp %eax, %ecx
130
je L(ashr_0) /* rsi and rdi relative offset same */
131
ja L(bigger)
132
mov %edx, %r8d /* r8d is offset flag for exit tail */
133
xchg %ecx, %eax
134
xchg %rsi, %rdi
135
L(bigger):
136
lea 15(%rax), %r9
137
sub %rcx, %r9
138
lea L(unaligned_table)(%rip), %r10
139
movslq (%r10, %r9,4), %r9
140
lea (%r10, %r9), %r10
141
jmp *%r10 /* jump to corresponding case */
142
143
/*
144
* The following cases will be handled by ashr_0
145
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
146
* n(0~15) n(0~15) 15(15+ n-n) ashr_0
147
*/
148
.p2align 4
149
L(ashr_0):
150
151
movdqa (%rsi), %xmm1
152
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
153
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
154
pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
155
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
156
pmovmskb %xmm1, %r9d
157
shr %cl, %edx /* adjust 0xffff for offset */
158
shr %cl, %r9d /* adjust for 16-byte offset */
159
sub %r9d, %edx
160
/*
161
* edx must be the same with r9d if in left byte (16-rcx) is equal to
162
* the start from (16-rax) and no null char was seen.
163
*/
164
jne L(less32bytes) /* mismatch or null char */
165
UPDATE_STRNCMP_COUNTER
166
mov $16, %rcx
167
mov $16, %r9
168
pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
169
170
/*
171
* Now both strings are aligned at 16-byte boundary. Loop over strings
172
* checking 32-bytes per iteration.
173
*/
174
.p2align 4
175
L(loop_ashr_0):
176
movdqa (%rsi, %rcx), %xmm1
177
movdqa (%rdi, %rcx), %xmm2
178
179
pcmpeqb %xmm1, %xmm0
180
pcmpeqb %xmm2, %xmm1
181
psubb %xmm0, %xmm1
182
pmovmskb %xmm1, %edx
183
sub $0xffff, %edx
184
jnz L(exit) /* mismatch or null char seen */
185
186
#ifdef USE_AS_STRNCMP
187
sub $16, %r11
188
jbe L(strcmp_exitz)
189
#endif
190
add $16, %rcx
191
movdqa (%rsi, %rcx), %xmm1
192
movdqa (%rdi, %rcx), %xmm2
193
194
pcmpeqb %xmm1, %xmm0
195
pcmpeqb %xmm2, %xmm1
196
psubb %xmm0, %xmm1
197
pmovmskb %xmm1, %edx
198
sub $0xffff, %edx
199
jnz L(exit)
200
#ifdef USE_AS_STRNCMP
201
sub $16, %r11
202
jbe L(strcmp_exitz)
203
#endif
204
add $16, %rcx
205
jmp L(loop_ashr_0)
206
207
/*
208
* The following cases will be handled by ashr_1
209
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
210
* n(15) n -15 0(15 +(n-15) - n) ashr_1
211
*/
212
.p2align 4
213
L(ashr_1):
214
pxor %xmm0, %xmm0
215
movdqa (%rdi), %xmm2
216
movdqa (%rsi), %xmm1
217
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
218
pslldq $15, %xmm2 /* shift first string to align with second */
219
pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
220
psubb %xmm0, %xmm2 /* packed sub of comparison results*/
221
pmovmskb %xmm2, %r9d
222
shr %cl, %edx /* adjust 0xffff for offset */
223
shr %cl, %r9d /* adjust for 16-byte offset */
224
sub %r9d, %edx
225
jnz L(less32bytes) /* mismatch or null char seen */
226
movdqa (%rdi), %xmm3
227
UPDATE_STRNCMP_COUNTER
228
229
pxor %xmm0, %xmm0
230
mov $16, %rcx /* index for loads*/
231
mov $1, %r9d /* byte position left over from less32bytes case */
232
/*
233
* Setup %r10 value allows us to detect crossing a page boundary.
234
* When %r10 goes positive we have crossed a page boundary and
235
* need to do a nibble.
236
*/
237
lea 1(%rdi), %r10
238
and $0xfff, %r10 /* offset into 4K page */
239
sub $0x1000, %r10 /* subtract 4K pagesize */
240
241
.p2align 4
242
L(loop_ashr_1):
243
add $16, %r10
244
jg L(nibble_ashr_1) /* cross page boundary */
245
246
L(gobble_ashr_1):
247
movdqa (%rsi, %rcx), %xmm1
248
movdqa (%rdi, %rcx), %xmm2
249
movdqa %xmm2, %xmm4 /* store for next cycle */
250
251
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
252
253
pcmpeqb %xmm1, %xmm0
254
pcmpeqb %xmm2, %xmm1
255
psubb %xmm0, %xmm1
256
pmovmskb %xmm1, %edx
257
sub $0xffff, %edx
258
jnz L(exit)
259
260
#ifdef USE_AS_STRNCMP
261
sub $16, %r11
262
jbe L(strcmp_exitz)
263
#endif
264
add $16, %rcx
265
movdqa %xmm4, %xmm3
266
267
add $16, %r10
268
jg L(nibble_ashr_1) /* cross page boundary */
269
270
movdqa (%rsi, %rcx), %xmm1
271
movdqa (%rdi, %rcx), %xmm2
272
movdqa %xmm2, %xmm4 /* store for next cycle */
273
274
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
275
276
pcmpeqb %xmm1, %xmm0
277
pcmpeqb %xmm2, %xmm1
278
psubb %xmm0, %xmm1
279
pmovmskb %xmm1, %edx
280
sub $0xffff, %edx
281
jnz L(exit)
282
283
#ifdef USE_AS_STRNCMP
284
sub $16, %r11
285
jbe L(strcmp_exitz)
286
#endif
287
add $16, %rcx
288
movdqa %xmm4, %xmm3
289
jmp L(loop_ashr_1)
290
291
/*
292
* Nibble avoids loads across page boundary. This is to avoid a potential
293
* access into unmapped memory.
294
*/
295
.p2align 4
296
L(nibble_ashr_1):
297
pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
298
pmovmskb %xmm0, %edx
299
test $0xfffe, %edx
300
jnz L(ashr_1_exittail) /* find null char*/
301
302
#ifdef USE_AS_STRNCMP
303
cmp $14, %r11
304
jbe L(ashr_1_exittail)
305
#endif
306
307
pxor %xmm0, %xmm0
308
sub $0x1000, %r10 /* substract 4K from %r10 */
309
jmp L(gobble_ashr_1)
310
311
/*
312
* Once find null char, determine if there is a string mismatch
313
* before the null char.
314
*/
315
.p2align 4
316
L(ashr_1_exittail):
317
movdqa (%rsi, %rcx), %xmm1
318
psrldq $1, %xmm0
319
psrldq $1, %xmm3
320
jmp L(aftertail)
321
322
/*
323
* The following cases will be handled by ashr_2
324
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
325
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2
326
*/
327
.p2align 4
328
L(ashr_2):
329
pxor %xmm0, %xmm0
330
movdqa (%rdi), %xmm2
331
movdqa (%rsi), %xmm1
332
pcmpeqb %xmm1, %xmm0
333
pslldq $14, %xmm2
334
pcmpeqb %xmm1, %xmm2
335
psubb %xmm0, %xmm2
336
pmovmskb %xmm2, %r9d
337
shr %cl, %edx
338
shr %cl, %r9d
339
sub %r9d, %edx
340
jnz L(less32bytes)
341
movdqa (%rdi), %xmm3
342
UPDATE_STRNCMP_COUNTER
343
344
pxor %xmm0, %xmm0
345
mov $16, %rcx /* index for loads */
346
mov $2, %r9d /* byte position left over from less32bytes case */
347
/*
348
* Setup %r10 value allows us to detect crossing a page boundary.
349
* When %r10 goes positive we have crossed a page boundary and
350
* need to do a nibble.
351
*/
352
lea 2(%rdi), %r10
353
and $0xfff, %r10 /* offset into 4K page */
354
sub $0x1000, %r10 /* subtract 4K pagesize */
355
356
.p2align 4
357
L(loop_ashr_2):
358
add $16, %r10
359
jg L(nibble_ashr_2)
360
361
L(gobble_ashr_2):
362
movdqa (%rsi, %rcx), %xmm1
363
movdqa (%rdi, %rcx), %xmm2
364
movdqa %xmm2, %xmm4
365
366
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
367
368
pcmpeqb %xmm1, %xmm0
369
pcmpeqb %xmm2, %xmm1
370
psubb %xmm0, %xmm1
371
pmovmskb %xmm1, %edx
372
sub $0xffff, %edx
373
jnz L(exit)
374
375
#ifdef USE_AS_STRNCMP
376
sub $16, %r11
377
jbe L(strcmp_exitz)
378
#endif
379
380
add $16, %rcx
381
movdqa %xmm4, %xmm3
382
383
add $16, %r10
384
jg L(nibble_ashr_2) /* cross page boundary */
385
386
movdqa (%rsi, %rcx), %xmm1
387
movdqa (%rdi, %rcx), %xmm2
388
movdqa %xmm2, %xmm4
389
390
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
391
392
pcmpeqb %xmm1, %xmm0
393
pcmpeqb %xmm2, %xmm1
394
psubb %xmm0, %xmm1
395
pmovmskb %xmm1, %edx
396
sub $0xffff, %edx
397
jnz L(exit)
398
399
#ifdef USE_AS_STRNCMP
400
sub $16, %r11
401
jbe L(strcmp_exitz)
402
#endif
403
404
add $16, %rcx
405
movdqa %xmm4, %xmm3
406
jmp L(loop_ashr_2)
407
408
.p2align 4
409
L(nibble_ashr_2):
410
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
411
pmovmskb %xmm0, %edx
412
test $0xfffc, %edx
413
jnz L(ashr_2_exittail)
414
415
#ifdef USE_AS_STRNCMP
416
cmp $13, %r11
417
jbe L(ashr_2_exittail)
418
#endif
419
420
pxor %xmm0, %xmm0
421
sub $0x1000, %r10
422
jmp L(gobble_ashr_2)
423
424
.p2align 4
425
L(ashr_2_exittail):
426
movdqa (%rsi, %rcx), %xmm1
427
psrldq $2, %xmm0
428
psrldq $2, %xmm3
429
jmp L(aftertail)
430
431
/*
432
* The following cases will be handled by ashr_3
433
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
434
* n(13~15) n -13 2(15 +(n-13) - n) ashr_3
435
*/
436
.p2align 4
437
L(ashr_3):
438
pxor %xmm0, %xmm0
439
movdqa (%rdi), %xmm2
440
movdqa (%rsi), %xmm1
441
pcmpeqb %xmm1, %xmm0
442
pslldq $13, %xmm2
443
pcmpeqb %xmm1, %xmm2
444
psubb %xmm0, %xmm2
445
pmovmskb %xmm2, %r9d
446
shr %cl, %edx
447
shr %cl, %r9d
448
sub %r9d, %edx
449
jnz L(less32bytes)
450
movdqa (%rdi), %xmm3
451
452
UPDATE_STRNCMP_COUNTER
453
454
pxor %xmm0, %xmm0
455
mov $16, %rcx /* index for loads */
456
mov $3, %r9d /* byte position left over from less32bytes case */
457
/*
458
* Setup %r10 value allows us to detect crossing a page boundary.
459
* When %r10 goes positive we have crossed a page boundary and
460
* need to do a nibble.
461
*/
462
lea 3(%rdi), %r10
463
and $0xfff, %r10 /* offset into 4K page */
464
sub $0x1000, %r10 /* subtract 4K pagesize */
465
466
.p2align 4
467
L(loop_ashr_3):
468
add $16, %r10
469
jg L(nibble_ashr_3)
470
471
L(gobble_ashr_3):
472
movdqa (%rsi, %rcx), %xmm1
473
movdqa (%rdi, %rcx), %xmm2
474
movdqa %xmm2, %xmm4
475
476
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
477
478
pcmpeqb %xmm1, %xmm0
479
pcmpeqb %xmm2, %xmm1
480
psubb %xmm0, %xmm1
481
pmovmskb %xmm1, %edx
482
sub $0xffff, %edx
483
jnz L(exit)
484
485
#ifdef USE_AS_STRNCMP
486
sub $16, %r11
487
jbe L(strcmp_exitz)
488
#endif
489
490
add $16, %rcx
491
movdqa %xmm4, %xmm3
492
493
add $16, %r10
494
jg L(nibble_ashr_3) /* cross page boundary */
495
496
movdqa (%rsi, %rcx), %xmm1
497
movdqa (%rdi, %rcx), %xmm2
498
movdqa %xmm2, %xmm4
499
500
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
501
502
pcmpeqb %xmm1, %xmm0
503
pcmpeqb %xmm2, %xmm1
504
psubb %xmm0, %xmm1
505
pmovmskb %xmm1, %edx
506
sub $0xffff, %edx
507
jnz L(exit)
508
509
#ifdef USE_AS_STRNCMP
510
sub $16, %r11
511
jbe L(strcmp_exitz)
512
#endif
513
514
add $16, %rcx
515
movdqa %xmm4, %xmm3
516
jmp L(loop_ashr_3)
517
518
.p2align 4
519
L(nibble_ashr_3):
520
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
521
pmovmskb %xmm0, %edx
522
test $0xfff8, %edx
523
jnz L(ashr_3_exittail)
524
525
#ifdef USE_AS_STRNCMP
526
cmp $12, %r11
527
jbe L(ashr_3_exittail)
528
#endif
529
530
pxor %xmm0, %xmm0
531
sub $0x1000, %r10
532
jmp L(gobble_ashr_3)
533
534
.p2align 4
535
L(ashr_3_exittail):
536
movdqa (%rsi, %rcx), %xmm1
537
psrldq $3, %xmm0
538
psrldq $3, %xmm3
539
jmp L(aftertail)
540
541
/*
542
* The following cases will be handled by ashr_4
543
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
544
* n(12~15) n -12 3(15 +(n-12) - n) ashr_4
545
*/
546
.p2align 4
547
L(ashr_4):
548
pxor %xmm0, %xmm0
549
movdqa (%rdi), %xmm2
550
movdqa (%rsi), %xmm1
551
pcmpeqb %xmm1, %xmm0
552
pslldq $12, %xmm2
553
pcmpeqb %xmm1, %xmm2
554
psubb %xmm0, %xmm2
555
pmovmskb %xmm2, %r9d
556
shr %cl, %edx
557
shr %cl, %r9d
558
sub %r9d, %edx
559
jnz L(less32bytes)
560
movdqa (%rdi), %xmm3
561
562
UPDATE_STRNCMP_COUNTER
563
564
pxor %xmm0, %xmm0
565
mov $16, %rcx /* index for loads */
566
mov $4, %r9d /* byte position left over from less32bytes case */
567
/*
568
* Setup %r10 value allows us to detect crossing a page boundary.
569
* When %r10 goes positive we have crossed a page boundary and
570
* need to do a nibble.
571
*/
572
lea 4(%rdi), %r10
573
and $0xfff, %r10 /* offset into 4K page */
574
sub $0x1000, %r10 /* subtract 4K pagesize */
575
576
.p2align 4
577
L(loop_ashr_4):
578
add $16, %r10
579
jg L(nibble_ashr_4)
580
581
L(gobble_ashr_4):
582
movdqa (%rsi, %rcx), %xmm1
583
movdqa (%rdi, %rcx), %xmm2
584
movdqa %xmm2, %xmm4
585
586
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
587
588
pcmpeqb %xmm1, %xmm0
589
pcmpeqb %xmm2, %xmm1
590
psubb %xmm0, %xmm1
591
pmovmskb %xmm1, %edx
592
sub $0xffff, %edx
593
jnz L(exit)
594
595
#ifdef USE_AS_STRNCMP
596
sub $16, %r11
597
jbe L(strcmp_exitz)
598
#endif
599
600
add $16, %rcx
601
movdqa %xmm4, %xmm3
602
603
add $16, %r10
604
jg L(nibble_ashr_4) /* cross page boundary */
605
606
movdqa (%rsi, %rcx), %xmm1
607
movdqa (%rdi, %rcx), %xmm2
608
movdqa %xmm2, %xmm4
609
610
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
611
612
pcmpeqb %xmm1, %xmm0
613
pcmpeqb %xmm2, %xmm1
614
psubb %xmm0, %xmm1
615
pmovmskb %xmm1, %edx
616
sub $0xffff, %edx
617
jnz L(exit)
618
619
#ifdef USE_AS_STRNCMP
620
sub $16, %r11
621
jbe L(strcmp_exitz)
622
#endif
623
624
add $16, %rcx
625
movdqa %xmm4, %xmm3
626
jmp L(loop_ashr_4)
627
628
.p2align 4
629
L(nibble_ashr_4):
630
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
631
pmovmskb %xmm0, %edx
632
test $0xfff0, %edx
633
jnz L(ashr_4_exittail)
634
635
#ifdef USE_AS_STRNCMP
636
cmp $11, %r11
637
jbe L(ashr_4_exittail)
638
#endif
639
640
pxor %xmm0, %xmm0
641
sub $0x1000, %r10
642
jmp L(gobble_ashr_4)
643
644
.p2align 4
645
L(ashr_4_exittail):
646
movdqa (%rsi, %rcx), %xmm1
647
psrldq $4, %xmm0
648
psrldq $4, %xmm3
649
jmp L(aftertail)
650
651
/*
652
* The following cases will be handled by ashr_5
653
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
654
* n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
655
*/
656
.p2align 4
657
L(ashr_5):
658
pxor %xmm0, %xmm0
659
movdqa (%rdi), %xmm2
660
movdqa (%rsi), %xmm1
661
pcmpeqb %xmm1, %xmm0
662
pslldq $11, %xmm2
663
pcmpeqb %xmm1, %xmm2
664
psubb %xmm0, %xmm2
665
pmovmskb %xmm2, %r9d
666
shr %cl, %edx
667
shr %cl, %r9d
668
sub %r9d, %edx
669
jnz L(less32bytes)
670
movdqa (%rdi), %xmm3
671
672
UPDATE_STRNCMP_COUNTER
673
674
pxor %xmm0, %xmm0
675
mov $16, %rcx /* index for loads */
676
mov $5, %r9d /* byte position left over from less32bytes case */
677
/*
678
* Setup %r10 value allows us to detect crossing a page boundary.
679
* When %r10 goes positive we have crossed a page boundary and
680
* need to do a nibble.
681
*/
682
lea 5(%rdi), %r10
683
and $0xfff, %r10 /* offset into 4K page */
684
sub $0x1000, %r10 /* subtract 4K pagesize */
685
686
.p2align 4
687
L(loop_ashr_5):
688
add $16, %r10
689
jg L(nibble_ashr_5)
690
691
L(gobble_ashr_5):
692
movdqa (%rsi, %rcx), %xmm1
693
movdqa (%rdi, %rcx), %xmm2
694
movdqa %xmm2, %xmm4
695
696
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
697
698
pcmpeqb %xmm1, %xmm0
699
pcmpeqb %xmm2, %xmm1
700
psubb %xmm0, %xmm1
701
pmovmskb %xmm1, %edx
702
sub $0xffff, %edx
703
jnz L(exit)
704
705
#ifdef USE_AS_STRNCMP
706
sub $16, %r11
707
jbe L(strcmp_exitz)
708
#endif
709
710
add $16, %rcx
711
movdqa %xmm4, %xmm3
712
713
add $16, %r10
714
jg L(nibble_ashr_5) /* cross page boundary */
715
716
movdqa (%rsi, %rcx), %xmm1
717
movdqa (%rdi, %rcx), %xmm2
718
movdqa %xmm2, %xmm4
719
720
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
721
722
pcmpeqb %xmm1, %xmm0
723
pcmpeqb %xmm2, %xmm1
724
psubb %xmm0, %xmm1
725
pmovmskb %xmm1, %edx
726
sub $0xffff, %edx
727
jnz L(exit)
728
729
#ifdef USE_AS_STRNCMP
730
sub $16, %r11
731
jbe L(strcmp_exitz)
732
#endif
733
734
add $16, %rcx
735
movdqa %xmm4, %xmm3
736
jmp L(loop_ashr_5)
737
738
.p2align 4
739
L(nibble_ashr_5):
740
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
741
pmovmskb %xmm0, %edx
742
test $0xffe0, %edx
743
jnz L(ashr_5_exittail)
744
745
#ifdef USE_AS_STRNCMP
746
cmp $10, %r11
747
jbe L(ashr_5_exittail)
748
#endif
749
750
pxor %xmm0, %xmm0
751
sub $0x1000, %r10
752
jmp L(gobble_ashr_5)
753
754
.p2align 4
755
L(ashr_5_exittail):
756
movdqa (%rsi, %rcx), %xmm1
757
psrldq $5, %xmm0
758
psrldq $5, %xmm3
759
jmp L(aftertail)
760
761
/*
762
* The following cases will be handled by ashr_6
763
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
764
* n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
765
*/
766
.p2align 4
767
L(ashr_6):
768
pxor %xmm0, %xmm0
769
movdqa (%rdi), %xmm2
770
movdqa (%rsi), %xmm1
771
pcmpeqb %xmm1, %xmm0
772
pslldq $10, %xmm2
773
pcmpeqb %xmm1, %xmm2
774
psubb %xmm0, %xmm2
775
pmovmskb %xmm2, %r9d
776
shr %cl, %edx
777
shr %cl, %r9d
778
sub %r9d, %edx
779
jnz L(less32bytes)
780
movdqa (%rdi), %xmm3
781
782
UPDATE_STRNCMP_COUNTER
783
784
pxor %xmm0, %xmm0
785
mov $16, %rcx /* index for loads */
786
mov $6, %r9d /* byte position left over from less32bytes case */
787
/*
788
* Setup %r10 value allows us to detect crossing a page boundary.
789
* When %r10 goes positive we have crossed a page boundary and
790
* need to do a nibble.
791
*/
792
lea 6(%rdi), %r10
793
and $0xfff, %r10 /* offset into 4K page */
794
sub $0x1000, %r10 /* subtract 4K pagesize */
795
796
.p2align 4
797
L(loop_ashr_6):
798
add $16, %r10
799
jg L(nibble_ashr_6)
800
801
L(gobble_ashr_6):
802
movdqa (%rsi, %rcx), %xmm1
803
movdqa (%rdi, %rcx), %xmm2
804
movdqa %xmm2, %xmm4
805
806
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
807
808
pcmpeqb %xmm1, %xmm0
809
pcmpeqb %xmm2, %xmm1
810
psubb %xmm0, %xmm1
811
pmovmskb %xmm1, %edx
812
sub $0xffff, %edx
813
jnz L(exit)
814
815
#ifdef USE_AS_STRNCMP
816
sub $16, %r11
817
jbe L(strcmp_exitz)
818
#endif
819
820
add $16, %rcx
821
movdqa %xmm4, %xmm3
822
823
add $16, %r10
824
jg L(nibble_ashr_6) /* cross page boundary */
825
826
movdqa (%rsi, %rcx), %xmm1
827
movdqa (%rdi, %rcx), %xmm2
828
movdqa %xmm2, %xmm4
829
830
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
831
832
pcmpeqb %xmm1, %xmm0
833
pcmpeqb %xmm2, %xmm1
834
psubb %xmm0, %xmm1
835
pmovmskb %xmm1, %edx
836
sub $0xffff, %edx
837
jnz L(exit)
838
839
#ifdef USE_AS_STRNCMP
840
sub $16, %r11
841
jbe L(strcmp_exitz)
842
#endif
843
844
add $16, %rcx
845
movdqa %xmm4, %xmm3
846
jmp L(loop_ashr_6)
847
848
.p2align 4
849
L(nibble_ashr_6):
850
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
851
pmovmskb %xmm0, %edx
852
test $0xffc0, %edx
853
jnz L(ashr_6_exittail)
854
855
#ifdef USE_AS_STRNCMP
856
cmp $9, %r11
857
jbe L(ashr_6_exittail)
858
#endif
859
860
pxor %xmm0, %xmm0
861
sub $0x1000, %r10
862
jmp L(gobble_ashr_6)
863
864
.p2align 4
865
L(ashr_6_exittail):
866
movdqa (%rsi, %rcx), %xmm1
867
psrldq $6, %xmm0
868
psrldq $6, %xmm3
869
jmp L(aftertail)
870
871
/*
872
* The following cases will be handled by ashr_7
873
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
874
* n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
875
*/
876
.p2align 4
877
L(ashr_7):
878
pxor %xmm0, %xmm0
879
movdqa (%rdi), %xmm2
880
movdqa (%rsi), %xmm1
881
pcmpeqb %xmm1, %xmm0
882
pslldq $9, %xmm2
883
pcmpeqb %xmm1, %xmm2
884
psubb %xmm0, %xmm2
885
pmovmskb %xmm2, %r9d
886
shr %cl, %edx
887
shr %cl, %r9d
888
sub %r9d, %edx
889
jnz L(less32bytes)
890
movdqa (%rdi), %xmm3
891
892
UPDATE_STRNCMP_COUNTER
893
894
pxor %xmm0, %xmm0
895
mov $16, %rcx /* index for loads */
896
mov $7, %r9d /* byte position left over from less32bytes case */
897
/*
898
* Setup %r10 value allows us to detect crossing a page boundary.
899
* When %r10 goes positive we have crossed a page boundary and
900
* need to do a nibble.
901
*/
902
lea 7(%rdi), %r10
903
and $0xfff, %r10 /* offset into 4K page */
904
sub $0x1000, %r10 /* subtract 4K pagesize */
905
906
.p2align 4
907
L(loop_ashr_7):
908
add $16, %r10
909
jg L(nibble_ashr_7)
910
911
L(gobble_ashr_7):
912
movdqa (%rsi, %rcx), %xmm1
913
movdqa (%rdi, %rcx), %xmm2
914
movdqa %xmm2, %xmm4
915
916
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
917
918
pcmpeqb %xmm1, %xmm0
919
pcmpeqb %xmm2, %xmm1
920
psubb %xmm0, %xmm1
921
pmovmskb %xmm1, %edx
922
sub $0xffff, %edx
923
jnz L(exit)
924
925
#ifdef USE_AS_STRNCMP
926
sub $16, %r11
927
jbe L(strcmp_exitz)
928
#endif
929
930
add $16, %rcx
931
movdqa %xmm4, %xmm3
932
933
add $16, %r10
934
jg L(nibble_ashr_7) /* cross page boundary */
935
936
movdqa (%rsi, %rcx), %xmm1
937
movdqa (%rdi, %rcx), %xmm2
938
movdqa %xmm2, %xmm4
939
940
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
941
942
pcmpeqb %xmm1, %xmm0
943
pcmpeqb %xmm2, %xmm1
944
psubb %xmm0, %xmm1
945
pmovmskb %xmm1, %edx
946
sub $0xffff, %edx
947
jnz L(exit)
948
949
#ifdef USE_AS_STRNCMP
950
sub $16, %r11
951
jbe L(strcmp_exitz)
952
#endif
953
954
add $16, %rcx
955
movdqa %xmm4, %xmm3
956
jmp L(loop_ashr_7)
957
958
.p2align 4
959
L(nibble_ashr_7):
960
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
961
pmovmskb %xmm0, %edx
962
test $0xff80, %edx
963
jnz L(ashr_7_exittail)
964
965
#ifdef USE_AS_STRNCMP
966
cmp $8, %r11
967
jbe L(ashr_7_exittail)
968
#endif
969
970
pxor %xmm0, %xmm0
971
sub $0x1000, %r10
972
jmp L(gobble_ashr_7)
973
974
.p2align 4
975
L(ashr_7_exittail):
976
movdqa (%rsi, %rcx), %xmm1
977
psrldq $7, %xmm0
978
psrldq $7, %xmm3
979
jmp L(aftertail)
980
981
/*
982
* The following cases will be handled by ashr_8
983
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
984
* n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
985
*/
986
.p2align 4
987
L(ashr_8):
988
pxor %xmm0, %xmm0
989
movdqa (%rdi), %xmm2
990
movdqa (%rsi), %xmm1
991
pcmpeqb %xmm1, %xmm0
992
pslldq $8, %xmm2
993
pcmpeqb %xmm1, %xmm2
994
psubb %xmm0, %xmm2
995
pmovmskb %xmm2, %r9d
996
shr %cl, %edx
997
shr %cl, %r9d
998
sub %r9d, %edx
999
jnz L(less32bytes)
1000
movdqa (%rdi), %xmm3
1001
1002
UPDATE_STRNCMP_COUNTER
1003
1004
pxor %xmm0, %xmm0
1005
mov $16, %rcx /* index for loads */
1006
mov $8, %r9d /* byte position left over from less32bytes case */
1007
/*
1008
* Setup %r10 value allows us to detect crossing a page boundary.
1009
* When %r10 goes positive we have crossed a page boundary and
1010
* need to do a nibble.
1011
*/
1012
lea 8(%rdi), %r10
1013
and $0xfff, %r10 /* offset into 4K page */
1014
sub $0x1000, %r10 /* subtract 4K pagesize */
1015
1016
.p2align 4
1017
L(loop_ashr_8):
1018
add $16, %r10
1019
jg L(nibble_ashr_8)
1020
1021
L(gobble_ashr_8):
1022
movdqa (%rsi, %rcx), %xmm1
1023
movdqa (%rdi, %rcx), %xmm2
1024
movdqa %xmm2, %xmm4
1025
1026
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1027
1028
pcmpeqb %xmm1, %xmm0
1029
pcmpeqb %xmm2, %xmm1
1030
psubb %xmm0, %xmm1
1031
pmovmskb %xmm1, %edx
1032
sub $0xffff, %edx
1033
jnz L(exit)
1034
1035
#ifdef USE_AS_STRNCMP
1036
sub $16, %r11
1037
jbe L(strcmp_exitz)
1038
#endif
1039
1040
add $16, %rcx
1041
movdqa %xmm4, %xmm3
1042
1043
add $16, %r10
1044
jg L(nibble_ashr_8) /* cross page boundary */
1045
1046
movdqa (%rsi, %rcx), %xmm1
1047
movdqa (%rdi, %rcx), %xmm2
1048
movdqa %xmm2, %xmm4
1049
1050
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1051
1052
pcmpeqb %xmm1, %xmm0
1053
pcmpeqb %xmm2, %xmm1
1054
psubb %xmm0, %xmm1
1055
pmovmskb %xmm1, %edx
1056
sub $0xffff, %edx
1057
jnz L(exit)
1058
1059
#ifdef USE_AS_STRNCMP
1060
sub $16, %r11
1061
jbe L(strcmp_exitz)
1062
#endif
1063
1064
add $16, %rcx
1065
movdqa %xmm4, %xmm3
1066
jmp L(loop_ashr_8)
1067
1068
.p2align 4
1069
L(nibble_ashr_8):
1070
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1071
pmovmskb %xmm0, %edx
1072
test $0xff00, %edx
1073
jnz L(ashr_8_exittail)
1074
1075
#ifdef USE_AS_STRNCMP
1076
cmp $7, %r11
1077
jbe L(ashr_8_exittail)
1078
#endif
1079
1080
pxor %xmm0, %xmm0
1081
sub $0x1000, %r10
1082
jmp L(gobble_ashr_8)
1083
1084
.p2align 4
1085
L(ashr_8_exittail):
1086
movdqa (%rsi, %rcx), %xmm1
1087
psrldq $8, %xmm0
1088
psrldq $8, %xmm3
1089
jmp L(aftertail)
1090
1091
/*
1092
* The following cases will be handled by ashr_9
1093
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1094
* n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1095
*/
1096
.p2align 4
1097
L(ashr_9):
1098
pxor %xmm0, %xmm0
1099
movdqa (%rdi), %xmm2
1100
movdqa (%rsi), %xmm1
1101
pcmpeqb %xmm1, %xmm0
1102
pslldq $7, %xmm2
1103
pcmpeqb %xmm1, %xmm2
1104
psubb %xmm0, %xmm2
1105
pmovmskb %xmm2, %r9d
1106
shr %cl, %edx
1107
shr %cl, %r9d
1108
sub %r9d, %edx
1109
jnz L(less32bytes)
1110
movdqa (%rdi), %xmm3
1111
1112
UPDATE_STRNCMP_COUNTER
1113
1114
pxor %xmm0, %xmm0
1115
mov $16, %rcx /* index for loads */
1116
mov $9, %r9d /* byte position left over from less32bytes case */
1117
/*
1118
* Setup %r10 value allows us to detect crossing a page boundary.
1119
* When %r10 goes positive we have crossed a page boundary and
1120
* need to do a nibble.
1121
*/
1122
lea 9(%rdi), %r10
1123
and $0xfff, %r10 /* offset into 4K page */
1124
sub $0x1000, %r10 /* subtract 4K pagesize */
1125
1126
.p2align 4
1127
L(loop_ashr_9):
1128
add $16, %r10
1129
jg L(nibble_ashr_9)
1130
1131
L(gobble_ashr_9):
1132
movdqa (%rsi, %rcx), %xmm1
1133
movdqa (%rdi, %rcx), %xmm2
1134
movdqa %xmm2, %xmm4
1135
1136
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1137
1138
pcmpeqb %xmm1, %xmm0
1139
pcmpeqb %xmm2, %xmm1
1140
psubb %xmm0, %xmm1
1141
pmovmskb %xmm1, %edx
1142
sub $0xffff, %edx
1143
jnz L(exit)
1144
1145
#ifdef USE_AS_STRNCMP
1146
sub $16, %r11
1147
jbe L(strcmp_exitz)
1148
#endif
1149
1150
add $16, %rcx
1151
movdqa %xmm4, %xmm3
1152
1153
add $16, %r10
1154
jg L(nibble_ashr_9) /* cross page boundary */
1155
1156
movdqa (%rsi, %rcx), %xmm1
1157
movdqa (%rdi, %rcx), %xmm2
1158
movdqa %xmm2, %xmm4
1159
1160
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1161
1162
pcmpeqb %xmm1, %xmm0
1163
pcmpeqb %xmm2, %xmm1
1164
psubb %xmm0, %xmm1
1165
pmovmskb %xmm1, %edx
1166
sub $0xffff, %edx
1167
jnz L(exit)
1168
1169
#ifdef USE_AS_STRNCMP
1170
sub $16, %r11
1171
jbe L(strcmp_exitz)
1172
#endif
1173
1174
add $16, %rcx
1175
movdqa %xmm4, %xmm3 /* store for next cycle */
1176
jmp L(loop_ashr_9)
1177
1178
.p2align 4
1179
L(nibble_ashr_9):
1180
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1181
pmovmskb %xmm0, %edx
1182
test $0xfe00, %edx
1183
jnz L(ashr_9_exittail)
1184
1185
#ifdef USE_AS_STRNCMP
1186
cmp $6, %r11
1187
jbe L(ashr_9_exittail)
1188
#endif
1189
1190
pxor %xmm0, %xmm0
1191
sub $0x1000, %r10
1192
jmp L(gobble_ashr_9)
1193
1194
.p2align 4
1195
L(ashr_9_exittail):
1196
movdqa (%rsi, %rcx), %xmm1
1197
psrldq $9, %xmm0
1198
psrldq $9, %xmm3
1199
jmp L(aftertail)
1200
1201
/*
1202
* The following cases will be handled by ashr_10
1203
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1204
* n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1205
*/
1206
.p2align 4
1207
L(ashr_10):
1208
pxor %xmm0, %xmm0
1209
movdqa (%rdi), %xmm2
1210
movdqa (%rsi), %xmm1
1211
pcmpeqb %xmm1, %xmm0
1212
pslldq $6, %xmm2
1213
pcmpeqb %xmm1, %xmm2
1214
psubb %xmm0, %xmm2
1215
pmovmskb %xmm2, %r9d
1216
shr %cl, %edx
1217
shr %cl, %r9d
1218
sub %r9d, %edx
1219
jnz L(less32bytes)
1220
movdqa (%rdi), %xmm3
1221
1222
UPDATE_STRNCMP_COUNTER
1223
1224
pxor %xmm0, %xmm0
1225
mov $16, %rcx /* index for loads */
1226
mov $10, %r9d /* byte position left over from less32bytes case */
1227
/*
1228
* Setup %r10 value allows us to detect crossing a page boundary.
1229
* When %r10 goes positive we have crossed a page boundary and
1230
* need to do a nibble.
1231
*/
1232
lea 10(%rdi), %r10
1233
and $0xfff, %r10 /* offset into 4K page */
1234
sub $0x1000, %r10 /* subtract 4K pagesize */
1235
1236
.p2align 4
1237
L(loop_ashr_10):
1238
add $16, %r10
1239
jg L(nibble_ashr_10)
1240
1241
L(gobble_ashr_10):
1242
movdqa (%rsi, %rcx), %xmm1
1243
movdqa (%rdi, %rcx), %xmm2
1244
movdqa %xmm2, %xmm4
1245
1246
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1247
1248
pcmpeqb %xmm1, %xmm0
1249
pcmpeqb %xmm2, %xmm1
1250
psubb %xmm0, %xmm1
1251
pmovmskb %xmm1, %edx
1252
sub $0xffff, %edx
1253
jnz L(exit)
1254
1255
#ifdef USE_AS_STRNCMP
1256
sub $16, %r11
1257
jbe L(strcmp_exitz)
1258
#endif
1259
1260
add $16, %rcx
1261
movdqa %xmm4, %xmm3
1262
1263
add $16, %r10
1264
jg L(nibble_ashr_10) /* cross page boundary */
1265
1266
movdqa (%rsi, %rcx), %xmm1
1267
movdqa (%rdi, %rcx), %xmm2
1268
movdqa %xmm2, %xmm4
1269
1270
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1271
1272
pcmpeqb %xmm1, %xmm0
1273
pcmpeqb %xmm2, %xmm1
1274
psubb %xmm0, %xmm1
1275
pmovmskb %xmm1, %edx
1276
sub $0xffff, %edx
1277
jnz L(exit)
1278
1279
#ifdef USE_AS_STRNCMP
1280
sub $16, %r11
1281
jbe L(strcmp_exitz)
1282
#endif
1283
1284
add $16, %rcx
1285
movdqa %xmm4, %xmm3
1286
jmp L(loop_ashr_10)
1287
1288
.p2align 4
1289
L(nibble_ashr_10):
1290
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1291
pmovmskb %xmm0, %edx
1292
test $0xfc00, %edx
1293
jnz L(ashr_10_exittail)
1294
1295
#ifdef USE_AS_STRNCMP
1296
cmp $5, %r11
1297
jbe L(ashr_10_exittail)
1298
#endif
1299
1300
pxor %xmm0, %xmm0
1301
sub $0x1000, %r10
1302
jmp L(gobble_ashr_10)
1303
1304
.p2align 4
1305
L(ashr_10_exittail):
1306
movdqa (%rsi, %rcx), %xmm1
1307
psrldq $10, %xmm0
1308
psrldq $10, %xmm3
1309
jmp L(aftertail)
1310
1311
/*
1312
* The following cases will be handled by ashr_11
1313
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1314
* n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1315
*/
1316
.p2align 4
1317
L(ashr_11):
1318
pxor %xmm0, %xmm0
1319
movdqa (%rdi), %xmm2
1320
movdqa (%rsi), %xmm1
1321
pcmpeqb %xmm1, %xmm0
1322
pslldq $5, %xmm2
1323
pcmpeqb %xmm1, %xmm2
1324
psubb %xmm0, %xmm2
1325
pmovmskb %xmm2, %r9d
1326
shr %cl, %edx
1327
shr %cl, %r9d
1328
sub %r9d, %edx
1329
jnz L(less32bytes)
1330
movdqa (%rdi), %xmm3
1331
1332
UPDATE_STRNCMP_COUNTER
1333
1334
pxor %xmm0, %xmm0
1335
mov $16, %rcx /* index for loads */
1336
mov $11, %r9d /* byte position left over from less32bytes case */
1337
/*
1338
* Setup %r10 value allows us to detect crossing a page boundary.
1339
* When %r10 goes positive we have crossed a page boundary and
1340
* need to do a nibble.
1341
*/
1342
lea 11(%rdi), %r10
1343
and $0xfff, %r10 /* offset into 4K page */
1344
sub $0x1000, %r10 /* subtract 4K pagesize */
1345
1346
.p2align 4
1347
L(loop_ashr_11):
1348
add $16, %r10
1349
jg L(nibble_ashr_11)
1350
1351
L(gobble_ashr_11):
1352
movdqa (%rsi, %rcx), %xmm1
1353
movdqa (%rdi, %rcx), %xmm2
1354
movdqa %xmm2, %xmm4
1355
1356
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1357
1358
pcmpeqb %xmm1, %xmm0
1359
pcmpeqb %xmm2, %xmm1
1360
psubb %xmm0, %xmm1
1361
pmovmskb %xmm1, %edx
1362
sub $0xffff, %edx
1363
jnz L(exit)
1364
1365
#ifdef USE_AS_STRNCMP
1366
sub $16, %r11
1367
jbe L(strcmp_exitz)
1368
#endif
1369
1370
add $16, %rcx
1371
movdqa %xmm4, %xmm3
1372
1373
add $16, %r10
1374
jg L(nibble_ashr_11) /* cross page boundary */
1375
1376
movdqa (%rsi, %rcx), %xmm1
1377
movdqa (%rdi, %rcx), %xmm2
1378
movdqa %xmm2, %xmm4
1379
1380
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1381
1382
pcmpeqb %xmm1, %xmm0
1383
pcmpeqb %xmm2, %xmm1
1384
psubb %xmm0, %xmm1
1385
pmovmskb %xmm1, %edx
1386
sub $0xffff, %edx
1387
jnz L(exit)
1388
1389
#ifdef USE_AS_STRNCMP
1390
sub $16, %r11
1391
jbe L(strcmp_exitz)
1392
#endif
1393
1394
add $16, %rcx
1395
movdqa %xmm4, %xmm3
1396
jmp L(loop_ashr_11)
1397
1398
.p2align 4
1399
L(nibble_ashr_11):
1400
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1401
pmovmskb %xmm0, %edx
1402
test $0xf800, %edx
1403
jnz L(ashr_11_exittail)
1404
1405
#ifdef USE_AS_STRNCMP
1406
cmp $4, %r11
1407
jbe L(ashr_11_exittail)
1408
#endif
1409
1410
pxor %xmm0, %xmm0
1411
sub $0x1000, %r10
1412
jmp L(gobble_ashr_11)
1413
1414
.p2align 4
1415
L(ashr_11_exittail):
1416
movdqa (%rsi, %rcx), %xmm1
1417
psrldq $11, %xmm0
1418
psrldq $11, %xmm3
1419
jmp L(aftertail)
1420
1421
/*
1422
* The following cases will be handled by ashr_12
1423
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1424
* n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1425
*/
1426
.p2align 4
1427
L(ashr_12):
1428
pxor %xmm0, %xmm0
1429
movdqa (%rdi), %xmm2
1430
movdqa (%rsi), %xmm1
1431
pcmpeqb %xmm1, %xmm0
1432
pslldq $4, %xmm2
1433
pcmpeqb %xmm1, %xmm2
1434
psubb %xmm0, %xmm2
1435
pmovmskb %xmm2, %r9d
1436
shr %cl, %edx
1437
shr %cl, %r9d
1438
sub %r9d, %edx
1439
jnz L(less32bytes)
1440
movdqa (%rdi), %xmm3
1441
1442
UPDATE_STRNCMP_COUNTER
1443
1444
pxor %xmm0, %xmm0
1445
mov $16, %rcx /* index for loads */
1446
mov $12, %r9d /* byte position left over from less32bytes case */
1447
/*
1448
* Setup %r10 value allows us to detect crossing a page boundary.
1449
* When %r10 goes positive we have crossed a page boundary and
1450
* need to do a nibble.
1451
*/
1452
lea 12(%rdi), %r10
1453
and $0xfff, %r10 /* offset into 4K page */
1454
sub $0x1000, %r10 /* subtract 4K pagesize */
1455
1456
.p2align 4
1457
L(loop_ashr_12):
1458
add $16, %r10
1459
jg L(nibble_ashr_12)
1460
1461
L(gobble_ashr_12):
1462
movdqa (%rsi, %rcx), %xmm1
1463
movdqa (%rdi, %rcx), %xmm2
1464
movdqa %xmm2, %xmm4
1465
1466
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1467
1468
pcmpeqb %xmm1, %xmm0
1469
pcmpeqb %xmm2, %xmm1
1470
psubb %xmm0, %xmm1
1471
pmovmskb %xmm1, %edx
1472
sub $0xffff, %edx
1473
jnz L(exit)
1474
1475
#ifdef USE_AS_STRNCMP
1476
sub $16, %r11
1477
jbe L(strcmp_exitz)
1478
#endif
1479
1480
add $16, %rcx
1481
movdqa %xmm4, %xmm3
1482
1483
add $16, %r10
1484
jg L(nibble_ashr_12) /* cross page boundary */
1485
1486
movdqa (%rsi, %rcx), %xmm1
1487
movdqa (%rdi, %rcx), %xmm2
1488
movdqa %xmm2, %xmm4
1489
1490
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1491
1492
pcmpeqb %xmm1, %xmm0
1493
pcmpeqb %xmm2, %xmm1
1494
psubb %xmm0, %xmm1
1495
pmovmskb %xmm1, %edx
1496
sub $0xffff, %edx
1497
jnz L(exit)
1498
1499
#ifdef USE_AS_STRNCMP
1500
sub $16, %r11
1501
jbe L(strcmp_exitz)
1502
#endif
1503
1504
add $16, %rcx
1505
movdqa %xmm4, %xmm3
1506
jmp L(loop_ashr_12)
1507
1508
.p2align 4
1509
L(nibble_ashr_12):
1510
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1511
pmovmskb %xmm0, %edx
1512
test $0xf000, %edx
1513
jnz L(ashr_12_exittail)
1514
1515
#ifdef USE_AS_STRNCMP
1516
cmp $3, %r11
1517
jbe L(ashr_12_exittail)
1518
#endif
1519
1520
pxor %xmm0, %xmm0
1521
sub $0x1000, %r10
1522
jmp L(gobble_ashr_12)
1523
1524
.p2align 4
1525
L(ashr_12_exittail):
1526
movdqa (%rsi, %rcx), %xmm1
1527
psrldq $12, %xmm0
1528
psrldq $12, %xmm3
1529
jmp L(aftertail)
1530
1531
/*
1532
* The following cases will be handled by ashr_13
1533
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1534
* n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1535
*/
1536
.p2align 4
1537
L(ashr_13):
1538
pxor %xmm0, %xmm0
1539
movdqa (%rdi), %xmm2
1540
movdqa (%rsi), %xmm1
1541
pcmpeqb %xmm1, %xmm0
1542
pslldq $3, %xmm2
1543
pcmpeqb %xmm1, %xmm2
1544
psubb %xmm0, %xmm2
1545
pmovmskb %xmm2, %r9d
1546
shr %cl, %edx
1547
shr %cl, %r9d
1548
sub %r9d, %edx
1549
jnz L(less32bytes)
1550
movdqa (%rdi), %xmm3
1551
1552
UPDATE_STRNCMP_COUNTER
1553
1554
pxor %xmm0, %xmm0
1555
mov $16, %rcx /* index for loads */
1556
mov $13, %r9d /* byte position left over from less32bytes case */
1557
/*
1558
* Setup %r10 value allows us to detect crossing a page boundary.
1559
* When %r10 goes positive we have crossed a page boundary and
1560
* need to do a nibble.
1561
*/
1562
lea 13(%rdi), %r10
1563
and $0xfff, %r10 /* offset into 4K page */
1564
sub $0x1000, %r10 /* subtract 4K pagesize */
1565
1566
.p2align 4
1567
L(loop_ashr_13):
1568
add $16, %r10
1569
jg L(nibble_ashr_13)
1570
1571
L(gobble_ashr_13):
1572
movdqa (%rsi, %rcx), %xmm1
1573
movdqa (%rdi, %rcx), %xmm2
1574
movdqa %xmm2, %xmm4
1575
1576
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1577
1578
pcmpeqb %xmm1, %xmm0
1579
pcmpeqb %xmm2, %xmm1
1580
psubb %xmm0, %xmm1
1581
pmovmskb %xmm1, %edx
1582
sub $0xffff, %edx
1583
jnz L(exit)
1584
1585
#ifdef USE_AS_STRNCMP
1586
sub $16, %r11
1587
jbe L(strcmp_exitz)
1588
#endif
1589
1590
add $16, %rcx
1591
movdqa %xmm4, %xmm3
1592
1593
add $16, %r10
1594
jg L(nibble_ashr_13) /* cross page boundary */
1595
1596
movdqa (%rsi, %rcx), %xmm1
1597
movdqa (%rdi, %rcx), %xmm2
1598
movdqa %xmm2, %xmm4
1599
1600
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1601
1602
pcmpeqb %xmm1, %xmm0
1603
pcmpeqb %xmm2, %xmm1
1604
psubb %xmm0, %xmm1
1605
pmovmskb %xmm1, %edx
1606
sub $0xffff, %edx
1607
jnz L(exit)
1608
1609
#ifdef USE_AS_STRNCMP
1610
sub $16, %r11
1611
jbe L(strcmp_exitz)
1612
#endif
1613
1614
add $16, %rcx
1615
movdqa %xmm4, %xmm3
1616
jmp L(loop_ashr_13)
1617
1618
.p2align 4
1619
L(nibble_ashr_13):
1620
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1621
pmovmskb %xmm0, %edx
1622
test $0xe000, %edx
1623
jnz L(ashr_13_exittail)
1624
1625
#ifdef USE_AS_STRNCMP
1626
cmp $2, %r11
1627
jbe L(ashr_13_exittail)
1628
#endif
1629
1630
pxor %xmm0, %xmm0
1631
sub $0x1000, %r10
1632
jmp L(gobble_ashr_13)
1633
1634
.p2align 4
1635
L(ashr_13_exittail):
1636
movdqa (%rsi, %rcx), %xmm1
1637
psrldq $13, %xmm0
1638
psrldq $13, %xmm3
1639
jmp L(aftertail)
1640
1641
/*
1642
* The following cases will be handled by ashr_14
1643
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1644
* n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1645
*/
1646
.p2align 4
1647
L(ashr_14):
1648
pxor %xmm0, %xmm0
1649
movdqa (%rdi), %xmm2
1650
movdqa (%rsi), %xmm1
1651
pcmpeqb %xmm1, %xmm0
1652
pslldq $2, %xmm2
1653
pcmpeqb %xmm1, %xmm2
1654
psubb %xmm0, %xmm2
1655
pmovmskb %xmm2, %r9d
1656
shr %cl, %edx
1657
shr %cl, %r9d
1658
sub %r9d, %edx
1659
jnz L(less32bytes)
1660
movdqa (%rdi), %xmm3
1661
1662
UPDATE_STRNCMP_COUNTER
1663
1664
pxor %xmm0, %xmm0
1665
mov $16, %rcx /* index for loads */
1666
mov $14, %r9d /* byte position left over from less32bytes case */
1667
/*
1668
* Setup %r10 value allows us to detect crossing a page boundary.
1669
* When %r10 goes positive we have crossed a page boundary and
1670
* need to do a nibble.
1671
*/
1672
lea 14(%rdi), %r10
1673
and $0xfff, %r10 /* offset into 4K page */
1674
sub $0x1000, %r10 /* subtract 4K pagesize */
1675
1676
.p2align 4
1677
L(loop_ashr_14):
1678
add $16, %r10
1679
jg L(nibble_ashr_14)
1680
1681
L(gobble_ashr_14):
1682
movdqa (%rsi, %rcx), %xmm1
1683
movdqa (%rdi, %rcx), %xmm2
1684
movdqa %xmm2, %xmm4
1685
1686
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1687
1688
pcmpeqb %xmm1, %xmm0
1689
pcmpeqb %xmm2, %xmm1
1690
psubb %xmm0, %xmm1
1691
pmovmskb %xmm1, %edx
1692
sub $0xffff, %edx
1693
jnz L(exit)
1694
1695
#ifdef USE_AS_STRNCMP
1696
sub $16, %r11
1697
jbe L(strcmp_exitz)
1698
#endif
1699
1700
add $16, %rcx
1701
movdqa %xmm4, %xmm3
1702
1703
add $16, %r10
1704
jg L(nibble_ashr_14) /* cross page boundary */
1705
1706
movdqa (%rsi, %rcx), %xmm1
1707
movdqa (%rdi, %rcx), %xmm2
1708
movdqa %xmm2, %xmm4
1709
1710
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1711
1712
pcmpeqb %xmm1, %xmm0
1713
pcmpeqb %xmm2, %xmm1
1714
psubb %xmm0, %xmm1
1715
pmovmskb %xmm1, %edx
1716
sub $0xffff, %edx
1717
jnz L(exit)
1718
1719
#ifdef USE_AS_STRNCMP
1720
sub $16, %r11
1721
jbe L(strcmp_exitz)
1722
#endif
1723
1724
add $16, %rcx
1725
movdqa %xmm4, %xmm3
1726
jmp L(loop_ashr_14)
1727
1728
.p2align 4
1729
L(nibble_ashr_14):
1730
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1731
pmovmskb %xmm0, %edx
1732
test $0xc000, %edx
1733
jnz L(ashr_14_exittail)
1734
1735
#ifdef USE_AS_STRNCMP
1736
cmp $1, %r11
1737
jbe L(ashr_14_exittail)
1738
#endif
1739
1740
pxor %xmm0, %xmm0
1741
sub $0x1000, %r10
1742
jmp L(gobble_ashr_14)
1743
1744
.p2align 4
1745
L(ashr_14_exittail):
1746
movdqa (%rsi, %rcx), %xmm1
1747
psrldq $14, %xmm0
1748
psrldq $14, %xmm3
1749
jmp L(aftertail)
1750
1751
/*
1752
* The following cases will be handled by ashr_15
1753
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1754
* n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1755
*/
1756
.p2align 4
1757
L(ashr_15):
1758
pxor %xmm0, %xmm0
1759
movdqa (%rdi), %xmm2
1760
movdqa (%rsi), %xmm1
1761
pcmpeqb %xmm1, %xmm0
1762
pslldq $1, %xmm2
1763
pcmpeqb %xmm1, %xmm2
1764
psubb %xmm0, %xmm2
1765
pmovmskb %xmm2, %r9d
1766
shr %cl, %edx
1767
shr %cl, %r9d
1768
sub %r9d, %edx
1769
jnz L(less32bytes)
1770
1771
movdqa (%rdi), %xmm3
1772
1773
UPDATE_STRNCMP_COUNTER
1774
1775
pxor %xmm0, %xmm0
1776
mov $16, %rcx /* index for loads */
1777
mov $15, %r9d /* byte position left over from less32bytes case */
1778
/*
1779
* Setup %r10 value allows us to detect crossing a page boundary.
1780
* When %r10 goes positive we have crossed a page boundary and
1781
* need to do a nibble.
1782
*/
1783
lea 15(%rdi), %r10
1784
and $0xfff, %r10 /* offset into 4K page */
1785
1786
sub $0x1000, %r10 /* subtract 4K pagesize */
1787
1788
.p2align 4
1789
L(loop_ashr_15):
1790
add $16, %r10
1791
jg L(nibble_ashr_15)
1792
1793
L(gobble_ashr_15):
1794
movdqa (%rsi, %rcx), %xmm1
1795
movdqa (%rdi, %rcx), %xmm2
1796
movdqa %xmm2, %xmm4
1797
1798
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
1799
1800
pcmpeqb %xmm1, %xmm0
1801
pcmpeqb %xmm2, %xmm1
1802
psubb %xmm0, %xmm1
1803
pmovmskb %xmm1, %edx
1804
sub $0xffff, %edx
1805
jnz L(exit)
1806
1807
#ifdef USE_AS_STRNCMP
1808
sub $16, %r11
1809
jbe L(strcmp_exitz)
1810
#endif
1811
1812
add $16, %rcx
1813
movdqa %xmm4, %xmm3
1814
1815
add $16, %r10
1816
jg L(nibble_ashr_15) /* cross page boundary */
1817
1818
movdqa (%rsi, %rcx), %xmm1
1819
movdqa (%rdi, %rcx), %xmm2
1820
movdqa %xmm2, %xmm4
1821
1822
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
1823
1824
pcmpeqb %xmm1, %xmm0
1825
pcmpeqb %xmm2, %xmm1
1826
psubb %xmm0, %xmm1
1827
pmovmskb %xmm1, %edx
1828
sub $0xffff, %edx
1829
jnz L(exit)
1830
1831
#ifdef USE_AS_STRNCMP
1832
sub $16, %r11
1833
jbe L(strcmp_exitz)
1834
#endif
1835
1836
add $16, %rcx
1837
movdqa %xmm4, %xmm3
1838
jmp L(loop_ashr_15)
1839
1840
.p2align 4
1841
L(nibble_ashr_15):
1842
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1843
pmovmskb %xmm0, %edx
1844
test $0x8000, %edx
1845
jnz L(ashr_15_exittail)
1846
1847
#ifdef USE_AS_STRNCMP
1848
test %r11, %r11
1849
je L(ashr_15_exittail)
1850
#endif
1851
1852
pxor %xmm0, %xmm0
1853
sub $0x1000, %r10
1854
jmp L(gobble_ashr_15)
1855
1856
.p2align 4
1857
L(ashr_15_exittail):
1858
movdqa (%rsi, %rcx), %xmm1
1859
psrldq $15, %xmm3
1860
psrldq $15, %xmm0
1861
1862
.p2align 4
1863
L(aftertail):
1864
pcmpeqb %xmm3, %xmm1
1865
psubb %xmm0, %xmm1
1866
pmovmskb %xmm1, %edx
1867
not %edx
1868
1869
.p2align 4
1870
L(exit):
1871
lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
1872
L(less32bytes):
1873
lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1874
lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1875
test %r8d, %r8d
1876
jz L(ret)
1877
xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1878
1879
.p2align 4
1880
L(ret):
1881
L(less16bytes):
1882
bsf %rdx, %rdx /* find and store bit index in %rdx */
1883
1884
#ifdef USE_AS_STRNCMP
1885
sub %rdx, %r11
1886
jbe L(strcmp_exitz)
1887
#endif
1888
movzbl (%rsi, %rdx), %ecx
1889
movzbl (%rdi, %rdx), %eax
1890
1891
sub %ecx, %eax
1892
ret
1893
1894
L(strcmp_exitz):
1895
xor %eax, %eax
1896
ret
1897
1898
.p2align 4
1899
L(Byte0):
1900
movzbl (%rsi), %ecx
1901
movzbl (%rdi), %eax
1902
1903
sub %ecx, %eax
1904
ret
1905
END (STRCMP)
1906
1907
.section .rodata,"a",@progbits
1908
.p2align 3
1909
L(unaligned_table):
1910
.int L(ashr_1) - L(unaligned_table)
1911
.int L(ashr_2) - L(unaligned_table)
1912
.int L(ashr_3) - L(unaligned_table)
1913
.int L(ashr_4) - L(unaligned_table)
1914
.int L(ashr_5) - L(unaligned_table)
1915
.int L(ashr_6) - L(unaligned_table)
1916
.int L(ashr_7) - L(unaligned_table)
1917
.int L(ashr_8) - L(unaligned_table)
1918
.int L(ashr_9) - L(unaligned_table)
1919
.int L(ashr_10) - L(unaligned_table)
1920
.int L(ashr_11) - L(unaligned_table)
1921
.int L(ashr_12) - L(unaligned_table)
1922
.int L(ashr_13) - L(unaligned_table)
1923
.int L(ashr_14) - L(unaligned_table)
1924
.int L(ashr_15) - L(unaligned_table)
1925
.int L(ashr_0) - L(unaligned_table)
1926
1927