Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/lib/memcmp_64.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* Author: Anton Blanchard <[email protected]>
4
* Copyright 2015 IBM Corporation.
5
*/
6
#include <linux/export.h>
7
#include <asm/ppc_asm.h>
8
#include <asm/ppc-opcode.h>
9
10
#define off8 r6
11
#define off16 r7
12
#define off24 r8
13
14
#define rA r9
15
#define rB r10
16
#define rC r11
17
#define rD r27
18
#define rE r28
19
#define rF r29
20
#define rG r30
21
#define rH r31
22
23
#ifdef __LITTLE_ENDIAN__
24
#define LH lhbrx
25
#define LW lwbrx
26
#define LD ldbrx
27
#define LVS lvsr
28
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
29
vperm _VRT,_VRB,_VRA,_VRC
30
#else
31
#define LH lhzx
32
#define LW lwzx
33
#define LD ldx
34
#define LVS lvsl
35
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
36
vperm _VRT,_VRA,_VRB,_VRC
37
#endif
38
39
#define VMX_THRESH 4096
40
#define ENTER_VMX_OPS \
41
mflr r0; \
42
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45
std r0,16(r1); \
46
stdu r1,-STACKFRAMESIZE(r1); \
47
bl CFUNC(enter_vmx_ops); \
48
cmpwi cr1,r3,0; \
49
ld r0,STACKFRAMESIZE+16(r1); \
50
ld r3,STK_REG(R31)(r1); \
51
ld r4,STK_REG(R30)(r1); \
52
ld r5,STK_REG(R29)(r1); \
53
addi r1,r1,STACKFRAMESIZE; \
54
mtlr r0
55
56
#define EXIT_VMX_OPS \
57
mflr r0; \
58
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61
std r0,16(r1); \
62
stdu r1,-STACKFRAMESIZE(r1); \
63
bl CFUNC(exit_vmx_ops); \
64
ld r0,STACKFRAMESIZE+16(r1); \
65
ld r3,STK_REG(R31)(r1); \
66
ld r4,STK_REG(R30)(r1); \
67
ld r5,STK_REG(R29)(r1); \
68
addi r1,r1,STACKFRAMESIZE; \
69
mtlr r0
70
71
/*
72
* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73
* 16 bytes boundary and permute the result with the 1st 16 bytes.
74
75
* | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
76
* ^ ^ ^
77
* 0xbbbb10 0xbbbb20 0xbbb30
78
* ^
79
* _vaddr
80
*
81
*
82
* _vmask is the mask generated by LVS
83
* _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84
* for example: 0xyyyyyyyyyyyyy012 for big endian
85
* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86
* for example: 0x3456789abcdefzzz for big endian
87
* The permute result is saved in _v_res.
88
* for example: 0x0123456789abcdef for big endian.
89
*/
90
#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91
lvx _v2nd_qw,_vaddr,off16; \
92
VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93
94
/*
95
* There are 2 categories for memcmp:
96
* 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97
* are named like .Lsameoffset_xxxx
98
* 2) src/dst has different offset to the 8 bytes boundary. The handlers
99
* are named like .Ldiffoffset_xxxx
100
*/
101
_GLOBAL_TOC(memcmp)
102
cmpdi cr1,r5,0
103
104
/* Use the short loop if the src/dst addresses are not
105
* with the same offset of 8 bytes align boundary.
106
*/
107
xor r6,r3,r4
108
andi. r6,r6,7
109
110
/* Fall back to short loop if compare at aligned addrs
111
* with less than 8 bytes.
112
*/
113
cmpdi cr6,r5,7
114
115
beq cr1,.Lzero
116
bgt cr6,.Lno_short
117
118
.Lshort:
119
mtctr r5
120
1: lbz rA,0(r3)
121
lbz rB,0(r4)
122
subf. rC,rB,rA
123
bne .Lnon_zero
124
bdz .Lzero
125
126
lbz rA,1(r3)
127
lbz rB,1(r4)
128
subf. rC,rB,rA
129
bne .Lnon_zero
130
bdz .Lzero
131
132
lbz rA,2(r3)
133
lbz rB,2(r4)
134
subf. rC,rB,rA
135
bne .Lnon_zero
136
bdz .Lzero
137
138
lbz rA,3(r3)
139
lbz rB,3(r4)
140
subf. rC,rB,rA
141
bne .Lnon_zero
142
143
addi r3,r3,4
144
addi r4,r4,4
145
146
bdnz 1b
147
148
.Lzero:
149
li r3,0
150
blr
151
152
.Lno_short:
153
dcbt 0,r3
154
dcbt 0,r4
155
bne .Ldiffoffset_8bytes_make_align_start
156
157
158
.Lsameoffset_8bytes_make_align_start:
159
/* attempt to compare bytes not aligned with 8 bytes so that
160
* rest comparison can run based on 8 bytes alignment.
161
*/
162
andi. r6,r3,7
163
164
/* Try to compare the first double word which is not 8 bytes aligned:
165
* load the first double word at (src & ~7UL) and shift left appropriate
166
* bits before comparision.
167
*/
168
rlwinm r6,r3,3,26,28
169
beq .Lsameoffset_8bytes_aligned
170
clrrdi r3,r3,3
171
clrrdi r4,r4,3
172
LD rA,0,r3
173
LD rB,0,r4
174
sld rA,rA,r6
175
sld rB,rB,r6
176
cmpld cr0,rA,rB
177
srwi r6,r6,3
178
bne cr0,.LcmpAB_lightweight
179
subfic r6,r6,8
180
subf. r5,r6,r5
181
addi r3,r3,8
182
addi r4,r4,8
183
beq .Lzero
184
185
.Lsameoffset_8bytes_aligned:
186
/* now we are aligned with 8 bytes.
187
* Use .Llong loop if left cmp bytes are equal or greater than 32B.
188
*/
189
cmpdi cr6,r5,31
190
bgt cr6,.Llong
191
192
.Lcmp_lt32bytes:
193
/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194
cmpdi cr5,r5,7
195
srdi r0,r5,3
196
ble cr5,.Lcmp_rest_lt8bytes
197
198
/* handle 8 ~ 31 bytes */
199
clrldi r5,r5,61
200
mtctr r0
201
2:
202
LD rA,0,r3
203
LD rB,0,r4
204
cmpld cr0,rA,rB
205
addi r3,r3,8
206
addi r4,r4,8
207
bne cr0,.LcmpAB_lightweight
208
bdnz 2b
209
210
cmpwi r5,0
211
beq .Lzero
212
213
.Lcmp_rest_lt8bytes:
214
/*
215
* Here we have less than 8 bytes to compare. At least s1 is aligned to
216
* 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217
* page boundary, otherwise we might read past the end of the buffer and
218
* trigger a page fault. We use 4K as the conservative minimum page
219
* size. If we detect that case we go to the byte-by-byte loop.
220
*
221
* Otherwise the next double word is loaded from s1 and s2, and shifted
222
* right to compare the appropriate bits.
223
*/
224
clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
225
cmpdi r6,0xff8
226
bgt .Lshort
227
228
subfic r6,r5,8
229
slwi r6,r6,3
230
LD rA,0,r3
231
LD rB,0,r4
232
srd rA,rA,r6
233
srd rB,rB,r6
234
cmpld cr0,rA,rB
235
bne cr0,.LcmpAB_lightweight
236
b .Lzero
237
238
.Lnon_zero:
239
mr r3,rC
240
blr
241
242
.Llong:
243
#ifdef CONFIG_ALTIVEC
244
BEGIN_FTR_SECTION
245
/* Try to use vmx loop if length is equal or greater than 4K */
246
cmpldi cr6,r5,VMX_THRESH
247
bge cr6,.Lsameoffset_vmx_cmp
248
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249
250
.Llong_novmx_cmp:
251
#endif
252
/* At least s1 addr is aligned with 8 bytes */
253
li off8,8
254
li off16,16
255
li off24,24
256
257
std r31,-8(r1)
258
std r30,-16(r1)
259
std r29,-24(r1)
260
std r28,-32(r1)
261
std r27,-40(r1)
262
263
srdi r0,r5,5
264
mtctr r0
265
andi. r5,r5,31
266
267
LD rA,0,r3
268
LD rB,0,r4
269
270
LD rC,off8,r3
271
LD rD,off8,r4
272
273
LD rE,off16,r3
274
LD rF,off16,r4
275
276
LD rG,off24,r3
277
LD rH,off24,r4
278
cmpld cr0,rA,rB
279
280
addi r3,r3,32
281
addi r4,r4,32
282
283
bdz .Lfirst32
284
285
LD rA,0,r3
286
LD rB,0,r4
287
cmpld cr1,rC,rD
288
289
LD rC,off8,r3
290
LD rD,off8,r4
291
cmpld cr6,rE,rF
292
293
LD rE,off16,r3
294
LD rF,off16,r4
295
cmpld cr7,rG,rH
296
bne cr0,.LcmpAB
297
298
LD rG,off24,r3
299
LD rH,off24,r4
300
cmpld cr0,rA,rB
301
bne cr1,.LcmpCD
302
303
addi r3,r3,32
304
addi r4,r4,32
305
306
bdz .Lsecond32
307
308
.balign 16
309
310
1: LD rA,0,r3
311
LD rB,0,r4
312
cmpld cr1,rC,rD
313
bne cr6,.LcmpEF
314
315
LD rC,off8,r3
316
LD rD,off8,r4
317
cmpld cr6,rE,rF
318
bne cr7,.LcmpGH
319
320
LD rE,off16,r3
321
LD rF,off16,r4
322
cmpld cr7,rG,rH
323
bne cr0,.LcmpAB
324
325
LD rG,off24,r3
326
LD rH,off24,r4
327
cmpld cr0,rA,rB
328
bne cr1,.LcmpCD
329
330
addi r3,r3,32
331
addi r4,r4,32
332
333
bdnz 1b
334
335
.Lsecond32:
336
cmpld cr1,rC,rD
337
bne cr6,.LcmpEF
338
339
cmpld cr6,rE,rF
340
bne cr7,.LcmpGH
341
342
cmpld cr7,rG,rH
343
bne cr0,.LcmpAB
344
345
bne cr1,.LcmpCD
346
bne cr6,.LcmpEF
347
bne cr7,.LcmpGH
348
349
.Ltail:
350
ld r31,-8(r1)
351
ld r30,-16(r1)
352
ld r29,-24(r1)
353
ld r28,-32(r1)
354
ld r27,-40(r1)
355
356
cmpdi r5,0
357
beq .Lzero
358
b .Lshort
359
360
.Lfirst32:
361
cmpld cr1,rC,rD
362
cmpld cr6,rE,rF
363
cmpld cr7,rG,rH
364
365
bne cr0,.LcmpAB
366
bne cr1,.LcmpCD
367
bne cr6,.LcmpEF
368
bne cr7,.LcmpGH
369
370
b .Ltail
371
372
.LcmpAB:
373
li r3,1
374
bgt cr0,.Lout
375
li r3,-1
376
b .Lout
377
378
.LcmpCD:
379
li r3,1
380
bgt cr1,.Lout
381
li r3,-1
382
b .Lout
383
384
.LcmpEF:
385
li r3,1
386
bgt cr6,.Lout
387
li r3,-1
388
b .Lout
389
390
.LcmpGH:
391
li r3,1
392
bgt cr7,.Lout
393
li r3,-1
394
395
.Lout:
396
ld r31,-8(r1)
397
ld r30,-16(r1)
398
ld r29,-24(r1)
399
ld r28,-32(r1)
400
ld r27,-40(r1)
401
blr
402
403
.LcmpAB_lightweight: /* skip NV GPRS restore */
404
li r3,1
405
bgtlr
406
li r3,-1
407
blr
408
409
#ifdef CONFIG_ALTIVEC
410
.Lsameoffset_vmx_cmp:
411
/* Enter with src/dst addrs has the same offset with 8 bytes
412
* align boundary.
413
*
414
* There is an optimization based on following fact: memcmp()
415
* prones to fail early at the first 32 bytes.
416
* Before applying VMX instructions which will lead to 32x128bits
417
* VMX regs load/restore penalty, we compare the first 32 bytes
418
* so that we can catch the ~80% fail cases.
419
*/
420
421
li r0,4
422
mtctr r0
423
.Lsameoffset_prechk_32B_loop:
424
LD rA,0,r3
425
LD rB,0,r4
426
cmpld cr0,rA,rB
427
addi r3,r3,8
428
addi r4,r4,8
429
bne cr0,.LcmpAB_lightweight
430
addi r5,r5,-8
431
bdnz .Lsameoffset_prechk_32B_loop
432
433
ENTER_VMX_OPS
434
beq cr1,.Llong_novmx_cmp
435
436
3:
437
/* need to check whether r4 has the same offset with r3
438
* for 16 bytes boundary.
439
*/
440
xor r0,r3,r4
441
andi. r0,r0,0xf
442
bne .Ldiffoffset_vmx_cmp_start
443
444
/* len is no less than 4KB. Need to align with 16 bytes further.
445
*/
446
andi. rA,r3,8
447
LD rA,0,r3
448
beq 4f
449
LD rB,0,r4
450
cmpld cr0,rA,rB
451
addi r3,r3,8
452
addi r4,r4,8
453
addi r5,r5,-8
454
455
beq cr0,4f
456
/* save and restore cr0 */
457
mfocrf r5,128
458
EXIT_VMX_OPS
459
mtocrf 128,r5
460
b .LcmpAB_lightweight
461
462
4:
463
/* compare 32 bytes for each loop */
464
srdi r0,r5,5
465
mtctr r0
466
clrldi r5,r5,59
467
li off16,16
468
469
.balign 16
470
5:
471
lvx v0,0,r3
472
lvx v1,0,r4
473
VCMPEQUD_RC(v0,v0,v1)
474
bnl cr6,7f
475
lvx v0,off16,r3
476
lvx v1,off16,r4
477
VCMPEQUD_RC(v0,v0,v1)
478
bnl cr6,6f
479
addi r3,r3,32
480
addi r4,r4,32
481
bdnz 5b
482
483
EXIT_VMX_OPS
484
cmpdi r5,0
485
beq .Lzero
486
b .Lcmp_lt32bytes
487
488
6:
489
addi r3,r3,16
490
addi r4,r4,16
491
492
7:
493
/* diff the last 16 bytes */
494
EXIT_VMX_OPS
495
LD rA,0,r3
496
LD rB,0,r4
497
cmpld cr0,rA,rB
498
li off8,8
499
bne cr0,.LcmpAB_lightweight
500
501
LD rA,off8,r3
502
LD rB,off8,r4
503
cmpld cr0,rA,rB
504
bne cr0,.LcmpAB_lightweight
505
b .Lzero
506
#endif
507
508
.Ldiffoffset_8bytes_make_align_start:
509
/* now try to align s1 with 8 bytes */
510
rlwinm r6,r3,3,26,28
511
beq .Ldiffoffset_align_s1_8bytes
512
513
clrrdi r3,r3,3
514
LD rA,0,r3
515
LD rB,0,r4 /* unaligned load */
516
sld rA,rA,r6
517
srd rA,rA,r6
518
srd rB,rB,r6
519
cmpld cr0,rA,rB
520
srwi r6,r6,3
521
bne cr0,.LcmpAB_lightweight
522
523
subfic r6,r6,8
524
subf. r5,r6,r5
525
addi r3,r3,8
526
add r4,r4,r6
527
528
beq .Lzero
529
530
.Ldiffoffset_align_s1_8bytes:
531
/* now s1 is aligned with 8 bytes. */
532
#ifdef CONFIG_ALTIVEC
533
BEGIN_FTR_SECTION
534
/* only do vmx ops when the size equal or greater than 4K bytes */
535
cmpdi cr5,r5,VMX_THRESH
536
bge cr5,.Ldiffoffset_vmx_cmp
537
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538
539
.Ldiffoffset_novmx_cmp:
540
#endif
541
542
543
cmpdi cr5,r5,31
544
ble cr5,.Lcmp_lt32bytes
545
546
#ifdef CONFIG_ALTIVEC
547
b .Llong_novmx_cmp
548
#else
549
b .Llong
550
#endif
551
552
#ifdef CONFIG_ALTIVEC
553
.Ldiffoffset_vmx_cmp:
554
/* perform a 32 bytes pre-checking before
555
* enable VMX operations.
556
*/
557
li r0,4
558
mtctr r0
559
.Ldiffoffset_prechk_32B_loop:
560
LD rA,0,r3
561
LD rB,0,r4
562
cmpld cr0,rA,rB
563
addi r3,r3,8
564
addi r4,r4,8
565
bne cr0,.LcmpAB_lightweight
566
addi r5,r5,-8
567
bdnz .Ldiffoffset_prechk_32B_loop
568
569
ENTER_VMX_OPS
570
beq cr1,.Ldiffoffset_novmx_cmp
571
572
.Ldiffoffset_vmx_cmp_start:
573
/* Firstly try to align r3 with 16 bytes */
574
andi. r6,r3,0xf
575
li off16,16
576
beq .Ldiffoffset_vmx_s1_16bytes_align
577
578
LVS v3,0,r3
579
LVS v4,0,r4
580
581
lvx v5,0,r3
582
lvx v6,0,r4
583
LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585
586
VCMPEQUB_RC(v7,v9,v10)
587
bnl cr6,.Ldiffoffset_vmx_diff_found
588
589
subfic r6,r6,16
590
subf r5,r6,r5
591
add r3,r3,r6
592
add r4,r4,r6
593
594
.Ldiffoffset_vmx_s1_16bytes_align:
595
/* now s1 is aligned with 16 bytes */
596
lvx v6,0,r4
597
LVS v4,0,r4
598
srdi r6,r5,5 /* loop for 32 bytes each */
599
clrldi r5,r5,59
600
mtctr r6
601
602
.balign 16
603
.Ldiffoffset_vmx_32bytesloop:
604
/* the first qw of r4 was saved in v6 */
605
lvx v9,0,r3
606
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607
VCMPEQUB_RC(v7,v9,v10)
608
vor v6,v8,v8
609
bnl cr6,.Ldiffoffset_vmx_diff_found
610
611
addi r3,r3,16
612
addi r4,r4,16
613
614
lvx v9,0,r3
615
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616
VCMPEQUB_RC(v7,v9,v10)
617
vor v6,v8,v8
618
bnl cr6,.Ldiffoffset_vmx_diff_found
619
620
addi r3,r3,16
621
addi r4,r4,16
622
623
bdnz .Ldiffoffset_vmx_32bytesloop
624
625
EXIT_VMX_OPS
626
627
cmpdi r5,0
628
beq .Lzero
629
b .Lcmp_lt32bytes
630
631
.Ldiffoffset_vmx_diff_found:
632
EXIT_VMX_OPS
633
/* anyway, the diff will appear in next 16 bytes */
634
li r5,16
635
b .Lcmp_lt32bytes
636
637
#endif
638
EXPORT_SYMBOL(memcmp)
639
640