CoCalc -- memcmp

GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/lib/memcmp_64.S
²⁶⁴²⁴ views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
 * Author: Anton Blanchard <[email protected]>
4
 * Copyright 2015 IBM Corporation.
5
 */
6
#include <linux/export.h>
7
#include <asm/ppc_asm.h>
8
#include <asm/ppc-opcode.h>
9

10
#define off8	r6
11
#define off16	r7
12
#define off24	r8
13

14
#define rA	r9
15
#define rB	r10
16
#define rC	r11
17
#define rD	r27
18
#define rE	r28
19
#define rF	r29
20
#define rG	r30
21
#define rH	r31
22

23
#ifdef __LITTLE_ENDIAN__
24
#define LH	lhbrx
25
#define LW	lwbrx
26
#define LD	ldbrx
27
#define LVS	lvsr
28
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
29
	vperm _VRT,_VRB,_VRA,_VRC
30
#else
31
#define LH	lhzx
32
#define LW	lwzx
33
#define LD	ldx
34
#define LVS	lvsl
35
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
36
	vperm _VRT,_VRA,_VRB,_VRC
37
#endif
38

39
#define VMX_THRESH 4096
40
#define ENTER_VMX_OPS	\
41
	mflr    r0;	\
42
	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43
	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44
	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45
	std     r0,16(r1); \
46
	stdu    r1,-STACKFRAMESIZE(r1); \
47
	bl      CFUNC(enter_vmx_ops); \
48
	cmpwi   cr1,r3,0; \
49
	ld      r0,STACKFRAMESIZE+16(r1); \
50
	ld      r3,STK_REG(R31)(r1); \
51
	ld      r4,STK_REG(R30)(r1); \
52
	ld      r5,STK_REG(R29)(r1); \
53
	addi	r1,r1,STACKFRAMESIZE; \
54
	mtlr    r0
55

56
#define EXIT_VMX_OPS \
57
	mflr    r0; \
58
	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59
	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60
	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61
	std     r0,16(r1); \
62
	stdu    r1,-STACKFRAMESIZE(r1); \
63
	bl      CFUNC(exit_vmx_ops); \
64
	ld      r0,STACKFRAMESIZE+16(r1); \
65
	ld      r3,STK_REG(R31)(r1); \
66
	ld      r4,STK_REG(R30)(r1); \
67
	ld      r5,STK_REG(R29)(r1); \
68
	addi	r1,r1,STACKFRAMESIZE; \
69
	mtlr    r0
70

71
/*
72
 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73
 * 16 bytes boundary and permute the result with the 1st 16 bytes.
74

75
 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
76
 *    ^                                  ^                                 ^
77
 * 0xbbbb10                          0xbbbb20                          0xbbb30
78
 *                                 ^
79
 *                                _vaddr
80
 *
81
 *
82
 * _vmask is the mask generated by LVS
83
 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84
 *   for example: 0xyyyyyyyyyyyyy012 for big endian
85
 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86
 *   for example: 0x3456789abcdefzzz for big endian
87
 * The permute result is saved in _v_res.
88
 *   for example: 0x0123456789abcdef for big endian.
89
 */
90
#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91
        lvx     _v2nd_qw,_vaddr,off16; \
92
        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93

94
/*
95
 * There are 2 categories for memcmp:
96
 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97
 * are named like .Lsameoffset_xxxx
98
 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
99
 * are named like .Ldiffoffset_xxxx
100
 */
101
_GLOBAL_TOC(memcmp)
102
	cmpdi	cr1,r5,0
103

104
	/* Use the short loop if the src/dst addresses are not
105
	 * with the same offset of 8 bytes align boundary.
106
	 */
107
	xor	r6,r3,r4
108
	andi.	r6,r6,7
109

110
	/* Fall back to short loop if compare at aligned addrs
111
	 * with less than 8 bytes.
112
	 */
113
	cmpdi   cr6,r5,7
114

115
	beq	cr1,.Lzero
116
	bgt	cr6,.Lno_short
117

118
.Lshort:
119
	mtctr	r5
120
1:	lbz	rA,0(r3)
121
	lbz	rB,0(r4)
122
	subf.	rC,rB,rA
123
	bne	.Lnon_zero
124
	bdz	.Lzero
125

126
	lbz	rA,1(r3)
127
	lbz	rB,1(r4)
128
	subf.	rC,rB,rA
129
	bne	.Lnon_zero
130
	bdz	.Lzero
131

132
	lbz	rA,2(r3)
133
	lbz	rB,2(r4)
134
	subf.	rC,rB,rA
135
	bne	.Lnon_zero
136
	bdz	.Lzero
137

138
	lbz	rA,3(r3)
139
	lbz	rB,3(r4)
140
	subf.	rC,rB,rA
141
	bne	.Lnon_zero
142

143
	addi	r3,r3,4
144
	addi	r4,r4,4
145

146
	bdnz	1b
147

148
.Lzero:
149
	li	r3,0
150
	blr
151

152
.Lno_short:
153
	dcbt	0,r3
154
	dcbt	0,r4
155
	bne	.Ldiffoffset_8bytes_make_align_start
156

157

158
.Lsameoffset_8bytes_make_align_start:
159
	/* attempt to compare bytes not aligned with 8 bytes so that
160
	 * rest comparison can run based on 8 bytes alignment.
161
	 */
162
	andi.   r6,r3,7
163

164
	/* Try to compare the first double word which is not 8 bytes aligned:
165
	 * load the first double word at (src & ~7UL) and shift left appropriate
166
	 * bits before comparision.
167
	 */
168
	rlwinm  r6,r3,3,26,28
169
	beq     .Lsameoffset_8bytes_aligned
170
	clrrdi	r3,r3,3
171
	clrrdi	r4,r4,3
172
	LD	rA,0,r3
173
	LD	rB,0,r4
174
	sld	rA,rA,r6
175
	sld	rB,rB,r6
176
	cmpld	cr0,rA,rB
177
	srwi	r6,r6,3
178
	bne	cr0,.LcmpAB_lightweight
179
	subfic  r6,r6,8
180
	subf.	r5,r6,r5
181
	addi	r3,r3,8
182
	addi	r4,r4,8
183
	beq	.Lzero
184

185
.Lsameoffset_8bytes_aligned:
186
	/* now we are aligned with 8 bytes.
187
	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
188
	 */
189
	cmpdi   cr6,r5,31
190
	bgt	cr6,.Llong
191

192
.Lcmp_lt32bytes:
193
	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194
	cmpdi   cr5,r5,7
195
	srdi    r0,r5,3
196
	ble	cr5,.Lcmp_rest_lt8bytes
197

198
	/* handle 8 ~ 31 bytes */
199
	clrldi  r5,r5,61
200
	mtctr   r0
201
2:
202
	LD	rA,0,r3
203
	LD	rB,0,r4
204
	cmpld	cr0,rA,rB
205
	addi	r3,r3,8
206
	addi	r4,r4,8
207
	bne	cr0,.LcmpAB_lightweight
208
	bdnz	2b
209

210
	cmpwi   r5,0
211
	beq	.Lzero
212

213
.Lcmp_rest_lt8bytes:
214
	/*
215
	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
216
	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217
	 * page boundary, otherwise we might read past the end of the buffer and
218
	 * trigger a page fault. We use 4K as the conservative minimum page
219
	 * size. If we detect that case we go to the byte-by-byte loop.
220
	 *
221
	 * Otherwise the next double word is loaded from s1 and s2, and shifted
222
	 * right to compare the appropriate bits.
223
	 */
224
	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
225
	cmpdi	r6,0xff8
226
	bgt	.Lshort
227

228
	subfic  r6,r5,8
229
	slwi	r6,r6,3
230
	LD	rA,0,r3
231
	LD	rB,0,r4
232
	srd	rA,rA,r6
233
	srd	rB,rB,r6
234
	cmpld	cr0,rA,rB
235
	bne	cr0,.LcmpAB_lightweight
236
	b	.Lzero
237

238
.Lnon_zero:
239
	mr	r3,rC
240
	blr
241

242
.Llong:
243
#ifdef CONFIG_ALTIVEC
244
BEGIN_FTR_SECTION
245
	/* Try to use vmx loop if length is equal or greater than 4K */
246
	cmpldi  cr6,r5,VMX_THRESH
247
	bge	cr6,.Lsameoffset_vmx_cmp
248
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249

250
.Llong_novmx_cmp:
251
#endif
252
	/* At least s1 addr is aligned with 8 bytes */
253
	li	off8,8
254
	li	off16,16
255
	li	off24,24
256

257
	std	r31,-8(r1)
258
	std	r30,-16(r1)
259
	std	r29,-24(r1)
260
	std	r28,-32(r1)
261
	std	r27,-40(r1)
262

263
	srdi	r0,r5,5
264
	mtctr	r0
265
	andi.	r5,r5,31
266

267
	LD	rA,0,r3
268
	LD	rB,0,r4
269

270
	LD	rC,off8,r3
271
	LD	rD,off8,r4
272

273
	LD	rE,off16,r3
274
	LD	rF,off16,r4
275

276
	LD	rG,off24,r3
277
	LD	rH,off24,r4
278
	cmpld	cr0,rA,rB
279

280
	addi	r3,r3,32
281
	addi	r4,r4,32
282

283
	bdz	.Lfirst32
284

285
	LD	rA,0,r3
286
	LD	rB,0,r4
287
	cmpld	cr1,rC,rD
288

289
	LD	rC,off8,r3
290
	LD	rD,off8,r4
291
	cmpld	cr6,rE,rF
292

293
	LD	rE,off16,r3
294
	LD	rF,off16,r4
295
	cmpld	cr7,rG,rH
296
	bne	cr0,.LcmpAB
297

298
	LD	rG,off24,r3
299
	LD	rH,off24,r4
300
	cmpld	cr0,rA,rB
301
	bne	cr1,.LcmpCD
302

303
	addi	r3,r3,32
304
	addi	r4,r4,32
305

306
	bdz	.Lsecond32
307

308
	.balign	16
309

310
1:	LD	rA,0,r3
311
	LD	rB,0,r4
312
	cmpld	cr1,rC,rD
313
	bne	cr6,.LcmpEF
314

315
	LD	rC,off8,r3
316
	LD	rD,off8,r4
317
	cmpld	cr6,rE,rF
318
	bne	cr7,.LcmpGH
319

320
	LD	rE,off16,r3
321
	LD	rF,off16,r4
322
	cmpld	cr7,rG,rH
323
	bne	cr0,.LcmpAB
324

325
	LD	rG,off24,r3
326
	LD	rH,off24,r4
327
	cmpld	cr0,rA,rB
328
	bne	cr1,.LcmpCD
329

330
	addi	r3,r3,32
331
	addi	r4,r4,32
332

333
	bdnz	1b
334

335
.Lsecond32:
336
	cmpld	cr1,rC,rD
337
	bne	cr6,.LcmpEF
338

339
	cmpld	cr6,rE,rF
340
	bne	cr7,.LcmpGH
341

342
	cmpld	cr7,rG,rH
343
	bne	cr0,.LcmpAB
344

345
	bne	cr1,.LcmpCD
346
	bne	cr6,.LcmpEF
347
	bne	cr7,.LcmpGH
348

349
.Ltail:
350
	ld	r31,-8(r1)
351
	ld	r30,-16(r1)
352
	ld	r29,-24(r1)
353
	ld	r28,-32(r1)
354
	ld	r27,-40(r1)
355

356
	cmpdi	r5,0
357
	beq	.Lzero
358
	b	.Lshort
359

360
.Lfirst32:
361
	cmpld	cr1,rC,rD
362
	cmpld	cr6,rE,rF
363
	cmpld	cr7,rG,rH
364

365
	bne	cr0,.LcmpAB
366
	bne	cr1,.LcmpCD
367
	bne	cr6,.LcmpEF
368
	bne	cr7,.LcmpGH
369

370
	b	.Ltail
371

372
.LcmpAB:
373
	li	r3,1
374
	bgt	cr0,.Lout
375
	li	r3,-1
376
	b	.Lout
377

378
.LcmpCD:
379
	li	r3,1
380
	bgt	cr1,.Lout
381
	li	r3,-1
382
	b	.Lout
383

384
.LcmpEF:
385
	li	r3,1
386
	bgt	cr6,.Lout
387
	li	r3,-1
388
	b	.Lout
389

390
.LcmpGH:
391
	li	r3,1
392
	bgt	cr7,.Lout
393
	li	r3,-1
394

395
.Lout:
396
	ld	r31,-8(r1)
397
	ld	r30,-16(r1)
398
	ld	r29,-24(r1)
399
	ld	r28,-32(r1)
400
	ld	r27,-40(r1)
401
	blr
402

403
.LcmpAB_lightweight:   /* skip NV GPRS restore */
404
	li	r3,1
405
	bgtlr
406
	li	r3,-1
407
	blr
408

409
#ifdef CONFIG_ALTIVEC
410
.Lsameoffset_vmx_cmp:
411
	/* Enter with src/dst addrs has the same offset with 8 bytes
412
	 * align boundary.
413
	 *
414
	 * There is an optimization based on following fact: memcmp()
415
	 * prones to fail early at the first 32 bytes.
416
	 * Before applying VMX instructions which will lead to 32x128bits
417
	 * VMX regs load/restore penalty, we compare the first 32 bytes
418
	 * so that we can catch the ~80% fail cases.
419
	 */
420

421
	li	r0,4
422
	mtctr	r0
423
.Lsameoffset_prechk_32B_loop:
424
	LD	rA,0,r3
425
	LD	rB,0,r4
426
	cmpld	cr0,rA,rB
427
	addi	r3,r3,8
428
	addi	r4,r4,8
429
	bne     cr0,.LcmpAB_lightweight
430
	addi	r5,r5,-8
431
	bdnz	.Lsameoffset_prechk_32B_loop
432

433
	ENTER_VMX_OPS
434
	beq     cr1,.Llong_novmx_cmp
435

436
3:
437
	/* need to check whether r4 has the same offset with r3
438
	 * for 16 bytes boundary.
439
	 */
440
	xor	r0,r3,r4
441
	andi.	r0,r0,0xf
442
	bne	.Ldiffoffset_vmx_cmp_start
443

444
	/* len is no less than 4KB. Need to align with 16 bytes further.
445
	 */
446
	andi.	rA,r3,8
447
	LD	rA,0,r3
448
	beq	4f
449
	LD	rB,0,r4
450
	cmpld	cr0,rA,rB
451
	addi	r3,r3,8
452
	addi	r4,r4,8
453
	addi	r5,r5,-8
454

455
	beq	cr0,4f
456
	/* save and restore cr0 */
457
	mfocrf  r5,128
458
	EXIT_VMX_OPS
459
	mtocrf  128,r5
460
	b	.LcmpAB_lightweight
461

462
4:
463
	/* compare 32 bytes for each loop */
464
	srdi	r0,r5,5
465
	mtctr	r0
466
	clrldi  r5,r5,59
467
	li	off16,16
468

469
.balign 16
470
5:
471
	lvx 	v0,0,r3
472
	lvx 	v1,0,r4
473
	VCMPEQUD_RC(v0,v0,v1)
474
	bnl	cr6,7f
475
	lvx 	v0,off16,r3
476
	lvx 	v1,off16,r4
477
	VCMPEQUD_RC(v0,v0,v1)
478
	bnl	cr6,6f
479
	addi	r3,r3,32
480
	addi	r4,r4,32
481
	bdnz	5b
482

483
	EXIT_VMX_OPS
484
	cmpdi	r5,0
485
	beq	.Lzero
486
	b	.Lcmp_lt32bytes
487

488
6:
489
	addi	r3,r3,16
490
	addi	r4,r4,16
491

492
7:
493
	/* diff the last 16 bytes */
494
	EXIT_VMX_OPS
495
	LD	rA,0,r3
496
	LD	rB,0,r4
497
	cmpld	cr0,rA,rB
498
	li	off8,8
499
	bne	cr0,.LcmpAB_lightweight
500

501
	LD	rA,off8,r3
502
	LD	rB,off8,r4
503
	cmpld	cr0,rA,rB
504
	bne	cr0,.LcmpAB_lightweight
505
	b	.Lzero
506
#endif
507

508
.Ldiffoffset_8bytes_make_align_start:
509
	/* now try to align s1 with 8 bytes */
510
	rlwinm  r6,r3,3,26,28
511
	beq     .Ldiffoffset_align_s1_8bytes
512

513
	clrrdi	r3,r3,3
514
	LD	rA,0,r3
515
	LD	rB,0,r4  /* unaligned load */
516
	sld	rA,rA,r6
517
	srd	rA,rA,r6
518
	srd	rB,rB,r6
519
	cmpld	cr0,rA,rB
520
	srwi	r6,r6,3
521
	bne	cr0,.LcmpAB_lightweight
522

523
	subfic  r6,r6,8
524
	subf.	r5,r6,r5
525
	addi	r3,r3,8
526
	add	r4,r4,r6
527

528
	beq	.Lzero
529

530
.Ldiffoffset_align_s1_8bytes:
531
	/* now s1 is aligned with 8 bytes. */
532
#ifdef CONFIG_ALTIVEC
533
BEGIN_FTR_SECTION
534
	/* only do vmx ops when the size equal or greater than 4K bytes */
535
	cmpdi	cr5,r5,VMX_THRESH
536
	bge	cr5,.Ldiffoffset_vmx_cmp
537
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538

539
.Ldiffoffset_novmx_cmp:
540
#endif
541

542

543
	cmpdi   cr5,r5,31
544
	ble	cr5,.Lcmp_lt32bytes
545

546
#ifdef CONFIG_ALTIVEC
547
	b	.Llong_novmx_cmp
548
#else
549
	b	.Llong
550
#endif
551

552
#ifdef CONFIG_ALTIVEC
553
.Ldiffoffset_vmx_cmp:
554
	/* perform a 32 bytes pre-checking before
555
	 * enable VMX operations.
556
	 */
557
	li	r0,4
558
	mtctr	r0
559
.Ldiffoffset_prechk_32B_loop:
560
	LD	rA,0,r3
561
	LD	rB,0,r4
562
	cmpld	cr0,rA,rB
563
	addi	r3,r3,8
564
	addi	r4,r4,8
565
	bne     cr0,.LcmpAB_lightweight
566
	addi	r5,r5,-8
567
	bdnz	.Ldiffoffset_prechk_32B_loop
568

569
	ENTER_VMX_OPS
570
	beq     cr1,.Ldiffoffset_novmx_cmp
571

572
.Ldiffoffset_vmx_cmp_start:
573
	/* Firstly try to align r3 with 16 bytes */
574
	andi.   r6,r3,0xf
575
	li	off16,16
576
	beq     .Ldiffoffset_vmx_s1_16bytes_align
577

578
	LVS	v3,0,r3
579
	LVS	v4,0,r4
580

581
	lvx     v5,0,r3
582
	lvx     v6,0,r4
583
	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584
	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585

586
	VCMPEQUB_RC(v7,v9,v10)
587
	bnl	cr6,.Ldiffoffset_vmx_diff_found
588

589
	subfic  r6,r6,16
590
	subf    r5,r6,r5
591
	add     r3,r3,r6
592
	add     r4,r4,r6
593

594
.Ldiffoffset_vmx_s1_16bytes_align:
595
	/* now s1 is aligned with 16 bytes */
596
	lvx     v6,0,r4
597
	LVS	v4,0,r4
598
	srdi	r6,r5,5  /* loop for 32 bytes each */
599
	clrldi  r5,r5,59
600
	mtctr	r6
601

602
.balign	16
603
.Ldiffoffset_vmx_32bytesloop:
604
	/* the first qw of r4 was saved in v6 */
605
	lvx	v9,0,r3
606
	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607
	VCMPEQUB_RC(v7,v9,v10)
608
	vor	v6,v8,v8
609
	bnl	cr6,.Ldiffoffset_vmx_diff_found
610

611
	addi	r3,r3,16
612
	addi	r4,r4,16
613

614
	lvx	v9,0,r3
615
	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616
	VCMPEQUB_RC(v7,v9,v10)
617
	vor	v6,v8,v8
618
	bnl	cr6,.Ldiffoffset_vmx_diff_found
619

620
	addi	r3,r3,16
621
	addi	r4,r4,16
622

623
	bdnz	.Ldiffoffset_vmx_32bytesloop
624

625
	EXIT_VMX_OPS
626

627
	cmpdi	r5,0
628
	beq	.Lzero
629
	b	.Lcmp_lt32bytes
630

631
.Ldiffoffset_vmx_diff_found:
632
	EXIT_VMX_OPS
633
	/* anyway, the diff will appear in next 16 bytes */
634
	li	r5,16
635
	b	.Lcmp_lt32bytes
636

637
#endif
638
EXPORT_SYMBOL(memcmp)
639

640
Product

Resources

Company