Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/lib/memcpy_power7.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
*
4
* Copyright (C) IBM Corporation, 2012
5
*
6
* Author: Anton Blanchard <[email protected]>
7
*/
8
#include <asm/ppc_asm.h>
9
10
#ifndef SELFTEST_CASE
11
/* 0 == don't use VMX, 1 == use VMX */
12
#define SELFTEST_CASE 0
13
#endif
14
15
#ifdef __BIG_ENDIAN__
16
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
18
#else
19
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
21
#endif
22
23
_GLOBAL(memcpy_power7)
24
cmpldi r5,16
25
cmpldi cr1,r5,4096
26
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27
blt .Lshort_copy
28
29
#ifdef CONFIG_ALTIVEC
30
test_feature = SELFTEST_CASE
31
BEGIN_FTR_SECTION
32
bgt cr1, .Lvmx_copy
33
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34
#endif
35
36
.Lnonvmx_copy:
37
/* Get the source 8B aligned */
38
neg r6,r4
39
mtocrf 0x01,r6
40
clrldi r6,r6,(64-3)
41
42
bf cr7*4+3,1f
43
lbz r0,0(r4)
44
addi r4,r4,1
45
stb r0,0(r3)
46
addi r3,r3,1
47
48
1: bf cr7*4+2,2f
49
lhz r0,0(r4)
50
addi r4,r4,2
51
sth r0,0(r3)
52
addi r3,r3,2
53
54
2: bf cr7*4+1,3f
55
lwz r0,0(r4)
56
addi r4,r4,4
57
stw r0,0(r3)
58
addi r3,r3,4
59
60
3: sub r5,r5,r6
61
cmpldi r5,128
62
blt 5f
63
64
mflr r0
65
stdu r1,-STACKFRAMESIZE(r1)
66
std r14,STK_REG(R14)(r1)
67
std r15,STK_REG(R15)(r1)
68
std r16,STK_REG(R16)(r1)
69
std r17,STK_REG(R17)(r1)
70
std r18,STK_REG(R18)(r1)
71
std r19,STK_REG(R19)(r1)
72
std r20,STK_REG(R20)(r1)
73
std r21,STK_REG(R21)(r1)
74
std r22,STK_REG(R22)(r1)
75
std r0,STACKFRAMESIZE+16(r1)
76
77
srdi r6,r5,7
78
mtctr r6
79
80
/* Now do cacheline (128B) sized loads and stores. */
81
.align 5
82
4:
83
ld r0,0(r4)
84
ld r6,8(r4)
85
ld r7,16(r4)
86
ld r8,24(r4)
87
ld r9,32(r4)
88
ld r10,40(r4)
89
ld r11,48(r4)
90
ld r12,56(r4)
91
ld r14,64(r4)
92
ld r15,72(r4)
93
ld r16,80(r4)
94
ld r17,88(r4)
95
ld r18,96(r4)
96
ld r19,104(r4)
97
ld r20,112(r4)
98
ld r21,120(r4)
99
addi r4,r4,128
100
std r0,0(r3)
101
std r6,8(r3)
102
std r7,16(r3)
103
std r8,24(r3)
104
std r9,32(r3)
105
std r10,40(r3)
106
std r11,48(r3)
107
std r12,56(r3)
108
std r14,64(r3)
109
std r15,72(r3)
110
std r16,80(r3)
111
std r17,88(r3)
112
std r18,96(r3)
113
std r19,104(r3)
114
std r20,112(r3)
115
std r21,120(r3)
116
addi r3,r3,128
117
bdnz 4b
118
119
clrldi r5,r5,(64-7)
120
121
ld r14,STK_REG(R14)(r1)
122
ld r15,STK_REG(R15)(r1)
123
ld r16,STK_REG(R16)(r1)
124
ld r17,STK_REG(R17)(r1)
125
ld r18,STK_REG(R18)(r1)
126
ld r19,STK_REG(R19)(r1)
127
ld r20,STK_REG(R20)(r1)
128
ld r21,STK_REG(R21)(r1)
129
ld r22,STK_REG(R22)(r1)
130
addi r1,r1,STACKFRAMESIZE
131
132
/* Up to 127B to go */
133
5: srdi r6,r5,4
134
mtocrf 0x01,r6
135
136
6: bf cr7*4+1,7f
137
ld r0,0(r4)
138
ld r6,8(r4)
139
ld r7,16(r4)
140
ld r8,24(r4)
141
ld r9,32(r4)
142
ld r10,40(r4)
143
ld r11,48(r4)
144
ld r12,56(r4)
145
addi r4,r4,64
146
std r0,0(r3)
147
std r6,8(r3)
148
std r7,16(r3)
149
std r8,24(r3)
150
std r9,32(r3)
151
std r10,40(r3)
152
std r11,48(r3)
153
std r12,56(r3)
154
addi r3,r3,64
155
156
/* Up to 63B to go */
157
7: bf cr7*4+2,8f
158
ld r0,0(r4)
159
ld r6,8(r4)
160
ld r7,16(r4)
161
ld r8,24(r4)
162
addi r4,r4,32
163
std r0,0(r3)
164
std r6,8(r3)
165
std r7,16(r3)
166
std r8,24(r3)
167
addi r3,r3,32
168
169
/* Up to 31B to go */
170
8: bf cr7*4+3,9f
171
ld r0,0(r4)
172
ld r6,8(r4)
173
addi r4,r4,16
174
std r0,0(r3)
175
std r6,8(r3)
176
addi r3,r3,16
177
178
9: clrldi r5,r5,(64-4)
179
180
/* Up to 15B to go */
181
.Lshort_copy:
182
mtocrf 0x01,r5
183
bf cr7*4+0,12f
184
lwz r0,0(r4) /* Less chance of a reject with word ops */
185
lwz r6,4(r4)
186
addi r4,r4,8
187
stw r0,0(r3)
188
stw r6,4(r3)
189
addi r3,r3,8
190
191
12: bf cr7*4+1,13f
192
lwz r0,0(r4)
193
addi r4,r4,4
194
stw r0,0(r3)
195
addi r3,r3,4
196
197
13: bf cr7*4+2,14f
198
lhz r0,0(r4)
199
addi r4,r4,2
200
sth r0,0(r3)
201
addi r3,r3,2
202
203
14: bf cr7*4+3,15f
204
lbz r0,0(r4)
205
stb r0,0(r3)
206
207
15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208
blr
209
210
.Lunwind_stack_nonvmx_copy:
211
addi r1,r1,STACKFRAMESIZE
212
b .Lnonvmx_copy
213
214
.Lvmx_copy:
215
#ifdef CONFIG_ALTIVEC
216
mflr r0
217
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219
std r0,16(r1)
220
stdu r1,-STACKFRAMESIZE(r1)
221
bl CFUNC(enter_vmx_ops)
222
cmpwi cr1,r3,0
223
ld r0,STACKFRAMESIZE+16(r1)
224
ld r3,STK_REG(R31)(r1)
225
ld r4,STK_REG(R30)(r1)
226
ld r5,STK_REG(R29)(r1)
227
mtlr r0
228
229
/*
230
* We prefetch both the source and destination using enhanced touch
231
* instructions. We use a stream ID of 0 for the load side and
232
* 1 for the store side.
233
*/
234
clrrdi r6,r4,7
235
clrrdi r9,r3,7
236
ori r9,r9,1 /* stream=1 */
237
238
srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
239
cmpldi r7,0x3FF
240
ble 1f
241
li r7,0x3FF
242
1: lis r0,0x0E00 /* depth=7 */
243
sldi r7,r7,7
244
or r7,r7,r0
245
ori r10,r7,1 /* stream=1 */
246
247
DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
248
249
beq cr1,.Lunwind_stack_nonvmx_copy
250
251
/*
252
* If source and destination are not relatively aligned we use a
253
* slower permute loop.
254
*/
255
xor r6,r4,r3
256
rldicl. r6,r6,0,(64-4)
257
bne .Lvmx_unaligned_copy
258
259
/* Get the destination 16B aligned */
260
neg r6,r3
261
mtocrf 0x01,r6
262
clrldi r6,r6,(64-4)
263
264
bf cr7*4+3,1f
265
lbz r0,0(r4)
266
addi r4,r4,1
267
stb r0,0(r3)
268
addi r3,r3,1
269
270
1: bf cr7*4+2,2f
271
lhz r0,0(r4)
272
addi r4,r4,2
273
sth r0,0(r3)
274
addi r3,r3,2
275
276
2: bf cr7*4+1,3f
277
lwz r0,0(r4)
278
addi r4,r4,4
279
stw r0,0(r3)
280
addi r3,r3,4
281
282
3: bf cr7*4+0,4f
283
ld r0,0(r4)
284
addi r4,r4,8
285
std r0,0(r3)
286
addi r3,r3,8
287
288
4: sub r5,r5,r6
289
290
/* Get the desination 128B aligned */
291
neg r6,r3
292
srdi r7,r6,4
293
mtocrf 0x01,r7
294
clrldi r6,r6,(64-7)
295
296
li r9,16
297
li r10,32
298
li r11,48
299
300
bf cr7*4+3,5f
301
lvx v1,0,r4
302
addi r4,r4,16
303
stvx v1,0,r3
304
addi r3,r3,16
305
306
5: bf cr7*4+2,6f
307
lvx v1,0,r4
308
lvx v0,r4,r9
309
addi r4,r4,32
310
stvx v1,0,r3
311
stvx v0,r3,r9
312
addi r3,r3,32
313
314
6: bf cr7*4+1,7f
315
lvx v3,0,r4
316
lvx v2,r4,r9
317
lvx v1,r4,r10
318
lvx v0,r4,r11
319
addi r4,r4,64
320
stvx v3,0,r3
321
stvx v2,r3,r9
322
stvx v1,r3,r10
323
stvx v0,r3,r11
324
addi r3,r3,64
325
326
7: sub r5,r5,r6
327
srdi r6,r5,7
328
329
std r14,STK_REG(R14)(r1)
330
std r15,STK_REG(R15)(r1)
331
std r16,STK_REG(R16)(r1)
332
333
li r12,64
334
li r14,80
335
li r15,96
336
li r16,112
337
338
mtctr r6
339
340
/*
341
* Now do cacheline sized loads and stores. By this stage the
342
* cacheline stores are also cacheline aligned.
343
*/
344
.align 5
345
8:
346
lvx v7,0,r4
347
lvx v6,r4,r9
348
lvx v5,r4,r10
349
lvx v4,r4,r11
350
lvx v3,r4,r12
351
lvx v2,r4,r14
352
lvx v1,r4,r15
353
lvx v0,r4,r16
354
addi r4,r4,128
355
stvx v7,0,r3
356
stvx v6,r3,r9
357
stvx v5,r3,r10
358
stvx v4,r3,r11
359
stvx v3,r3,r12
360
stvx v2,r3,r14
361
stvx v1,r3,r15
362
stvx v0,r3,r16
363
addi r3,r3,128
364
bdnz 8b
365
366
ld r14,STK_REG(R14)(r1)
367
ld r15,STK_REG(R15)(r1)
368
ld r16,STK_REG(R16)(r1)
369
370
/* Up to 127B to go */
371
clrldi r5,r5,(64-7)
372
srdi r6,r5,4
373
mtocrf 0x01,r6
374
375
bf cr7*4+1,9f
376
lvx v3,0,r4
377
lvx v2,r4,r9
378
lvx v1,r4,r10
379
lvx v0,r4,r11
380
addi r4,r4,64
381
stvx v3,0,r3
382
stvx v2,r3,r9
383
stvx v1,r3,r10
384
stvx v0,r3,r11
385
addi r3,r3,64
386
387
9: bf cr7*4+2,10f
388
lvx v1,0,r4
389
lvx v0,r4,r9
390
addi r4,r4,32
391
stvx v1,0,r3
392
stvx v0,r3,r9
393
addi r3,r3,32
394
395
10: bf cr7*4+3,11f
396
lvx v1,0,r4
397
addi r4,r4,16
398
stvx v1,0,r3
399
addi r3,r3,16
400
401
/* Up to 15B to go */
402
11: clrldi r5,r5,(64-4)
403
mtocrf 0x01,r5
404
bf cr7*4+0,12f
405
ld r0,0(r4)
406
addi r4,r4,8
407
std r0,0(r3)
408
addi r3,r3,8
409
410
12: bf cr7*4+1,13f
411
lwz r0,0(r4)
412
addi r4,r4,4
413
stw r0,0(r3)
414
addi r3,r3,4
415
416
13: bf cr7*4+2,14f
417
lhz r0,0(r4)
418
addi r4,r4,2
419
sth r0,0(r3)
420
addi r3,r3,2
421
422
14: bf cr7*4+3,15f
423
lbz r0,0(r4)
424
stb r0,0(r3)
425
426
15: addi r1,r1,STACKFRAMESIZE
427
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
428
b CFUNC(exit_vmx_ops) /* tail call optimise */
429
430
.Lvmx_unaligned_copy:
431
/* Get the destination 16B aligned */
432
neg r6,r3
433
mtocrf 0x01,r6
434
clrldi r6,r6,(64-4)
435
436
bf cr7*4+3,1f
437
lbz r0,0(r4)
438
addi r4,r4,1
439
stb r0,0(r3)
440
addi r3,r3,1
441
442
1: bf cr7*4+2,2f
443
lhz r0,0(r4)
444
addi r4,r4,2
445
sth r0,0(r3)
446
addi r3,r3,2
447
448
2: bf cr7*4+1,3f
449
lwz r0,0(r4)
450
addi r4,r4,4
451
stw r0,0(r3)
452
addi r3,r3,4
453
454
3: bf cr7*4+0,4f
455
lwz r0,0(r4) /* Less chance of a reject with word ops */
456
lwz r7,4(r4)
457
addi r4,r4,8
458
stw r0,0(r3)
459
stw r7,4(r3)
460
addi r3,r3,8
461
462
4: sub r5,r5,r6
463
464
/* Get the desination 128B aligned */
465
neg r6,r3
466
srdi r7,r6,4
467
mtocrf 0x01,r7
468
clrldi r6,r6,(64-7)
469
470
li r9,16
471
li r10,32
472
li r11,48
473
474
LVS(v16,0,r4) /* Setup permute control vector */
475
lvx v0,0,r4
476
addi r4,r4,16
477
478
bf cr7*4+3,5f
479
lvx v1,0,r4
480
VPERM(v8,v0,v1,v16)
481
addi r4,r4,16
482
stvx v8,0,r3
483
addi r3,r3,16
484
vor v0,v1,v1
485
486
5: bf cr7*4+2,6f
487
lvx v1,0,r4
488
VPERM(v8,v0,v1,v16)
489
lvx v0,r4,r9
490
VPERM(v9,v1,v0,v16)
491
addi r4,r4,32
492
stvx v8,0,r3
493
stvx v9,r3,r9
494
addi r3,r3,32
495
496
6: bf cr7*4+1,7f
497
lvx v3,0,r4
498
VPERM(v8,v0,v3,v16)
499
lvx v2,r4,r9
500
VPERM(v9,v3,v2,v16)
501
lvx v1,r4,r10
502
VPERM(v10,v2,v1,v16)
503
lvx v0,r4,r11
504
VPERM(v11,v1,v0,v16)
505
addi r4,r4,64
506
stvx v8,0,r3
507
stvx v9,r3,r9
508
stvx v10,r3,r10
509
stvx v11,r3,r11
510
addi r3,r3,64
511
512
7: sub r5,r5,r6
513
srdi r6,r5,7
514
515
std r14,STK_REG(R14)(r1)
516
std r15,STK_REG(R15)(r1)
517
std r16,STK_REG(R16)(r1)
518
519
li r12,64
520
li r14,80
521
li r15,96
522
li r16,112
523
524
mtctr r6
525
526
/*
527
* Now do cacheline sized loads and stores. By this stage the
528
* cacheline stores are also cacheline aligned.
529
*/
530
.align 5
531
8:
532
lvx v7,0,r4
533
VPERM(v8,v0,v7,v16)
534
lvx v6,r4,r9
535
VPERM(v9,v7,v6,v16)
536
lvx v5,r4,r10
537
VPERM(v10,v6,v5,v16)
538
lvx v4,r4,r11
539
VPERM(v11,v5,v4,v16)
540
lvx v3,r4,r12
541
VPERM(v12,v4,v3,v16)
542
lvx v2,r4,r14
543
VPERM(v13,v3,v2,v16)
544
lvx v1,r4,r15
545
VPERM(v14,v2,v1,v16)
546
lvx v0,r4,r16
547
VPERM(v15,v1,v0,v16)
548
addi r4,r4,128
549
stvx v8,0,r3
550
stvx v9,r3,r9
551
stvx v10,r3,r10
552
stvx v11,r3,r11
553
stvx v12,r3,r12
554
stvx v13,r3,r14
555
stvx v14,r3,r15
556
stvx v15,r3,r16
557
addi r3,r3,128
558
bdnz 8b
559
560
ld r14,STK_REG(R14)(r1)
561
ld r15,STK_REG(R15)(r1)
562
ld r16,STK_REG(R16)(r1)
563
564
/* Up to 127B to go */
565
clrldi r5,r5,(64-7)
566
srdi r6,r5,4
567
mtocrf 0x01,r6
568
569
bf cr7*4+1,9f
570
lvx v3,0,r4
571
VPERM(v8,v0,v3,v16)
572
lvx v2,r4,r9
573
VPERM(v9,v3,v2,v16)
574
lvx v1,r4,r10
575
VPERM(v10,v2,v1,v16)
576
lvx v0,r4,r11
577
VPERM(v11,v1,v0,v16)
578
addi r4,r4,64
579
stvx v8,0,r3
580
stvx v9,r3,r9
581
stvx v10,r3,r10
582
stvx v11,r3,r11
583
addi r3,r3,64
584
585
9: bf cr7*4+2,10f
586
lvx v1,0,r4
587
VPERM(v8,v0,v1,v16)
588
lvx v0,r4,r9
589
VPERM(v9,v1,v0,v16)
590
addi r4,r4,32
591
stvx v8,0,r3
592
stvx v9,r3,r9
593
addi r3,r3,32
594
595
10: bf cr7*4+3,11f
596
lvx v1,0,r4
597
VPERM(v8,v0,v1,v16)
598
addi r4,r4,16
599
stvx v8,0,r3
600
addi r3,r3,16
601
602
/* Up to 15B to go */
603
11: clrldi r5,r5,(64-4)
604
addi r4,r4,-16 /* Unwind the +16 load offset */
605
mtocrf 0x01,r5
606
bf cr7*4+0,12f
607
lwz r0,0(r4) /* Less chance of a reject with word ops */
608
lwz r6,4(r4)
609
addi r4,r4,8
610
stw r0,0(r3)
611
stw r6,4(r3)
612
addi r3,r3,8
613
614
12: bf cr7*4+1,13f
615
lwz r0,0(r4)
616
addi r4,r4,4
617
stw r0,0(r3)
618
addi r3,r3,4
619
620
13: bf cr7*4+2,14f
621
lhz r0,0(r4)
622
addi r4,r4,2
623
sth r0,0(r3)
624
addi r3,r3,2
625
626
14: bf cr7*4+3,15f
627
lbz r0,0(r4)
628
stb r0,0(r3)
629
630
15: addi r1,r1,STACKFRAMESIZE
631
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
632
b CFUNC(exit_vmx_ops) /* tail call optimise */
633
#endif /* CONFIG_ALTIVEC */
634
635