Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/lib/copyuser_power7.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
*
4
* Copyright (C) IBM Corporation, 2011
5
*
6
* Author: Anton Blanchard <[email protected]>
7
*/
8
#include <asm/ppc_asm.h>
9
10
#ifndef SELFTEST_CASE
11
/* 0 == don't use VMX, 1 == use VMX */
12
#define SELFTEST_CASE 0
13
#endif
14
15
#ifdef __BIG_ENDIAN__
16
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
18
#else
19
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
21
#endif
22
23
.macro err1
24
100:
25
EX_TABLE(100b,.Ldo_err1)
26
.endm
27
28
.macro err2
29
200:
30
EX_TABLE(200b,.Ldo_err2)
31
.endm
32
33
#ifdef CONFIG_ALTIVEC
34
.macro err3
35
300:
36
EX_TABLE(300b,.Ldo_err3)
37
.endm
38
39
.macro err4
40
400:
41
EX_TABLE(400b,.Ldo_err4)
42
.endm
43
44
45
.Ldo_err4:
46
ld r16,STK_REG(R16)(r1)
47
ld r15,STK_REG(R15)(r1)
48
ld r14,STK_REG(R14)(r1)
49
.Ldo_err3:
50
bl CFUNC(exit_vmx_usercopy)
51
ld r0,STACKFRAMESIZE+16(r1)
52
mtlr r0
53
b .Lexit
54
#endif /* CONFIG_ALTIVEC */
55
56
.Ldo_err2:
57
ld r22,STK_REG(R22)(r1)
58
ld r21,STK_REG(R21)(r1)
59
ld r20,STK_REG(R20)(r1)
60
ld r19,STK_REG(R19)(r1)
61
ld r18,STK_REG(R18)(r1)
62
ld r17,STK_REG(R17)(r1)
63
ld r16,STK_REG(R16)(r1)
64
ld r15,STK_REG(R15)(r1)
65
ld r14,STK_REG(R14)(r1)
66
.Lexit:
67
addi r1,r1,STACKFRAMESIZE
68
.Ldo_err1:
69
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70
ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71
ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72
b __copy_tofrom_user_base
73
74
75
_GLOBAL(__copy_tofrom_user_power7)
76
cmpldi r5,16
77
cmpldi cr1,r5,3328
78
79
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
82
83
blt .Lshort_copy
84
85
#ifdef CONFIG_ALTIVEC
86
test_feature = SELFTEST_CASE
87
BEGIN_FTR_SECTION
88
bgt cr1,.Lvmx_copy
89
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90
#endif
91
92
.Lnonvmx_copy:
93
/* Get the source 8B aligned */
94
neg r6,r4
95
mtocrf 0x01,r6
96
clrldi r6,r6,(64-3)
97
98
bf cr7*4+3,1f
99
err1; lbz r0,0(r4)
100
addi r4,r4,1
101
err1; stb r0,0(r3)
102
addi r3,r3,1
103
104
1: bf cr7*4+2,2f
105
err1; lhz r0,0(r4)
106
addi r4,r4,2
107
err1; sth r0,0(r3)
108
addi r3,r3,2
109
110
2: bf cr7*4+1,3f
111
err1; lwz r0,0(r4)
112
addi r4,r4,4
113
err1; stw r0,0(r3)
114
addi r3,r3,4
115
116
3: sub r5,r5,r6
117
cmpldi r5,128
118
blt 5f
119
120
mflr r0
121
stdu r1,-STACKFRAMESIZE(r1)
122
std r14,STK_REG(R14)(r1)
123
std r15,STK_REG(R15)(r1)
124
std r16,STK_REG(R16)(r1)
125
std r17,STK_REG(R17)(r1)
126
std r18,STK_REG(R18)(r1)
127
std r19,STK_REG(R19)(r1)
128
std r20,STK_REG(R20)(r1)
129
std r21,STK_REG(R21)(r1)
130
std r22,STK_REG(R22)(r1)
131
std r0,STACKFRAMESIZE+16(r1)
132
133
srdi r6,r5,7
134
mtctr r6
135
136
/* Now do cacheline (128B) sized loads and stores. */
137
.align 5
138
4:
139
err2; ld r0,0(r4)
140
err2; ld r6,8(r4)
141
err2; ld r7,16(r4)
142
err2; ld r8,24(r4)
143
err2; ld r9,32(r4)
144
err2; ld r10,40(r4)
145
err2; ld r11,48(r4)
146
err2; ld r12,56(r4)
147
err2; ld r14,64(r4)
148
err2; ld r15,72(r4)
149
err2; ld r16,80(r4)
150
err2; ld r17,88(r4)
151
err2; ld r18,96(r4)
152
err2; ld r19,104(r4)
153
err2; ld r20,112(r4)
154
err2; ld r21,120(r4)
155
addi r4,r4,128
156
err2; std r0,0(r3)
157
err2; std r6,8(r3)
158
err2; std r7,16(r3)
159
err2; std r8,24(r3)
160
err2; std r9,32(r3)
161
err2; std r10,40(r3)
162
err2; std r11,48(r3)
163
err2; std r12,56(r3)
164
err2; std r14,64(r3)
165
err2; std r15,72(r3)
166
err2; std r16,80(r3)
167
err2; std r17,88(r3)
168
err2; std r18,96(r3)
169
err2; std r19,104(r3)
170
err2; std r20,112(r3)
171
err2; std r21,120(r3)
172
addi r3,r3,128
173
bdnz 4b
174
175
clrldi r5,r5,(64-7)
176
177
ld r14,STK_REG(R14)(r1)
178
ld r15,STK_REG(R15)(r1)
179
ld r16,STK_REG(R16)(r1)
180
ld r17,STK_REG(R17)(r1)
181
ld r18,STK_REG(R18)(r1)
182
ld r19,STK_REG(R19)(r1)
183
ld r20,STK_REG(R20)(r1)
184
ld r21,STK_REG(R21)(r1)
185
ld r22,STK_REG(R22)(r1)
186
addi r1,r1,STACKFRAMESIZE
187
188
/* Up to 127B to go */
189
5: srdi r6,r5,4
190
mtocrf 0x01,r6
191
192
6: bf cr7*4+1,7f
193
err1; ld r0,0(r4)
194
err1; ld r6,8(r4)
195
err1; ld r7,16(r4)
196
err1; ld r8,24(r4)
197
err1; ld r9,32(r4)
198
err1; ld r10,40(r4)
199
err1; ld r11,48(r4)
200
err1; ld r12,56(r4)
201
addi r4,r4,64
202
err1; std r0,0(r3)
203
err1; std r6,8(r3)
204
err1; std r7,16(r3)
205
err1; std r8,24(r3)
206
err1; std r9,32(r3)
207
err1; std r10,40(r3)
208
err1; std r11,48(r3)
209
err1; std r12,56(r3)
210
addi r3,r3,64
211
212
/* Up to 63B to go */
213
7: bf cr7*4+2,8f
214
err1; ld r0,0(r4)
215
err1; ld r6,8(r4)
216
err1; ld r7,16(r4)
217
err1; ld r8,24(r4)
218
addi r4,r4,32
219
err1; std r0,0(r3)
220
err1; std r6,8(r3)
221
err1; std r7,16(r3)
222
err1; std r8,24(r3)
223
addi r3,r3,32
224
225
/* Up to 31B to go */
226
8: bf cr7*4+3,9f
227
err1; ld r0,0(r4)
228
err1; ld r6,8(r4)
229
addi r4,r4,16
230
err1; std r0,0(r3)
231
err1; std r6,8(r3)
232
addi r3,r3,16
233
234
9: clrldi r5,r5,(64-4)
235
236
/* Up to 15B to go */
237
.Lshort_copy:
238
mtocrf 0x01,r5
239
bf cr7*4+0,12f
240
err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
241
err1; lwz r6,4(r4)
242
addi r4,r4,8
243
err1; stw r0,0(r3)
244
err1; stw r6,4(r3)
245
addi r3,r3,8
246
247
12: bf cr7*4+1,13f
248
err1; lwz r0,0(r4)
249
addi r4,r4,4
250
err1; stw r0,0(r3)
251
addi r3,r3,4
252
253
13: bf cr7*4+2,14f
254
err1; lhz r0,0(r4)
255
addi r4,r4,2
256
err1; sth r0,0(r3)
257
addi r3,r3,2
258
259
14: bf cr7*4+3,15f
260
err1; lbz r0,0(r4)
261
err1; stb r0,0(r3)
262
263
15: li r3,0
264
blr
265
266
.Lunwind_stack_nonvmx_copy:
267
addi r1,r1,STACKFRAMESIZE
268
b .Lnonvmx_copy
269
270
.Lvmx_copy:
271
#ifdef CONFIG_ALTIVEC
272
mflr r0
273
std r0,16(r1)
274
stdu r1,-STACKFRAMESIZE(r1)
275
bl CFUNC(enter_vmx_usercopy)
276
cmpwi cr1,r3,0
277
ld r0,STACKFRAMESIZE+16(r1)
278
ld r3,STK_REG(R31)(r1)
279
ld r4,STK_REG(R30)(r1)
280
ld r5,STK_REG(R29)(r1)
281
mtlr r0
282
283
/*
284
* We prefetch both the source and destination using enhanced touch
285
* instructions. We use a stream ID of 0 for the load side and
286
* 1 for the store side.
287
*/
288
clrrdi r6,r4,7
289
clrrdi r9,r3,7
290
ori r9,r9,1 /* stream=1 */
291
292
srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
293
cmpldi r7,0x3FF
294
ble 1f
295
li r7,0x3FF
296
1: lis r0,0x0E00 /* depth=7 */
297
sldi r7,r7,7
298
or r7,r7,r0
299
ori r10,r7,1 /* stream=1 */
300
301
DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
302
303
beq cr1,.Lunwind_stack_nonvmx_copy
304
305
/*
306
* If source and destination are not relatively aligned we use a
307
* slower permute loop.
308
*/
309
xor r6,r4,r3
310
rldicl. r6,r6,0,(64-4)
311
bne .Lvmx_unaligned_copy
312
313
/* Get the destination 16B aligned */
314
neg r6,r3
315
mtocrf 0x01,r6
316
clrldi r6,r6,(64-4)
317
318
bf cr7*4+3,1f
319
err3; lbz r0,0(r4)
320
addi r4,r4,1
321
err3; stb r0,0(r3)
322
addi r3,r3,1
323
324
1: bf cr7*4+2,2f
325
err3; lhz r0,0(r4)
326
addi r4,r4,2
327
err3; sth r0,0(r3)
328
addi r3,r3,2
329
330
2: bf cr7*4+1,3f
331
err3; lwz r0,0(r4)
332
addi r4,r4,4
333
err3; stw r0,0(r3)
334
addi r3,r3,4
335
336
3: bf cr7*4+0,4f
337
err3; ld r0,0(r4)
338
addi r4,r4,8
339
err3; std r0,0(r3)
340
addi r3,r3,8
341
342
4: sub r5,r5,r6
343
344
/* Get the desination 128B aligned */
345
neg r6,r3
346
srdi r7,r6,4
347
mtocrf 0x01,r7
348
clrldi r6,r6,(64-7)
349
350
li r9,16
351
li r10,32
352
li r11,48
353
354
bf cr7*4+3,5f
355
err3; lvx v1,0,r4
356
addi r4,r4,16
357
err3; stvx v1,0,r3
358
addi r3,r3,16
359
360
5: bf cr7*4+2,6f
361
err3; lvx v1,0,r4
362
err3; lvx v0,r4,r9
363
addi r4,r4,32
364
err3; stvx v1,0,r3
365
err3; stvx v0,r3,r9
366
addi r3,r3,32
367
368
6: bf cr7*4+1,7f
369
err3; lvx v3,0,r4
370
err3; lvx v2,r4,r9
371
err3; lvx v1,r4,r10
372
err3; lvx v0,r4,r11
373
addi r4,r4,64
374
err3; stvx v3,0,r3
375
err3; stvx v2,r3,r9
376
err3; stvx v1,r3,r10
377
err3; stvx v0,r3,r11
378
addi r3,r3,64
379
380
7: sub r5,r5,r6
381
srdi r6,r5,7
382
383
std r14,STK_REG(R14)(r1)
384
std r15,STK_REG(R15)(r1)
385
std r16,STK_REG(R16)(r1)
386
387
li r12,64
388
li r14,80
389
li r15,96
390
li r16,112
391
392
mtctr r6
393
394
/*
395
* Now do cacheline sized loads and stores. By this stage the
396
* cacheline stores are also cacheline aligned.
397
*/
398
.align 5
399
8:
400
err4; lvx v7,0,r4
401
err4; lvx v6,r4,r9
402
err4; lvx v5,r4,r10
403
err4; lvx v4,r4,r11
404
err4; lvx v3,r4,r12
405
err4; lvx v2,r4,r14
406
err4; lvx v1,r4,r15
407
err4; lvx v0,r4,r16
408
addi r4,r4,128
409
err4; stvx v7,0,r3
410
err4; stvx v6,r3,r9
411
err4; stvx v5,r3,r10
412
err4; stvx v4,r3,r11
413
err4; stvx v3,r3,r12
414
err4; stvx v2,r3,r14
415
err4; stvx v1,r3,r15
416
err4; stvx v0,r3,r16
417
addi r3,r3,128
418
bdnz 8b
419
420
ld r14,STK_REG(R14)(r1)
421
ld r15,STK_REG(R15)(r1)
422
ld r16,STK_REG(R16)(r1)
423
424
/* Up to 127B to go */
425
clrldi r5,r5,(64-7)
426
srdi r6,r5,4
427
mtocrf 0x01,r6
428
429
bf cr7*4+1,9f
430
err3; lvx v3,0,r4
431
err3; lvx v2,r4,r9
432
err3; lvx v1,r4,r10
433
err3; lvx v0,r4,r11
434
addi r4,r4,64
435
err3; stvx v3,0,r3
436
err3; stvx v2,r3,r9
437
err3; stvx v1,r3,r10
438
err3; stvx v0,r3,r11
439
addi r3,r3,64
440
441
9: bf cr7*4+2,10f
442
err3; lvx v1,0,r4
443
err3; lvx v0,r4,r9
444
addi r4,r4,32
445
err3; stvx v1,0,r3
446
err3; stvx v0,r3,r9
447
addi r3,r3,32
448
449
10: bf cr7*4+3,11f
450
err3; lvx v1,0,r4
451
addi r4,r4,16
452
err3; stvx v1,0,r3
453
addi r3,r3,16
454
455
/* Up to 15B to go */
456
11: clrldi r5,r5,(64-4)
457
mtocrf 0x01,r5
458
bf cr7*4+0,12f
459
err3; ld r0,0(r4)
460
addi r4,r4,8
461
err3; std r0,0(r3)
462
addi r3,r3,8
463
464
12: bf cr7*4+1,13f
465
err3; lwz r0,0(r4)
466
addi r4,r4,4
467
err3; stw r0,0(r3)
468
addi r3,r3,4
469
470
13: bf cr7*4+2,14f
471
err3; lhz r0,0(r4)
472
addi r4,r4,2
473
err3; sth r0,0(r3)
474
addi r3,r3,2
475
476
14: bf cr7*4+3,15f
477
err3; lbz r0,0(r4)
478
err3; stb r0,0(r3)
479
480
15: addi r1,r1,STACKFRAMESIZE
481
b CFUNC(exit_vmx_usercopy) /* tail call optimise */
482
483
.Lvmx_unaligned_copy:
484
/* Get the destination 16B aligned */
485
neg r6,r3
486
mtocrf 0x01,r6
487
clrldi r6,r6,(64-4)
488
489
bf cr7*4+3,1f
490
err3; lbz r0,0(r4)
491
addi r4,r4,1
492
err3; stb r0,0(r3)
493
addi r3,r3,1
494
495
1: bf cr7*4+2,2f
496
err3; lhz r0,0(r4)
497
addi r4,r4,2
498
err3; sth r0,0(r3)
499
addi r3,r3,2
500
501
2: bf cr7*4+1,3f
502
err3; lwz r0,0(r4)
503
addi r4,r4,4
504
err3; stw r0,0(r3)
505
addi r3,r3,4
506
507
3: bf cr7*4+0,4f
508
err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
509
err3; lwz r7,4(r4)
510
addi r4,r4,8
511
err3; stw r0,0(r3)
512
err3; stw r7,4(r3)
513
addi r3,r3,8
514
515
4: sub r5,r5,r6
516
517
/* Get the desination 128B aligned */
518
neg r6,r3
519
srdi r7,r6,4
520
mtocrf 0x01,r7
521
clrldi r6,r6,(64-7)
522
523
li r9,16
524
li r10,32
525
li r11,48
526
527
LVS(v16,0,r4) /* Setup permute control vector */
528
err3; lvx v0,0,r4
529
addi r4,r4,16
530
531
bf cr7*4+3,5f
532
err3; lvx v1,0,r4
533
VPERM(v8,v0,v1,v16)
534
addi r4,r4,16
535
err3; stvx v8,0,r3
536
addi r3,r3,16
537
vor v0,v1,v1
538
539
5: bf cr7*4+2,6f
540
err3; lvx v1,0,r4
541
VPERM(v8,v0,v1,v16)
542
err3; lvx v0,r4,r9
543
VPERM(v9,v1,v0,v16)
544
addi r4,r4,32
545
err3; stvx v8,0,r3
546
err3; stvx v9,r3,r9
547
addi r3,r3,32
548
549
6: bf cr7*4+1,7f
550
err3; lvx v3,0,r4
551
VPERM(v8,v0,v3,v16)
552
err3; lvx v2,r4,r9
553
VPERM(v9,v3,v2,v16)
554
err3; lvx v1,r4,r10
555
VPERM(v10,v2,v1,v16)
556
err3; lvx v0,r4,r11
557
VPERM(v11,v1,v0,v16)
558
addi r4,r4,64
559
err3; stvx v8,0,r3
560
err3; stvx v9,r3,r9
561
err3; stvx v10,r3,r10
562
err3; stvx v11,r3,r11
563
addi r3,r3,64
564
565
7: sub r5,r5,r6
566
srdi r6,r5,7
567
568
std r14,STK_REG(R14)(r1)
569
std r15,STK_REG(R15)(r1)
570
std r16,STK_REG(R16)(r1)
571
572
li r12,64
573
li r14,80
574
li r15,96
575
li r16,112
576
577
mtctr r6
578
579
/*
580
* Now do cacheline sized loads and stores. By this stage the
581
* cacheline stores are also cacheline aligned.
582
*/
583
.align 5
584
8:
585
err4; lvx v7,0,r4
586
VPERM(v8,v0,v7,v16)
587
err4; lvx v6,r4,r9
588
VPERM(v9,v7,v6,v16)
589
err4; lvx v5,r4,r10
590
VPERM(v10,v6,v5,v16)
591
err4; lvx v4,r4,r11
592
VPERM(v11,v5,v4,v16)
593
err4; lvx v3,r4,r12
594
VPERM(v12,v4,v3,v16)
595
err4; lvx v2,r4,r14
596
VPERM(v13,v3,v2,v16)
597
err4; lvx v1,r4,r15
598
VPERM(v14,v2,v1,v16)
599
err4; lvx v0,r4,r16
600
VPERM(v15,v1,v0,v16)
601
addi r4,r4,128
602
err4; stvx v8,0,r3
603
err4; stvx v9,r3,r9
604
err4; stvx v10,r3,r10
605
err4; stvx v11,r3,r11
606
err4; stvx v12,r3,r12
607
err4; stvx v13,r3,r14
608
err4; stvx v14,r3,r15
609
err4; stvx v15,r3,r16
610
addi r3,r3,128
611
bdnz 8b
612
613
ld r14,STK_REG(R14)(r1)
614
ld r15,STK_REG(R15)(r1)
615
ld r16,STK_REG(R16)(r1)
616
617
/* Up to 127B to go */
618
clrldi r5,r5,(64-7)
619
srdi r6,r5,4
620
mtocrf 0x01,r6
621
622
bf cr7*4+1,9f
623
err3; lvx v3,0,r4
624
VPERM(v8,v0,v3,v16)
625
err3; lvx v2,r4,r9
626
VPERM(v9,v3,v2,v16)
627
err3; lvx v1,r4,r10
628
VPERM(v10,v2,v1,v16)
629
err3; lvx v0,r4,r11
630
VPERM(v11,v1,v0,v16)
631
addi r4,r4,64
632
err3; stvx v8,0,r3
633
err3; stvx v9,r3,r9
634
err3; stvx v10,r3,r10
635
err3; stvx v11,r3,r11
636
addi r3,r3,64
637
638
9: bf cr7*4+2,10f
639
err3; lvx v1,0,r4
640
VPERM(v8,v0,v1,v16)
641
err3; lvx v0,r4,r9
642
VPERM(v9,v1,v0,v16)
643
addi r4,r4,32
644
err3; stvx v8,0,r3
645
err3; stvx v9,r3,r9
646
addi r3,r3,32
647
648
10: bf cr7*4+3,11f
649
err3; lvx v1,0,r4
650
VPERM(v8,v0,v1,v16)
651
addi r4,r4,16
652
err3; stvx v8,0,r3
653
addi r3,r3,16
654
655
/* Up to 15B to go */
656
11: clrldi r5,r5,(64-4)
657
addi r4,r4,-16 /* Unwind the +16 load offset */
658
mtocrf 0x01,r5
659
bf cr7*4+0,12f
660
err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
661
err3; lwz r6,4(r4)
662
addi r4,r4,8
663
err3; stw r0,0(r3)
664
err3; stw r6,4(r3)
665
addi r3,r3,8
666
667
12: bf cr7*4+1,13f
668
err3; lwz r0,0(r4)
669
addi r4,r4,4
670
err3; stw r0,0(r3)
671
addi r3,r3,4
672
673
13: bf cr7*4+2,14f
674
err3; lhz r0,0(r4)
675
addi r4,r4,2
676
err3; sth r0,0(r3)
677
addi r3,r3,2
678
679
14: bf cr7*4+3,15f
680
err3; lbz r0,0(r4)
681
err3; stb r0,0(r3)
682
683
15: addi r1,r1,STACKFRAMESIZE
684
b CFUNC(exit_vmx_usercopy) /* tail call optimise */
685
#endif /* CONFIG_ALTIVEC */
686
687