Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/lib/copy_32.S
10818 views
1
/*
2
* Memory copy functions for 32-bit PowerPC.
3
*
4
* Copyright (C) 1996-2005 Paul Mackerras.
5
*
6
* This program is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU General Public License
8
* as published by the Free Software Foundation; either version
9
* 2 of the License, or (at your option) any later version.
10
*/
11
#include <asm/processor.h>
12
#include <asm/cache.h>
13
#include <asm/errno.h>
14
#include <asm/ppc_asm.h>
15
16
#define COPY_16_BYTES \
17
lwz r7,4(r4); \
18
lwz r8,8(r4); \
19
lwz r9,12(r4); \
20
lwzu r10,16(r4); \
21
stw r7,4(r6); \
22
stw r8,8(r6); \
23
stw r9,12(r6); \
24
stwu r10,16(r6)
25
26
#define COPY_16_BYTES_WITHEX(n) \
27
8 ## n ## 0: \
28
lwz r7,4(r4); \
29
8 ## n ## 1: \
30
lwz r8,8(r4); \
31
8 ## n ## 2: \
32
lwz r9,12(r4); \
33
8 ## n ## 3: \
34
lwzu r10,16(r4); \
35
8 ## n ## 4: \
36
stw r7,4(r6); \
37
8 ## n ## 5: \
38
stw r8,8(r6); \
39
8 ## n ## 6: \
40
stw r9,12(r6); \
41
8 ## n ## 7: \
42
stwu r10,16(r6)
43
44
#define COPY_16_BYTES_EXCODE(n) \
45
9 ## n ## 0: \
46
addi r5,r5,-(16 * n); \
47
b 104f; \
48
9 ## n ## 1: \
49
addi r5,r5,-(16 * n); \
50
b 105f; \
51
.section __ex_table,"a"; \
52
.align 2; \
53
.long 8 ## n ## 0b,9 ## n ## 0b; \
54
.long 8 ## n ## 1b,9 ## n ## 0b; \
55
.long 8 ## n ## 2b,9 ## n ## 0b; \
56
.long 8 ## n ## 3b,9 ## n ## 0b; \
57
.long 8 ## n ## 4b,9 ## n ## 1b; \
58
.long 8 ## n ## 5b,9 ## n ## 1b; \
59
.long 8 ## n ## 6b,9 ## n ## 1b; \
60
.long 8 ## n ## 7b,9 ## n ## 1b; \
61
.text
62
63
.text
64
.stabs "arch/powerpc/lib/",N_SO,0,0,0f
65
.stabs "copy_32.S",N_SO,0,0,0f
66
0:
67
68
CACHELINE_BYTES = L1_CACHE_BYTES
69
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70
CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72
/*
73
* Use dcbz on the complete cache lines in the destination
74
* to set them to zero. This requires that the destination
75
* area is cacheable. -- paulus
76
*/
77
_GLOBAL(cacheable_memzero)
78
mr r5,r4
79
li r4,0
80
addi r6,r3,-4
81
cmplwi 0,r5,4
82
blt 7f
83
stwu r4,4(r6)
84
beqlr
85
andi. r0,r6,3
86
add r5,r0,r5
87
subf r6,r0,r6
88
clrlwi r7,r6,32-LG_CACHELINE_BYTES
89
add r8,r7,r5
90
srwi r9,r8,LG_CACHELINE_BYTES
91
addic. r9,r9,-1 /* total number of complete cachelines */
92
ble 2f
93
xori r0,r7,CACHELINE_MASK & ~3
94
srwi. r0,r0,2
95
beq 3f
96
mtctr r0
97
4: stwu r4,4(r6)
98
bdnz 4b
99
3: mtctr r9
100
li r7,4
101
10: dcbz r7,r6
102
addi r6,r6,CACHELINE_BYTES
103
bdnz 10b
104
clrlwi r5,r8,32-LG_CACHELINE_BYTES
105
addi r5,r5,4
106
2: srwi r0,r5,2
107
mtctr r0
108
bdz 6f
109
1: stwu r4,4(r6)
110
bdnz 1b
111
6: andi. r5,r5,3
112
7: cmpwi 0,r5,0
113
beqlr
114
mtctr r5
115
addi r6,r6,3
116
8: stbu r4,1(r6)
117
bdnz 8b
118
blr
119
120
_GLOBAL(memset)
121
rlwimi r4,r4,8,16,23
122
rlwimi r4,r4,16,0,15
123
addi r6,r3,-4
124
cmplwi 0,r5,4
125
blt 7f
126
stwu r4,4(r6)
127
beqlr
128
andi. r0,r6,3
129
add r5,r0,r5
130
subf r6,r0,r6
131
srwi r0,r5,2
132
mtctr r0
133
bdz 6f
134
1: stwu r4,4(r6)
135
bdnz 1b
136
6: andi. r5,r5,3
137
7: cmpwi 0,r5,0
138
beqlr
139
mtctr r5
140
addi r6,r6,3
141
8: stbu r4,1(r6)
142
bdnz 8b
143
blr
144
145
/*
146
* This version uses dcbz on the complete cache lines in the
147
* destination area to reduce memory traffic. This requires that
148
* the destination area is cacheable.
149
* We only use this version if the source and dest don't overlap.
150
* -- paulus.
151
*/
152
_GLOBAL(cacheable_memcpy)
153
add r7,r3,r5 /* test if the src & dst overlap */
154
add r8,r4,r5
155
cmplw 0,r4,r7
156
cmplw 1,r3,r8
157
crand 0,0,4 /* cr0.lt &= cr1.lt */
158
blt memcpy /* if regions overlap */
159
160
addi r4,r4,-4
161
addi r6,r3,-4
162
neg r0,r3
163
andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
164
beq 58f
165
166
cmplw 0,r5,r0 /* is this more than total to do? */
167
blt 63f /* if not much to do */
168
andi. r8,r0,3 /* get it word-aligned first */
169
subf r5,r0,r5
170
mtctr r8
171
beq+ 61f
172
70: lbz r9,4(r4) /* do some bytes */
173
stb r9,4(r6)
174
addi r4,r4,1
175
addi r6,r6,1
176
bdnz 70b
177
61: srwi. r0,r0,2
178
mtctr r0
179
beq 58f
180
72: lwzu r9,4(r4) /* do some words */
181
stwu r9,4(r6)
182
bdnz 72b
183
184
58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
185
clrlwi r5,r5,32-LG_CACHELINE_BYTES
186
li r11,4
187
mtctr r0
188
beq 63f
189
53:
190
dcbz r11,r6
191
COPY_16_BYTES
192
#if L1_CACHE_BYTES >= 32
193
COPY_16_BYTES
194
#if L1_CACHE_BYTES >= 64
195
COPY_16_BYTES
196
COPY_16_BYTES
197
#if L1_CACHE_BYTES >= 128
198
COPY_16_BYTES
199
COPY_16_BYTES
200
COPY_16_BYTES
201
COPY_16_BYTES
202
#endif
203
#endif
204
#endif
205
bdnz 53b
206
207
63: srwi. r0,r5,2
208
mtctr r0
209
beq 64f
210
30: lwzu r0,4(r4)
211
stwu r0,4(r6)
212
bdnz 30b
213
214
64: andi. r0,r5,3
215
mtctr r0
216
beq+ 65f
217
40: lbz r0,4(r4)
218
stb r0,4(r6)
219
addi r4,r4,1
220
addi r6,r6,1
221
bdnz 40b
222
65: blr
223
224
_GLOBAL(memmove)
225
cmplw 0,r3,r4
226
bgt backwards_memcpy
227
/* fall through */
228
229
_GLOBAL(memcpy)
230
srwi. r7,r5,3
231
addi r6,r3,-4
232
addi r4,r4,-4
233
beq 2f /* if less than 8 bytes to do */
234
andi. r0,r6,3 /* get dest word aligned */
235
mtctr r7
236
bne 5f
237
1: lwz r7,4(r4)
238
lwzu r8,8(r4)
239
stw r7,4(r6)
240
stwu r8,8(r6)
241
bdnz 1b
242
andi. r5,r5,7
243
2: cmplwi 0,r5,4
244
blt 3f
245
lwzu r0,4(r4)
246
addi r5,r5,-4
247
stwu r0,4(r6)
248
3: cmpwi 0,r5,0
249
beqlr
250
mtctr r5
251
addi r4,r4,3
252
addi r6,r6,3
253
4: lbzu r0,1(r4)
254
stbu r0,1(r6)
255
bdnz 4b
256
blr
257
5: subfic r0,r0,4
258
mtctr r0
259
6: lbz r7,4(r4)
260
addi r4,r4,1
261
stb r7,4(r6)
262
addi r6,r6,1
263
bdnz 6b
264
subf r5,r0,r5
265
rlwinm. r7,r5,32-3,3,31
266
beq 2b
267
mtctr r7
268
b 1b
269
270
_GLOBAL(backwards_memcpy)
271
rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
272
add r6,r3,r5
273
add r4,r4,r5
274
beq 2f
275
andi. r0,r6,3
276
mtctr r7
277
bne 5f
278
1: lwz r7,-4(r4)
279
lwzu r8,-8(r4)
280
stw r7,-4(r6)
281
stwu r8,-8(r6)
282
bdnz 1b
283
andi. r5,r5,7
284
2: cmplwi 0,r5,4
285
blt 3f
286
lwzu r0,-4(r4)
287
subi r5,r5,4
288
stwu r0,-4(r6)
289
3: cmpwi 0,r5,0
290
beqlr
291
mtctr r5
292
4: lbzu r0,-1(r4)
293
stbu r0,-1(r6)
294
bdnz 4b
295
blr
296
5: mtctr r0
297
6: lbzu r7,-1(r4)
298
stbu r7,-1(r6)
299
bdnz 6b
300
subf r5,r0,r5
301
rlwinm. r7,r5,32-3,3,31
302
beq 2b
303
mtctr r7
304
b 1b
305
306
_GLOBAL(__copy_tofrom_user)
307
addi r4,r4,-4
308
addi r6,r3,-4
309
neg r0,r3
310
andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
311
beq 58f
312
313
cmplw 0,r5,r0 /* is this more than total to do? */
314
blt 63f /* if not much to do */
315
andi. r8,r0,3 /* get it word-aligned first */
316
mtctr r8
317
beq+ 61f
318
70: lbz r9,4(r4) /* do some bytes */
319
71: stb r9,4(r6)
320
addi r4,r4,1
321
addi r6,r6,1
322
bdnz 70b
323
61: subf r5,r0,r5
324
srwi. r0,r0,2
325
mtctr r0
326
beq 58f
327
72: lwzu r9,4(r4) /* do some words */
328
73: stwu r9,4(r6)
329
bdnz 72b
330
331
.section __ex_table,"a"
332
.align 2
333
.long 70b,100f
334
.long 71b,101f
335
.long 72b,102f
336
.long 73b,103f
337
.text
338
339
58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
340
clrlwi r5,r5,32-LG_CACHELINE_BYTES
341
li r11,4
342
beq 63f
343
344
/* Here we decide how far ahead to prefetch the source */
345
li r3,4
346
cmpwi r0,1
347
li r7,0
348
ble 114f
349
li r7,1
350
#if MAX_COPY_PREFETCH > 1
351
/* Heuristically, for large transfers we prefetch
352
MAX_COPY_PREFETCH cachelines ahead. For small transfers
353
we prefetch 1 cacheline ahead. */
354
cmpwi r0,MAX_COPY_PREFETCH
355
ble 112f
356
li r7,MAX_COPY_PREFETCH
357
112: mtctr r7
358
111: dcbt r3,r4
359
addi r3,r3,CACHELINE_BYTES
360
bdnz 111b
361
#else
362
dcbt r3,r4
363
addi r3,r3,CACHELINE_BYTES
364
#endif /* MAX_COPY_PREFETCH > 1 */
365
366
114: subf r8,r7,r0
367
mr r0,r7
368
mtctr r8
369
370
53: dcbt r3,r4
371
54: dcbz r11,r6
372
.section __ex_table,"a"
373
.align 2
374
.long 54b,105f
375
.text
376
/* the main body of the cacheline loop */
377
COPY_16_BYTES_WITHEX(0)
378
#if L1_CACHE_BYTES >= 32
379
COPY_16_BYTES_WITHEX(1)
380
#if L1_CACHE_BYTES >= 64
381
COPY_16_BYTES_WITHEX(2)
382
COPY_16_BYTES_WITHEX(3)
383
#if L1_CACHE_BYTES >= 128
384
COPY_16_BYTES_WITHEX(4)
385
COPY_16_BYTES_WITHEX(5)
386
COPY_16_BYTES_WITHEX(6)
387
COPY_16_BYTES_WITHEX(7)
388
#endif
389
#endif
390
#endif
391
bdnz 53b
392
cmpwi r0,0
393
li r3,4
394
li r7,0
395
bne 114b
396
397
63: srwi. r0,r5,2
398
mtctr r0
399
beq 64f
400
30: lwzu r0,4(r4)
401
31: stwu r0,4(r6)
402
bdnz 30b
403
404
64: andi. r0,r5,3
405
mtctr r0
406
beq+ 65f
407
40: lbz r0,4(r4)
408
41: stb r0,4(r6)
409
addi r4,r4,1
410
addi r6,r6,1
411
bdnz 40b
412
65: li r3,0
413
blr
414
415
/* read fault, initial single-byte copy */
416
100: li r9,0
417
b 90f
418
/* write fault, initial single-byte copy */
419
101: li r9,1
420
90: subf r5,r8,r5
421
li r3,0
422
b 99f
423
/* read fault, initial word copy */
424
102: li r9,0
425
b 91f
426
/* write fault, initial word copy */
427
103: li r9,1
428
91: li r3,2
429
b 99f
430
431
/*
432
* this stuff handles faults in the cacheline loop and branches to either
433
* 104f (if in read part) or 105f (if in write part), after updating r5
434
*/
435
COPY_16_BYTES_EXCODE(0)
436
#if L1_CACHE_BYTES >= 32
437
COPY_16_BYTES_EXCODE(1)
438
#if L1_CACHE_BYTES >= 64
439
COPY_16_BYTES_EXCODE(2)
440
COPY_16_BYTES_EXCODE(3)
441
#if L1_CACHE_BYTES >= 128
442
COPY_16_BYTES_EXCODE(4)
443
COPY_16_BYTES_EXCODE(5)
444
COPY_16_BYTES_EXCODE(6)
445
COPY_16_BYTES_EXCODE(7)
446
#endif
447
#endif
448
#endif
449
450
/* read fault in cacheline loop */
451
104: li r9,0
452
b 92f
453
/* fault on dcbz (effectively a write fault) */
454
/* or write fault in cacheline loop */
455
105: li r9,1
456
92: li r3,LG_CACHELINE_BYTES
457
mfctr r8
458
add r0,r0,r8
459
b 106f
460
/* read fault in final word loop */
461
108: li r9,0
462
b 93f
463
/* write fault in final word loop */
464
109: li r9,1
465
93: andi. r5,r5,3
466
li r3,2
467
b 99f
468
/* read fault in final byte loop */
469
110: li r9,0
470
b 94f
471
/* write fault in final byte loop */
472
111: li r9,1
473
94: li r5,0
474
li r3,0
475
/*
476
* At this stage the number of bytes not copied is
477
* r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
478
*/
479
99: mfctr r0
480
106: slw r3,r0,r3
481
add. r3,r3,r5
482
beq 120f /* shouldn't happen */
483
cmpwi 0,r9,0
484
bne 120f
485
/* for a read fault, first try to continue the copy one byte at a time */
486
mtctr r3
487
130: lbz r0,4(r4)
488
131: stb r0,4(r6)
489
addi r4,r4,1
490
addi r6,r6,1
491
bdnz 130b
492
/* then clear out the destination: r3 bytes starting at 4(r6) */
493
132: mfctr r3
494
srwi. r0,r3,2
495
li r9,0
496
mtctr r0
497
beq 113f
498
112: stwu r9,4(r6)
499
bdnz 112b
500
113: andi. r0,r3,3
501
mtctr r0
502
beq 120f
503
114: stb r9,4(r6)
504
addi r6,r6,1
505
bdnz 114b
506
120: blr
507
508
.section __ex_table,"a"
509
.align 2
510
.long 30b,108b
511
.long 31b,109b
512
.long 40b,110b
513
.long 41b,111b
514
.long 130b,132b
515
.long 131b,120b
516
.long 112b,120b
517
.long 114b,120b
518
.text
519
520