Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/sparc/lib/NG2memcpy.S
10817 views
1
/* NG2memcpy.S: Niagara-2 optimized memcpy.
2
*
3
* Copyright (C) 2007 David S. Miller ([email protected])
4
*/
5
6
#ifdef __KERNEL__
7
#include <asm/visasm.h>
8
#include <asm/asi.h>
9
#define GLOBAL_SPARE %g7
10
#else
11
#define ASI_PNF 0x82
12
#define ASI_BLK_P 0xf0
13
#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
14
#define FPRS_FEF 0x04
15
#ifdef MEMCPY_DEBUG
16
#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
17
clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
18
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
19
#else
20
#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
21
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22
#endif
23
#define GLOBAL_SPARE %g5
24
#endif
25
26
#ifndef STORE_ASI
27
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
28
#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
29
#else
30
#define STORE_ASI 0x80 /* ASI_P */
31
#endif
32
#endif
33
34
#ifndef EX_LD
35
#define EX_LD(x) x
36
#endif
37
38
#ifndef EX_ST
39
#define EX_ST(x) x
40
#endif
41
42
#ifndef EX_RETVAL
43
#define EX_RETVAL(x) x
44
#endif
45
46
#ifndef LOAD
47
#define LOAD(type,addr,dest) type [addr], dest
48
#endif
49
50
#ifndef LOAD_BLK
51
#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
52
#endif
53
54
#ifndef STORE
55
#ifndef MEMCPY_DEBUG
56
#define STORE(type,src,addr) type src, [addr]
57
#else
58
#define STORE(type,src,addr) type##a src, [addr] 0x80
59
#endif
60
#endif
61
62
#ifndef STORE_BLK
63
#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
64
#endif
65
66
#ifndef STORE_INIT
67
#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
68
#endif
69
70
#ifndef FUNC_NAME
71
#define FUNC_NAME NG2memcpy
72
#endif
73
74
#ifndef PREAMBLE
75
#define PREAMBLE
76
#endif
77
78
#ifndef XCC
79
#define XCC xcc
80
#endif
81
82
#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
83
faligndata %x0, %x1, %f0; \
84
faligndata %x1, %x2, %f2; \
85
faligndata %x2, %x3, %f4; \
86
faligndata %x3, %x4, %f6; \
87
faligndata %x4, %x5, %f8; \
88
faligndata %x5, %x6, %f10; \
89
faligndata %x6, %x7, %f12; \
90
faligndata %x7, %x8, %f14;
91
92
#define FREG_MOVE_1(x0) \
93
fmovd %x0, %f0;
94
#define FREG_MOVE_2(x0, x1) \
95
fmovd %x0, %f0; \
96
fmovd %x1, %f2;
97
#define FREG_MOVE_3(x0, x1, x2) \
98
fmovd %x0, %f0; \
99
fmovd %x1, %f2; \
100
fmovd %x2, %f4;
101
#define FREG_MOVE_4(x0, x1, x2, x3) \
102
fmovd %x0, %f0; \
103
fmovd %x1, %f2; \
104
fmovd %x2, %f4; \
105
fmovd %x3, %f6;
106
#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
107
fmovd %x0, %f0; \
108
fmovd %x1, %f2; \
109
fmovd %x2, %f4; \
110
fmovd %x3, %f6; \
111
fmovd %x4, %f8;
112
#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
113
fmovd %x0, %f0; \
114
fmovd %x1, %f2; \
115
fmovd %x2, %f4; \
116
fmovd %x3, %f6; \
117
fmovd %x4, %f8; \
118
fmovd %x5, %f10;
119
#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
120
fmovd %x0, %f0; \
121
fmovd %x1, %f2; \
122
fmovd %x2, %f4; \
123
fmovd %x3, %f6; \
124
fmovd %x4, %f8; \
125
fmovd %x5, %f10; \
126
fmovd %x6, %f12;
127
#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
128
fmovd %x0, %f0; \
129
fmovd %x1, %f2; \
130
fmovd %x2, %f4; \
131
fmovd %x3, %f6; \
132
fmovd %x4, %f8; \
133
fmovd %x5, %f10; \
134
fmovd %x6, %f12; \
135
fmovd %x7, %f14;
136
#define FREG_LOAD_1(base, x0) \
137
EX_LD(LOAD(ldd, base + 0x00, %x0))
138
#define FREG_LOAD_2(base, x0, x1) \
139
EX_LD(LOAD(ldd, base + 0x00, %x0)); \
140
EX_LD(LOAD(ldd, base + 0x08, %x1));
141
#define FREG_LOAD_3(base, x0, x1, x2) \
142
EX_LD(LOAD(ldd, base + 0x00, %x0)); \
143
EX_LD(LOAD(ldd, base + 0x08, %x1)); \
144
EX_LD(LOAD(ldd, base + 0x10, %x2));
145
#define FREG_LOAD_4(base, x0, x1, x2, x3) \
146
EX_LD(LOAD(ldd, base + 0x00, %x0)); \
147
EX_LD(LOAD(ldd, base + 0x08, %x1)); \
148
EX_LD(LOAD(ldd, base + 0x10, %x2)); \
149
EX_LD(LOAD(ldd, base + 0x18, %x3));
150
#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
151
EX_LD(LOAD(ldd, base + 0x00, %x0)); \
152
EX_LD(LOAD(ldd, base + 0x08, %x1)); \
153
EX_LD(LOAD(ldd, base + 0x10, %x2)); \
154
EX_LD(LOAD(ldd, base + 0x18, %x3)); \
155
EX_LD(LOAD(ldd, base + 0x20, %x4));
156
#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
157
EX_LD(LOAD(ldd, base + 0x00, %x0)); \
158
EX_LD(LOAD(ldd, base + 0x08, %x1)); \
159
EX_LD(LOAD(ldd, base + 0x10, %x2)); \
160
EX_LD(LOAD(ldd, base + 0x18, %x3)); \
161
EX_LD(LOAD(ldd, base + 0x20, %x4)); \
162
EX_LD(LOAD(ldd, base + 0x28, %x5));
163
#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
164
EX_LD(LOAD(ldd, base + 0x00, %x0)); \
165
EX_LD(LOAD(ldd, base + 0x08, %x1)); \
166
EX_LD(LOAD(ldd, base + 0x10, %x2)); \
167
EX_LD(LOAD(ldd, base + 0x18, %x3)); \
168
EX_LD(LOAD(ldd, base + 0x20, %x4)); \
169
EX_LD(LOAD(ldd, base + 0x28, %x5)); \
170
EX_LD(LOAD(ldd, base + 0x30, %x6));
171
172
.register %g2,#scratch
173
.register %g3,#scratch
174
175
.text
176
.align 64
177
178
.globl FUNC_NAME
179
.type FUNC_NAME,#function
180
FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
181
srlx %o2, 31, %g2
182
cmp %g2, 0
183
tne %xcc, 5
184
PREAMBLE
185
mov %o0, GLOBAL_SPARE
186
cmp %o2, 0
187
be,pn %XCC, 85f
188
or %o0, %o1, %o3
189
cmp %o2, 16
190
blu,a,pn %XCC, 80f
191
or %o3, %o2, %o3
192
193
/* 2 blocks (128 bytes) is the minimum we can do the block
194
* copy with. We need to ensure that we'll iterate at least
195
* once in the block copy loop. At worst we'll need to align
196
* the destination to a 64-byte boundary which can chew up
197
* to (64 - 1) bytes from the length before we perform the
198
* block copy loop.
199
*
200
* However, the cut-off point, performance wise, is around
201
* 4 64-byte blocks.
202
*/
203
cmp %o2, (4 * 64)
204
blu,pt %XCC, 75f
205
andcc %o3, 0x7, %g0
206
207
/* %o0: dst
208
* %o1: src
209
* %o2: len (known to be >= 128)
210
*
211
* The block copy loops can use %o4, %g2, %g3 as
212
* temporaries while copying the data. %o5 must
213
* be preserved between VISEntryHalf and VISExitHalf
214
*/
215
216
LOAD(prefetch, %o1 + 0x000, #one_read)
217
LOAD(prefetch, %o1 + 0x040, #one_read)
218
LOAD(prefetch, %o1 + 0x080, #one_read)
219
220
/* Align destination on 64-byte boundary. */
221
andcc %o0, (64 - 1), %o4
222
be,pt %XCC, 2f
223
sub %o4, 64, %o4
224
sub %g0, %o4, %o4 ! bytes to align dst
225
sub %o2, %o4, %o2
226
1: subcc %o4, 1, %o4
227
EX_LD(LOAD(ldub, %o1, %g1))
228
EX_ST(STORE(stb, %g1, %o0))
229
add %o1, 1, %o1
230
bne,pt %XCC, 1b
231
add %o0, 1, %o0
232
233
2:
234
/* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
235
* o5 from here until we hit VISExitHalf.
236
*/
237
VISEntryHalf
238
239
alignaddr %o1, %g0, %g0
240
241
add %o1, (64 - 1), %o4
242
andn %o4, (64 - 1), %o4
243
andn %o2, (64 - 1), %g1
244
sub %o2, %g1, %o2
245
246
and %o1, (64 - 1), %g2
247
add %o1, %g1, %o1
248
sub %o0, %o4, %g3
249
brz,pt %g2, 190f
250
cmp %g2, 32
251
blu,a 5f
252
cmp %g2, 16
253
cmp %g2, 48
254
blu,a 4f
255
cmp %g2, 40
256
cmp %g2, 56
257
blu 170f
258
nop
259
ba,a,pt %xcc, 180f
260
261
4: /* 32 <= low bits < 48 */
262
blu 150f
263
nop
264
ba,a,pt %xcc, 160f
265
5: /* 0 < low bits < 32 */
266
blu,a 6f
267
cmp %g2, 8
268
cmp %g2, 24
269
blu 130f
270
nop
271
ba,a,pt %xcc, 140f
272
6: /* 0 < low bits < 16 */
273
bgeu 120f
274
nop
275
/* fall through for 0 < low bits < 8 */
276
110: sub %o4, 64, %g2
277
EX_LD(LOAD_BLK(%g2, %f0))
278
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
279
EX_LD(LOAD_BLK(%o4, %f16))
280
FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
281
EX_ST(STORE_BLK(%f0, %o4 + %g3))
282
FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
283
subcc %g1, 64, %g1
284
add %o4, 64, %o4
285
bne,pt %xcc, 1b
286
LOAD(prefetch, %o4 + 64, #one_read)
287
ba,pt %xcc, 195f
288
nop
289
290
120: sub %o4, 56, %g2
291
FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
292
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
293
EX_LD(LOAD_BLK(%o4, %f16))
294
FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
295
EX_ST(STORE_BLK(%f0, %o4 + %g3))
296
FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
297
subcc %g1, 64, %g1
298
add %o4, 64, %o4
299
bne,pt %xcc, 1b
300
LOAD(prefetch, %o4 + 64, #one_read)
301
ba,pt %xcc, 195f
302
nop
303
304
130: sub %o4, 48, %g2
305
FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
306
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
307
EX_LD(LOAD_BLK(%o4, %f16))
308
FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
309
EX_ST(STORE_BLK(%f0, %o4 + %g3))
310
FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
311
subcc %g1, 64, %g1
312
add %o4, 64, %o4
313
bne,pt %xcc, 1b
314
LOAD(prefetch, %o4 + 64, #one_read)
315
ba,pt %xcc, 195f
316
nop
317
318
140: sub %o4, 40, %g2
319
FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
320
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
321
EX_LD(LOAD_BLK(%o4, %f16))
322
FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
323
EX_ST(STORE_BLK(%f0, %o4 + %g3))
324
FREG_MOVE_5(f22, f24, f26, f28, f30)
325
subcc %g1, 64, %g1
326
add %o4, 64, %o4
327
bne,pt %xcc, 1b
328
LOAD(prefetch, %o4 + 64, #one_read)
329
ba,pt %xcc, 195f
330
nop
331
332
150: sub %o4, 32, %g2
333
FREG_LOAD_4(%g2, f0, f2, f4, f6)
334
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
335
EX_LD(LOAD_BLK(%o4, %f16))
336
FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
337
EX_ST(STORE_BLK(%f0, %o4 + %g3))
338
FREG_MOVE_4(f24, f26, f28, f30)
339
subcc %g1, 64, %g1
340
add %o4, 64, %o4
341
bne,pt %xcc, 1b
342
LOAD(prefetch, %o4 + 64, #one_read)
343
ba,pt %xcc, 195f
344
nop
345
346
160: sub %o4, 24, %g2
347
FREG_LOAD_3(%g2, f0, f2, f4)
348
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
349
EX_LD(LOAD_BLK(%o4, %f16))
350
FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
351
EX_ST(STORE_BLK(%f0, %o4 + %g3))
352
FREG_MOVE_3(f26, f28, f30)
353
subcc %g1, 64, %g1
354
add %o4, 64, %o4
355
bne,pt %xcc, 1b
356
LOAD(prefetch, %o4 + 64, #one_read)
357
ba,pt %xcc, 195f
358
nop
359
360
170: sub %o4, 16, %g2
361
FREG_LOAD_2(%g2, f0, f2)
362
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
363
EX_LD(LOAD_BLK(%o4, %f16))
364
FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
365
EX_ST(STORE_BLK(%f0, %o4 + %g3))
366
FREG_MOVE_2(f28, f30)
367
subcc %g1, 64, %g1
368
add %o4, 64, %o4
369
bne,pt %xcc, 1b
370
LOAD(prefetch, %o4 + 64, #one_read)
371
ba,pt %xcc, 195f
372
nop
373
374
180: sub %o4, 8, %g2
375
FREG_LOAD_1(%g2, f0)
376
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
377
EX_LD(LOAD_BLK(%o4, %f16))
378
FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
379
EX_ST(STORE_BLK(%f0, %o4 + %g3))
380
FREG_MOVE_1(f30)
381
subcc %g1, 64, %g1
382
add %o4, 64, %o4
383
bne,pt %xcc, 1b
384
LOAD(prefetch, %o4 + 64, #one_read)
385
ba,pt %xcc, 195f
386
nop
387
388
190:
389
1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
390
subcc %g1, 64, %g1
391
EX_LD(LOAD_BLK(%o4, %f0))
392
EX_ST(STORE_BLK(%f0, %o4 + %g3))
393
add %o4, 64, %o4
394
bne,pt %xcc, 1b
395
LOAD(prefetch, %o4 + 64, #one_read)
396
397
195:
398
add %o4, %g3, %o0
399
membar #Sync
400
401
VISExitHalf
402
403
/* %o2 contains any final bytes still needed to be copied
404
* over. If anything is left, we copy it one byte at a time.
405
*/
406
brz,pt %o2, 85f
407
sub %o0, %o1, %o3
408
ba,a,pt %XCC, 90f
409
410
.align 64
411
75: /* 16 < len <= 64 */
412
bne,pn %XCC, 75f
413
sub %o0, %o1, %o3
414
415
72:
416
andn %o2, 0xf, %o4
417
and %o2, 0xf, %o2
418
1: subcc %o4, 0x10, %o4
419
EX_LD(LOAD(ldx, %o1, %o5))
420
add %o1, 0x08, %o1
421
EX_LD(LOAD(ldx, %o1, %g1))
422
sub %o1, 0x08, %o1
423
EX_ST(STORE(stx, %o5, %o1 + %o3))
424
add %o1, 0x8, %o1
425
EX_ST(STORE(stx, %g1, %o1 + %o3))
426
bgu,pt %XCC, 1b
427
add %o1, 0x8, %o1
428
73: andcc %o2, 0x8, %g0
429
be,pt %XCC, 1f
430
nop
431
sub %o2, 0x8, %o2
432
EX_LD(LOAD(ldx, %o1, %o5))
433
EX_ST(STORE(stx, %o5, %o1 + %o3))
434
add %o1, 0x8, %o1
435
1: andcc %o2, 0x4, %g0
436
be,pt %XCC, 1f
437
nop
438
sub %o2, 0x4, %o2
439
EX_LD(LOAD(lduw, %o1, %o5))
440
EX_ST(STORE(stw, %o5, %o1 + %o3))
441
add %o1, 0x4, %o1
442
1: cmp %o2, 0
443
be,pt %XCC, 85f
444
nop
445
ba,pt %xcc, 90f
446
nop
447
448
75:
449
andcc %o0, 0x7, %g1
450
sub %g1, 0x8, %g1
451
be,pn %icc, 2f
452
sub %g0, %g1, %g1
453
sub %o2, %g1, %o2
454
455
1: subcc %g1, 1, %g1
456
EX_LD(LOAD(ldub, %o1, %o5))
457
EX_ST(STORE(stb, %o5, %o1 + %o3))
458
bgu,pt %icc, 1b
459
add %o1, 1, %o1
460
461
2: add %o1, %o3, %o0
462
andcc %o1, 0x7, %g1
463
bne,pt %icc, 8f
464
sll %g1, 3, %g1
465
466
cmp %o2, 16
467
bgeu,pt %icc, 72b
468
nop
469
ba,a,pt %xcc, 73b
470
471
8: mov 64, %o3
472
andn %o1, 0x7, %o1
473
EX_LD(LOAD(ldx, %o1, %g2))
474
sub %o3, %g1, %o3
475
andn %o2, 0x7, %o4
476
sllx %g2, %g1, %g2
477
1: add %o1, 0x8, %o1
478
EX_LD(LOAD(ldx, %o1, %g3))
479
subcc %o4, 0x8, %o4
480
srlx %g3, %o3, %o5
481
or %o5, %g2, %o5
482
EX_ST(STORE(stx, %o5, %o0))
483
add %o0, 0x8, %o0
484
bgu,pt %icc, 1b
485
sllx %g3, %g1, %g2
486
487
srl %g1, 3, %g1
488
andcc %o2, 0x7, %o2
489
be,pn %icc, 85f
490
add %o1, %g1, %o1
491
ba,pt %xcc, 90f
492
sub %o0, %o1, %o3
493
494
.align 64
495
80: /* 0 < len <= 16 */
496
andcc %o3, 0x3, %g0
497
bne,pn %XCC, 90f
498
sub %o0, %o1, %o3
499
500
1:
501
subcc %o2, 4, %o2
502
EX_LD(LOAD(lduw, %o1, %g1))
503
EX_ST(STORE(stw, %g1, %o1 + %o3))
504
bgu,pt %XCC, 1b
505
add %o1, 4, %o1
506
507
85: retl
508
mov EX_RETVAL(GLOBAL_SPARE), %o0
509
510
.align 32
511
90:
512
subcc %o2, 1, %o2
513
EX_LD(LOAD(ldub, %o1, %g1))
514
EX_ST(STORE(stb, %g1, %o1 + %o3))
515
bgu,pt %XCC, 90b
516
add %o1, 1, %o1
517
retl
518
mov EX_RETVAL(GLOBAL_SPARE), %o0
519
520
.size FUNC_NAME, .-FUNC_NAME
521
522