Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/sparc/lib/NGmemcpy.S
10817 views
1
/* NGmemcpy.S: Niagara optimized memcpy.
2
*
3
* Copyright (C) 2006, 2007 David S. Miller ([email protected])
4
*/
5
6
#ifdef __KERNEL__
7
#include <asm/asi.h>
8
#include <asm/thread_info.h>
9
#define GLOBAL_SPARE %g7
10
#define RESTORE_ASI(TMP) \
11
ldub [%g6 + TI_CURRENT_DS], TMP; \
12
wr TMP, 0x0, %asi;
13
#else
14
#define GLOBAL_SPARE %g5
15
#define RESTORE_ASI(TMP) \
16
wr %g0, ASI_PNF, %asi
17
#endif
18
19
#ifdef __sparc_v9__
20
#define SAVE_AMOUNT 128
21
#else
22
#define SAVE_AMOUNT 64
23
#endif
24
25
#ifndef STORE_ASI
26
#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
27
#endif
28
29
#ifndef EX_LD
30
#define EX_LD(x) x
31
#endif
32
33
#ifndef EX_ST
34
#define EX_ST(x) x
35
#endif
36
37
#ifndef EX_RETVAL
38
#define EX_RETVAL(x) x
39
#endif
40
41
#ifndef LOAD
42
#ifndef MEMCPY_DEBUG
43
#define LOAD(type,addr,dest) type [addr], dest
44
#else
45
#define LOAD(type,addr,dest) type##a [addr] 0x80, dest
46
#endif
47
#endif
48
49
#ifndef LOAD_TWIN
50
#define LOAD_TWIN(addr_reg,dest0,dest1) \
51
ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
52
#endif
53
54
#ifndef STORE
55
#define STORE(type,src,addr) type src, [addr]
56
#endif
57
58
#ifndef STORE_INIT
59
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
60
#define STORE_INIT(src,addr) stxa src, [addr] %asi
61
#else
62
#define STORE_INIT(src,addr) stx src, [addr + 0x00]
63
#endif
64
#endif
65
66
#ifndef FUNC_NAME
67
#define FUNC_NAME NGmemcpy
68
#endif
69
70
#ifndef PREAMBLE
71
#define PREAMBLE
72
#endif
73
74
#ifndef XCC
75
#define XCC xcc
76
#endif
77
78
.register %g2,#scratch
79
.register %g3,#scratch
80
81
.text
82
.align 64
83
84
.globl FUNC_NAME
85
.type FUNC_NAME,#function
86
FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */
87
PREAMBLE
88
save %sp, -SAVE_AMOUNT, %sp
89
srlx %i2, 31, %g2
90
cmp %g2, 0
91
tne %xcc, 5
92
mov %i0, %o0
93
cmp %i2, 0
94
be,pn %XCC, 85f
95
or %o0, %i1, %i3
96
cmp %i2, 16
97
blu,a,pn %XCC, 80f
98
or %i3, %i2, %i3
99
100
/* 2 blocks (128 bytes) is the minimum we can do the block
101
* copy with. We need to ensure that we'll iterate at least
102
* once in the block copy loop. At worst we'll need to align
103
* the destination to a 64-byte boundary which can chew up
104
* to (64 - 1) bytes from the length before we perform the
105
* block copy loop.
106
*/
107
cmp %i2, (2 * 64)
108
blu,pt %XCC, 70f
109
andcc %i3, 0x7, %g0
110
111
/* %o0: dst
112
* %i1: src
113
* %i2: len (known to be >= 128)
114
*
115
* The block copy loops will use %i4/%i5,%g2/%g3 as
116
* temporaries while copying the data.
117
*/
118
119
LOAD(prefetch, %i1, #one_read)
120
wr %g0, STORE_ASI, %asi
121
122
/* Align destination on 64-byte boundary. */
123
andcc %o0, (64 - 1), %i4
124
be,pt %XCC, 2f
125
sub %i4, 64, %i4
126
sub %g0, %i4, %i4 ! bytes to align dst
127
sub %i2, %i4, %i2
128
1: subcc %i4, 1, %i4
129
EX_LD(LOAD(ldub, %i1, %g1))
130
EX_ST(STORE(stb, %g1, %o0))
131
add %i1, 1, %i1
132
bne,pt %XCC, 1b
133
add %o0, 1, %o0
134
135
/* If the source is on a 16-byte boundary we can do
136
* the direct block copy loop. If it is 8-byte aligned
137
* we can do the 16-byte loads offset by -8 bytes and the
138
* init stores offset by one register.
139
*
140
* If the source is not even 8-byte aligned, we need to do
141
* shifting and masking (basically integer faligndata).
142
*
143
* The careful bit with init stores is that if we store
144
* to any part of the cache line we have to store the whole
145
* cacheline else we can end up with corrupt L2 cache line
146
* contents. Since the loop works on 64-bytes of 64-byte
147
* aligned store data at a time, this is easy to ensure.
148
*/
149
2:
150
andcc %i1, (16 - 1), %i4
151
andn %i2, (64 - 1), %g1 ! block copy loop iterator
152
be,pt %XCC, 50f
153
sub %i2, %g1, %i2 ! final sub-block copy bytes
154
155
cmp %i4, 8
156
be,pt %XCC, 10f
157
sub %i1, %i4, %i1
158
159
/* Neither 8-byte nor 16-byte aligned, shift and mask. */
160
and %i4, 0x7, GLOBAL_SPARE
161
sll GLOBAL_SPARE, 3, GLOBAL_SPARE
162
mov 64, %i5
163
EX_LD(LOAD_TWIN(%i1, %g2, %g3))
164
sub %i5, GLOBAL_SPARE, %i5
165
mov 16, %o4
166
mov 32, %o5
167
mov 48, %o7
168
mov 64, %i3
169
170
bg,pn %XCC, 9f
171
nop
172
173
#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
174
sllx WORD1, POST_SHIFT, WORD1; \
175
srlx WORD2, PRE_SHIFT, TMP; \
176
sllx WORD2, POST_SHIFT, WORD2; \
177
or WORD1, TMP, WORD1; \
178
srlx WORD3, PRE_SHIFT, TMP; \
179
or WORD2, TMP, WORD2;
180
181
8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
182
MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
183
LOAD(prefetch, %i1 + %i3, #one_read)
184
185
EX_ST(STORE_INIT(%g2, %o0 + 0x00))
186
EX_ST(STORE_INIT(%g3, %o0 + 0x08))
187
188
EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
189
MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
190
191
EX_ST(STORE_INIT(%o2, %o0 + 0x10))
192
EX_ST(STORE_INIT(%o3, %o0 + 0x18))
193
194
EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
195
MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
196
197
EX_ST(STORE_INIT(%g2, %o0 + 0x20))
198
EX_ST(STORE_INIT(%g3, %o0 + 0x28))
199
200
EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
201
add %i1, 64, %i1
202
MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
203
204
EX_ST(STORE_INIT(%o2, %o0 + 0x30))
205
EX_ST(STORE_INIT(%o3, %o0 + 0x38))
206
207
subcc %g1, 64, %g1
208
bne,pt %XCC, 8b
209
add %o0, 64, %o0
210
211
ba,pt %XCC, 60f
212
add %i1, %i4, %i1
213
214
9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
215
MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
216
LOAD(prefetch, %i1 + %i3, #one_read)
217
218
EX_ST(STORE_INIT(%g3, %o0 + 0x00))
219
EX_ST(STORE_INIT(%o2, %o0 + 0x08))
220
221
EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
222
MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
223
224
EX_ST(STORE_INIT(%o3, %o0 + 0x10))
225
EX_ST(STORE_INIT(%g2, %o0 + 0x18))
226
227
EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
228
MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
229
230
EX_ST(STORE_INIT(%g3, %o0 + 0x20))
231
EX_ST(STORE_INIT(%o2, %o0 + 0x28))
232
233
EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
234
add %i1, 64, %i1
235
MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
236
237
EX_ST(STORE_INIT(%o3, %o0 + 0x30))
238
EX_ST(STORE_INIT(%g2, %o0 + 0x38))
239
240
subcc %g1, 64, %g1
241
bne,pt %XCC, 9b
242
add %o0, 64, %o0
243
244
ba,pt %XCC, 60f
245
add %i1, %i4, %i1
246
247
10: /* Destination is 64-byte aligned, source was only 8-byte
248
* aligned but it has been subtracted by 8 and we perform
249
* one twin load ahead, then add 8 back into source when
250
* we finish the loop.
251
*/
252
EX_LD(LOAD_TWIN(%i1, %o4, %o5))
253
mov 16, %o7
254
mov 32, %g2
255
mov 48, %g3
256
mov 64, %o1
257
1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
258
LOAD(prefetch, %i1 + %o1, #one_read)
259
EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line
260
EX_ST(STORE_INIT(%o2, %o0 + 0x08))
261
EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
262
EX_ST(STORE_INIT(%o3, %o0 + 0x10))
263
EX_ST(STORE_INIT(%o4, %o0 + 0x18))
264
EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
265
EX_ST(STORE_INIT(%o5, %o0 + 0x20))
266
EX_ST(STORE_INIT(%o2, %o0 + 0x28))
267
EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5))
268
add %i1, 64, %i1
269
EX_ST(STORE_INIT(%o3, %o0 + 0x30))
270
EX_ST(STORE_INIT(%o4, %o0 + 0x38))
271
subcc %g1, 64, %g1
272
bne,pt %XCC, 1b
273
add %o0, 64, %o0
274
275
ba,pt %XCC, 60f
276
add %i1, 0x8, %i1
277
278
50: /* Destination is 64-byte aligned, and source is 16-byte
279
* aligned.
280
*/
281
mov 16, %o7
282
mov 32, %g2
283
mov 48, %g3
284
mov 64, %o1
285
1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5))
286
EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
287
LOAD(prefetch, %i1 + %o1, #one_read)
288
EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line
289
EX_ST(STORE_INIT(%o5, %o0 + 0x08))
290
EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
291
EX_ST(STORE_INIT(%o2, %o0 + 0x10))
292
EX_ST(STORE_INIT(%o3, %o0 + 0x18))
293
EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
294
add %i1, 64, %i1
295
EX_ST(STORE_INIT(%o4, %o0 + 0x20))
296
EX_ST(STORE_INIT(%o5, %o0 + 0x28))
297
EX_ST(STORE_INIT(%o2, %o0 + 0x30))
298
EX_ST(STORE_INIT(%o3, %o0 + 0x38))
299
subcc %g1, 64, %g1
300
bne,pt %XCC, 1b
301
add %o0, 64, %o0
302
/* fall through */
303
304
60:
305
membar #Sync
306
307
/* %i2 contains any final bytes still needed to be copied
308
* over. If anything is left, we copy it one byte at a time.
309
*/
310
RESTORE_ASI(%i3)
311
brz,pt %i2, 85f
312
sub %o0, %i1, %i3
313
ba,a,pt %XCC, 90f
314
315
.align 64
316
70: /* 16 < len <= 64 */
317
bne,pn %XCC, 75f
318
sub %o0, %i1, %i3
319
320
72:
321
andn %i2, 0xf, %i4
322
and %i2, 0xf, %i2
323
1: subcc %i4, 0x10, %i4
324
EX_LD(LOAD(ldx, %i1, %o4))
325
add %i1, 0x08, %i1
326
EX_LD(LOAD(ldx, %i1, %g1))
327
sub %i1, 0x08, %i1
328
EX_ST(STORE(stx, %o4, %i1 + %i3))
329
add %i1, 0x8, %i1
330
EX_ST(STORE(stx, %g1, %i1 + %i3))
331
bgu,pt %XCC, 1b
332
add %i1, 0x8, %i1
333
73: andcc %i2, 0x8, %g0
334
be,pt %XCC, 1f
335
nop
336
sub %i2, 0x8, %i2
337
EX_LD(LOAD(ldx, %i1, %o4))
338
EX_ST(STORE(stx, %o4, %i1 + %i3))
339
add %i1, 0x8, %i1
340
1: andcc %i2, 0x4, %g0
341
be,pt %XCC, 1f
342
nop
343
sub %i2, 0x4, %i2
344
EX_LD(LOAD(lduw, %i1, %i5))
345
EX_ST(STORE(stw, %i5, %i1 + %i3))
346
add %i1, 0x4, %i1
347
1: cmp %i2, 0
348
be,pt %XCC, 85f
349
nop
350
ba,pt %xcc, 90f
351
nop
352
353
75:
354
andcc %o0, 0x7, %g1
355
sub %g1, 0x8, %g1
356
be,pn %icc, 2f
357
sub %g0, %g1, %g1
358
sub %i2, %g1, %i2
359
360
1: subcc %g1, 1, %g1
361
EX_LD(LOAD(ldub, %i1, %i5))
362
EX_ST(STORE(stb, %i5, %i1 + %i3))
363
bgu,pt %icc, 1b
364
add %i1, 1, %i1
365
366
2: add %i1, %i3, %o0
367
andcc %i1, 0x7, %g1
368
bne,pt %icc, 8f
369
sll %g1, 3, %g1
370
371
cmp %i2, 16
372
bgeu,pt %icc, 72b
373
nop
374
ba,a,pt %xcc, 73b
375
376
8: mov 64, %i3
377
andn %i1, 0x7, %i1
378
EX_LD(LOAD(ldx, %i1, %g2))
379
sub %i3, %g1, %i3
380
andn %i2, 0x7, %i4
381
sllx %g2, %g1, %g2
382
1: add %i1, 0x8, %i1
383
EX_LD(LOAD(ldx, %i1, %g3))
384
subcc %i4, 0x8, %i4
385
srlx %g3, %i3, %i5
386
or %i5, %g2, %i5
387
EX_ST(STORE(stx, %i5, %o0))
388
add %o0, 0x8, %o0
389
bgu,pt %icc, 1b
390
sllx %g3, %g1, %g2
391
392
srl %g1, 3, %g1
393
andcc %i2, 0x7, %i2
394
be,pn %icc, 85f
395
add %i1, %g1, %i1
396
ba,pt %xcc, 90f
397
sub %o0, %i1, %i3
398
399
.align 64
400
80: /* 0 < len <= 16 */
401
andcc %i3, 0x3, %g0
402
bne,pn %XCC, 90f
403
sub %o0, %i1, %i3
404
405
1:
406
subcc %i2, 4, %i2
407
EX_LD(LOAD(lduw, %i1, %g1))
408
EX_ST(STORE(stw, %g1, %i1 + %i3))
409
bgu,pt %XCC, 1b
410
add %i1, 4, %i1
411
412
85: ret
413
restore EX_RETVAL(%i0), %g0, %o0
414
415
.align 32
416
90:
417
subcc %i2, 1, %i2
418
EX_LD(LOAD(ldub, %i1, %g1))
419
EX_ST(STORE(stb, %g1, %i1 + %i3))
420
bgu,pt %XCC, 90b
421
add %i1, 1, %i1
422
ret
423
restore EX_RETVAL(%i0), %g0, %o0
424
425
.size FUNC_NAME, .-FUNC_NAME
426
427