Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/sparc/lib/NG2memcpy.S
26444 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/* NG2memcpy.S: Niagara-2 optimized memcpy.
3
*
4
* Copyright (C) 2007 David S. Miller ([email protected])
5
*/
6
7
#ifdef __KERNEL__
8
#include <linux/linkage.h>
9
#include <asm/visasm.h>
10
#include <asm/asi.h>
11
#define GLOBAL_SPARE %g7
12
#else
13
#define ASI_PNF 0x82
14
#define ASI_BLK_P 0xf0
15
#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
16
#define FPRS_FEF 0x04
17
#ifdef MEMCPY_DEBUG
18
#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
19
clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
20
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
21
#else
22
#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
23
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
24
#endif
25
#define GLOBAL_SPARE %g5
26
#endif
27
28
#ifndef STORE_ASI
29
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
30
#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
31
#else
32
#define STORE_ASI 0x80 /* ASI_P */
33
#endif
34
#endif
35
36
#ifndef EX_LD
37
#define EX_LD(x,y) x
38
#endif
39
#ifndef EX_LD_FP
40
#define EX_LD_FP(x,y) x
41
#endif
42
43
#ifndef EX_ST
44
#define EX_ST(x,y) x
45
#endif
46
#ifndef EX_ST_FP
47
#define EX_ST_FP(x,y) x
48
#endif
49
50
#ifndef LOAD
51
#define LOAD(type,addr,dest) type [addr], dest
52
#endif
53
54
#ifndef LOAD_BLK
55
#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
56
#endif
57
58
#ifndef STORE
59
#ifndef MEMCPY_DEBUG
60
#define STORE(type,src,addr) type src, [addr]
61
#else
62
#define STORE(type,src,addr) type##a src, [addr] 0x80
63
#endif
64
#endif
65
66
#ifndef STORE_BLK
67
#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
68
#endif
69
70
#ifndef STORE_INIT
71
#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
72
#endif
73
74
#ifndef FUNC_NAME
75
#define FUNC_NAME NG2memcpy
76
#endif
77
78
#ifndef PREAMBLE
79
#define PREAMBLE
80
#endif
81
82
#ifndef XCC
83
#define XCC xcc
84
#endif
85
86
#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
87
faligndata %x0, %x1, %f0; \
88
faligndata %x1, %x2, %f2; \
89
faligndata %x2, %x3, %f4; \
90
faligndata %x3, %x4, %f6; \
91
faligndata %x4, %x5, %f8; \
92
faligndata %x5, %x6, %f10; \
93
faligndata %x6, %x7, %f12; \
94
faligndata %x7, %x8, %f14;
95
96
#define FREG_MOVE_1(x0) \
97
fsrc2 %x0, %f0;
98
#define FREG_MOVE_2(x0, x1) \
99
fsrc2 %x0, %f0; \
100
fsrc2 %x1, %f2;
101
#define FREG_MOVE_3(x0, x1, x2) \
102
fsrc2 %x0, %f0; \
103
fsrc2 %x1, %f2; \
104
fsrc2 %x2, %f4;
105
#define FREG_MOVE_4(x0, x1, x2, x3) \
106
fsrc2 %x0, %f0; \
107
fsrc2 %x1, %f2; \
108
fsrc2 %x2, %f4; \
109
fsrc2 %x3, %f6;
110
#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
111
fsrc2 %x0, %f0; \
112
fsrc2 %x1, %f2; \
113
fsrc2 %x2, %f4; \
114
fsrc2 %x3, %f6; \
115
fsrc2 %x4, %f8;
116
#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
117
fsrc2 %x0, %f0; \
118
fsrc2 %x1, %f2; \
119
fsrc2 %x2, %f4; \
120
fsrc2 %x3, %f6; \
121
fsrc2 %x4, %f8; \
122
fsrc2 %x5, %f10;
123
#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
124
fsrc2 %x0, %f0; \
125
fsrc2 %x1, %f2; \
126
fsrc2 %x2, %f4; \
127
fsrc2 %x3, %f6; \
128
fsrc2 %x4, %f8; \
129
fsrc2 %x5, %f10; \
130
fsrc2 %x6, %f12;
131
#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
132
fsrc2 %x0, %f0; \
133
fsrc2 %x1, %f2; \
134
fsrc2 %x2, %f4; \
135
fsrc2 %x3, %f6; \
136
fsrc2 %x4, %f8; \
137
fsrc2 %x5, %f10; \
138
fsrc2 %x6, %f12; \
139
fsrc2 %x7, %f14;
140
#define FREG_LOAD_1(base, x0) \
141
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
142
#define FREG_LOAD_2(base, x0, x1) \
143
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
144
EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
145
#define FREG_LOAD_3(base, x0, x1, x2) \
146
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
147
EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
148
EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
149
#define FREG_LOAD_4(base, x0, x1, x2, x3) \
150
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
151
EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
152
EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
153
EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
154
#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
155
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
156
EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
157
EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
158
EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
159
EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
160
#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
161
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
162
EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
163
EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
164
EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
165
EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
166
EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
167
#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
168
EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
169
EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
170
EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
171
EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
172
EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
173
EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
174
EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
175
176
.register %g2,#scratch
177
.register %g3,#scratch
178
179
.text
180
#ifndef EX_RETVAL
181
#define EX_RETVAL(x) x
182
__restore_fp:
183
VISExitHalf
184
__restore_asi:
185
retl
186
wr %g0, ASI_AIUS, %asi
187
ENTRY(NG2_retl_o2)
188
ba,pt %xcc, __restore_asi
189
mov %o2, %o0
190
ENDPROC(NG2_retl_o2)
191
ENTRY(NG2_retl_o2_plus_1)
192
ba,pt %xcc, __restore_asi
193
add %o2, 1, %o0
194
ENDPROC(NG2_retl_o2_plus_1)
195
ENTRY(NG2_retl_o2_plus_4)
196
ba,pt %xcc, __restore_asi
197
add %o2, 4, %o0
198
ENDPROC(NG2_retl_o2_plus_4)
199
ENTRY(NG2_retl_o2_plus_8)
200
ba,pt %xcc, __restore_asi
201
add %o2, 8, %o0
202
ENDPROC(NG2_retl_o2_plus_8)
203
ENTRY(NG2_retl_o2_plus_o4_plus_1)
204
add %o4, 1, %o4
205
ba,pt %xcc, __restore_asi
206
add %o2, %o4, %o0
207
ENDPROC(NG2_retl_o2_plus_o4_plus_1)
208
ENTRY(NG2_retl_o2_plus_o4_plus_8)
209
add %o4, 8, %o4
210
ba,pt %xcc, __restore_asi
211
add %o2, %o4, %o0
212
ENDPROC(NG2_retl_o2_plus_o4_plus_8)
213
ENTRY(NG2_retl_o2_plus_o4_plus_16)
214
add %o4, 16, %o4
215
ba,pt %xcc, __restore_asi
216
add %o2, %o4, %o0
217
ENDPROC(NG2_retl_o2_plus_o4_plus_16)
218
ENTRY(NG2_retl_o2_plus_g1_fp)
219
ba,pt %xcc, __restore_fp
220
add %o2, %g1, %o0
221
ENDPROC(NG2_retl_o2_plus_g1_fp)
222
ENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
223
add %g1, 64, %g1
224
ba,pt %xcc, __restore_fp
225
add %o2, %g1, %o0
226
ENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
227
ENTRY(NG2_retl_o2_plus_g1_plus_1)
228
add %g1, 1, %g1
229
ba,pt %xcc, __restore_asi
230
add %o2, %g1, %o0
231
ENDPROC(NG2_retl_o2_plus_g1_plus_1)
232
ENTRY(NG2_retl_o2_and_7_plus_o4)
233
and %o2, 7, %o2
234
ba,pt %xcc, __restore_asi
235
add %o2, %o4, %o0
236
ENDPROC(NG2_retl_o2_and_7_plus_o4)
237
ENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
238
and %o2, 7, %o2
239
add %o4, 8, %o4
240
ba,pt %xcc, __restore_asi
241
add %o2, %o4, %o0
242
ENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
243
#endif
244
245
.align 64
246
247
.globl FUNC_NAME
248
.type FUNC_NAME,#function
249
FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
250
srlx %o2, 31, %g2
251
cmp %g2, 0
252
tne %xcc, 5
253
PREAMBLE
254
mov %o0, %o3
255
cmp %o2, 0
256
be,pn %XCC, 85f
257
or %o0, %o1, GLOBAL_SPARE
258
cmp %o2, 16
259
blu,a,pn %XCC, 80f
260
or GLOBAL_SPARE, %o2, GLOBAL_SPARE
261
262
/* 2 blocks (128 bytes) is the minimum we can do the block
263
* copy with. We need to ensure that we'll iterate at least
264
* once in the block copy loop. At worst we'll need to align
265
* the destination to a 64-byte boundary which can chew up
266
* to (64 - 1) bytes from the length before we perform the
267
* block copy loop.
268
*
269
* However, the cut-off point, performance wise, is around
270
* 4 64-byte blocks.
271
*/
272
cmp %o2, (4 * 64)
273
blu,pt %XCC, 75f
274
andcc GLOBAL_SPARE, 0x7, %g0
275
276
/* %o0: dst
277
* %o1: src
278
* %o2: len (known to be >= 128)
279
*
280
* The block copy loops can use %o4, %g2, %g3 as
281
* temporaries while copying the data. %o5 must
282
* be preserved between VISEntryHalf and VISExitHalf
283
*/
284
285
LOAD(prefetch, %o1 + 0x000, #one_read)
286
LOAD(prefetch, %o1 + 0x040, #one_read)
287
LOAD(prefetch, %o1 + 0x080, #one_read)
288
289
/* Align destination on 64-byte boundary. */
290
andcc %o0, (64 - 1), %o4
291
be,pt %XCC, 2f
292
sub %o4, 64, %o4
293
sub %g0, %o4, %o4 ! bytes to align dst
294
sub %o2, %o4, %o2
295
1: subcc %o4, 1, %o4
296
EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
297
EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
298
add %o1, 1, %o1
299
bne,pt %XCC, 1b
300
add %o0, 1, %o0
301
302
2:
303
/* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
304
* o5 from here until we hit VISExitHalf.
305
*/
306
VISEntryHalf
307
308
membar #Sync
309
alignaddr %o1, %g0, %g0
310
311
add %o1, (64 - 1), %o4
312
andn %o4, (64 - 1), %o4
313
andn %o2, (64 - 1), %g1
314
sub %o2, %g1, %o2
315
316
and %o1, (64 - 1), %g2
317
add %o1, %g1, %o1
318
sub %o0, %o4, %g3
319
brz,pt %g2, 190f
320
cmp %g2, 32
321
blu,a 5f
322
cmp %g2, 16
323
cmp %g2, 48
324
blu,a 4f
325
cmp %g2, 40
326
cmp %g2, 56
327
blu 170f
328
nop
329
ba,a,pt %xcc, 180f
330
nop
331
332
4: /* 32 <= low bits < 48 */
333
blu 150f
334
nop
335
ba,a,pt %xcc, 160f
336
nop
337
5: /* 0 < low bits < 32 */
338
blu,a 6f
339
cmp %g2, 8
340
cmp %g2, 24
341
blu 130f
342
nop
343
ba,a,pt %xcc, 140f
344
nop
345
6: /* 0 < low bits < 16 */
346
bgeu 120f
347
nop
348
/* fall through for 0 < low bits < 8 */
349
110: sub %o4, 64, %g2
350
EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
351
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
352
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
353
FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
354
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
355
FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
356
subcc %g1, 64, %g1
357
add %o4, 64, %o4
358
bne,pt %xcc, 1b
359
LOAD(prefetch, %o4 + 64, #one_read)
360
ba,pt %xcc, 195f
361
nop
362
363
120: sub %o4, 56, %g2
364
FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
365
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
366
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
367
FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
368
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
369
FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
370
subcc %g1, 64, %g1
371
add %o4, 64, %o4
372
bne,pt %xcc, 1b
373
LOAD(prefetch, %o4 + 64, #one_read)
374
ba,pt %xcc, 195f
375
nop
376
377
130: sub %o4, 48, %g2
378
FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
379
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
380
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
381
FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
382
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
383
FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
384
subcc %g1, 64, %g1
385
add %o4, 64, %o4
386
bne,pt %xcc, 1b
387
LOAD(prefetch, %o4 + 64, #one_read)
388
ba,pt %xcc, 195f
389
nop
390
391
140: sub %o4, 40, %g2
392
FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
393
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
394
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
395
FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
396
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
397
FREG_MOVE_5(f22, f24, f26, f28, f30)
398
subcc %g1, 64, %g1
399
add %o4, 64, %o4
400
bne,pt %xcc, 1b
401
LOAD(prefetch, %o4 + 64, #one_read)
402
ba,pt %xcc, 195f
403
nop
404
405
150: sub %o4, 32, %g2
406
FREG_LOAD_4(%g2, f0, f2, f4, f6)
407
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
408
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
409
FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
410
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
411
FREG_MOVE_4(f24, f26, f28, f30)
412
subcc %g1, 64, %g1
413
add %o4, 64, %o4
414
bne,pt %xcc, 1b
415
LOAD(prefetch, %o4 + 64, #one_read)
416
ba,pt %xcc, 195f
417
nop
418
419
160: sub %o4, 24, %g2
420
FREG_LOAD_3(%g2, f0, f2, f4)
421
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
422
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
423
FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
424
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
425
FREG_MOVE_3(f26, f28, f30)
426
subcc %g1, 64, %g1
427
add %o4, 64, %o4
428
bne,pt %xcc, 1b
429
LOAD(prefetch, %o4 + 64, #one_read)
430
ba,pt %xcc, 195f
431
nop
432
433
170: sub %o4, 16, %g2
434
FREG_LOAD_2(%g2, f0, f2)
435
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
436
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
437
FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
438
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
439
FREG_MOVE_2(f28, f30)
440
subcc %g1, 64, %g1
441
add %o4, 64, %o4
442
bne,pt %xcc, 1b
443
LOAD(prefetch, %o4 + 64, #one_read)
444
ba,pt %xcc, 195f
445
nop
446
447
180: sub %o4, 8, %g2
448
FREG_LOAD_1(%g2, f0)
449
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
450
EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
451
FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
452
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
453
FREG_MOVE_1(f30)
454
subcc %g1, 64, %g1
455
add %o4, 64, %o4
456
bne,pt %xcc, 1b
457
LOAD(prefetch, %o4 + 64, #one_read)
458
ba,pt %xcc, 195f
459
nop
460
461
190:
462
1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
463
subcc %g1, 64, %g1
464
EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
465
EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
466
add %o4, 64, %o4
467
bne,pt %xcc, 1b
468
LOAD(prefetch, %o4 + 64, #one_read)
469
470
195:
471
add %o4, %g3, %o0
472
membar #Sync
473
474
VISExitHalf
475
476
/* %o2 contains any final bytes still needed to be copied
477
* over. If anything is left, we copy it one byte at a time.
478
*/
479
brz,pt %o2, 85f
480
sub %o0, %o1, GLOBAL_SPARE
481
ba,a,pt %XCC, 90f
482
nop
483
484
.align 64
485
75: /* 16 < len <= 64 */
486
bne,pn %XCC, 75f
487
sub %o0, %o1, GLOBAL_SPARE
488
489
72:
490
andn %o2, 0xf, %o4
491
and %o2, 0xf, %o2
492
1: subcc %o4, 0x10, %o4
493
EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
494
add %o1, 0x08, %o1
495
EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
496
sub %o1, 0x08, %o1
497
EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
498
add %o1, 0x8, %o1
499
EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
500
bgu,pt %XCC, 1b
501
add %o1, 0x8, %o1
502
73: andcc %o2, 0x8, %g0
503
be,pt %XCC, 1f
504
nop
505
sub %o2, 0x8, %o2
506
EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
507
EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
508
add %o1, 0x8, %o1
509
1: andcc %o2, 0x4, %g0
510
be,pt %XCC, 1f
511
nop
512
sub %o2, 0x4, %o2
513
EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
514
EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
515
add %o1, 0x4, %o1
516
1: cmp %o2, 0
517
be,pt %XCC, 85f
518
nop
519
ba,pt %xcc, 90f
520
nop
521
522
75:
523
andcc %o0, 0x7, %g1
524
sub %g1, 0x8, %g1
525
be,pn %icc, 2f
526
sub %g0, %g1, %g1
527
sub %o2, %g1, %o2
528
529
1: subcc %g1, 1, %g1
530
EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
531
EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
532
bgu,pt %icc, 1b
533
add %o1, 1, %o1
534
535
2: add %o1, GLOBAL_SPARE, %o0
536
andcc %o1, 0x7, %g1
537
bne,pt %icc, 8f
538
sll %g1, 3, %g1
539
540
cmp %o2, 16
541
bgeu,pt %icc, 72b
542
nop
543
ba,a,pt %xcc, 73b
544
545
8: mov 64, GLOBAL_SPARE
546
andn %o1, 0x7, %o1
547
EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
548
sub GLOBAL_SPARE, %g1, GLOBAL_SPARE
549
andn %o2, 0x7, %o4
550
sllx %g2, %g1, %g2
551
1: add %o1, 0x8, %o1
552
EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
553
subcc %o4, 0x8, %o4
554
srlx %g3, GLOBAL_SPARE, %o5
555
or %o5, %g2, %o5
556
EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
557
add %o0, 0x8, %o0
558
bgu,pt %icc, 1b
559
sllx %g3, %g1, %g2
560
561
srl %g1, 3, %g1
562
andcc %o2, 0x7, %o2
563
be,pn %icc, 85f
564
add %o1, %g1, %o1
565
ba,pt %xcc, 90f
566
sub %o0, %o1, GLOBAL_SPARE
567
568
.align 64
569
80: /* 0 < len <= 16 */
570
andcc GLOBAL_SPARE, 0x3, %g0
571
bne,pn %XCC, 90f
572
sub %o0, %o1, GLOBAL_SPARE
573
574
1:
575
subcc %o2, 4, %o2
576
EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
577
EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
578
bgu,pt %XCC, 1b
579
add %o1, 4, %o1
580
581
85: retl
582
mov EX_RETVAL(%o3), %o0
583
584
.align 32
585
90:
586
subcc %o2, 1, %o2
587
EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
588
EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
589
bgu,pt %XCC, 90b
590
add %o1, 1, %o1
591
retl
592
mov EX_RETVAL(%o3), %o0
593
594
.size FUNC_NAME, .-FUNC_NAME
595
596