Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/sparc/lib/U1memcpy.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
3
*
4
* Copyright (C) 1997, 2004 David S. Miller ([email protected])
5
* Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek ([email protected])
6
*/
7
8
#ifdef __KERNEL__
9
#include <linux/export.h>
10
#include <linux/linkage.h>
11
#include <asm/visasm.h>
12
#include <asm/asi.h>
13
#define GLOBAL_SPARE g7
14
#else
15
#define GLOBAL_SPARE g5
16
#define ASI_BLK_P 0xf0
17
#define FPRS_FEF 0x04
18
#ifdef MEMCPY_DEBUG
19
#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
20
clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
21
#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22
#else
23
#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
24
#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
25
#endif
26
#endif
27
28
#ifndef EX_LD
29
#define EX_LD(x,y) x
30
#endif
31
#ifndef EX_LD_FP
32
#define EX_LD_FP(x,y) x
33
#endif
34
35
#ifndef EX_ST
36
#define EX_ST(x,y) x
37
#endif
38
#ifndef EX_ST_FP
39
#define EX_ST_FP(x,y) x
40
#endif
41
42
#ifndef LOAD
43
#define LOAD(type,addr,dest) type [addr], dest
44
#endif
45
46
#ifndef LOAD_BLK
47
#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
48
#endif
49
50
#ifndef STORE
51
#define STORE(type,src,addr) type src, [addr]
52
#endif
53
54
#ifndef STORE_BLK
55
#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
56
#endif
57
58
#ifndef FUNC_NAME
59
#define FUNC_NAME memcpy
60
#endif
61
62
#ifndef PREAMBLE
63
#define PREAMBLE
64
#endif
65
66
#ifndef XCC
67
#define XCC xcc
68
#endif
69
70
#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \
71
faligndata %f1, %f2, %f48; \
72
faligndata %f2, %f3, %f50; \
73
faligndata %f3, %f4, %f52; \
74
faligndata %f4, %f5, %f54; \
75
faligndata %f5, %f6, %f56; \
76
faligndata %f6, %f7, %f58; \
77
faligndata %f7, %f8, %f60; \
78
faligndata %f8, %f9, %f62;
79
80
#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, jmptgt) \
81
EX_LD_FP(LOAD_BLK(%src, %fdest), U1_gs_80_fp); \
82
EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp); \
83
add %src, 0x40, %src; \
84
subcc %GLOBAL_SPARE, 0x40, %GLOBAL_SPARE; \
85
be,pn %xcc, jmptgt; \
86
add %dest, 0x40, %dest; \
87
88
#define LOOP_CHUNK1(src, dest, branch_dest) \
89
MAIN_LOOP_CHUNK(src, dest, f0, f48, branch_dest)
90
#define LOOP_CHUNK2(src, dest, branch_dest) \
91
MAIN_LOOP_CHUNK(src, dest, f16, f48, branch_dest)
92
#define LOOP_CHUNK3(src, dest, branch_dest) \
93
MAIN_LOOP_CHUNK(src, dest, f32, f48, branch_dest)
94
95
#define DO_SYNC membar #Sync;
96
#define STORE_SYNC(dest, fsrc) \
97
EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp); \
98
add %dest, 0x40, %dest; \
99
DO_SYNC
100
101
#define STORE_JUMP(dest, fsrc, target) \
102
EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_40_fp); \
103
add %dest, 0x40, %dest; \
104
ba,pt %xcc, target; \
105
nop;
106
107
#define FINISH_VISCHUNK(dest, f0, f1) \
108
subcc %g3, 8, %g3; \
109
bl,pn %xcc, 95f; \
110
faligndata %f0, %f1, %f48; \
111
EX_ST_FP(STORE(std, %f48, %dest), U1_g3_8_fp); \
112
add %dest, 8, %dest;
113
114
#define UNEVEN_VISCHUNK_LAST(dest, f0, f1) \
115
subcc %g3, 8, %g3; \
116
bl,pn %xcc, 95f; \
117
fsrc2 %f0, %f1;
118
119
#define UNEVEN_VISCHUNK(dest, f0, f1) \
120
UNEVEN_VISCHUNK_LAST(dest, f0, f1) \
121
ba,a,pt %xcc, 93f;
122
123
.register %g2,#scratch
124
.register %g3,#scratch
125
126
.text
127
#ifndef EX_RETVAL
128
#define EX_RETVAL(x) x
129
ENTRY(U1_g1_1_fp)
130
VISExitHalf
131
add %g1, 1, %g1
132
add %g1, %g2, %g1
133
retl
134
add %g1, %o2, %o0
135
ENDPROC(U1_g1_1_fp)
136
ENTRY(U1_g2_0_fp)
137
VISExitHalf
138
retl
139
add %g2, %o2, %o0
140
ENDPROC(U1_g2_0_fp)
141
ENTRY(U1_g2_8_fp)
142
VISExitHalf
143
add %g2, 8, %g2
144
retl
145
add %g2, %o2, %o0
146
ENDPROC(U1_g2_8_fp)
147
ENTRY(U1_gs_0_fp)
148
VISExitHalf
149
add %GLOBAL_SPARE, %g3, %o0
150
retl
151
add %o0, %o2, %o0
152
ENDPROC(U1_gs_0_fp)
153
ENTRY(U1_gs_80_fp)
154
VISExitHalf
155
add %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
156
add %GLOBAL_SPARE, %g3, %o0
157
retl
158
add %o0, %o2, %o0
159
ENDPROC(U1_gs_80_fp)
160
ENTRY(U1_gs_40_fp)
161
VISExitHalf
162
add %GLOBAL_SPARE, 0x40, %GLOBAL_SPARE
163
add %GLOBAL_SPARE, %g3, %o0
164
retl
165
add %o0, %o2, %o0
166
ENDPROC(U1_gs_40_fp)
167
ENTRY(U1_g3_0_fp)
168
VISExitHalf
169
retl
170
add %g3, %o2, %o0
171
ENDPROC(U1_g3_0_fp)
172
ENTRY(U1_g3_8_fp)
173
VISExitHalf
174
add %g3, 8, %g3
175
retl
176
add %g3, %o2, %o0
177
ENDPROC(U1_g3_8_fp)
178
ENTRY(U1_o2_0_fp)
179
VISExitHalf
180
retl
181
mov %o2, %o0
182
ENDPROC(U1_o2_0_fp)
183
ENTRY(U1_o2_1_fp)
184
VISExitHalf
185
retl
186
add %o2, 1, %o0
187
ENDPROC(U1_o2_1_fp)
188
ENTRY(U1_gs_0)
189
VISExitHalf
190
retl
191
add %GLOBAL_SPARE, %o2, %o0
192
ENDPROC(U1_gs_0)
193
ENTRY(U1_gs_8)
194
VISExitHalf
195
add %GLOBAL_SPARE, %o2, %GLOBAL_SPARE
196
retl
197
add %GLOBAL_SPARE, 0x8, %o0
198
ENDPROC(U1_gs_8)
199
ENTRY(U1_gs_10)
200
VISExitHalf
201
add %GLOBAL_SPARE, %o2, %GLOBAL_SPARE
202
retl
203
add %GLOBAL_SPARE, 0x10, %o0
204
ENDPROC(U1_gs_10)
205
ENTRY(U1_o2_0)
206
retl
207
mov %o2, %o0
208
ENDPROC(U1_o2_0)
209
ENTRY(U1_o2_8)
210
retl
211
add %o2, 8, %o0
212
ENDPROC(U1_o2_8)
213
ENTRY(U1_o2_4)
214
retl
215
add %o2, 4, %o0
216
ENDPROC(U1_o2_4)
217
ENTRY(U1_o2_1)
218
retl
219
add %o2, 1, %o0
220
ENDPROC(U1_o2_1)
221
ENTRY(U1_g1_0)
222
retl
223
add %g1, %o2, %o0
224
ENDPROC(U1_g1_0)
225
ENTRY(U1_g1_1)
226
add %g1, 1, %g1
227
retl
228
add %g1, %o2, %o0
229
ENDPROC(U1_g1_1)
230
ENTRY(U1_gs_0_o2_adj)
231
and %o2, 7, %o2
232
retl
233
add %GLOBAL_SPARE, %o2, %o0
234
ENDPROC(U1_gs_0_o2_adj)
235
ENTRY(U1_gs_8_o2_adj)
236
and %o2, 7, %o2
237
add %GLOBAL_SPARE, 8, %GLOBAL_SPARE
238
retl
239
add %GLOBAL_SPARE, %o2, %o0
240
ENDPROC(U1_gs_8_o2_adj)
241
#endif
242
243
.align 64
244
245
.globl FUNC_NAME
246
.type FUNC_NAME,#function
247
FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
248
srlx %o2, 31, %g2
249
cmp %g2, 0
250
tne %xcc, 5
251
PREAMBLE
252
mov %o0, %o4
253
cmp %o2, 0
254
be,pn %XCC, 85f
255
or %o0, %o1, %o3
256
cmp %o2, 16
257
blu,a,pn %XCC, 80f
258
or %o3, %o2, %o3
259
260
cmp %o2, (5 * 64)
261
blu,pt %XCC, 70f
262
andcc %o3, 0x7, %g0
263
264
/* Clobbers o5/g1/g2/g3/g7/icc/xcc. */
265
VISEntry
266
267
/* Is 'dst' already aligned on an 64-byte boundary? */
268
andcc %o0, 0x3f, %g2
269
be,pt %XCC, 2f
270
271
/* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
272
* of bytes to copy to make 'dst' 64-byte aligned. We pre-
273
* subtract this from 'len'.
274
*/
275
sub %o0, %o1, %GLOBAL_SPARE
276
sub %g2, 0x40, %g2
277
sub %g0, %g2, %g2
278
sub %o2, %g2, %o2
279
andcc %g2, 0x7, %g1
280
be,pt %icc, 2f
281
and %g2, 0x38, %g2
282
283
1: subcc %g1, 0x1, %g1
284
EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U1_g1_1_fp)
285
EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE), U1_g1_1_fp)
286
bgu,pt %XCC, 1b
287
add %o1, 0x1, %o1
288
289
add %o1, %GLOBAL_SPARE, %o0
290
291
2: cmp %g2, 0x0
292
and %o1, 0x7, %g1
293
be,pt %icc, 3f
294
alignaddr %o1, %g0, %o1
295
296
EX_LD_FP(LOAD(ldd, %o1, %f4), U1_g2_0_fp)
297
1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U1_g2_0_fp)
298
add %o1, 0x8, %o1
299
subcc %g2, 0x8, %g2
300
faligndata %f4, %f6, %f0
301
EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
302
be,pn %icc, 3f
303
add %o0, 0x8, %o0
304
305
EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U1_g2_0_fp)
306
add %o1, 0x8, %o1
307
subcc %g2, 0x8, %g2
308
faligndata %f6, %f4, %f0
309
EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
310
bne,pt %icc, 1b
311
add %o0, 0x8, %o0
312
313
/* Destination is 64-byte aligned. */
314
3:
315
membar #LoadStore | #StoreStore | #StoreLoad
316
317
subcc %o2, 0x40, %GLOBAL_SPARE
318
add %o1, %g1, %g1
319
andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
320
srl %g1, 3, %g2
321
sub %o2, %GLOBAL_SPARE, %g3
322
andn %o1, (0x40 - 1), %o1
323
and %g2, 7, %g2
324
andncc %g3, 0x7, %g3
325
fsrc2 %f0, %f2
326
sub %g3, 0x8, %g3
327
sub %o2, %GLOBAL_SPARE, %o2
328
329
add %g1, %GLOBAL_SPARE, %g1
330
subcc %o2, %g3, %o2
331
332
EX_LD_FP(LOAD_BLK(%o1, %f0), U1_gs_0_fp)
333
add %o1, 0x40, %o1
334
add %g1, %g3, %g1
335
EX_LD_FP(LOAD_BLK(%o1, %f16), U1_gs_0_fp)
336
add %o1, 0x40, %o1
337
sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
338
EX_LD_FP(LOAD_BLK(%o1, %f32), U1_gs_80_fp)
339
add %o1, 0x40, %o1
340
341
/* There are 8 instances of the unrolled loop,
342
* one for each possible alignment of the
343
* source buffer. Each loop instance is 452
344
* bytes.
345
*/
346
sll %g2, 3, %o3
347
sub %o3, %g2, %o3
348
sllx %o3, 4, %o3
349
add %o3, %g2, %o3
350
sllx %o3, 2, %g2
351
1: rd %pc, %o3
352
add %o3, %lo(1f - 1b), %o3
353
jmpl %o3 + %g2, %g0
354
nop
355
356
.align 64
357
1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
358
LOOP_CHUNK1(o1, o0, 1f)
359
FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
360
LOOP_CHUNK2(o1, o0, 2f)
361
FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
362
LOOP_CHUNK3(o1, o0, 3f)
363
ba,pt %xcc, 1b+4
364
faligndata %f0, %f2, %f48
365
1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
366
STORE_SYNC(o0, f48)
367
FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
368
STORE_JUMP(o0, f48, 40f)
369
2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
370
STORE_SYNC(o0, f48)
371
FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
372
STORE_JUMP(o0, f48, 48f)
373
3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
374
STORE_SYNC(o0, f48)
375
FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
376
STORE_JUMP(o0, f48, 56f)
377
378
1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
379
LOOP_CHUNK1(o1, o0, 1f)
380
FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
381
LOOP_CHUNK2(o1, o0, 2f)
382
FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
383
LOOP_CHUNK3(o1, o0, 3f)
384
ba,pt %xcc, 1b+4
385
faligndata %f2, %f4, %f48
386
1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
387
STORE_SYNC(o0, f48)
388
FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
389
STORE_JUMP(o0, f48, 41f)
390
2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
391
STORE_SYNC(o0, f48)
392
FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
393
STORE_JUMP(o0, f48, 49f)
394
3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
395
STORE_SYNC(o0, f48)
396
FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
397
STORE_JUMP(o0, f48, 57f)
398
399
1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
400
LOOP_CHUNK1(o1, o0, 1f)
401
FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
402
LOOP_CHUNK2(o1, o0, 2f)
403
FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
404
LOOP_CHUNK3(o1, o0, 3f)
405
ba,pt %xcc, 1b+4
406
faligndata %f4, %f6, %f48
407
1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
408
STORE_SYNC(o0, f48)
409
FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
410
STORE_JUMP(o0, f48, 42f)
411
2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
412
STORE_SYNC(o0, f48)
413
FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
414
STORE_JUMP(o0, f48, 50f)
415
3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
416
STORE_SYNC(o0, f48)
417
FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
418
STORE_JUMP(o0, f48, 58f)
419
420
1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
421
LOOP_CHUNK1(o1, o0, 1f)
422
FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
423
LOOP_CHUNK2(o1, o0, 2f)
424
FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
425
LOOP_CHUNK3(o1, o0, 3f)
426
ba,pt %xcc, 1b+4
427
faligndata %f6, %f8, %f48
428
1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
429
STORE_SYNC(o0, f48)
430
FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
431
STORE_JUMP(o0, f48, 43f)
432
2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
433
STORE_SYNC(o0, f48)
434
FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
435
STORE_JUMP(o0, f48, 51f)
436
3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
437
STORE_SYNC(o0, f48)
438
FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
439
STORE_JUMP(o0, f48, 59f)
440
441
1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
442
LOOP_CHUNK1(o1, o0, 1f)
443
FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
444
LOOP_CHUNK2(o1, o0, 2f)
445
FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
446
LOOP_CHUNK3(o1, o0, 3f)
447
ba,pt %xcc, 1b+4
448
faligndata %f8, %f10, %f48
449
1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
450
STORE_SYNC(o0, f48)
451
FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
452
STORE_JUMP(o0, f48, 44f)
453
2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
454
STORE_SYNC(o0, f48)
455
FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
456
STORE_JUMP(o0, f48, 52f)
457
3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
458
STORE_SYNC(o0, f48)
459
FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
460
STORE_JUMP(o0, f48, 60f)
461
462
1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
463
LOOP_CHUNK1(o1, o0, 1f)
464
FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
465
LOOP_CHUNK2(o1, o0, 2f)
466
FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
467
LOOP_CHUNK3(o1, o0, 3f)
468
ba,pt %xcc, 1b+4
469
faligndata %f10, %f12, %f48
470
1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
471
STORE_SYNC(o0, f48)
472
FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
473
STORE_JUMP(o0, f48, 45f)
474
2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
475
STORE_SYNC(o0, f48)
476
FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
477
STORE_JUMP(o0, f48, 53f)
478
3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
479
STORE_SYNC(o0, f48)
480
FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
481
STORE_JUMP(o0, f48, 61f)
482
483
1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
484
LOOP_CHUNK1(o1, o0, 1f)
485
FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
486
LOOP_CHUNK2(o1, o0, 2f)
487
FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
488
LOOP_CHUNK3(o1, o0, 3f)
489
ba,pt %xcc, 1b+4
490
faligndata %f12, %f14, %f48
491
1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
492
STORE_SYNC(o0, f48)
493
FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
494
STORE_JUMP(o0, f48, 46f)
495
2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
496
STORE_SYNC(o0, f48)
497
FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
498
STORE_JUMP(o0, f48, 54f)
499
3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
500
STORE_SYNC(o0, f48)
501
FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
502
STORE_JUMP(o0, f48, 62f)
503
504
1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
505
LOOP_CHUNK1(o1, o0, 1f)
506
FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
507
LOOP_CHUNK2(o1, o0, 2f)
508
FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
509
LOOP_CHUNK3(o1, o0, 3f)
510
ba,pt %xcc, 1b+4
511
faligndata %f14, %f16, %f48
512
1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
513
STORE_SYNC(o0, f48)
514
FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
515
STORE_JUMP(o0, f48, 47f)
516
2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
517
STORE_SYNC(o0, f48)
518
FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
519
STORE_JUMP(o0, f48, 55f)
520
3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
521
STORE_SYNC(o0, f48)
522
FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
523
STORE_JUMP(o0, f48, 63f)
524
525
40: FINISH_VISCHUNK(o0, f0, f2)
526
41: FINISH_VISCHUNK(o0, f2, f4)
527
42: FINISH_VISCHUNK(o0, f4, f6)
528
43: FINISH_VISCHUNK(o0, f6, f8)
529
44: FINISH_VISCHUNK(o0, f8, f10)
530
45: FINISH_VISCHUNK(o0, f10, f12)
531
46: FINISH_VISCHUNK(o0, f12, f14)
532
47: UNEVEN_VISCHUNK(o0, f14, f0)
533
48: FINISH_VISCHUNK(o0, f16, f18)
534
49: FINISH_VISCHUNK(o0, f18, f20)
535
50: FINISH_VISCHUNK(o0, f20, f22)
536
51: FINISH_VISCHUNK(o0, f22, f24)
537
52: FINISH_VISCHUNK(o0, f24, f26)
538
53: FINISH_VISCHUNK(o0, f26, f28)
539
54: FINISH_VISCHUNK(o0, f28, f30)
540
55: UNEVEN_VISCHUNK(o0, f30, f0)
541
56: FINISH_VISCHUNK(o0, f32, f34)
542
57: FINISH_VISCHUNK(o0, f34, f36)
543
58: FINISH_VISCHUNK(o0, f36, f38)
544
59: FINISH_VISCHUNK(o0, f38, f40)
545
60: FINISH_VISCHUNK(o0, f40, f42)
546
61: FINISH_VISCHUNK(o0, f42, f44)
547
62: FINISH_VISCHUNK(o0, f44, f46)
548
63: UNEVEN_VISCHUNK_LAST(o0, f46, f0)
549
550
93: EX_LD_FP(LOAD(ldd, %o1, %f2), U1_g3_0_fp)
551
add %o1, 8, %o1
552
subcc %g3, 8, %g3
553
faligndata %f0, %f2, %f8
554
EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
555
bl,pn %xcc, 95f
556
add %o0, 8, %o0
557
EX_LD_FP(LOAD(ldd, %o1, %f0), U1_g3_0_fp)
558
add %o1, 8, %o1
559
subcc %g3, 8, %g3
560
faligndata %f2, %f0, %f8
561
EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
562
bge,pt %xcc, 93b
563
add %o0, 8, %o0
564
565
95: brz,pt %o2, 2f
566
mov %g1, %o1
567
568
1: EX_LD_FP(LOAD(ldub, %o1, %o3), U1_o2_0_fp)
569
add %o1, 1, %o1
570
subcc %o2, 1, %o2
571
EX_ST_FP(STORE(stb, %o3, %o0), U1_o2_1_fp)
572
bne,pt %xcc, 1b
573
add %o0, 1, %o0
574
575
2: membar #StoreLoad | #StoreStore
576
VISExit
577
retl
578
mov EX_RETVAL(%o4), %o0
579
580
.align 64
581
70: /* 16 < len <= (5 * 64) */
582
bne,pn %XCC, 75f
583
sub %o0, %o1, %o3
584
585
72: andn %o2, 0xf, %GLOBAL_SPARE
586
and %o2, 0xf, %o2
587
1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U1_gs_0)
588
EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U1_gs_0)
589
subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
590
EX_ST(STORE(stx, %o5, %o1 + %o3), U1_gs_10)
591
add %o1, 0x8, %o1
592
EX_ST(STORE(stx, %g1, %o1 + %o3), U1_gs_8)
593
bgu,pt %XCC, 1b
594
add %o1, 0x8, %o1
595
73: andcc %o2, 0x8, %g0
596
be,pt %XCC, 1f
597
nop
598
EX_LD(LOAD(ldx, %o1, %o5), U1_o2_0)
599
sub %o2, 0x8, %o2
600
EX_ST(STORE(stx, %o5, %o1 + %o3), U1_o2_8)
601
add %o1, 0x8, %o1
602
1: andcc %o2, 0x4, %g0
603
be,pt %XCC, 1f
604
nop
605
EX_LD(LOAD(lduw, %o1, %o5), U1_o2_0)
606
sub %o2, 0x4, %o2
607
EX_ST(STORE(stw, %o5, %o1 + %o3), U1_o2_4)
608
add %o1, 0x4, %o1
609
1: cmp %o2, 0
610
be,pt %XCC, 85f
611
nop
612
ba,pt %xcc, 90f
613
nop
614
615
75: andcc %o0, 0x7, %g1
616
sub %g1, 0x8, %g1
617
be,pn %icc, 2f
618
sub %g0, %g1, %g1
619
sub %o2, %g1, %o2
620
621
1: EX_LD(LOAD(ldub, %o1, %o5), U1_g1_0)
622
subcc %g1, 1, %g1
623
EX_ST(STORE(stb, %o5, %o1 + %o3), U1_g1_1)
624
bgu,pt %icc, 1b
625
add %o1, 1, %o1
626
627
2: add %o1, %o3, %o0
628
andcc %o1, 0x7, %g1
629
bne,pt %icc, 8f
630
sll %g1, 3, %g1
631
632
cmp %o2, 16
633
bgeu,pt %icc, 72b
634
nop
635
ba,a,pt %xcc, 73b
636
637
8: mov 64, %o3
638
andn %o1, 0x7, %o1
639
EX_LD(LOAD(ldx, %o1, %g2), U1_o2_0)
640
sub %o3, %g1, %o3
641
andn %o2, 0x7, %GLOBAL_SPARE
642
sllx %g2, %g1, %g2
643
1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U1_gs_0_o2_adj)
644
subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
645
add %o1, 0x8, %o1
646
srlx %g3, %o3, %o5
647
or %o5, %g2, %o5
648
EX_ST(STORE(stx, %o5, %o0), U1_gs_8_o2_adj)
649
add %o0, 0x8, %o0
650
bgu,pt %icc, 1b
651
sllx %g3, %g1, %g2
652
653
srl %g1, 3, %g1
654
andcc %o2, 0x7, %o2
655
be,pn %icc, 85f
656
add %o1, %g1, %o1
657
ba,pt %xcc, 90f
658
sub %o0, %o1, %o3
659
660
.align 64
661
80: /* 0 < len <= 16 */
662
andcc %o3, 0x3, %g0
663
bne,pn %XCC, 90f
664
sub %o0, %o1, %o3
665
666
1: EX_LD(LOAD(lduw, %o1, %g1), U1_o2_0)
667
subcc %o2, 4, %o2
668
EX_ST(STORE(stw, %g1, %o1 + %o3), U1_o2_4)
669
bgu,pt %XCC, 1b
670
add %o1, 4, %o1
671
672
85: retl
673
mov EX_RETVAL(%o4), %o0
674
675
.align 32
676
90: EX_LD(LOAD(ldub, %o1, %g1), U1_o2_0)
677
subcc %o2, 1, %o2
678
EX_ST(STORE(stb, %g1, %o1 + %o3), U1_o2_1)
679
bgu,pt %XCC, 90b
680
add %o1, 1, %o1
681
retl
682
mov EX_RETVAL(%o4), %o0
683
684
.size FUNC_NAME, .-FUNC_NAME
685
EXPORT_SYMBOL(FUNC_NAME)
686
687