Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/sparc/lib/M7memcpy.S
26424 views
1
/*
2
* M7memcpy: Optimized SPARC M7 memcpy
3
*
4
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
5
*/
6
7
.file "M7memcpy.S"
8
9
/*
10
* memcpy(s1, s2, len)
11
*
12
* Copy s2 to s1, always copy n bytes.
13
* Note: this C code does not work for overlapped copies.
14
*
15
* Fast assembler language version of the following C-program for memcpy
16
* which represents the `standard' for the C-library.
17
*
18
* void *
19
* memcpy(void *s, const void *s0, size_t n)
20
* {
21
* if (n != 0) {
22
* char *s1 = s;
23
* const char *s2 = s0;
24
* do {
25
* *s1++ = *s2++;
26
* } while (--n != 0);
27
* }
28
* return (s);
29
* }
30
*
31
*
32
* SPARC T7/M7 Flow :
33
*
34
* if (count < SMALL_MAX) {
35
* if count < SHORTCOPY (SHORTCOPY=3)
36
* copy bytes; exit with dst addr
37
* if src & dst aligned on word boundary but not long word boundary,
38
* copy with ldw/stw; branch to finish_up
39
* if src & dst aligned on long word boundary
40
* copy with ldx/stx; branch to finish_up
41
* if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14)
42
* copy bytes; exit with dst addr
43
* move enough bytes to get src to word boundary
44
* if dst now on word boundary
45
* move_words:
46
* copy words; branch to finish_up
47
* if dst now on half word boundary
48
* load words, shift half words, store words; branch to finish_up
49
* if dst on byte 1
50
* load words, shift 3 bytes, store words; branch to finish_up
51
* if dst on byte 3
52
* load words, shift 1 byte, store words; branch to finish_up
53
* finish_up:
54
* copy bytes; exit with dst addr
55
* } else { More than SMALL_MAX bytes
56
* move bytes until dst is on long word boundary
57
* if( src is on long word boundary ) {
58
* if (count < MED_MAX) {
59
* finish_long: src/dst aligned on 8 bytes
60
* copy with ldx/stx in 8-way unrolled loop;
61
* copy final 0-63 bytes; exit with dst addr
62
* } else { src/dst aligned; count > MED_MAX
63
* align dst on 64 byte boundary; for main data movement:
64
* prefetch src data to L2 cache; let HW prefetch move data to L1 cache
65
* Use BIS (block initializing store) to avoid copying store cache
66
* lines from memory. But pre-store first element of each cache line
67
* ST_CHUNK lines in advance of the rest of that cache line. That
68
* gives time for replacement cache lines to be written back without
69
* excess STQ and Miss Buffer filling. Repeat until near the end,
70
* then finish up storing before going to finish_long.
71
* }
72
* } else { src/dst not aligned on 8 bytes
73
* if src is word aligned and count < MED_WMAX
74
* move words in 8-way unrolled loop
75
* move final 0-31 bytes; exit with dst addr
76
* if count < MED_UMAX
77
* use alignaddr/faligndata combined with ldd/std in 8-way
78
* unrolled loop to move data.
79
* go to unalign_done
80
* else
81
* setup alignaddr for faligndata instructions
82
* align dst on 64 byte boundary; prefetch src data to L1 cache
83
* loadx8, falign, block-store, prefetch loop
84
* (only use block-init-store when src/dst on 8 byte boundaries.)
85
* unalign_done:
86
* move remaining bytes for unaligned cases. exit with dst addr.
87
* }
88
*
89
*/
90
91
#include <asm/visasm.h>
92
#include <asm/asi.h>
93
94
#if !defined(EX_LD) && !defined(EX_ST)
95
#define NON_USER_COPY
96
#endif
97
98
#ifndef EX_LD
99
#define EX_LD(x,y) x
100
#endif
101
#ifndef EX_LD_FP
102
#define EX_LD_FP(x,y) x
103
#endif
104
105
#ifndef EX_ST
106
#define EX_ST(x,y) x
107
#endif
108
#ifndef EX_ST_FP
109
#define EX_ST_FP(x,y) x
110
#endif
111
112
#ifndef EX_RETVAL
113
#define EX_RETVAL(x) x
114
#endif
115
116
#ifndef LOAD
117
#define LOAD(type,addr,dest) type [addr], dest
118
#endif
119
120
#ifndef STORE
121
#define STORE(type,src,addr) type src, [addr]
122
#endif
123
124
/*
125
* ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
126
* line as "least recently used" which means if many threads are
127
* active, it has a high probability of being pushed out of the cache
128
* between the first initializing store and the final stores.
129
* Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
130
* marks the cache line as "most recently used" for all
131
* but the last cache line
132
*/
133
#ifndef STORE_ASI
134
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
135
#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
136
#else
137
#define STORE_ASI 0x80 /* ASI_P */
138
#endif
139
#endif
140
141
#ifndef STORE_MRU_ASI
142
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
143
#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P
144
#else
145
#define STORE_MRU_ASI 0x80 /* ASI_P */
146
#endif
147
#endif
148
149
#ifndef STORE_INIT
150
#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
151
#endif
152
153
#ifndef STORE_INIT_MRU
154
#define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI
155
#endif
156
157
#ifndef FUNC_NAME
158
#define FUNC_NAME M7memcpy
159
#endif
160
161
#ifndef PREAMBLE
162
#define PREAMBLE
163
#endif
164
165
#define BLOCK_SIZE 64
166
#define SHORTCOPY 3
167
#define SHORTCHECK 14
168
#define SHORT_LONG 64 /* max copy for short longword-aligned case */
169
/* must be at least 64 */
170
#define SMALL_MAX 128
171
#define MED_UMAX 1024 /* max copy for medium un-aligned case */
172
#define MED_WMAX 1024 /* max copy for medium word-aligned case */
173
#define MED_MAX 1024 /* max copy for medium longword-aligned case */
174
#define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */
175
#define ALIGN_PRE 24 /* distance for aligned prefetch loop */
176
177
.register %g2,#scratch
178
179
.section ".text"
180
.global FUNC_NAME
181
.type FUNC_NAME, #function
182
.align 16
183
FUNC_NAME:
184
srlx %o2, 31, %g2
185
cmp %g2, 0
186
tne %xcc, 5
187
PREAMBLE
188
mov %o0, %g1 ! save %o0
189
brz,pn %o2, .Lsmallx
190
cmp %o2, 3
191
ble,pn %icc, .Ltiny_cp
192
cmp %o2, 19
193
ble,pn %icc, .Lsmall_cp
194
or %o0, %o1, %g2
195
cmp %o2, SMALL_MAX
196
bl,pn %icc, .Lmedium_cp
197
nop
198
199
.Lmedium:
200
neg %o0, %o5
201
andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
202
brz,pt %o5, .Ldst_aligned_on_8
203
204
! %o5 has the bytes to be written in partial store.
205
sub %o2, %o5, %o2
206
sub %o1, %o0, %o1 ! %o1 gets the difference
207
7: ! dst aligning loop
208
add %o1, %o0, %o4
209
EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte
210
subcc %o5, 1, %o5
211
EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
212
bgu,pt %xcc, 7b
213
add %o0, 1, %o0 ! advance dst
214
add %o1, %o0, %o1 ! restore %o1
215
.Ldst_aligned_on_8:
216
andcc %o1, 7, %o5
217
brnz,pt %o5, .Lsrc_dst_unaligned_on_8
218
nop
219
220
.Lsrc_dst_aligned_on_8:
221
! check if we are copying MED_MAX or more bytes
222
set MED_MAX, %o3
223
cmp %o2, %o3 ! limit to store buffer size
224
bgu,pn %xcc, .Llarge_align8_copy
225
nop
226
227
/*
228
* Special case for handling when src and dest are both long word aligned
229
* and total data to move is less than MED_MAX bytes
230
*/
231
.Lmedlong:
232
subcc %o2, 63, %o2 ! adjust length to allow cc test
233
ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes
234
nop
235
.Lmedl64:
236
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load
237
subcc %o2, 64, %o2 ! decrement length count
238
EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store
239
EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
240
EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
241
EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
242
EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
243
EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
244
EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
245
EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
246
EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
247
EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
248
add %o1, 64, %o1 ! increase src ptr by 64
249
EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
250
EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
251
add %o0, 64, %o0 ! increase dst ptr by 64
252
EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
253
EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
254
bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left
255
EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
256
.Lmedl63:
257
addcc %o2, 32, %o2 ! adjust remaining count
258
ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left
259
nop
260
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load
261
sub %o2, 32, %o2 ! decrement length count
262
EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store
263
EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
264
add %o1, 32, %o1 ! increase src ptr by 32
265
EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
266
EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
267
add %o0, 32, %o0 ! increase dst ptr by 32
268
EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
269
EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
270
EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
271
.Lmedl31:
272
addcc %o2, 16, %o2 ! adjust remaining count
273
ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left
274
nop !
275
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
276
add %o1, 16, %o1 ! increase src ptr by 16
277
EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
278
sub %o2, 16, %o2 ! decrease count by 16
279
EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
280
add %o0, 16, %o0 ! increase dst ptr by 16
281
EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
282
.Lmedl15:
283
addcc %o2, 15, %o2 ! restore count
284
bz,pt %xcc, .Lsmallx ! exit if finished
285
cmp %o2, 8
286
blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
287
tst %o2
288
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes
289
add %o1, 8, %o1 ! increase src ptr by 8
290
add %o0, 8, %o0 ! increase dst ptr by 8
291
subcc %o2, 8, %o2 ! decrease count by 8
292
bnz,pn %xcc, .Lmedw7
293
EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8
294
retl
295
mov EX_RETVAL(%g1), %o0 ! restore %o0
296
297
.align 16
298
.Lsrc_dst_unaligned_on_8:
299
! DST is 8-byte aligned, src is not
300
2:
301
andcc %o1, 0x3, %o5 ! test word alignment
302
bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned
303
nop
304
305
/*
306
* Handle all cases where src and dest are aligned on word
307
* boundaries. Use unrolled loops for better performance.
308
* This option wins over standard large data move when
309
* source and destination is in cache for.Lmedium
310
* to short data moves.
311
*/
312
set MED_WMAX, %o3
313
cmp %o2, %o3 ! limit to store buffer size
314
bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop
315
nop
316
317
subcc %o2, 31, %o2 ! adjust length to allow cc test
318
! for end of loop
319
ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16
320
.Lmedw32:
321
EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
322
sllx %o4, 32, %o5
323
EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
324
or %o4, %o5, %o5
325
EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
326
subcc %o2, 32, %o2 ! decrement length count
327
EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
328
sllx %o4, 32, %o5
329
EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
330
or %o4, %o5, %o5
331
EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
332
add %o1, 32, %o1 ! increase src ptr by 32
333
EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
334
sllx %o4, 32, %o5
335
EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
336
or %o4, %o5, %o5
337
EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
338
add %o0, 32, %o0 ! increase dst ptr by 32
339
EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
340
sllx %o4, 32, %o5
341
EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
342
or %o4, %o5, %o5
343
bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left
344
EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
345
.Lmedw31:
346
addcc %o2, 31, %o2 ! restore count
347
348
bz,pt %xcc, .Lsmallx ! exit if finished
349
nop
350
cmp %o2, 16
351
blt,pt %xcc, .Lmedw15
352
nop
353
EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
354
sllx %o4, 32, %o5
355
subcc %o2, 16, %o2 ! decrement length count
356
EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
357
or %o4, %o5, %o5
358
EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
359
add %o1, 16, %o1 ! increase src ptr by 16
360
EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
361
add %o0, 16, %o0 ! increase dst ptr by 16
362
sllx %o4, 32, %o5
363
EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
364
or %o4, %o5, %o5
365
EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
366
.Lmedw15:
367
bz,pt %xcc, .Lsmallx ! exit if finished
368
cmp %o2, 8
369
blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
370
tst %o2
371
EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
372
subcc %o2, 8, %o2 ! decrease count by 8
373
EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
374
add %o1, 8, %o1 ! increase src ptr by 8
375
EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes
376
add %o0, 8, %o0 ! increase dst ptr by 8
377
EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
378
bz,pt %xcc, .Lsmallx ! exit if finished
379
.Lmedw7: ! count is ge 1, less than 8
380
cmp %o2, 4 ! check for 4 bytes left
381
blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left
382
nop !
383
EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
384
add %o1, 4, %o1 ! increase src ptr by 4
385
add %o0, 4, %o0 ! increase dst ptr by 4
386
subcc %o2, 4, %o2 ! decrease count by 4
387
bnz .Lsmallleft3
388
EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
389
retl
390
mov EX_RETVAL(%g1), %o0
391
392
.align 16
393
.Llarge_align8_copy: ! Src and dst share 8 byte alignment
394
! align dst to 64 byte boundary
395
andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
396
brz,pn %o3, .Laligned_to_64
397
andcc %o0, 8, %o3 ! odd long words to move?
398
brz,pt %o3, .Laligned_to_16
399
nop
400
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
401
sub %o2, 8, %o2
402
add %o1, 8, %o1 ! increment src ptr
403
add %o0, 8, %o0 ! increment dst ptr
404
EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
405
.Laligned_to_16:
406
andcc %o0, 16, %o3 ! pair of long words to move?
407
brz,pt %o3, .Laligned_to_32
408
nop
409
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
410
sub %o2, 16, %o2
411
EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
412
add %o1, 16, %o1 ! increment src ptr
413
EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
414
add %o0, 16, %o0 ! increment dst ptr
415
EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
416
.Laligned_to_32:
417
andcc %o0, 32, %o3 ! four long words to move?
418
brz,pt %o3, .Laligned_to_64
419
nop
420
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
421
sub %o2, 32, %o2
422
EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
423
EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
424
EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
425
EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
426
EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
427
add %o1, 32, %o1 ! increment src ptr
428
EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
429
add %o0, 32, %o0 ! increment dst ptr
430
EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
431
.Laligned_to_64:
432
!
433
! Using block init store (BIS) instructions to avoid fetching cache
434
! lines from memory. Use ST_CHUNK stores to first element of each cache
435
! line (similar to prefetching) to avoid overfilling STQ or miss buffers.
436
! Gives existing cache lines time to be moved out of L1/L2/L3 cache.
437
! Initial stores using MRU version of BIS to keep cache line in
438
! cache until we are ready to store final element of cache line.
439
! Then store last element using the LRU version of BIS.
440
!
441
andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
442
and %o2, 0x3f, %o2 ! residue bytes in %o2
443
!
444
! We use STORE_MRU_ASI for the first seven stores to each cache line
445
! followed by STORE_ASI (mark as LRU) for the last store. That
446
! mixed approach reduces the probability that the cache line is removed
447
! before we finish setting it, while minimizing the effects on
448
! other cached values during a large memcpy
449
!
450
! ST_CHUNK batches up initial BIS operations for several cache lines
451
! to allow multiple requests to not be blocked by overflowing the
452
! the store miss buffer. Then the matching stores for all those
453
! BIS operations are executed.
454
!
455
456
sub %o0, 8, %o0 ! adjust %o0 for ASI alignment
457
.Lalign_loop:
458
cmp %o5, ST_CHUNK*64
459
blu,pt %xcc, .Lalign_loop_fin
460
mov ST_CHUNK,%o3
461
.Lalign_loop_start:
462
prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
463
subcc %o3, 1, %o3
464
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
465
add %o1, 64, %o1
466
add %o0, 8, %o0
467
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
468
bgu %xcc,.Lalign_loop_start
469
add %o0, 56, %o0
470
471
mov ST_CHUNK,%o3
472
sllx %o3, 6, %o4 ! ST_CHUNK*64
473
sub %o1, %o4, %o1 ! reset %o1
474
sub %o0, %o4, %o0 ! reset %o0
475
476
.Lalign_loop_rest:
477
EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
478
add %o0, 16, %o0
479
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
480
EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
481
add %o0, 8, %o0
482
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
483
subcc %o3, 1, %o3
484
EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
485
add %o0, 8, %o0
486
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
487
EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
488
add %o0, 8, %o0
489
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
490
EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
491
add %o0, 8, %o0
492
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
493
EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
494
add %o1, 64, %o1
495
add %o0, 8, %o0
496
EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
497
add %o0, 8, %o0
498
EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
499
sub %o5, 64, %o5
500
bgu %xcc,.Lalign_loop_rest
501
! mark cache line as LRU
502
EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
503
504
cmp %o5, ST_CHUNK*64
505
bgu,pt %xcc, .Lalign_loop_start
506
mov ST_CHUNK,%o3
507
508
cmp %o5, 0
509
beq .Lalign_done
510
nop
511
.Lalign_loop_fin:
512
EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
513
EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
514
EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
515
EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
516
EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
517
EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
518
subcc %o5, 64, %o5
519
EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
520
EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
521
EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
522
EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
523
EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
524
EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
525
EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
526
add %o1, 64, %o1
527
EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
528
add %o0, 64, %o0
529
EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
530
bgu %xcc,.Lalign_loop_fin
531
EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
532
533
.Lalign_done:
534
add %o0, 8, %o0 ! restore %o0 from ASI alignment
535
membar #StoreStore
536
sub %o2, 63, %o2 ! adjust length to allow cc test
537
ba .Lmedl63 ! in .Lmedl63
538
nop
539
540
.align 16
541
! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
542
.Lunalignsetup:
543
.Lunalignrejoin:
544
mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it
545
#ifdef NON_USER_COPY
546
VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
547
#else
548
VISEntryHalf
549
#endif
550
mov %o3, %g1 ! restore %g1
551
552
set MED_UMAX, %o3
553
cmp %o2, %o3 ! check for.Lmedium unaligned limit
554
bge,pt %xcc,.Lunalign_large
555
prefetch [%o1 + (4 * BLOCK_SIZE)], 20
556
andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
557
and %o2, 0x3f, %o2 ! residue bytes in %o2
558
cmp %o2, 8 ! Insure we do not load beyond
559
bgt .Lunalign_adjust ! end of source buffer
560
andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
561
add %o2, 64, %o2 ! adjust to leave loop
562
sub %o5, 64, %o5 ! early if necessary
563
.Lunalign_adjust:
564
alignaddr %o1, %g0, %g0 ! generate %gsr
565
add %o1, %o5, %o1 ! advance %o1 to after blocks
566
EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
567
.Lunalign_loop:
568
EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
569
faligndata %f0, %f2, %f16
570
EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
571
subcc %o5, BLOCK_SIZE, %o5
572
EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
573
faligndata %f2, %f4, %f18
574
EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
575
EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
576
faligndata %f4, %f6, %f20
577
EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
578
EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
579
faligndata %f6, %f8, %f22
580
EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
581
EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
582
faligndata %f8, %f10, %f24
583
EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
584
EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
585
faligndata %f10, %f12, %f26
586
EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
587
add %o4, BLOCK_SIZE, %o4
588
EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
589
faligndata %f12, %f14, %f28
590
EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
591
EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
592
faligndata %f14, %f0, %f30
593
EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
594
add %o0, BLOCK_SIZE, %o0
595
bgu,pt %xcc, .Lunalign_loop
596
prefetch [%o4 + (5 * BLOCK_SIZE)], 20
597
ba .Lunalign_done
598
nop
599
600
.Lunalign_large:
601
andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned?
602
bz %xcc, .Lunalignsrc
603
sub %o3, 64, %o3 ! %o3 will be multiple of 8
604
neg %o3 ! bytes until dest is 64 byte aligned
605
sub %o2, %o3, %o2 ! update cnt with bytes to be moved
606
! Move bytes according to source alignment
607
andcc %o1, 0x1, %o5
608
bnz %xcc, .Lunalignbyte ! check for byte alignment
609
nop
610
andcc %o1, 2, %o5 ! check for half word alignment
611
bnz %xcc, .Lunalignhalf
612
nop
613
! Src is word aligned
614
.Lunalignword:
615
EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes
616
add %o1, 8, %o1 ! increase src ptr by 8
617
EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4
618
subcc %o3, 8, %o3 ! decrease count by 8
619
EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
620
add %o0, 8, %o0 ! increase dst ptr by 8
621
bnz %xcc, .Lunalignword
622
EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
623
ba .Lunalignsrc
624
nop
625
626
! Src is half-word aligned
627
.Lunalignhalf:
628
EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes
629
sllx %o4, 32, %o5 ! shift left
630
EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
631
or %o4, %o5, %o5
632
sllx %o5, 16, %o5
633
EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
634
or %o4, %o5, %o5
635
EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
636
add %o1, 8, %o1
637
subcc %o3, 8, %o3
638
bnz %xcc, .Lunalignhalf
639
add %o0, 8, %o0
640
ba .Lunalignsrc
641
nop
642
643
! Src is Byte aligned
644
.Lunalignbyte:
645
sub %o0, %o1, %o0 ! share pointer advance
646
.Lunalignbyte_loop:
647
EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
648
sllx %o4, 56, %o5
649
EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
650
sllx %o4, 40, %o4
651
or %o4, %o5, %o5
652
EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
653
sllx %o4, 24, %o4
654
or %o4, %o5, %o5
655
EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
656
sllx %o4, 8, %o4
657
or %o4, %o5, %o5
658
EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
659
or %o4, %o5, %o5
660
add %o0, %o1, %o0
661
EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
662
sub %o0, %o1, %o0
663
subcc %o3, 8, %o3
664
bnz %xcc, .Lunalignbyte_loop
665
add %o1, 8, %o1
666
add %o0,%o1, %o0 ! restore pointer
667
668
! Destination is now block (64 byte aligned)
669
.Lunalignsrc:
670
andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
671
and %o2, 0x3f, %o2 ! residue bytes in %o2
672
add %o2, 64, %o2 ! Insure we do not load beyond
673
sub %o5, 64, %o5 ! end of source buffer
674
675
andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
676
alignaddr %o1, %g0, %g0 ! generate %gsr
677
add %o1, %o5, %o1 ! advance %o1 to after blocks
678
679
EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
680
add %o4, 8, %o4
681
.Lunalign_sloop:
682
EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
683
faligndata %f14, %f16, %f0
684
EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
685
faligndata %f16, %f18, %f2
686
EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
687
faligndata %f18, %f20, %f4
688
EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
689
subcc %o5, 64, %o5
690
EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
691
faligndata %f20, %f22, %f6
692
EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
693
EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
694
faligndata %f22, %f24, %f8
695
EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
696
EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
697
faligndata %f24, %f26, %f10
698
EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
699
EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
700
faligndata %f26, %f28, %f12
701
EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
702
add %o4, 64, %o4
703
EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
704
faligndata %f28, %f30, %f14
705
EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
706
EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
707
add %o0, 64, %o0
708
EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
709
fsrc2 %f30, %f14
710
bgu,pt %xcc, .Lunalign_sloop
711
prefetch [%o4 + (8 * BLOCK_SIZE)], 20
712
713
.Lunalign_done:
714
! Handle trailing bytes, 64 to 127
715
! Dest long word aligned, Src not long word aligned
716
cmp %o2, 15
717
bleu %xcc, .Lunalign_short
718
719
andn %o2, 0x7, %o5 ! %o5 is multiple of 8
720
and %o2, 0x7, %o2 ! residue bytes in %o2
721
add %o2, 8, %o2
722
sub %o5, 8, %o5 ! insure we do not load past end of src
723
andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
724
add %o1, %o5, %o1 ! advance %o1 to after multiple of 8
725
EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
726
.Lunalign_by8:
727
EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
728
add %o4, 8, %o4
729
faligndata %f0, %f2, %f16
730
subcc %o5, 8, %o5
731
EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
732
fsrc2 %f2, %f0
733
bgu,pt %xcc, .Lunalign_by8
734
add %o0, 8, %o0
735
736
.Lunalign_short:
737
#ifdef NON_USER_COPY
738
VISExitHalfFast
739
#else
740
VISExitHalf
741
#endif
742
ba .Lsmallrest
743
nop
744
745
/*
746
* This is a special case of nested memcpy. This can happen when kernel
747
* calls unaligned memcpy back to back without saving FP registers. We need
748
* traps(context switch) to save/restore FP registers. If the kernel calls
749
* memcpy without this trap sequence we will hit FP corruption. Let's use
750
* the normal integer load/store method in this case.
751
*/
752
753
#ifdef NON_USER_COPY
754
.Lmedium_vis_entry_fail_cp:
755
or %o0, %o1, %g2
756
#endif
757
.Lmedium_cp:
758
LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
759
andcc %g2, 0x7, %g0
760
bne,pn %xcc, .Lmedium_unaligned_cp
761
nop
762
763
.Lmedium_noprefetch_cp:
764
andncc %o2, 0x20 - 1, %o5
765
be,pn %xcc, 2f
766
sub %o2, %o5, %o2
767
1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
768
EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
769
EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
770
EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
771
add %o1, 0x20, %o1
772
subcc %o5, 0x20, %o5
773
EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
774
EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
775
EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
776
EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
777
bne,pt %xcc, 1b
778
add %o0, 0x20, %o0
779
2: andcc %o2, 0x18, %o5
780
be,pt %xcc, 3f
781
sub %o2, %o5, %o2
782
1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
783
add %o1, 0x08, %o1
784
add %o0, 0x08, %o0
785
subcc %o5, 0x08, %o5
786
bne,pt %xcc, 1b
787
EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
788
3: brz,pt %o2, .Lexit_cp
789
cmp %o2, 0x04
790
bl,pn %xcc, .Ltiny_cp
791
nop
792
EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
793
add %o1, 0x04, %o1
794
add %o0, 0x04, %o0
795
subcc %o2, 0x04, %o2
796
bne,pn %xcc, .Ltiny_cp
797
EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
798
ba,a,pt %xcc, .Lexit_cp
799
800
.Lmedium_unaligned_cp:
801
/* First get dest 8 byte aligned. */
802
sub %g0, %o0, %o3
803
and %o3, 0x7, %o3
804
brz,pt %o3, 2f
805
sub %o2, %o3, %o2
806
807
1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
808
add %o1, 1, %o1
809
subcc %o3, 1, %o3
810
add %o0, 1, %o0
811
bne,pt %xcc, 1b
812
EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
813
2:
814
and %o1, 0x7, %o3
815
brz,pn %o3, .Lmedium_noprefetch_cp
816
sll %o3, 3, %o3
817
mov 64, %g2
818
sub %g2, %o3, %g2
819
andn %o1, 0x7, %o1
820
EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
821
sllx %o4, %o3, %o4
822
andn %o2, 0x08 - 1, %o5
823
sub %o2, %o5, %o2
824
825
1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
826
add %o1, 0x08, %o1
827
subcc %o5, 0x08, %o5
828
srlx %g3, %g2, %g7
829
or %g7, %o4, %g7
830
EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
831
add %o0, 0x08, %o0
832
bne,pt %xcc, 1b
833
sllx %g3, %o3, %o4
834
srl %o3, 3, %o3
835
add %o1, %o3, %o1
836
brz,pn %o2, .Lexit_cp
837
nop
838
ba,pt %xcc, .Lsmall_unaligned_cp
839
840
.Ltiny_cp:
841
EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
842
subcc %o2, 1, %o2
843
be,pn %xcc, .Lexit_cp
844
EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
845
EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
846
subcc %o2, 1, %o2
847
be,pn %xcc, .Lexit_cp
848
EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
849
EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
850
ba,pt %xcc, .Lexit_cp
851
EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
852
853
.Lsmall_cp:
854
andcc %g2, 0x3, %g0
855
bne,pn %xcc, .Lsmall_unaligned_cp
856
andn %o2, 0x4 - 1, %o5
857
sub %o2, %o5, %o2
858
1:
859
EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
860
add %o1, 0x04, %o1
861
subcc %o5, 0x04, %o5
862
add %o0, 0x04, %o0
863
bne,pt %xcc, 1b
864
EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
865
brz,pt %o2, .Lexit_cp
866
nop
867
ba,a,pt %xcc, .Ltiny_cp
868
869
.Lsmall_unaligned_cp:
870
1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
871
add %o1, 1, %o1
872
add %o0, 1, %o0
873
subcc %o2, 1, %o2
874
bne,pt %xcc, 1b
875
EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
876
ba,a,pt %xcc, .Lexit_cp
877
878
.Lsmallrest:
879
tst %o2
880
bz,pt %xcc, .Lsmallx
881
cmp %o2, 4
882
blt,pn %xcc, .Lsmallleft3
883
nop
884
sub %o2, 3, %o2
885
.Lsmallnotalign4:
886
EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
887
subcc %o2, 4, %o2 ! reduce count by 4
888
EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
889
EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
890
add %o1, 4, %o1 ! advance SRC by 4
891
EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
892
EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
893
add %o0, 4, %o0 ! advance DST by 4
894
EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
895
EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
896
bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain
897
EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
898
addcc %o2, 3, %o2 ! restore count
899
bz,pt %xcc, .Lsmallx
900
.Lsmallleft3: ! 1, 2, or 3 bytes remain
901
subcc %o2, 1, %o2
902
EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte
903
bz,pt %xcc, .Lsmallx
904
EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte
905
EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte
906
subcc %o2, 1, %o2
907
bz,pt %xcc, .Lsmallx
908
EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
909
EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte
910
EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte
911
.Lsmallx:
912
retl
913
mov EX_RETVAL(%g1), %o0
914
.Lsmallfin:
915
tst %o2
916
bnz,pn %xcc, .Lsmallleft3
917
nop
918
retl
919
mov EX_RETVAL(%g1), %o0 ! restore %o0
920
.Lexit_cp:
921
retl
922
mov EX_RETVAL(%g1), %o0
923
.size FUNC_NAME, .-FUNC_NAME
924
925