Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/xtensa/lib/memcopy.S
26424 views
1
/*
2
* arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3
* xthal_memcpy and xthal_bcopy
4
*
5
* This file is subject to the terms and conditions of the GNU General Public
6
* License. See the file "COPYING" in the main directory of this archive
7
* for more details.
8
*
9
* Copyright (C) 2002 - 2012 Tensilica Inc.
10
*/
11
12
#include <linux/linkage.h>
13
#include <asm/asmmacro.h>
14
#include <asm/core.h>
15
16
/*
17
* void *memcpy(void *dst, const void *src, size_t len);
18
*
19
* This function is intended to do the same thing as the standard
20
* library function memcpy() for most cases.
21
* However, where the source and/or destination references
22
* an instruction RAM or ROM or a data RAM or ROM, that
23
* source and/or destination will always be accessed with
24
* 32-bit load and store instructions (as required for these
25
* types of devices).
26
*
27
* !!!!!!! XTFIXME:
28
* !!!!!!! Handling of IRAM/IROM has not yet
29
* !!!!!!! been implemented.
30
*
31
* The (general case) algorithm is as follows:
32
* If destination is unaligned, align it by conditionally
33
* copying 1 and 2 bytes.
34
* If source is aligned,
35
* do 16 bytes with a loop, and then finish up with
36
* 8, 4, 2, and 1 byte copies conditional on the length;
37
* else (if source is unaligned),
38
* do the same, but use SRC to align the source data.
39
* This code tries to use fall-through branches for the common
40
* case of aligned source and destination and multiple
41
* of 4 (or 8) length.
42
*
43
* Register use:
44
* a0/ return address
45
* a1/ stack pointer
46
* a2/ return value
47
* a3/ src
48
* a4/ length
49
* a5/ dst
50
* a6/ tmp
51
* a7/ tmp
52
* a8/ tmp
53
* a9/ tmp
54
* a10/ tmp
55
* a11/ tmp
56
*/
57
58
.text
59
60
/*
61
* Byte by byte copy
62
*/
63
.align 4
64
.byte 0 # 1 mod 4 alignment for LOOPNEZ
65
# (0 mod 4 alignment for LBEG)
66
.Lbytecopy:
67
#if XCHAL_HAVE_LOOPS
68
loopnez a4, .Lbytecopydone
69
#else /* !XCHAL_HAVE_LOOPS */
70
beqz a4, .Lbytecopydone
71
add a7, a3, a4 # a7 = end address for source
72
#endif /* !XCHAL_HAVE_LOOPS */
73
.Lnextbyte:
74
l8ui a6, a3, 0
75
addi a3, a3, 1
76
s8i a6, a5, 0
77
addi a5, a5, 1
78
#if !XCHAL_HAVE_LOOPS
79
bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80
#endif /* !XCHAL_HAVE_LOOPS */
81
.Lbytecopydone:
82
abi_ret_default
83
84
/*
85
* Destination is unaligned
86
*/
87
88
.align 4
89
.Ldst1mod2: # dst is only byte aligned
90
_bltui a4, 7, .Lbytecopy # do short copies byte by byte
91
92
# copy 1 byte
93
l8ui a6, a3, 0
94
addi a3, a3, 1
95
addi a4, a4, -1
96
s8i a6, a5, 0
97
addi a5, a5, 1
98
_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
99
# return to main algorithm
100
.Ldst2mod4: # dst 16-bit aligned
101
# copy 2 bytes
102
_bltui a4, 6, .Lbytecopy # do short copies byte by byte
103
l8ui a6, a3, 0
104
l8ui a7, a3, 1
105
addi a3, a3, 2
106
addi a4, a4, -2
107
s8i a6, a5, 0
108
s8i a7, a5, 1
109
addi a5, a5, 2
110
j .Ldstaligned # dst is now aligned, return to main algorithm
111
112
ENTRY(__memcpy)
113
WEAK(memcpy)
114
115
abi_entry_default
116
# a2/ dst, a3/ src, a4/ len
117
mov a5, a2 # copy dst so that a2 is return value
118
.Lcommon:
119
_bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
120
_bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
121
.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
122
srli a7, a4, 4 # number of loop iterations with 16B
123
# per iteration
124
movi a8, 3 # if source is not aligned,
125
_bany a3, a8, .Lsrcunaligned # then use shifting copy
126
/*
127
* Destination and source are word-aligned, use word copy.
128
*/
129
# copy 16 bytes per iteration for word-aligned dst and word-aligned src
130
#if XCHAL_HAVE_LOOPS
131
loopnez a7, .Loop1done
132
#else /* !XCHAL_HAVE_LOOPS */
133
beqz a7, .Loop1done
134
slli a8, a7, 4
135
add a8, a8, a3 # a8 = end of last 16B source chunk
136
#endif /* !XCHAL_HAVE_LOOPS */
137
.Loop1:
138
l32i a6, a3, 0
139
l32i a7, a3, 4
140
s32i a6, a5, 0
141
l32i a6, a3, 8
142
s32i a7, a5, 4
143
l32i a7, a3, 12
144
s32i a6, a5, 8
145
addi a3, a3, 16
146
s32i a7, a5, 12
147
addi a5, a5, 16
148
#if !XCHAL_HAVE_LOOPS
149
bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
150
#endif /* !XCHAL_HAVE_LOOPS */
151
.Loop1done:
152
bbci.l a4, 3, .L2
153
# copy 8 bytes
154
l32i a6, a3, 0
155
l32i a7, a3, 4
156
addi a3, a3, 8
157
s32i a6, a5, 0
158
s32i a7, a5, 4
159
addi a5, a5, 8
160
.L2:
161
bbsi.l a4, 2, .L3
162
bbsi.l a4, 1, .L4
163
bbsi.l a4, 0, .L5
164
abi_ret_default
165
.L3:
166
# copy 4 bytes
167
l32i a6, a3, 0
168
addi a3, a3, 4
169
s32i a6, a5, 0
170
addi a5, a5, 4
171
bbsi.l a4, 1, .L4
172
bbsi.l a4, 0, .L5
173
abi_ret_default
174
.L4:
175
# copy 2 bytes
176
l16ui a6, a3, 0
177
addi a3, a3, 2
178
s16i a6, a5, 0
179
addi a5, a5, 2
180
bbsi.l a4, 0, .L5
181
abi_ret_default
182
.L5:
183
# copy 1 byte
184
l8ui a6, a3, 0
185
s8i a6, a5, 0
186
abi_ret_default
187
188
/*
189
* Destination is aligned, Source is unaligned
190
*/
191
192
.align 4
193
.Lsrcunaligned:
194
_beqz a4, .Ldone # avoid loading anything for zero-length copies
195
# copy 16 bytes per iteration for word-aligned dst and unaligned src
196
__ssa8 a3 # set shift amount from byte offset
197
198
/* set to 1 when running on ISS (simulator) with the
199
lint or ferret client, or 0 to save a few cycles */
200
#define SIM_CHECKS_ALIGNMENT 1
201
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202
and a11, a3, a8 # save unalignment offset for below
203
sub a3, a3, a11 # align a3
204
#endif
205
l32i a6, a3, 0 # load first word
206
#if XCHAL_HAVE_LOOPS
207
loopnez a7, .Loop2done
208
#else /* !XCHAL_HAVE_LOOPS */
209
beqz a7, .Loop2done
210
slli a10, a7, 4
211
add a10, a10, a3 # a10 = end of last 16B source chunk
212
#endif /* !XCHAL_HAVE_LOOPS */
213
.Loop2:
214
l32i a7, a3, 4
215
l32i a8, a3, 8
216
__src_b a6, a6, a7
217
s32i a6, a5, 0
218
l32i a9, a3, 12
219
__src_b a7, a7, a8
220
s32i a7, a5, 4
221
l32i a6, a3, 16
222
__src_b a8, a8, a9
223
s32i a8, a5, 8
224
addi a3, a3, 16
225
__src_b a9, a9, a6
226
s32i a9, a5, 12
227
addi a5, a5, 16
228
#if !XCHAL_HAVE_LOOPS
229
bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230
#endif /* !XCHAL_HAVE_LOOPS */
231
.Loop2done:
232
bbci.l a4, 3, .L12
233
# copy 8 bytes
234
l32i a7, a3, 4
235
l32i a8, a3, 8
236
__src_b a6, a6, a7
237
s32i a6, a5, 0
238
addi a3, a3, 8
239
__src_b a7, a7, a8
240
s32i a7, a5, 4
241
addi a5, a5, 8
242
mov a6, a8
243
.L12:
244
bbci.l a4, 2, .L13
245
# copy 4 bytes
246
l32i a7, a3, 4
247
addi a3, a3, 4
248
__src_b a6, a6, a7
249
s32i a6, a5, 0
250
addi a5, a5, 4
251
mov a6, a7
252
.L13:
253
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254
add a3, a3, a11 # readjust a3 with correct misalignment
255
#endif
256
bbsi.l a4, 1, .L14
257
bbsi.l a4, 0, .L15
258
.Ldone: abi_ret_default
259
.L14:
260
# copy 2 bytes
261
l8ui a6, a3, 0
262
l8ui a7, a3, 1
263
addi a3, a3, 2
264
s8i a6, a5, 0
265
s8i a7, a5, 1
266
addi a5, a5, 2
267
bbsi.l a4, 0, .L15
268
abi_ret_default
269
.L15:
270
# copy 1 byte
271
l8ui a6, a3, 0
272
s8i a6, a5, 0
273
abi_ret_default
274
275
ENDPROC(__memcpy)
276
EXPORT_SYMBOL(__memcpy)
277
EXPORT_SYMBOL(memcpy)
278
279
/*
280
* void *memmove(void *dst, const void *src, size_t len);
281
*
282
* This function is intended to do the same thing as the standard
283
* library function memmove() for most cases.
284
* However, where the source and/or destination references
285
* an instruction RAM or ROM or a data RAM or ROM, that
286
* source and/or destination will always be accessed with
287
* 32-bit load and store instructions (as required for these
288
* types of devices).
289
*
290
* !!!!!!! XTFIXME:
291
* !!!!!!! Handling of IRAM/IROM has not yet
292
* !!!!!!! been implemented.
293
*
294
* The (general case) algorithm is as follows:
295
* If end of source doesn't overlap destination then use memcpy.
296
* Otherwise do memcpy backwards.
297
*
298
* Register use:
299
* a0/ return address
300
* a1/ stack pointer
301
* a2/ return value
302
* a3/ src
303
* a4/ length
304
* a5/ dst
305
* a6/ tmp
306
* a7/ tmp
307
* a8/ tmp
308
* a9/ tmp
309
* a10/ tmp
310
* a11/ tmp
311
*/
312
313
/*
314
* Byte by byte copy
315
*/
316
.align 4
317
.byte 0 # 1 mod 4 alignment for LOOPNEZ
318
# (0 mod 4 alignment for LBEG)
319
.Lbackbytecopy:
320
#if XCHAL_HAVE_LOOPS
321
loopnez a4, .Lbackbytecopydone
322
#else /* !XCHAL_HAVE_LOOPS */
323
beqz a4, .Lbackbytecopydone
324
sub a7, a3, a4 # a7 = start address for source
325
#endif /* !XCHAL_HAVE_LOOPS */
326
.Lbacknextbyte:
327
addi a3, a3, -1
328
l8ui a6, a3, 0
329
addi a5, a5, -1
330
s8i a6, a5, 0
331
#if !XCHAL_HAVE_LOOPS
332
bne a3, a7, .Lbacknextbyte # continue loop if
333
# $a3:src != $a7:src_start
334
#endif /* !XCHAL_HAVE_LOOPS */
335
.Lbackbytecopydone:
336
abi_ret_default
337
338
/*
339
* Destination is unaligned
340
*/
341
342
.align 4
343
.Lbackdst1mod2: # dst is only byte aligned
344
_bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
345
346
# copy 1 byte
347
addi a3, a3, -1
348
l8ui a6, a3, 0
349
addi a5, a5, -1
350
s8i a6, a5, 0
351
addi a4, a4, -1
352
_bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
353
# return to main algorithm
354
.Lbackdst2mod4: # dst 16-bit aligned
355
# copy 2 bytes
356
_bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
357
addi a3, a3, -2
358
l8ui a6, a3, 0
359
l8ui a7, a3, 1
360
addi a5, a5, -2
361
s8i a6, a5, 0
362
s8i a7, a5, 1
363
addi a4, a4, -2
364
j .Lbackdstaligned # dst is now aligned,
365
# return to main algorithm
366
367
ENTRY(__memmove)
368
WEAK(memmove)
369
370
abi_entry_default
371
# a2/ dst, a3/ src, a4/ len
372
mov a5, a2 # copy dst so that a2 is return value
373
.Lmovecommon:
374
sub a6, a5, a3
375
bgeu a6, a4, .Lcommon
376
377
add a5, a5, a4
378
add a3, a3, a4
379
380
_bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
381
_bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
382
.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
383
srli a7, a4, 4 # number of loop iterations with 16B
384
# per iteration
385
movi a8, 3 # if source is not aligned,
386
_bany a3, a8, .Lbacksrcunaligned # then use shifting copy
387
/*
388
* Destination and source are word-aligned, use word copy.
389
*/
390
# copy 16 bytes per iteration for word-aligned dst and word-aligned src
391
#if XCHAL_HAVE_LOOPS
392
loopnez a7, .LbackLoop1done
393
#else /* !XCHAL_HAVE_LOOPS */
394
beqz a7, .LbackLoop1done
395
slli a8, a7, 4
396
sub a8, a3, a8 # a8 = start of first 16B source chunk
397
#endif /* !XCHAL_HAVE_LOOPS */
398
.LbackLoop1:
399
addi a3, a3, -16
400
l32i a7, a3, 12
401
l32i a6, a3, 8
402
addi a5, a5, -16
403
s32i a7, a5, 12
404
l32i a7, a3, 4
405
s32i a6, a5, 8
406
l32i a6, a3, 0
407
s32i a7, a5, 4
408
s32i a6, a5, 0
409
#if !XCHAL_HAVE_LOOPS
410
bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start
411
#endif /* !XCHAL_HAVE_LOOPS */
412
.LbackLoop1done:
413
bbci.l a4, 3, .Lback2
414
# copy 8 bytes
415
addi a3, a3, -8
416
l32i a6, a3, 0
417
l32i a7, a3, 4
418
addi a5, a5, -8
419
s32i a6, a5, 0
420
s32i a7, a5, 4
421
.Lback2:
422
bbsi.l a4, 2, .Lback3
423
bbsi.l a4, 1, .Lback4
424
bbsi.l a4, 0, .Lback5
425
abi_ret_default
426
.Lback3:
427
# copy 4 bytes
428
addi a3, a3, -4
429
l32i a6, a3, 0
430
addi a5, a5, -4
431
s32i a6, a5, 0
432
bbsi.l a4, 1, .Lback4
433
bbsi.l a4, 0, .Lback5
434
abi_ret_default
435
.Lback4:
436
# copy 2 bytes
437
addi a3, a3, -2
438
l16ui a6, a3, 0
439
addi a5, a5, -2
440
s16i a6, a5, 0
441
bbsi.l a4, 0, .Lback5
442
abi_ret_default
443
.Lback5:
444
# copy 1 byte
445
addi a3, a3, -1
446
l8ui a6, a3, 0
447
addi a5, a5, -1
448
s8i a6, a5, 0
449
abi_ret_default
450
451
/*
452
* Destination is aligned, Source is unaligned
453
*/
454
455
.align 4
456
.Lbacksrcunaligned:
457
_beqz a4, .Lbackdone # avoid loading anything for zero-length copies
458
# copy 16 bytes per iteration for word-aligned dst and unaligned src
459
__ssa8 a3 # set shift amount from byte offset
460
#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
461
* the lint or ferret client, or 0
462
* to save a few cycles */
463
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
464
and a11, a3, a8 # save unalignment offset for below
465
sub a3, a3, a11 # align a3
466
#endif
467
l32i a6, a3, 0 # load first word
468
#if XCHAL_HAVE_LOOPS
469
loopnez a7, .LbackLoop2done
470
#else /* !XCHAL_HAVE_LOOPS */
471
beqz a7, .LbackLoop2done
472
slli a10, a7, 4
473
sub a10, a3, a10 # a10 = start of first 16B source chunk
474
#endif /* !XCHAL_HAVE_LOOPS */
475
.LbackLoop2:
476
addi a3, a3, -16
477
l32i a7, a3, 12
478
l32i a8, a3, 8
479
addi a5, a5, -16
480
__src_b a6, a7, a6
481
s32i a6, a5, 12
482
l32i a9, a3, 4
483
__src_b a7, a8, a7
484
s32i a7, a5, 8
485
l32i a6, a3, 0
486
__src_b a8, a9, a8
487
s32i a8, a5, 4
488
__src_b a9, a6, a9
489
s32i a9, a5, 0
490
#if !XCHAL_HAVE_LOOPS
491
bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
492
#endif /* !XCHAL_HAVE_LOOPS */
493
.LbackLoop2done:
494
bbci.l a4, 3, .Lback12
495
# copy 8 bytes
496
addi a3, a3, -8
497
l32i a7, a3, 4
498
l32i a8, a3, 0
499
addi a5, a5, -8
500
__src_b a6, a7, a6
501
s32i a6, a5, 4
502
__src_b a7, a8, a7
503
s32i a7, a5, 0
504
mov a6, a8
505
.Lback12:
506
bbci.l a4, 2, .Lback13
507
# copy 4 bytes
508
addi a3, a3, -4
509
l32i a7, a3, 0
510
addi a5, a5, -4
511
__src_b a6, a7, a6
512
s32i a6, a5, 0
513
mov a6, a7
514
.Lback13:
515
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
516
add a3, a3, a11 # readjust a3 with correct misalignment
517
#endif
518
bbsi.l a4, 1, .Lback14
519
bbsi.l a4, 0, .Lback15
520
.Lbackdone:
521
abi_ret_default
522
.Lback14:
523
# copy 2 bytes
524
addi a3, a3, -2
525
l8ui a6, a3, 0
526
l8ui a7, a3, 1
527
addi a5, a5, -2
528
s8i a6, a5, 0
529
s8i a7, a5, 1
530
bbsi.l a4, 0, .Lback15
531
abi_ret_default
532
.Lback15:
533
# copy 1 byte
534
addi a3, a3, -1
535
addi a5, a5, -1
536
l8ui a6, a3, 0
537
s8i a6, a5, 0
538
abi_ret_default
539
540
ENDPROC(__memmove)
541
EXPORT_SYMBOL(__memmove)
542
EXPORT_SYMBOL(memmove)
543
544