Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/arm/memcpy.S
39556 views
1
/*
2
* memcpy - copy memory area
3
*
4
* Copyright (c) 2013-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/*
9
This memcpy routine is optimised for Cortex-A15 cores and takes advantage
10
of VFP or NEON when built with the appropriate flags.
11
12
Assumptions:
13
14
ARMv6 (ARMv7-a if using Neon)
15
ARM state
16
Unaligned accesses
17
18
*/
19
20
#include "asmdefs.h"
21
22
.syntax unified
23
/* This implementation requires ARM state. */
24
.arm
25
26
#ifdef __ARM_NEON__
27
28
.fpu neon
29
.arch armv7-a
30
# define FRAME_SIZE 4
31
# define USE_VFP
32
# define USE_NEON
33
34
#elif !defined (__SOFTFP__)
35
36
.arch armv6
37
.fpu vfpv2
38
# define FRAME_SIZE 32
39
# define USE_VFP
40
41
#else
42
.arch armv6
43
# define FRAME_SIZE 32
44
45
#endif
46
47
/* Old versions of GAS incorrectly implement the NEON align semantics. */
48
#ifdef BROKEN_ASM_NEON_ALIGN
49
#define ALIGN(addr, align) addr,:align
50
#else
51
#define ALIGN(addr, align) addr:align
52
#endif
53
54
#define PC_OFFSET 8 /* PC pipeline compensation. */
55
#define INSN_SIZE 4
56
57
/* Call parameters. */
58
#define dstin r0
59
#define src r1
60
#define count r2
61
62
/* Locals. */
63
#define tmp1 r3
64
#define dst ip
65
#define tmp2 r10
66
67
#ifndef USE_NEON
68
/* For bulk copies using GP registers. */
69
#define A_l r2 /* Call-clobbered. */
70
#define A_h r3 /* Call-clobbered. */
71
#define B_l r4
72
#define B_h r5
73
#define C_l r6
74
#define C_h r7
75
#define D_l r8
76
#define D_h r9
77
#endif
78
79
/* Number of lines ahead to pre-fetch data. If you change this the code
80
below will need adjustment to compensate. */
81
82
#define prefetch_lines 5
83
84
#ifdef USE_VFP
85
.macro cpy_line_vfp vreg, base
86
vstr \vreg, [dst, #\base]
87
vldr \vreg, [src, #\base]
88
vstr d0, [dst, #\base + 8]
89
vldr d0, [src, #\base + 8]
90
vstr d1, [dst, #\base + 16]
91
vldr d1, [src, #\base + 16]
92
vstr d2, [dst, #\base + 24]
93
vldr d2, [src, #\base + 24]
94
vstr \vreg, [dst, #\base + 32]
95
vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
96
vstr d0, [dst, #\base + 40]
97
vldr d0, [src, #\base + 40]
98
vstr d1, [dst, #\base + 48]
99
vldr d1, [src, #\base + 48]
100
vstr d2, [dst, #\base + 56]
101
vldr d2, [src, #\base + 56]
102
.endm
103
104
.macro cpy_tail_vfp vreg, base
105
vstr \vreg, [dst, #\base]
106
vldr \vreg, [src, #\base]
107
vstr d0, [dst, #\base + 8]
108
vldr d0, [src, #\base + 8]
109
vstr d1, [dst, #\base + 16]
110
vldr d1, [src, #\base + 16]
111
vstr d2, [dst, #\base + 24]
112
vldr d2, [src, #\base + 24]
113
vstr \vreg, [dst, #\base + 32]
114
vstr d0, [dst, #\base + 40]
115
vldr d0, [src, #\base + 40]
116
vstr d1, [dst, #\base + 48]
117
vldr d1, [src, #\base + 48]
118
vstr d2, [dst, #\base + 56]
119
vldr d2, [src, #\base + 56]
120
.endm
121
#endif
122
123
ENTRY (__memcpy_arm)
124
125
mov dst, dstin /* Preserve dstin, we need to return it. */
126
cmp count, #64
127
bhs L(cpy_not_short)
128
/* Deal with small copies quickly by dropping straight into the
129
exit block. */
130
131
L(tail63unaligned):
132
#ifdef USE_NEON
133
and tmp1, count, #0x38
134
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
135
add pc, pc, tmp1
136
vld1.8 {d0}, [src]! /* 14 words to go. */
137
vst1.8 {d0}, [dst]!
138
vld1.8 {d0}, [src]! /* 12 words to go. */
139
vst1.8 {d0}, [dst]!
140
vld1.8 {d0}, [src]! /* 10 words to go. */
141
vst1.8 {d0}, [dst]!
142
vld1.8 {d0}, [src]! /* 8 words to go. */
143
vst1.8 {d0}, [dst]!
144
vld1.8 {d0}, [src]! /* 6 words to go. */
145
vst1.8 {d0}, [dst]!
146
vld1.8 {d0}, [src]! /* 4 words to go. */
147
vst1.8 {d0}, [dst]!
148
vld1.8 {d0}, [src]! /* 2 words to go. */
149
vst1.8 {d0}, [dst]!
150
151
tst count, #4
152
ldrne tmp1, [src], #4
153
strne tmp1, [dst], #4
154
#else
155
/* Copy up to 15 full words of data. May not be aligned. */
156
/* Cannot use VFP for unaligned data. */
157
and tmp1, count, #0x3c
158
add dst, dst, tmp1
159
add src, src, tmp1
160
rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
161
/* Jump directly into the sequence below at the correct offset. */
162
add pc, pc, tmp1, lsl #1
163
164
ldr tmp1, [src, #-60] /* 15 words to go. */
165
str tmp1, [dst, #-60]
166
167
ldr tmp1, [src, #-56] /* 14 words to go. */
168
str tmp1, [dst, #-56]
169
ldr tmp1, [src, #-52]
170
str tmp1, [dst, #-52]
171
172
ldr tmp1, [src, #-48] /* 12 words to go. */
173
str tmp1, [dst, #-48]
174
ldr tmp1, [src, #-44]
175
str tmp1, [dst, #-44]
176
177
ldr tmp1, [src, #-40] /* 10 words to go. */
178
str tmp1, [dst, #-40]
179
ldr tmp1, [src, #-36]
180
str tmp1, [dst, #-36]
181
182
ldr tmp1, [src, #-32] /* 8 words to go. */
183
str tmp1, [dst, #-32]
184
ldr tmp1, [src, #-28]
185
str tmp1, [dst, #-28]
186
187
ldr tmp1, [src, #-24] /* 6 words to go. */
188
str tmp1, [dst, #-24]
189
ldr tmp1, [src, #-20]
190
str tmp1, [dst, #-20]
191
192
ldr tmp1, [src, #-16] /* 4 words to go. */
193
str tmp1, [dst, #-16]
194
ldr tmp1, [src, #-12]
195
str tmp1, [dst, #-12]
196
197
ldr tmp1, [src, #-8] /* 2 words to go. */
198
str tmp1, [dst, #-8]
199
ldr tmp1, [src, #-4]
200
str tmp1, [dst, #-4]
201
#endif
202
203
lsls count, count, #31
204
ldrhcs tmp1, [src], #2
205
ldrbne src, [src] /* Src is dead, use as a scratch. */
206
strhcs tmp1, [dst], #2
207
strbne src, [dst]
208
bx lr
209
210
L(cpy_not_short):
211
/* At least 64 bytes to copy, but don't know the alignment yet. */
212
str tmp2, [sp, #-FRAME_SIZE]!
213
and tmp2, src, #7
214
and tmp1, dst, #7
215
cmp tmp1, tmp2
216
bne L(cpy_notaligned)
217
218
#ifdef USE_VFP
219
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
220
that the FP pipeline is much better at streaming loads and
221
stores. This is outside the critical loop. */
222
vmov.f32 s0, s0
223
#endif
224
225
/* SRC and DST have the same mutual 64-bit alignment, but we may
226
still need to pre-copy some bytes to get to natural alignment.
227
We bring SRC and DST into full 64-bit alignment. */
228
lsls tmp2, dst, #29
229
beq 1f
230
rsbs tmp2, tmp2, #0
231
sub count, count, tmp2, lsr #29
232
ldrmi tmp1, [src], #4
233
strmi tmp1, [dst], #4
234
lsls tmp2, tmp2, #2
235
ldrhcs tmp1, [src], #2
236
ldrbne tmp2, [src], #1
237
strhcs tmp1, [dst], #2
238
strbne tmp2, [dst], #1
239
240
1:
241
subs tmp2, count, #64 /* Use tmp2 for count. */
242
blo L(tail63aligned)
243
244
cmp tmp2, #512
245
bhs L(cpy_body_long)
246
247
L(cpy_body_medium): /* Count in tmp2. */
248
#ifdef USE_VFP
249
1:
250
vldr d0, [src, #0]
251
subs tmp2, tmp2, #64
252
vldr d1, [src, #8]
253
vstr d0, [dst, #0]
254
vldr d0, [src, #16]
255
vstr d1, [dst, #8]
256
vldr d1, [src, #24]
257
vstr d0, [dst, #16]
258
vldr d0, [src, #32]
259
vstr d1, [dst, #24]
260
vldr d1, [src, #40]
261
vstr d0, [dst, #32]
262
vldr d0, [src, #48]
263
vstr d1, [dst, #40]
264
vldr d1, [src, #56]
265
vstr d0, [dst, #48]
266
add src, src, #64
267
vstr d1, [dst, #56]
268
add dst, dst, #64
269
bhs 1b
270
tst tmp2, #0x3f
271
beq L(done)
272
273
L(tail63aligned): /* Count in tmp2. */
274
and tmp1, tmp2, #0x38
275
add dst, dst, tmp1
276
add src, src, tmp1
277
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
278
add pc, pc, tmp1
279
280
vldr d0, [src, #-56] /* 14 words to go. */
281
vstr d0, [dst, #-56]
282
vldr d0, [src, #-48] /* 12 words to go. */
283
vstr d0, [dst, #-48]
284
vldr d0, [src, #-40] /* 10 words to go. */
285
vstr d0, [dst, #-40]
286
vldr d0, [src, #-32] /* 8 words to go. */
287
vstr d0, [dst, #-32]
288
vldr d0, [src, #-24] /* 6 words to go. */
289
vstr d0, [dst, #-24]
290
vldr d0, [src, #-16] /* 4 words to go. */
291
vstr d0, [dst, #-16]
292
vldr d0, [src, #-8] /* 2 words to go. */
293
vstr d0, [dst, #-8]
294
#else
295
sub src, src, #8
296
sub dst, dst, #8
297
1:
298
ldrd A_l, A_h, [src, #8]
299
strd A_l, A_h, [dst, #8]
300
ldrd A_l, A_h, [src, #16]
301
strd A_l, A_h, [dst, #16]
302
ldrd A_l, A_h, [src, #24]
303
strd A_l, A_h, [dst, #24]
304
ldrd A_l, A_h, [src, #32]
305
strd A_l, A_h, [dst, #32]
306
ldrd A_l, A_h, [src, #40]
307
strd A_l, A_h, [dst, #40]
308
ldrd A_l, A_h, [src, #48]
309
strd A_l, A_h, [dst, #48]
310
ldrd A_l, A_h, [src, #56]
311
strd A_l, A_h, [dst, #56]
312
ldrd A_l, A_h, [src, #64]!
313
strd A_l, A_h, [dst, #64]!
314
subs tmp2, tmp2, #64
315
bhs 1b
316
tst tmp2, #0x3f
317
bne 1f
318
ldr tmp2,[sp], #FRAME_SIZE
319
bx lr
320
1:
321
add src, src, #8
322
add dst, dst, #8
323
324
L(tail63aligned): /* Count in tmp2. */
325
/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
326
we know that the src and dest are 64-bit aligned so we can use
327
LDRD/STRD to improve efficiency. */
328
/* TMP2 is now negative, but we don't care about that. The bottom
329
six bits still tell us how many bytes are left to copy. */
330
331
and tmp1, tmp2, #0x38
332
add dst, dst, tmp1
333
add src, src, tmp1
334
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
335
add pc, pc, tmp1
336
ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
337
strd A_l, A_h, [dst, #-56]
338
ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
339
strd A_l, A_h, [dst, #-48]
340
ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
341
strd A_l, A_h, [dst, #-40]
342
ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
343
strd A_l, A_h, [dst, #-32]
344
ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
345
strd A_l, A_h, [dst, #-24]
346
ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
347
strd A_l, A_h, [dst, #-16]
348
ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
349
strd A_l, A_h, [dst, #-8]
350
351
#endif
352
tst tmp2, #4
353
ldrne tmp1, [src], #4
354
strne tmp1, [dst], #4
355
lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
356
ldrhcs tmp1, [src], #2
357
ldrbne tmp2, [src]
358
strhcs tmp1, [dst], #2
359
strbne tmp2, [dst]
360
361
L(done):
362
ldr tmp2, [sp], #FRAME_SIZE
363
bx lr
364
365
L(cpy_body_long): /* Count in tmp2. */
366
367
/* Long copy. We know that there's at least (prefetch_lines * 64)
368
bytes to go. */
369
#ifdef USE_VFP
370
/* Don't use PLD. Instead, read some data in advance of the current
371
copy position into a register. This should act like a PLD
372
operation but we won't have to repeat the transfer. */
373
374
vldr d3, [src, #0]
375
vldr d4, [src, #64]
376
vldr d5, [src, #128]
377
vldr d6, [src, #192]
378
vldr d7, [src, #256]
379
380
vldr d0, [src, #8]
381
vldr d1, [src, #16]
382
vldr d2, [src, #24]
383
add src, src, #32
384
385
subs tmp2, tmp2, #prefetch_lines * 64 * 2
386
blo 2f
387
1:
388
cpy_line_vfp d3, 0
389
cpy_line_vfp d4, 64
390
cpy_line_vfp d5, 128
391
add dst, dst, #3 * 64
392
add src, src, #3 * 64
393
cpy_line_vfp d6, 0
394
cpy_line_vfp d7, 64
395
add dst, dst, #2 * 64
396
add src, src, #2 * 64
397
subs tmp2, tmp2, #prefetch_lines * 64
398
bhs 1b
399
400
2:
401
cpy_tail_vfp d3, 0
402
cpy_tail_vfp d4, 64
403
cpy_tail_vfp d5, 128
404
add src, src, #3 * 64
405
add dst, dst, #3 * 64
406
cpy_tail_vfp d6, 0
407
vstr d7, [dst, #64]
408
vldr d7, [src, #64]
409
vstr d0, [dst, #64 + 8]
410
vldr d0, [src, #64 + 8]
411
vstr d1, [dst, #64 + 16]
412
vldr d1, [src, #64 + 16]
413
vstr d2, [dst, #64 + 24]
414
vldr d2, [src, #64 + 24]
415
vstr d7, [dst, #64 + 32]
416
add src, src, #96
417
vstr d0, [dst, #64 + 40]
418
vstr d1, [dst, #64 + 48]
419
vstr d2, [dst, #64 + 56]
420
add dst, dst, #128
421
add tmp2, tmp2, #prefetch_lines * 64
422
b L(cpy_body_medium)
423
#else
424
/* Long copy. Use an SMS style loop to maximize the I/O
425
bandwidth of the core. We don't have enough spare registers
426
to synthesise prefetching, so use PLD operations. */
427
/* Pre-bias src and dst. */
428
sub src, src, #8
429
sub dst, dst, #8
430
pld [src, #8]
431
pld [src, #72]
432
subs tmp2, tmp2, #64
433
pld [src, #136]
434
ldrd A_l, A_h, [src, #8]
435
strd B_l, B_h, [sp, #8]
436
ldrd B_l, B_h, [src, #16]
437
strd C_l, C_h, [sp, #16]
438
ldrd C_l, C_h, [src, #24]
439
strd D_l, D_h, [sp, #24]
440
pld [src, #200]
441
ldrd D_l, D_h, [src, #32]!
442
b 1f
443
.p2align 6
444
2:
445
pld [src, #232]
446
strd A_l, A_h, [dst, #40]
447
ldrd A_l, A_h, [src, #40]
448
strd B_l, B_h, [dst, #48]
449
ldrd B_l, B_h, [src, #48]
450
strd C_l, C_h, [dst, #56]
451
ldrd C_l, C_h, [src, #56]
452
strd D_l, D_h, [dst, #64]!
453
ldrd D_l, D_h, [src, #64]!
454
subs tmp2, tmp2, #64
455
1:
456
strd A_l, A_h, [dst, #8]
457
ldrd A_l, A_h, [src, #8]
458
strd B_l, B_h, [dst, #16]
459
ldrd B_l, B_h, [src, #16]
460
strd C_l, C_h, [dst, #24]
461
ldrd C_l, C_h, [src, #24]
462
strd D_l, D_h, [dst, #32]
463
ldrd D_l, D_h, [src, #32]
464
bcs 2b
465
/* Save the remaining bytes and restore the callee-saved regs. */
466
strd A_l, A_h, [dst, #40]
467
add src, src, #40
468
strd B_l, B_h, [dst, #48]
469
ldrd B_l, B_h, [sp, #8]
470
strd C_l, C_h, [dst, #56]
471
ldrd C_l, C_h, [sp, #16]
472
strd D_l, D_h, [dst, #64]
473
ldrd D_l, D_h, [sp, #24]
474
add dst, dst, #72
475
tst tmp2, #0x3f
476
bne L(tail63aligned)
477
ldr tmp2, [sp], #FRAME_SIZE
478
bx lr
479
#endif
480
481
L(cpy_notaligned):
482
pld [src]
483
pld [src, #64]
484
/* There's at least 64 bytes to copy, but there is no mutual
485
alignment. */
486
/* Bring DST to 64-bit alignment. */
487
lsls tmp2, dst, #29
488
pld [src, #(2 * 64)]
489
beq 1f
490
rsbs tmp2, tmp2, #0
491
sub count, count, tmp2, lsr #29
492
ldrmi tmp1, [src], #4
493
strmi tmp1, [dst], #4
494
lsls tmp2, tmp2, #2
495
ldrbne tmp1, [src], #1
496
ldrhcs tmp2, [src], #2
497
strbne tmp1, [dst], #1
498
strhcs tmp2, [dst], #2
499
1:
500
pld [src, #(3 * 64)]
501
subs count, count, #64
502
ldrlo tmp2, [sp], #FRAME_SIZE
503
blo L(tail63unaligned)
504
pld [src, #(4 * 64)]
505
506
#ifdef USE_NEON
507
vld1.8 {d0-d3}, [src]!
508
vld1.8 {d4-d7}, [src]!
509
subs count, count, #64
510
blo 2f
511
1:
512
pld [src, #(4 * 64)]
513
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
514
vld1.8 {d0-d3}, [src]!
515
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
516
vld1.8 {d4-d7}, [src]!
517
subs count, count, #64
518
bhs 1b
519
2:
520
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
521
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
522
ands count, count, #0x3f
523
#else
524
/* Use an SMS style loop to maximize the I/O bandwidth. */
525
sub src, src, #4
526
sub dst, dst, #8
527
subs tmp2, count, #64 /* Use tmp2 for count. */
528
ldr A_l, [src, #4]
529
ldr A_h, [src, #8]
530
strd B_l, B_h, [sp, #8]
531
ldr B_l, [src, #12]
532
ldr B_h, [src, #16]
533
strd C_l, C_h, [sp, #16]
534
ldr C_l, [src, #20]
535
ldr C_h, [src, #24]
536
strd D_l, D_h, [sp, #24]
537
ldr D_l, [src, #28]
538
ldr D_h, [src, #32]!
539
b 1f
540
.p2align 6
541
2:
542
pld [src, #(5 * 64) - (32 - 4)]
543
strd A_l, A_h, [dst, #40]
544
ldr A_l, [src, #36]
545
ldr A_h, [src, #40]
546
strd B_l, B_h, [dst, #48]
547
ldr B_l, [src, #44]
548
ldr B_h, [src, #48]
549
strd C_l, C_h, [dst, #56]
550
ldr C_l, [src, #52]
551
ldr C_h, [src, #56]
552
strd D_l, D_h, [dst, #64]!
553
ldr D_l, [src, #60]
554
ldr D_h, [src, #64]!
555
subs tmp2, tmp2, #64
556
1:
557
strd A_l, A_h, [dst, #8]
558
ldr A_l, [src, #4]
559
ldr A_h, [src, #8]
560
strd B_l, B_h, [dst, #16]
561
ldr B_l, [src, #12]
562
ldr B_h, [src, #16]
563
strd C_l, C_h, [dst, #24]
564
ldr C_l, [src, #20]
565
ldr C_h, [src, #24]
566
strd D_l, D_h, [dst, #32]
567
ldr D_l, [src, #28]
568
ldr D_h, [src, #32]
569
bcs 2b
570
571
/* Save the remaining bytes and restore the callee-saved regs. */
572
strd A_l, A_h, [dst, #40]
573
add src, src, #36
574
strd B_l, B_h, [dst, #48]
575
ldrd B_l, B_h, [sp, #8]
576
strd C_l, C_h, [dst, #56]
577
ldrd C_l, C_h, [sp, #16]
578
strd D_l, D_h, [dst, #64]
579
ldrd D_l, D_h, [sp, #24]
580
add dst, dst, #72
581
ands count, tmp2, #0x3f
582
#endif
583
ldr tmp2, [sp], #FRAME_SIZE
584
bne L(tail63unaligned)
585
bx lr
586
587
END (__memcpy_arm)
588
589