Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
39486 views
1
/*
2
* memcpy - copy memory area
3
*
4
* Copyright (c) 2019-2023, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11
*
12
*/
13
14
#include "asmdefs.h"
15
16
#define dstin x0
17
#define src x1
18
#define count x2
19
#define dst x3
20
#define srcend x4
21
#define dstend x5
22
#define A_l x6
23
#define A_lw w6
24
#define A_h x7
25
#define B_l x8
26
#define B_lw w8
27
#define B_h x9
28
#define C_lw w10
29
#define tmp1 x14
30
31
#define A_q q0
32
#define B_q q1
33
#define C_q q2
34
#define D_q q3
35
#define E_q q4
36
#define F_q q5
37
#define G_q q6
38
#define H_q q7
39
40
/* This implementation handles overlaps and supports both memcpy and memmove
41
from a single entry point. It uses unaligned accesses and branchless
42
sequences to keep the code small, simple and improve performance.
43
44
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45
copies of up to 128 bytes, and large copies. The overhead of the overlap
46
check is negligible since it is only required for large copies.
47
48
Large copies use a software pipelined loop processing 64 bytes per iteration.
49
The source pointer is 16-byte aligned to minimize unaligned accesses.
50
The loop tail is handled by always copying 64 bytes from the end.
51
*/
52
53
ENTRY_ALIAS (__memmove_aarch64_simd)
54
ENTRY (__memcpy_aarch64_simd)
55
add srcend, src, count
56
cmp count, 128
57
b.hi L(copy_long)
58
add dstend, dstin, count
59
cmp count, 32
60
b.hi L(copy32_128)
61
nop
62
63
/* Small copies: 0..32 bytes. */
64
cmp count, 16
65
b.lo L(copy16)
66
ldr A_q, [src]
67
ldr B_q, [srcend, -16]
68
str A_q, [dstin]
69
str B_q, [dstend, -16]
70
ret
71
72
.p2align 4
73
/* Medium copies: 33..128 bytes. */
74
L(copy32_128):
75
ldp A_q, B_q, [src]
76
ldp C_q, D_q, [srcend, -32]
77
cmp count, 64
78
b.hi L(copy128)
79
stp A_q, B_q, [dstin]
80
stp C_q, D_q, [dstend, -32]
81
ret
82
83
.p2align 4
84
/* Copy 8-15 bytes. */
85
L(copy16):
86
tbz count, 3, L(copy8)
87
ldr A_l, [src]
88
ldr A_h, [srcend, -8]
89
str A_l, [dstin]
90
str A_h, [dstend, -8]
91
ret
92
93
/* Copy 4-7 bytes. */
94
L(copy8):
95
tbz count, 2, L(copy4)
96
ldr A_lw, [src]
97
ldr B_lw, [srcend, -4]
98
str A_lw, [dstin]
99
str B_lw, [dstend, -4]
100
ret
101
102
/* Copy 65..128 bytes. */
103
L(copy128):
104
ldp E_q, F_q, [src, 32]
105
cmp count, 96
106
b.ls L(copy96)
107
ldp G_q, H_q, [srcend, -64]
108
stp G_q, H_q, [dstend, -64]
109
L(copy96):
110
stp A_q, B_q, [dstin]
111
stp E_q, F_q, [dstin, 32]
112
stp C_q, D_q, [dstend, -32]
113
ret
114
115
/* Copy 0..3 bytes using a branchless sequence. */
116
L(copy4):
117
cbz count, L(copy0)
118
lsr tmp1, count, 1
119
ldrb A_lw, [src]
120
ldrb C_lw, [srcend, -1]
121
ldrb B_lw, [src, tmp1]
122
strb A_lw, [dstin]
123
strb B_lw, [dstin, tmp1]
124
strb C_lw, [dstend, -1]
125
L(copy0):
126
ret
127
128
.p2align 3
129
/* Copy more than 128 bytes. */
130
L(copy_long):
131
add dstend, dstin, count
132
133
/* Use backwards copy if there is an overlap. */
134
sub tmp1, dstin, src
135
cmp tmp1, count
136
b.lo L(copy_long_backwards)
137
138
/* Copy 16 bytes and then align src to 16-byte alignment. */
139
ldr D_q, [src]
140
and tmp1, src, 15
141
bic src, src, 15
142
sub dst, dstin, tmp1
143
add count, count, tmp1 /* Count is now 16 too large. */
144
ldp A_q, B_q, [src, 16]
145
str D_q, [dstin]
146
ldp C_q, D_q, [src, 48]
147
subs count, count, 128 + 16 /* Test and readjust count. */
148
b.ls L(copy64_from_end)
149
L(loop64):
150
stp A_q, B_q, [dst, 16]
151
ldp A_q, B_q, [src, 80]
152
stp C_q, D_q, [dst, 48]
153
ldp C_q, D_q, [src, 112]
154
add src, src, 64
155
add dst, dst, 64
156
subs count, count, 64
157
b.hi L(loop64)
158
159
/* Write the last iteration and copy 64 bytes from the end. */
160
L(copy64_from_end):
161
ldp E_q, F_q, [srcend, -64]
162
stp A_q, B_q, [dst, 16]
163
ldp A_q, B_q, [srcend, -32]
164
stp C_q, D_q, [dst, 48]
165
stp E_q, F_q, [dstend, -64]
166
stp A_q, B_q, [dstend, -32]
167
ret
168
169
.p2align 4
170
nop
171
172
/* Large backwards copy for overlapping copies.
173
Copy 16 bytes and then align srcend to 16-byte alignment. */
174
L(copy_long_backwards):
175
cbz tmp1, L(copy0)
176
ldr D_q, [srcend, -16]
177
and tmp1, srcend, 15
178
bic srcend, srcend, 15
179
sub count, count, tmp1
180
ldp A_q, B_q, [srcend, -32]
181
str D_q, [dstend, -16]
182
ldp C_q, D_q, [srcend, -64]
183
sub dstend, dstend, tmp1
184
subs count, count, 128
185
b.ls L(copy64_from_start)
186
187
L(loop64_backwards):
188
str B_q, [dstend, -16]
189
str A_q, [dstend, -32]
190
ldp A_q, B_q, [srcend, -96]
191
str D_q, [dstend, -48]
192
str C_q, [dstend, -64]!
193
ldp C_q, D_q, [srcend, -128]
194
sub srcend, srcend, 64
195
subs count, count, 64
196
b.hi L(loop64_backwards)
197
198
/* Write the last iteration and copy 64 bytes from the start. */
199
L(copy64_from_start):
200
ldp E_q, F_q, [src, 32]
201
stp A_q, B_q, [dstend, -32]
202
ldp A_q, B_q, [src]
203
stp C_q, D_q, [dstend, -64]
204
stp E_q, F_q, [dstin, 32]
205
stp A_q, B_q, [dstin]
206
ret
207
208
END (__memcpy_aarch64_simd)
209
210
211