Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
35292 views
1
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2
// See https://llvm.org/LICENSE.txt for license information.
3
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
5
// Routines taken from libc/AOR_v20.02/string/aarch64
6
7
#include "../assembly.h"
8
9
#ifdef __aarch64__
10
11
#define L(l) .L ## l
12
13
//
14
// __arm_sc_memcpy / __arm_sc_memmove
15
//
16
17
#define dstin x0
18
#define src x1
19
#define count x2
20
#define dst x3
21
#define srcend1 x4
22
#define dstend1 x5
23
#define A_l x6
24
#define A_lw w6
25
#define A_h x7
26
#define B_l x8
27
#define B_lw w8
28
#define B_h x9
29
#define C_l x10
30
#define C_lw w10
31
#define C_h x11
32
#define D_l x12
33
#define D_h x13
34
#define E_l x14
35
#define E_h x15
36
#define F_l x16
37
#define F_h x17
38
#define G_l count
39
#define G_h dst
40
#define H_l src
41
#define H_h srcend1
42
#define tmp1 x14
43
44
/* This implementation handles overlaps and supports both memcpy and memmove
45
from a single entry point. It uses unaligned accesses and branchless
46
sequences to keep the code small, simple and improve performance.
47
48
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
49
copies of up to 128 bytes, and large copies. The overhead of the overlap
50
check is negligible since it is only required for large copies.
51
52
Large copies use a software pipelined loop processing 64 bytes per iteration.
53
The destination pointer is 16-byte aligned to minimize unaligned accesses.
54
The loop tail is handled by always copying 64 bytes from the end.
55
*/
56
57
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
58
add srcend1, src, count
59
add dstend1, dstin, count
60
cmp count, 128
61
b.hi L(copy_long)
62
cmp count, 32
63
b.hi L(copy32_128)
64
65
/* Small copies: 0..32 bytes. */
66
cmp count, 16
67
b.lo L(copy16)
68
ldp A_l, A_h, [src]
69
ldp D_l, D_h, [srcend1, -16]
70
stp A_l, A_h, [dstin]
71
stp D_l, D_h, [dstend1, -16]
72
ret
73
74
/* Copy 8-15 bytes. */
75
L(copy16):
76
tbz count, 3, L(copy8)
77
ldr A_l, [src]
78
ldr A_h, [srcend1, -8]
79
str A_l, [dstin]
80
str A_h, [dstend1, -8]
81
ret
82
83
.p2align 3
84
/* Copy 4-7 bytes. */
85
L(copy8):
86
tbz count, 2, L(copy4)
87
ldr A_lw, [src]
88
ldr B_lw, [srcend1, -4]
89
str A_lw, [dstin]
90
str B_lw, [dstend1, -4]
91
ret
92
93
/* Copy 0..3 bytes using a branchless sequence. */
94
L(copy4):
95
cbz count, L(copy0)
96
lsr tmp1, count, 1
97
ldrb A_lw, [src]
98
ldrb C_lw, [srcend1, -1]
99
ldrb B_lw, [src, tmp1]
100
strb A_lw, [dstin]
101
strb B_lw, [dstin, tmp1]
102
strb C_lw, [dstend1, -1]
103
L(copy0):
104
ret
105
106
.p2align 4
107
/* Medium copies: 33..128 bytes. */
108
L(copy32_128):
109
ldp A_l, A_h, [src]
110
ldp B_l, B_h, [src, 16]
111
ldp C_l, C_h, [srcend1, -32]
112
ldp D_l, D_h, [srcend1, -16]
113
cmp count, 64
114
b.hi L(copy128)
115
stp A_l, A_h, [dstin]
116
stp B_l, B_h, [dstin, 16]
117
stp C_l, C_h, [dstend1, -32]
118
stp D_l, D_h, [dstend1, -16]
119
ret
120
121
.p2align 4
122
/* Copy 65..128 bytes. */
123
L(copy128):
124
ldp E_l, E_h, [src, 32]
125
ldp F_l, F_h, [src, 48]
126
cmp count, 96
127
b.ls L(copy96)
128
ldp G_l, G_h, [srcend1, -64]
129
ldp H_l, H_h, [srcend1, -48]
130
stp G_l, G_h, [dstend1, -64]
131
stp H_l, H_h, [dstend1, -48]
132
L(copy96):
133
stp A_l, A_h, [dstin]
134
stp B_l, B_h, [dstin, 16]
135
stp E_l, E_h, [dstin, 32]
136
stp F_l, F_h, [dstin, 48]
137
stp C_l, C_h, [dstend1, -32]
138
stp D_l, D_h, [dstend1, -16]
139
ret
140
141
.p2align 4
142
/* Copy more than 128 bytes. */
143
L(copy_long):
144
/* Use backwards copy if there is an overlap. */
145
sub tmp1, dstin, src
146
cbz tmp1, L(copy0)
147
cmp tmp1, count
148
b.lo L(copy_long_backwards)
149
150
/* Copy 16 bytes and then align dst to 16-byte alignment. */
151
152
ldp D_l, D_h, [src]
153
and tmp1, dstin, 15
154
bic dst, dstin, 15
155
sub src, src, tmp1
156
add count, count, tmp1 /* Count is now 16 too large. */
157
ldp A_l, A_h, [src, 16]
158
stp D_l, D_h, [dstin]
159
ldp B_l, B_h, [src, 32]
160
ldp C_l, C_h, [src, 48]
161
ldp D_l, D_h, [src, 64]!
162
subs count, count, 128 + 16 /* Test and readjust count. */
163
b.ls L(copy64_from_end)
164
L(loop64):
165
stp A_l, A_h, [dst, 16]
166
ldp A_l, A_h, [src, 16]
167
stp B_l, B_h, [dst, 32]
168
ldp B_l, B_h, [src, 32]
169
stp C_l, C_h, [dst, 48]
170
ldp C_l, C_h, [src, 48]
171
stp D_l, D_h, [dst, 64]!
172
ldp D_l, D_h, [src, 64]!
173
subs count, count, 64
174
b.hi L(loop64)
175
176
/* Write the last iteration and copy 64 bytes from the end. */
177
L(copy64_from_end):
178
ldp E_l, E_h, [srcend1, -64]
179
stp A_l, A_h, [dst, 16]
180
ldp A_l, A_h, [srcend1, -48]
181
stp B_l, B_h, [dst, 32]
182
ldp B_l, B_h, [srcend1, -32]
183
stp C_l, C_h, [dst, 48]
184
ldp C_l, C_h, [srcend1, -16]
185
stp D_l, D_h, [dst, 64]
186
stp E_l, E_h, [dstend1, -64]
187
stp A_l, A_h, [dstend1, -48]
188
stp B_l, B_h, [dstend1, -32]
189
stp C_l, C_h, [dstend1, -16]
190
ret
191
192
.p2align 4
193
194
/* Large backwards copy for overlapping copies.
195
Copy 16 bytes and then align dst to 16-byte alignment. */
196
L(copy_long_backwards):
197
ldp D_l, D_h, [srcend1, -16]
198
and tmp1, dstend1, 15
199
sub srcend1, srcend1, tmp1
200
sub count, count, tmp1
201
ldp A_l, A_h, [srcend1, -16]
202
stp D_l, D_h, [dstend1, -16]
203
ldp B_l, B_h, [srcend1, -32]
204
ldp C_l, C_h, [srcend1, -48]
205
ldp D_l, D_h, [srcend1, -64]!
206
sub dstend1, dstend1, tmp1
207
subs count, count, 128
208
b.ls L(copy64_from_start)
209
210
L(loop64_backwards):
211
stp A_l, A_h, [dstend1, -16]
212
ldp A_l, A_h, [srcend1, -16]
213
stp B_l, B_h, [dstend1, -32]
214
ldp B_l, B_h, [srcend1, -32]
215
stp C_l, C_h, [dstend1, -48]
216
ldp C_l, C_h, [srcend1, -48]
217
stp D_l, D_h, [dstend1, -64]!
218
ldp D_l, D_h, [srcend1, -64]!
219
subs count, count, 64
220
b.hi L(loop64_backwards)
221
222
/* Write the last iteration and copy 64 bytes from the start. */
223
L(copy64_from_start):
224
ldp G_l, G_h, [src, 48]
225
stp A_l, A_h, [dstend1, -16]
226
ldp A_l, A_h, [src, 32]
227
stp B_l, B_h, [dstend1, -32]
228
ldp B_l, B_h, [src, 16]
229
stp C_l, C_h, [dstend1, -48]
230
ldp C_l, C_h, [src]
231
stp D_l, D_h, [dstend1, -64]
232
stp G_l, G_h, [dstin, 48]
233
stp A_l, A_h, [dstin, 32]
234
stp B_l, B_h, [dstin, 16]
235
stp C_l, C_h, [dstin]
236
ret
237
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
238
239
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
240
241
242
//
243
// __arm_sc_memset
244
//
245
246
#define dstin x0
247
#define val x1
248
#define valw w1
249
#define count x2
250
#define dst x3
251
#define dstend2 x4
252
#define zva_val x5
253
254
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
255
#ifdef __ARM_FEATURE_SVE
256
mov z0.b, valw
257
#else
258
bfi valw, valw, #8, #8
259
bfi valw, valw, #16, #16
260
bfi val, val, #32, #32
261
fmov d0, val
262
fmov v0.d[1], val
263
#endif
264
add dstend2, dstin, count
265
266
cmp count, 96
267
b.hi L(set_long)
268
cmp count, 16
269
b.hs L(set_medium)
270
mov val, v0.D[0]
271
272
/* Set 0..15 bytes. */
273
tbz count, 3, 1f
274
str val, [dstin]
275
str val, [dstend2, -8]
276
ret
277
nop
278
1: tbz count, 2, 2f
279
str valw, [dstin]
280
str valw, [dstend2, -4]
281
ret
282
2: cbz count, 3f
283
strb valw, [dstin]
284
tbz count, 1, 3f
285
strh valw, [dstend2, -2]
286
3: ret
287
288
/* Set 17..96 bytes. */
289
L(set_medium):
290
str q0, [dstin]
291
tbnz count, 6, L(set96)
292
str q0, [dstend2, -16]
293
tbz count, 5, 1f
294
str q0, [dstin, 16]
295
str q0, [dstend2, -32]
296
1: ret
297
298
.p2align 4
299
/* Set 64..96 bytes. Write 64 bytes from the start and
300
32 bytes from the end. */
301
L(set96):
302
str q0, [dstin, 16]
303
stp q0, q0, [dstin, 32]
304
stp q0, q0, [dstend2, -32]
305
ret
306
307
.p2align 4
308
L(set_long):
309
and valw, valw, 255
310
bic dst, dstin, 15
311
str q0, [dstin]
312
cmp count, 160
313
ccmp valw, 0, 0, hs
314
b.ne L(no_zva)
315
316
#ifndef SKIP_ZVA_CHECK
317
mrs zva_val, dczid_el0
318
and zva_val, zva_val, 31
319
cmp zva_val, 4 /* ZVA size is 64 bytes. */
320
b.ne L(no_zva)
321
#endif
322
str q0, [dst, 16]
323
stp q0, q0, [dst, 32]
324
bic dst, dst, 63
325
sub count, dstend2, dst /* Count is now 64 too large. */
326
sub count, count, 128 /* Adjust count and bias for loop. */
327
328
.p2align 4
329
L(zva_loop):
330
add dst, dst, 64
331
dc zva, dst
332
subs count, count, 64
333
b.hi L(zva_loop)
334
stp q0, q0, [dstend2, -64]
335
stp q0, q0, [dstend2, -32]
336
ret
337
338
L(no_zva):
339
sub count, dstend2, dst /* Count is 16 too large. */
340
sub dst, dst, 16 /* Dst is biased by -32. */
341
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
342
L(no_zva_loop):
343
stp q0, q0, [dst, 32]
344
stp q0, q0, [dst, 64]!
345
subs count, count, 64
346
b.hi L(no_zva_loop)
347
stp q0, q0, [dstend2, -64]
348
stp q0, q0, [dstend2, -32]
349
ret
350
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
351
352
#endif // __aarch64__
353
354