Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/aarch64/string/strlcpy.S
48262 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2024 Getz Mikalsen <[email protected]>
5
*/
6
7
#include <machine/asm.h>
8
9
.weak strlcpy
10
.set strlcpy, __strlcpy
11
.text
12
13
ENTRY(__strlcpy)
14
subs x2, x2, #1
15
b.lo .L0
16
17
mov x9, x0 // stash copy of dst pointer
18
bic x10, x1, #0xf // src aligned
19
and x11, x1, #0xf // src offset
20
21
ldr q1, [x10]
22
cmeq v1.16b, v1.16b, #0 // NUL found in head?
23
24
mov x8, #-1 // fill register with 0xfff..fff
25
lsl x12, x11, #2
26
lsl x8, x8, x12 // mask of bytes in the string
27
28
shrn v1.8b, v1.8h, #4
29
fmov x5, d1
30
31
ands x5, x5, x8
32
b.ne .Lhead_nul
33
34
ldr q3, [x10, #16] // load second string chunk
35
ldr q2, [x1] // load true head
36
mov x8, #32
37
sub x8, x8, x11
38
39
cmeq v1.16b, v3.16b, #0 // NUL found in second chunk?
40
41
subs x2, x2, x8
42
b.ls .Lhead_buf_end
43
44
/* process second chunk */
45
shrn v1.8b, v1.8h, #4
46
fmov x5, d1
47
cbnz x5, .Lsecond_nul
48
49
/* string didn't end in second chunk and neither did buffer */
50
ldr q1, [x10, #32] // load next string chunk
51
str q2, [x0] // deposit head into buffer
52
sub x0, x0, x11 // adjust x0
53
str q3, [x0, #16] // deposit second chunk
54
add x10, x10, #32 // advance src
55
add x0, x0, #32 // advance dst
56
subs x2, x2, #16 // enough left for another round?
57
b.ls 1f
58
59
/* main loop unrolled twice */
60
.p2align 4
61
0:
62
cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?
63
shrn v2.8b, v2.8h, #4
64
fmov x5, d2
65
66
cbnz x5, 3f
67
68
str q1, [x0]
69
ldr q1, [x10, #16] // load next chunk
70
71
cmp x2, #16 // more than a full chunk left?
72
b.ls 2f
73
74
add x10, x10, #32 // advance pointers
75
add x0, x0, #32
76
77
cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?
78
shrn v2.8b, v2.8h, #4
79
fmov x5, d2
80
cbnz x5, 4f // process chunk if match
81
82
str q1, [x0, #-16]
83
ldr q1, [x10] // load next chunk
84
85
subs x2, x2, #32
86
b.hi 0b
87
88
1:
89
sub x10, x10, #16 // undo second advancement
90
add x2, x2, #16
91
sub x0, x0, #16
92
93
/* 1--16 bytes left in the buffer but string has not ended yet */
94
2:
95
cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?
96
shrn v2.8b, v2.8h, #4
97
fmov x4, d2
98
99
mov x6, #0xf
100
mov x7, x4
101
102
lsl x5, x2, #2 // shift 0xf to the limits position
103
lsl x5, x6, x5
104
cmp x2, #16 // dont induce match if limit >=16
105
csel x5, x5, xzr, lo
106
orr x8, x4, x5 // treat limit as if terminator present
107
108
rbit x8, x8 // simulate x86 tzcnt
109
clz x8, x8 // index of mismatch
110
lsr x8, x8, #2
111
112
add x0, x0, x8
113
114
ldr q1, [x10, x8] // load tail
115
str q1, [x0] // store tail
116
strb wzr, [x0, #16]
117
118
/* continue to find the end of the string */
119
cbnz x7, 1f
120
121
/* we opt for a simpler strlen than the one in libc as the
122
* cmeq, shrn approach is faster for shorter strings.
123
*/
124
.p2align 4
125
0:
126
ldr q1, [x10, #32]
127
cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
128
shrn v1.8b, v1.8h, #4
129
fmov x7, d1
130
cbnz x7, 2f
131
132
ldr q1, [x10, #48]
133
cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
134
shrn v1.8b, v1.8h, #4
135
fmov x7, d1
136
add x10, x10, #32
137
cbz x7, 0b
138
139
1: sub x10, x10, #16
140
2: rbit x8, x7
141
clz x8, x8 // index of mismatch
142
lsr x8, x8, #2
143
144
sub x10, x10, x1
145
add x0, x10, #32
146
add x0, x0, x8
147
148
ret
149
150
4:
151
sub x10, x10, #16 // undo second advancement
152
sub x0, x0, #16 // undo second advancement
153
154
/* string has ended but buffer has not */
155
3:
156
rbit x8, x5
157
clz x8, x8 // index of mismatch
158
lsr x8, x8, #2
159
160
add x0, x0, x8 // restore dst pointer
161
add x10, x10, x8
162
163
ldr q1, [x10, #-15]
164
str q1, [x0, #-15]
165
add x0, x0, #1
166
sub x0, x10, x1
167
168
ret
169
170
.Lhead_buf_end:
171
shrn v1.8b, v1.8h, #4
172
fmov x8, d1
173
174
add x2, x2, #32 // restore limit
175
176
mov x7, x8
177
mov x6, #0xf
178
179
cmp x2, #16 // should we induce a match or not
180
b.lo 0f
181
182
rbit x8, x8
183
clz x8, x8 // index of mismatch
184
lsr x8, x8, #2
185
add x8, x8, #16
186
187
cmp x8, x2
188
csel x8, x8, x2, lo // copy min(buflen, srclen) bytes
189
b 1f
190
0:
191
192
rbit x8, x8
193
clz x8, x8 // index of mismatch
194
lsr x8, x8, #2
195
196
mov x8, x2
197
1:
198
199
sub x8, x8, x11
200
strb wzr, [x9, x8]
201
202
/* continue to find the end of the string */
203
cbnz x7, 1f
204
205
/* we opt for a simpler strlen than the one in libc as the
206
* cmeq, shrn approach is faster for shorter strings.
207
*/
208
.p2align 4
209
0:
210
ldr q1, [x10, #32]
211
cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
212
shrn v1.8b, v1.8h, #4
213
fmov x7, d1
214
cbnz x7, 2f
215
216
ldr q1, [x10, #48]
217
cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
218
shrn v1.8b, v1.8h, #4
219
fmov x7, d1
220
add x10, x10, #32
221
cbz x7, 0b
222
223
1: sub x10, x10, #16
224
2: rbit x6, x7
225
clz x6, x6 // index of mismatch
226
lsr x6, x6, #2
227
228
sub x10, x10, x1
229
add x0, x10, #32
230
add x0, x0, x6
231
232
add x4, x9, x8 // dst + cnt
233
add x5, x1, x8 // src + cnt
234
235
b .L1732
236
237
.Lsecond_nul:
238
add x2, x2, x8
239
240
rbit x8, x5
241
clz x8, x8 // index of mismatch
242
lsr x5, x8, #2
243
244
sub x8, x11, #16
245
sub x0, x5, x8 // string length
246
247
cmp x0, x2 // did we match or hit limit first?
248
csel x8, x2, x0, hi
249
250
add x4, x9, x8 // dst + cnt
251
add x5, x1, x8 // src + cnt
252
253
strb wzr, [x4]
254
255
/* copy 17-32 bytes */
256
.L1732:
257
cmp x8, #16
258
b.lo .L0816
259
ldp x16, x17, [x1]
260
ldp x12, x1, [x5, #-16]
261
stp x16, x17, [x9]
262
stp x12, x1, [x4, #-16]
263
ret
264
265
.Lhead_nul:
266
rbit x8, x5
267
clz x8, x8 // index of mismatch
268
lsr x8, x8, #2
269
270
sub x0, x8, x11
271
cmp x0, x2
272
csel x8, x2, x0, hi
273
274
add x4, x9, x8 // dst + cnt
275
add x5, x1, x8 // src + cnt
276
strb wzr, [x4]
277
278
/* Copy 8-16 bytes */
279
.L0816:
280
tbz x8, #3, .L0407
281
ldr x16, [x1]
282
ldr x17, [x5, #-8]
283
str x16, [x9]
284
str x17, [x4, #-8]
285
ret
286
287
/* Copy 4-7 bytes */
288
.p2align 4
289
.L0407:
290
cmp x8, #3
291
b.ls .L0203
292
ldr w16, [x1]
293
ldr w18, [x5, #-4]
294
str w16, [x9]
295
str w18, [x4, #-4]
296
ret
297
298
.L0203:
299
tbz x8, 1, .L0001
300
ldrh w16, [x1]
301
ldrh w17, [x5, #-2]
302
strh w16, [x9]
303
strh w17, [x4, #-2]
304
ret
305
306
.L0001:
307
ldrb w16, [x1]
308
strb w16, [x9]
309
strb wzr, [x4]
310
ret
311
312
.L0:
313
mov x0, x1
314
b strlen
315
ret
316
END(__strlcpy)
317
318