Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/aarch64/string/memccpy.S
48261 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2024 Getz Mikalsen <[email protected]>
5
*/
6
7
#include <machine/asm.h>
8
9
.weak memccpy
10
.set memccpy, __memccpy
11
.text
12
13
ENTRY(__memccpy)
14
subs x3, x3, #1
15
b.lo .L0
16
17
dup v0.16b, w2
18
19
mov x9, x0 // stash copy of src pointer
20
bic x10, x1, #0xf // src aligned
21
and x11, x1, #0xf // src offset
22
23
ldr q1, [x10]
24
cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char
25
26
mov x8, #-1 // prepare a 0xfff..fff register
27
mov x6, #0xf
28
29
lsl x12, x11, #2
30
lsl x8, x8, x12 // mask of bytes in the string
31
32
shrn v1.8b, v1.8h, #4
33
fmov x5, d1
34
35
sub x12, x11, #32
36
adds x12, x12, x3 // distance from alignment boundary - 32
37
b.cc .Lrunt // branch if buffer length is 32 or less
38
39
ands x8, x8, x5
40
b.eq 0f
41
42
/* match in first chunk */
43
rbit x8, x8
44
clz x8, x8 // index of mismatch
45
lsr x8, x8, #2
46
47
sub x8, x8, x11 // ... from beginning of the string
48
49
add x0, x0, x8
50
add x4, x9, x8 // dst + cnt
51
add x5, x1, x8 // src + cnt
52
add x0, x0, #1
53
54
b .L0816
55
56
0:
57
ldr q3, [x10, #16] // load second string chunk
58
ldr q2, [x1] // load true head
59
cmeq v1.16b, v3.16b, v0.16b // char found in second chunk?
60
61
/* process second chunk */
62
shrn v1.8b, v1.8h, #4
63
fmov x5, d1
64
65
cbz x5, 0f
66
67
/* match in second chunk */
68
rbit x8, x5
69
clz x8, x8 // index of mismatch
70
lsr x8, x8, #2
71
72
sub x11, x11, #16
73
sub x8, x8, x11 // adjust for alignment offset
74
add x0, x0, x8 // return value
75
add x0, x0, #1
76
77
add x4, x9, x8
78
add x5, x1, x8
79
b .L1732
80
81
0:
82
/* string didn't end in second chunk and neither did buffer */
83
ldr q1, [x10, #32] // load next string chunk
84
str q2, [x0] // deposit head into buffer
85
sub x0, x0, x11 // adjust x0
86
mov x3, x12
87
str q3, [x0, #16] // deposit second chunk
88
89
add x10, x10, #32 // advance src
90
add x0, x0, #32 // advance dst
91
subs x3, x3, #16 // enough left for another round?
92
b.lo 1f
93
94
/* main loop unrolled twice */
95
.p2align 4
96
0:
97
cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?
98
shrn v2.8b, v2.8h, #4
99
fmov x5, d2
100
101
cbnz x5, 3f
102
103
str q1, [x0]
104
ldr q1, [x10, #16] // load next chunk
105
106
cmp x3, #16 // more than a full chunk left?
107
b.lo 2f
108
109
add x10, x10, #32 // advance pointers
110
add x0, x0, #32
111
112
cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?
113
shrn v2.8b, v2.8h, #4
114
fmov x5, d2
115
cbnz x5, 4f // process chunk if match
116
117
str q1, [x0, #-16]
118
ldr q1, [x10] // load next chunk
119
120
subs x3, x3, #32
121
b.hs 0b
122
123
1:
124
sub x10, x10, #16 // undo second advancement
125
add x3, x3, #16
126
sub x0, x0, #16
127
128
/* 1--16 bytes left in the buffer but string has not ended yet */
129
2:
130
cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?
131
shrn v2.8b, v2.8h, #4
132
fmov x4, d2
133
134
lsl x5, x3, #2 // shift 0xf to the limits position
135
lsl x5, x6, x5
136
orr x8, x4, x5 // insert match in mask at limit
137
138
rbit x8, x8 // simulate x86 tzcnt
139
clz x7, x8 // index of mismatch
140
lsr x8, x7, #2
141
142
lsl x5, x6, x7 // simulate x86 bt with shifted 0xf
143
144
add x8, x8, #1
145
add x0, x0, x8
146
147
ldr q1, [x10, x8] // load tail
148
str q1, [x0] // store tail
149
150
add x0, x0, #16
151
152
tst x4, x5 // terminator encountered inside buffer?
153
csel x0, x0, xzr, ne // if yes, return pointer, else NUL
154
ret
155
156
4:
157
sub x10, x10, #16 // undo second advancement
158
sub x0, x0, #16 // undo second advancement
159
160
3:
161
rbit x8, x5
162
clz x8, x8 // index of mismatch
163
lsr x3, x8, #2
164
165
add x0, x0, x3 // restore dst pointer
166
add x10, x10, x3
167
ldr q1, [x10, #-15]
168
str q1, [x0, #-15]
169
add x0, x0, #1
170
ret
171
172
.Lrunt:
173
add x13, x11, x3
174
175
mov x7, x5 // keep a copy of original match mask
176
177
lsl x4, x12, #2 // shift 0xf to the limits position
178
lsl x4, x6, x4
179
180
cmp x13, #16 // dont induce match if limit >=16
181
csel x4, x4, xzr, lo
182
orr x5, x5, x4 // insert match in mask at limit
183
184
ands x8, x8, x5 // if match always fall through
185
b.ne 0f
186
187
ldr q4, [x10, #16] // load second string chunk
188
cmeq v1.16b, v4.16b, v0.16b // char found in second chunk?
189
190
/* process second chunk */
191
shrn v1.8b, v1.8h, #4
192
fmov x8, d1
193
mov x7, x8
194
195
lsl x4, x12, #2
196
lsl x4, x6, x4
197
orr x8, x8, x4 // induce match in upper bytes of mask
198
199
rbit x8, x8
200
clz x4, x8 // index of mismatch
201
lsr x8, x4, #2
202
add x8, x8, #16 // no match in first chunk
203
b 1f
204
205
0:
206
rbit x8, x8
207
clz x4, x8 // index of mismatch
208
lsr x8, x4, #2
209
1:
210
add x0, x0, x8 // return value if terminator not found
211
sub x0, x0, x11
212
add x0, x0, #1
213
214
/* check if we encountered a match or the limit first */
215
lsl x5, x6, x4
216
ands x7, x7, x5 // was the terminator present?
217
csel x0, xzr, x0, eq // return value based on what we matched
218
219
sub x8, x8, x11
220
add x4, x9, x8 // dst + cnt
221
add x5, x1, x8 // src + cnt
222
223
/* copy 17-32 bytes */
224
.L1732:
225
cmp x8, #16
226
b.lo .L0816
227
add x5, x5, #1 // ldp offsets are powers of 2
228
add x4, x4, #1
229
ldp x16, x17, [x1]
230
ldp x12, x13, [x5, #-16]
231
stp x16, x17, [x9]
232
stp x12, x13, [x4, #-16]
233
ret
234
235
/* Copy 8-16 bytes */
236
.L0816:
237
tbz x8, #3, .L0407
238
ldr x16, [x1]
239
ldr x17, [x5, #-7]
240
str x16, [x9]
241
str x17, [x4, #-7]
242
ret
243
244
/* Copy 4-7 bytes */
245
.p2align 4
246
.L0407:
247
cmp x8, #3
248
b.lo .L0103
249
ldr w16, [x1]
250
ldr w18, [x5, #-3]
251
str w16, [x9]
252
str w18, [x4, #-3]
253
ret
254
255
/* Copy 1-3 bytes */
256
.p2align 4
257
.L0103:
258
lsr x14, x8, #1
259
ldrb w16, [x1]
260
ldrb w15, [x5]
261
ldrb w18, [x1, x14]
262
strb w16, [x9]
263
strb w18, [x9, x14]
264
strb w15, [x4]
265
ret
266
267
.L0:
268
eor x0, x0, x0
269
ret
270
271
END(__memccpy)
272
273