Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/aarch64/string/strcmp.S
48261 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2024 Getz Mikalsen <[email protected]>
5
*/
6
7
#include <machine/asm.h>
8
#include <machine/param.h>
9
10
.weak strcmp
11
.set strcmp, __strcmp
12
.text
13
14
ENTRY(__strcmp)
15
16
bic x8, x0, #0xf // x0 aligned to the boundary
17
and x9, x0, #0xf // x9 is the offset
18
bic x10, x1, #0xf // x1 aligned to the boundary
19
and x11, x1, #0xf // x11 is the offset
20
21
mov x13, #-1
22
23
/*
24
* Check if either string is located at end of page to avoid crossing
25
* into unmapped page. If so, we load 16 bytes from the nearest
26
* alignment boundary and shift based on the offset.
27
*/
28
29
add x3, x0, #16 // end of head
30
add x4, x1, #16
31
eor x3, x3, x0
32
eor x4, x4, x1 // bits that changed
33
orr x3, x3, x4 // in either str1 or str2
34
tbz w3, #PAGE_SHIFT, .Lbegin
35
36
ldr q0, [x8] // load aligned head
37
ldr q2, [x10]
38
39
lsl x14, x9, #2
40
lsl x15, x11, #2
41
lsl x3, x13, x14 // string head
42
lsl x4, x13, x15
43
44
cmeq v5.16b, v0.16b, #0
45
cmeq v6.16b, v2.16b, #0
46
47
shrn v5.8b, v5.8h, #4
48
shrn v6.8b, v6.8h, #4
49
fmov x5, d5
50
fmov x6, d6
51
52
adrp x2, shift_data
53
add x2, x2, :lo12:shift_data
54
55
/* heads may cross page boundary, avoid unmapped loads */
56
tst x5, x3
57
b.eq 0f
58
59
ldr q4, [x2, x9] // load permutation table
60
tbl v0.16b, {v0.16b}, v4.16b
61
62
b 1f
63
.p2align 4
64
0:
65
ldr q0, [x0] // load true head
66
1:
67
tst x6, x4
68
b.eq 0f
69
70
ldr q4, [x2, x11]
71
tbl v4.16b, {v2.16b}, v4.16b
72
73
b 1f
74
75
.p2align 4
76
.Lbegin:
77
ldr q0, [x0] // load true heads
78
0:
79
ldr q4, [x1]
80
1:
81
82
cmeq v2.16b, v0.16b, #0 // NUL byte present?
83
cmeq v4.16b, v0.16b, v4.16b // which bytes match?
84
85
orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte?
86
87
shrn v2.8b, v2.8h, #4
88
fmov x5, d2
89
90
cbnz x5, .Lhead_mismatch
91
92
ldr q2, [x8, #16] // load second chunk
93
ldr q3, [x10, #16]
94
subs x9, x9, x11 // is a&0xf >= b&0xf
95
b.lo .Lswapped // if not swap operands
96
sub x12, x10, x9
97
ldr q0, [x12, #16]!
98
sub x10, x10, x8
99
sub x11, x10, x9
100
101
cmeq v1.16b, v3.16b, #0
102
cmeq v0.16b, v0.16b, v2.16b
103
add x8, x8, #16
104
shrn v1.8b, v1.8h, #4
105
fmov x6, d1
106
shrn v0.8b, v0.8h, #4
107
fmov x5, d0
108
cbnz x6, .Lnulfound
109
mvn x5, x5
110
cbnz x5, .Lmismatch
111
add x8, x8, #16 // advance aligned pointers
112
113
/*
114
* During the main loop, the layout of the two strings is something like:
115
*
116
* v ------1------ v ------2------ v
117
* X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
118
* X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
119
*
120
* where v indicates the alignment boundaries and corresponding chunks
121
* of the strings have the same letters. Chunk A has been checked in
122
* the previous iteration. This iteration, we first check that string
123
* X1 doesn't end within region 2, then we compare chunk B between the
124
* two strings. As X1 is known not to hold a NUL byte in regions 1
125
* and 2 at this point, this also ensures that x0 has not ended yet.
126
*/
127
.p2align 4
128
0:
129
ldr q0, [x8, x11]
130
ldr q1, [x8, x10]
131
ldr q2, [x8]
132
133
cmeq v1.16b, v1.16b, #0 // end of string?
134
cmeq v0.16b, v0.16b, v2.16b // do the chunks match?
135
136
shrn v1.8b, v1.8h, #4
137
fmov x6, d1
138
shrn v0.8b, v0.8h, #4
139
fmov x5, d0
140
cbnz x6, .Lnulfound
141
mvn x5, x5 // any mismatches?
142
cbnz x5, .Lmismatch
143
144
add x8, x8, #16
145
146
ldr q0, [x8, x11]
147
ldr q1, [x8, x10]
148
ldr q2, [x8]
149
150
add x8, x8, #16
151
cmeq v1.16b, v1.16b, #0
152
cmeq v0.16b, v0.16b, v2.16b
153
154
shrn v1.8b, v1.8h, #4
155
fmov x6, d1
156
shrn v0.8b, v0.8h, #4
157
fmov x5, d0
158
cbnz x6, .Lnulfound2
159
mvn x5, x5
160
cbz x5, 0b
161
162
sub x8, x8, #16 // roll back second increment
163
.Lmismatch:
164
rbit x2, x5
165
clz x2, x2 // index of mismatch
166
lsr x2, x2, #2
167
add x11, x8, x11
168
169
ldrb w4, [x8, x2]
170
ldrb w5, [x11, x2]
171
sub w0, w4, w5 // byte difference
172
ret
173
174
.p2align 4
175
.Lnulfound2:
176
sub x8, x8, #16
177
178
.Lnulfound:
179
mov x7, x9
180
mov x4, x6
181
182
ubfiz x7, x7, #2, #4 // x7 = (x7 & 0xf) << 2
183
lsl x6, x6, x7 // adjust NUL mask to indices
184
orn x5, x6, x5
185
cbnz x5, .Lmismatch
186
187
/*
188
* (x0) == (x1) and NUL is past the string.
189
* Compare (x1) with the corresponding part
190
* of the other string until the NUL byte.
191
*/
192
ldr q0, [x8, x9]
193
ldr q1, [x8, x10]
194
195
cmeq v1.16b, v0.16b, v1.16b
196
shrn v1.8b, v1.8h, #4
197
fmov x5, d1
198
199
orn x5, x4, x5
200
201
rbit x2, x5
202
clz x2, x2
203
lsr x5, x2, #2
204
205
add x10, x10, x8 // restore x10 pointer
206
add x8, x8, x9 // point to corresponding chunk
207
208
ldrb w4, [x8, x5]
209
ldrb w5, [x10, x5]
210
sub w0, w4, w5
211
ret
212
213
.p2align 4
214
.Lhead_mismatch:
215
rbit x2, x5
216
clz x2, x2 // index of mismatch
217
lsr x2, x2, #2
218
ldrb w4, [x0, x2]
219
ldrb w5, [x1, x2]
220
sub w0, w4, w5
221
ret
222
223
/*
224
* If (a&0xf) < (b&0xf), we do the same thing but with swapped
225
* operands. I found that this performs slightly better than
226
* using conditional moves to do the swap branchless.
227
*/
228
.p2align 4
229
.Lswapped:
230
add x12, x8, x9
231
ldr q0, [x12, #16]!
232
sub x8, x8, x10
233
add x11, x8, x9
234
neg x9, x9
235
236
cmeq v1.16b, v2.16b, #0
237
cmeq v0.16b, v0.16b, v3.16b
238
add x10, x10, #16
239
shrn v1.8b, v1.8h, #4
240
fmov x6, d1
241
shrn v0.8b, v0.8h, #4
242
fmov x5, d0
243
cbnz x6, .Lnulfounds
244
mvn x5, x5
245
cbnz x5, .Lmismatchs
246
add x10, x10, #16
247
248
/*
249
* During the main loop, the layout of the two strings is something like:
250
*
251
* v ------1------ v ------2------ v
252
* X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
253
* X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
254
*
255
* where v indicates the alignment boundaries and corresponding chunks
256
* of the strings have the same letters. Chunk A has been checked in
257
* the previous iteration. This iteration, we first check that string
258
* X0 doesn't end within region 2, then we compare chunk B between the
259
* two strings. As X0 is known not to hold a NUL byte in regions 1
260
* and 2 at this point, this also ensures that X1 has not ended yet.
261
*/
262
.p2align 4
263
0:
264
ldr q0, [x10, x11]
265
ldr q1, [x10, x8]
266
ldr q2, [x10]
267
268
cmeq v1.16b, v1.16b, #0
269
cmeq v0.16b, v0.16b, v2.16b
270
271
shrn v1.8b, v1.8h, #4
272
fmov x6, d1
273
shrn v0.8b, v0.8h, #4
274
fmov x5, d0
275
cbnz x6, .Lnulfounds
276
mvn x5, x5
277
cbnz x5, .Lmismatchs
278
279
add x10, x10, #16
280
281
ldr q0, [x10, x11]
282
ldr q1, [x10, x8]
283
ldr q2, [x10]
284
285
add x10, x10, #16
286
cmeq v1.16b, v1.16b, #0
287
cmeq v0.16b, v0.16b, v2.16b
288
289
shrn v1.8b, v1.8h, #4
290
fmov x6, d1
291
shrn v0.8b, v0.8h, #4
292
fmov x5, d0
293
cbnz x6, .Lnulfound2s
294
mvn x5, x5
295
cbz x5, 0b
296
297
sub x10, x10, #16
298
299
.Lmismatchs:
300
rbit x2, x5
301
clz x2, x2
302
lsr x2, x2, #2
303
add x11, x10, x11
304
305
ldrb w4, [x10, x2]
306
ldrb w5, [x11, x2]
307
sub w0, w5, w4
308
ret
309
310
.p2align 4
311
.Lnulfound2s:
312
sub x10, x10, #16
313
.Lnulfounds:
314
mov x7, x9
315
mov x4, x6
316
317
ubfiz x7, x7, #2, #4
318
lsl x6, x6, x7
319
orn x5, x6, x5
320
cbnz x5, .Lmismatchs
321
322
ldr q0, [x10, x9]
323
ldr q1, [x10, x8]
324
325
cmeq v1.16b, v0.16b, v1.16b
326
shrn v1.8b, v1.8h, #4
327
fmov x5, d1
328
329
orn x5, x4, x5
330
331
rbit x2, x5
332
clz x2, x2
333
lsr x5, x2, #2
334
335
add x11, x10, x8
336
add x10, x10, x9
337
338
ldrb w4, [x10, x5]
339
ldrb w5, [x11, x5]
340
sub w0, w5, w4
341
ret
342
343
END(__strcmp)
344
345
.section .rodata
346
.p2align 4
347
shift_data:
348
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
349
.fill 16, 1, -1
350
.size shift_data, .-shift_data
351
352