CoCalc -- strcmp.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/aarch64/string/strcmp.S
¹⁰²⁴⁹⁰ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (c) 2024 Getz Mikalsen <[email protected]>
5
*/
6

7
#include <machine/asm.h>
8
#include <machine/param.h>
9

10
	.weak	strcmp
11
	.set	strcmp, __strcmp
12
	.text
13

14
ENTRY(__strcmp)
15

16
	bic	x8, x0, #0xf			// x0 aligned to the boundary
17
	and	x9, x0, #0xf			// x9 is the offset
18
	bic	x10, x1, #0xf			// x1 aligned to the boundary
19
	and	x11, x1, #0xf			// x11 is the offset
20

21
	mov	x13, #-1
22

23
	/*
24
	 * Check if either string is located at end of page to avoid crossing
25
	 * into unmapped page. If so, we load 16 bytes from the nearest
26
	 * alignment boundary and shift based on the offset.
27
	 */
28

29
	add	x3, x0, #16			// end of head
30
	add	x4, x1, #16
31
	eor	x3, x3, x0
32
	eor	x4, x4, x1			// bits that changed
33
	orr	x3, x3, x4			// in either str1 or str2
34
	tbz	w3, #PAGE_SHIFT, .Lbegin
35

36
	ldr	q0, [x8]			// load aligned head
37
	ldr	q2, [x10]
38

39
	lsl	x14, x9, #2
40
	lsl	x15, x11, #2
41
	lsl	x3, x13, x14			// string head
42
	lsl	x4, x13, x15
43

44
	cmeq	v5.16b, v0.16b, #0
45
	cmeq	v6.16b, v2.16b, #0
46

47
	shrn	v5.8b, v5.8h, #4
48
	shrn	v6.8b, v6.8h, #4
49
	fmov	x5, d5
50
	fmov	x6, d6
51

52
	adrp	x2, shift_data
53
	add	x2, x2, :lo12:shift_data
54

55
	/* heads may cross page boundary, avoid unmapped loads */
56
	tst	x5, x3
57
	b.eq	0f
58

59
	ldr	q4, [x2, x9]			// load permutation table
60
	tbl	v0.16b, {v0.16b}, v4.16b
61

62
	b		1f
63
	.p2align 4
64
0:
65
	ldr	q0, [x0]			// load true head
66
1:
67
	tst	x6, x4
68
	b.eq	0f
69

70
	ldr	q4, [x2, x11]
71
	tbl	v4.16b, {v2.16b}, v4.16b
72

73
	b 1f
74

75
	.p2align 4
76
.Lbegin:
77
	ldr	q0, [x0]			// load true heads
78
0:
79
	ldr	q4, [x1]
80
1:
81

82
	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
83
	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
84

85
	orn	v2.16b, v2.16b, v4.16b		// mismatch or NUL byte?
86

87
	shrn	v2.8b, v2.8h, #4
88
	fmov	x5, d2
89

90
	cbnz	x5, .Lhead_mismatch
91

92
	ldr	q2, [x8, #16]			// load second chunk
93
	ldr	q3, [x10, #16]
94
	subs	x9, x9, x11			// is a&0xf >= b&0xf
95
	b.lo	.Lswapped			// if not swap operands
96
	sub	x12, x10, x9
97
	ldr	q0, [x12, #16]!
98
	sub	x10, x10, x8
99
	sub	x11, x10, x9
100

101
	cmeq	v1.16b, v3.16b, #0
102
	cmeq	v0.16b, v0.16b, v2.16b
103
	add	x8, x8, #16
104
	shrn	v1.8b, v1.8h, #4
105
	fmov	x6, d1
106
	shrn	v0.8b, v0.8h, #4
107
	fmov	x5, d0
108
	cbnz	x6, .Lnulfound
109
	mvn	x5, x5
110
	cbnz	x5, .Lmismatch
111
	add	x8, x8, #16			// advance aligned pointers
112

113
	/*
114
	 * During the main loop, the layout of the two strings is something like:
115
	 *
116
	 *          v ------1------ v ------2------ v
117
	 *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
118
	 *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
119
	 *
120
	 * where v indicates the alignment boundaries and corresponding chunks
121
	 * of the strings have the same letters.  Chunk A has been checked in
122
	 * the previous iteration.  This iteration, we first check that string
123
	 * X1 doesn't end within region 2, then we compare chunk B between the
124
	 * two strings.  As X1 is known not to hold a NUL byte in regions 1
125
	 * and 2 at this point, this also ensures that x0 has not ended yet.
126
	 */
127
	.p2align 4
128
0:
129
	ldr	q0, [x8, x11]
130
	ldr	q1, [x8, x10]
131
	ldr	q2, [x8]
132

133
	cmeq	v1.16b, v1.16b, #0		// end of string?
134
	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
135

136
	shrn	v1.8b, v1.8h, #4
137
	fmov	x6, d1
138
	shrn	v0.8b, v0.8h, #4
139
	fmov	x5, d0
140
	cbnz	x6, .Lnulfound
141
	mvn	x5, x5				// any mismatches?
142
	cbnz	x5, .Lmismatch
143

144
	add	x8, x8, #16
145

146
	ldr	q0, [x8, x11]
147
	ldr	q1, [x8, x10]
148
	ldr	q2, [x8]
149

150
	add	x8, x8, #16
151
	cmeq	v1.16b, v1.16b, #0
152
	cmeq	v0.16b, v0.16b, v2.16b
153

154
	shrn	v1.8b, v1.8h, #4
155
	fmov	x6, d1
156
	shrn	v0.8b, v0.8h, #4
157
	fmov	x5, d0
158
	cbnz	x6, .Lnulfound2
159
	mvn	x5, x5
160
	cbz	x5, 0b
161

162
	sub	x8, x8, #16			// roll back second increment
163
.Lmismatch:
164
	rbit	x2, x5
165
	clz	x2, x2				// index of mismatch
166
	lsr	x2, x2, #2
167
	add	x11, x8, x11
168

169
	ldrb	w4, [x8, x2]
170
	ldrb	w5, [x11, x2]
171
	sub	w0, w4, w5			// byte difference
172
	ret
173

174
	.p2align 4
175
.Lnulfound2:
176
	sub	x8, x8, #16
177

178
.Lnulfound:
179
	mov	x7, x9
180
	mov	x4, x6
181

182
	ubfiz	x7, x7, #2, #4			// x7 = (x7 & 0xf) << 2
183
	lsl	x6, x6, x7			// adjust NUL mask to indices
184
	orn	x5, x6, x5
185
	cbnz	x5, .Lmismatch
186

187
	/*
188
	 * (x0) == (x1) and NUL is past the string.
189
	 * Compare (x1) with the corresponding part
190
	 * of the other string until the NUL byte.
191
	 */
192
	ldr	q0, [x8, x9]
193
	ldr	q1, [x8, x10]
194

195
	cmeq	v1.16b, v0.16b, v1.16b
196
	shrn	v1.8b, v1.8h, #4
197
	fmov	x5, d1
198

199
	orn	x5, x4, x5
200

201
	rbit	x2, x5
202
	clz	x2, x2
203
	lsr	x5, x2, #2
204

205
	add	x10, x10, x8			// restore x10 pointer
206
	add	x8, x8, x9			// point to corresponding chunk
207

208
	ldrb	w4, [x8, x5]
209
	ldrb	w5, [x10, x5]
210
	sub	w0, w4, w5
211
	ret
212

213
	.p2align 4
214
.Lhead_mismatch:
215
	rbit	x2, x5
216
	clz	x2, x2				// index of mismatch
217
	lsr	x2, x2, #2
218
	ldrb	w4, [x0, x2]
219
	ldrb	w5, [x1, x2]
220
	sub	w0, w4, w5
221
	ret
222

223
	/*
224
	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
225
	 * operands.  I found that this performs slightly better than
226
	 * using conditional moves to do the swap branchless.
227
	 */
228
	.p2align 4
229
.Lswapped:
230
	add	x12, x8, x9
231
	ldr	q0, [x12, #16]!
232
	sub	x8, x8, x10
233
	add	x11, x8, x9
234
	neg	x9, x9
235

236
	cmeq	v1.16b, v2.16b, #0
237
	cmeq	v0.16b, v0.16b, v3.16b
238
	add	x10, x10, #16
239
	shrn	v1.8b, v1.8h, #4
240
	fmov	x6, d1
241
	shrn	v0.8b, v0.8h, #4
242
	fmov	x5, d0
243
	cbnz	x6, .Lnulfounds
244
	mvn	x5, x5
245
	cbnz	x5, .Lmismatchs
246
	add	x10, x10, #16
247

248
	/*
249
	 * During the main loop, the layout of the two strings is something like:
250
	 *
251
	 *          v ------1------ v ------2------ v
252
	 *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
253
	 *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
254
	 *
255
	 * where v indicates the alignment boundaries and corresponding chunks
256
	 * of the strings have the same letters.  Chunk A has been checked in
257
	 * the previous iteration.  This iteration, we first check that string
258
	 * X0 doesn't end within region 2, then we compare chunk B between the
259
	 * two strings.  As X0 is known not to hold a NUL byte in regions 1
260
	 * and 2 at this point, this also ensures that X1 has not ended yet.
261
	 */
262
	.p2align 4
263
0:
264
	ldr	q0, [x10, x11]
265
	ldr	q1, [x10, x8]
266
	ldr	q2, [x10]
267

268
	cmeq	v1.16b, v1.16b, #0
269
	cmeq	v0.16b, v0.16b, v2.16b
270

271
	shrn	v1.8b, v1.8h, #4
272
	fmov	x6, d1
273
	shrn	v0.8b, v0.8h, #4
274
	fmov	x5, d0
275
	cbnz	x6, .Lnulfounds
276
	mvn	x5, x5
277
	cbnz	x5, .Lmismatchs
278

279
	add	x10, x10, #16
280

281
	ldr	q0, [x10, x11]
282
	ldr	q1, [x10, x8]
283
	ldr	q2, [x10]
284

285
	add	x10, x10, #16
286
	cmeq	v1.16b, v1.16b, #0
287
	cmeq	v0.16b, v0.16b, v2.16b
288

289
	shrn	v1.8b, v1.8h, #4
290
	fmov	x6, d1
291
	shrn	v0.8b, v0.8h, #4
292
	fmov	x5, d0
293
	cbnz	x6, .Lnulfound2s
294
	mvn	x5, x5
295
	cbz	x5, 0b
296

297
	sub	x10, x10, #16
298

299
.Lmismatchs:
300
	rbit	x2, x5
301
	clz	x2, x2
302
	lsr	x2, x2, #2
303
	add	x11, x10, x11
304

305
	ldrb	w4, [x10, x2]
306
	ldrb	w5, [x11, x2]
307
	sub	w0, w5, w4
308
	ret
309

310
	.p2align 4
311
.Lnulfound2s:
312
	sub	x10, x10, #16
313
.Lnulfounds:
314
	mov	x7, x9
315
	mov	x4, x6
316

317
	ubfiz	x7, x7, #2, #4
318
	lsl	x6, x6, x7
319
	orn	x5, x6, x5
320
	cbnz	x5, .Lmismatchs
321

322
	ldr	q0, [x10, x9]
323
	ldr	q1, [x10, x8]
324

325
	cmeq	v1.16b, v0.16b, v1.16b
326
	shrn	v1.8b, v1.8h, #4
327
	fmov	x5, d1
328

329
	orn	x5, x4, x5
330

331
	rbit	x2, x5
332
	clz	x2, x2
333
	lsr	x5, x2, #2
334

335
	add	x11, x10, x8
336
	add	x10, x10, x9
337

338
	ldrb	w4, [x10, x5]
339
	ldrb	w5, [x11, x5]
340
	sub	w0, w5, w4
341
	ret
342

343
END(__strcmp)
344

345
	.section .rodata
346
	.p2align 4
347
shift_data:
348
	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
349
	.fill 16, 1, -1
350
	.size shift_data, .-shift_data
351

352
Product

Resources

Company