CoCalc -- strncmp.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/aarch64/string/strncmp.S
⁴⁸²⁶² views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (c) 2024 Getz Mikalsen <[email protected]>
5
*/
6

7
#include <machine/asm.h>
8
#include <machine/param.h>
9

10
	.weak	strncmp
11
	.set	strncmp, __strncmp
12
	.text
13

14
ENTRY(__strncmp)
15

16
	bic	x8, x0, #0xf			// x0 aligned to the boundary
17
	and	x9, x0, #0xf			// x9 is the offset
18
	bic	x10, x1, #0xf			// x1 aligned to the boundary
19
	and	x11, x1, #0xf			// x11 is the offset
20

21
	subs	x2, x2, #1
22
	b.lo	.Lempty
23

24
	mov	x13, #-1			// save constants for later
25
	mov	x16, #0xf
26

27
	/*
28
	 * Check if either string is located at end of page to avoid crossing
29
	 * into unmapped page. If so, we load 16 bytes from the nearest
30
	 * alignment boundary and shift based on the offset.
31
	 */
32

33
	add	x3, x0, #16			// end of head
34
	add	x4, x1, #16
35
	eor	x3, x3, x0
36
	eor	x4, x4, x1			// bits that changed
37
	orr	x3, x3, x4			// in either str1 or str2
38
	cmp	x2,#16
39
	b.lo	.Llt16
40
	tbz	w3, #PAGE_SHIFT, .Lbegin
41

42
	ldr	q0, [x8]			// load aligned head
43
	ldr	q1, [x10]
44

45
	lsl	x14, x9, #2
46
	lsl	x15, x11, #2
47
	lsl	x3, x13, x14			// string head
48
	lsl	x4, x13, x15
49

50
	cmeq	v5.16b, v0.16b, #0
51
	cmeq	v6.16b, v1.16b, #0
52

53
	shrn	v5.8b, v5.8h, #4
54
	shrn	v6.8b, v6.8h, #4
55
	fmov	x5, d5
56
	fmov	x6, d6
57

58
	adrp	x14, shift_data
59
	add	x14, x14, :lo12:shift_data
60

61
	/* heads may cross page boundary, avoid unmapped loads */
62
	tst	x5, x3
63
	b.eq	0f
64

65
	ldr	q4, [x14, x9]			// load permutation table
66
	tbl	v0.16b, {v0.16b}, v4.16b
67

68
	b	1f
69
	.p2align 4
70
0:
71
	ldr	q0, [x0]			// load true head
72
1:
73
	tst	x6, x4
74
	b.eq	0f
75

76
	ldr	q4, [x14, x11]
77
	tbl	v4.16b, {v1.16b}, v4.16b
78

79
	b 1f
80

81
	.p2align 4
82
.Lbegin:
83
	ldr	q0, [x0]			// load true heads
84
0:
85
	ldr	q4, [x1]
86
1:
87
	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
88
	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
89

90
	orn	v2.16b, v2.16b, v4.16b		// mismatch or NUL byte?
91

92
	shrn	v2.8b, v2.8h, #4
93
	fmov	x5, d2
94

95
	cbnz	x5, .Lhead_mismatch
96
	/* load head and second chunk */
97
	ldr	q2, [x8, #16]			// load second chunk
98
	ldr	q3, [x10, #16]
99

100
	add	x2, x2, x11
101
	sub	x2, x2, #16
102

103
	subs	x9, x9, x11			// is a&0xf >= b&0xf
104
	b.lo	.Lswapped			// if not swap operands
105
	b	.Lnormal
106

107
	.p2align 4
108
.Llt16:
109
	/*
110
	 * Check if either string is located at end of page to avoid crossing
111
	 * into unmapped page. If so, we load 16 bytes from the nearest
112
	 * alignment boundary and shift based on the offset.
113
	 */
114
	tbz	w3, #PAGE_SHIFT, 2f
115

116
	ldr	q0, [x8]			// load aligned head
117
	ldr	q1, [x10]
118

119
	lsl	x14, x9, #2
120
	lsl	x15, x11, #2
121
	lsl	x3, x13, x14			// string head
122
	lsl	x4, x13, x15
123

124
	/* Introduce a null byte match if the limit is within the aligned chunk */
125
	add	x14, x2, x9
126
	add	x15, x2, x11
127
	lsl	x14, x14, #2
128
	lsl	x15, x15, #2
129
	lsl	x14, x16, x14
130
	lsl	x15, x16, x15
131

132
	cmeq	v5.16b, v0.16b, #0
133
	cmeq	v6.16b, v1.16b, #0
134

135
	shrn	v5.8b, v5.8h, #4
136
	shrn	v6.8b, v6.8h, #4
137
	fmov	x5, d5
138
	fmov	x6, d6
139

140
	orr	x5, x5, x14			// insert match at limit
141
	orr	x6, x6, x15
142

143
	adrp	x14, shift_data
144
	add	x14, x14, :lo12:shift_data
145

146
	/* heads may cross page boundary, avoid unmapped loads */
147
	tst	x5, x3
148
	b.eq	0f
149

150
	ldr	q4, [x14, x9]			// load permutation table
151
	tbl	v0.16b, {v0.16b}, v4.16b
152

153
	b	1f
154
	.p2align 4
155
0:
156
	ldr	q0, [x0]			// load true head
157
1:
158
	tst	x6, x4
159
	b.eq	0f
160

161
	ldr	q4, [x14, x11]
162
	tbl	v4.16b, {v1.16b}, v4.16b
163

164
	b 1f
165

166
	.p2align 4
167
2:
168
	ldr	q0, [x0]			// load true heads
169
0:
170
	ldr	q4, [x1]
171
1:
172

173
	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
174
	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
175

176
	bic	v2.16b, v4.16b, v2.16b		// match and not NUL byte
177

178
	shrn	v2.8b, v2.8h, #4
179
	fmov	x5, d2
180
	lsl	x4, x2, #2
181
	lsl	x4, x13, x4
182
	orn	x5, x4, x5			// mismatch or NUL byte?
183

184
.Lhead_mismatch:
185
	rbit	x3, x5
186
	clz	x3, x3				// index of mismatch
187
	lsr	x3, x3, #2
188
	ldrb	w4, [x0, x3]
189
	ldrb	w5, [x1, x3]
190
	sub	w0, w4, w5
191
	ret
192

193
	.p2align 4
194
.Lnormal:
195
	sub	x12, x10, x9
196
	ldr	q0, [x12, #16]!
197
	sub	x10, x10, x8
198
	sub	x11, x10, x9
199

200
	cmeq	v1.16b, v3.16b, #0		// NUL present?
201
	cmeq	v0.16b, v0.16b, v2.16b		// Mismatch between chunks?
202
	shrn	v1.8b, v1.8h, #4
203
	shrn	v0.8b, v0.8h, #4
204
	fmov	x6, d1
205
	fmov	x5, d0
206

207
	add	x8, x8, #32			// advance to next iteration
208

209
	lsl	x4, x2, #2
210
	lsl	x4, x13, x4
211
	orr	x3, x6, x4			// introduce a null byte match
212
	cmp	x2, #16				// does the buffer end within x2
213
	csel	x6, x3, x6, lo
214
	cbnz	x6, .Lnulfound2			// NUL or end of buffer found?
215
	mvn	x5, x5
216
	cbnz	x5, .Lmismatch2
217
	sub	x2, x2, #16
218
	cmp	x2, #32				// end of buffer?
219
	b.lo	.Ltail
220
	/*
221
	 * During the main loop, the layout of the two strings is something like:
222
	 *
223
	 *          v ------1------ v ------2------ v
224
	 *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
225
	 *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
226
	 *
227
	 * where v indicates the alignment boundaries and corresponding chunks
228
	 * of the strings have the same letters.  Chunk A has been checked in
229
	 * the previous iteration.  This iteration, we first check that string
230
	 * X1 doesn't end within region 2, then we compare chunk B between the
231
	 * two strings.  As X1 is known not to hold a NUL byte in regions 1
232
	 * and 2 at this point, this also ensures that x0 has not ended yet.
233
	 */
234
	.p2align 4
235
0:
236
	ldr	q0, [x8, x11]
237
	ldr	q1, [x8, x10]
238
	ldr	q2, [x8]
239

240
	cmeq	v1.16b, v1.16b, #0		// end of string?
241
	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
242

243
	shrn	v1.8b, v1.8h, #4
244
	shrn	v0.8b, v0.8h, #4
245
	fmov	x6, d1
246
	fmov	x5, d0
247
	cbnz	x6, .Lnulfound
248
	mvn	x5, x5				// any mismatches?
249
	cbnz	x5, .Lmismatch
250

251
	add	x8, x8, #16
252

253
	/* main loop unrolled twice */
254
	ldr	q0, [x8, x11]
255
	ldr	q1, [x8, x10]
256
	ldr	q2, [x8]
257

258
	add	x8, x8, #16
259
	cmeq	v1.16b, v1.16b, #0
260
	cmeq	v0.16b, v0.16b, v2.16b
261

262
	shrn	v1.8b, v1.8h, #4
263
	shrn	v0.8b, v0.8h, #4
264
	fmov	x6, d1
265
	fmov	x5, d0
266
	cbnz	x6, .Lnulfound2
267
	mvn	x5, x5
268
	cbnz	x5, .Lmismatch2
269
	sub	x2, x2, #32
270
	cmp	x2, #32				// end of buffer?
271
	b.hs	0b				// if yes, process tail
272

273
	/* end of buffer will occur in next 32 bytes */
274
.Ltail:
275
	ldr	q0, [x8, x11]
276
	ldr	q1, [x8, x10]
277
	ldr	q2, [x8]
278

279
	cmeq	v1.16b, v1.16b, #0		// end of string?
280
	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
281

282
	shrn	v1.8b, v1.8h, #4
283
	shrn	v0.8b, v0.8h, #4
284
	fmov	x6, d1
285
	fmov	x5, d0
286

287
	/*
288
	 * If x2 <= 16 then we introduce a NUL byte in the
289
	 * result from CMEQ to avoid comparing further!
290
	 */
291

292
	lsl	x4, x2, #2
293
	lsl	x4, x13, x4
294
	orr	x3, x6, x4			// introduce a null byte match
295
	cmp	x2, #16				// does the buffer end within x2
296
	csel	x6, x3, x6, lo
297

298
	cbnz	x6, .Lnulfound			// NUL or end of string found
299
	mvn	x5, x5
300
	cbnz	x5, .Lmismatch
301

302
	add	x8, x8, #16
303

304
	/* main loop unrolled twice */
305
	ldr	q0, [x8, x11]
306
	ldr	q1, [x8, x10]
307
	ldr	q2, [x8]
308

309
	add	x8, x8, #16
310
	cmeq	v1.16b, v1.16b, #0
311
	cmeq	v0.16b, v0.16b, v2.16b
312

313
	shrn	v1.8b, v1.8h, #4
314
	shrn	v0.8b, v0.8h, #4
315
	fmov	x6, d1
316
	fmov	x5, d0
317

318
	ubfiz	x4, x2, #2, #4	// (x2 - 16) << 2
319
	lsl	x4, x13, x4			// take first half into account
320
	orr	x6, x6, x4			// introduce a null byte match
321

322
.Lnulfound2:
323
	sub	x8, x8, #16
324

325
.Lnulfound:
326
	mov	x4, x6
327

328
	ubfiz	x7, x9, #2, #4
329
	lsl	x6, x6, x7			// adjust NUL mask to indices
330

331
	orn	x5, x6, x5
332
	cbnz	x5, .Lmismatch
333

334
	/*
335
	 * (x0) == (x1) and NUL is past the string.
336
	 * Compare (x1) with the corresponding part
337
	 * of the other string until the NUL byte.
338
	 */
339
	ldr	q0, [x8, x9]
340
	ldr	q1, [x8, x10]
341

342
	cmeq	v1.16b, v0.16b, v1.16b
343
	shrn	v1.8b, v1.8h, #4
344
	fmov	x5, d1
345

346
	orn	x5, x4, x5
347

348
	rbit	x3, x5
349
	clz	x3, x3
350
	lsr	x5, x3, #2
351

352
	add	x10, x10, x8			// restore x10 pointer
353
	add	x8, x8, x9			// point to corresponding chunk
354

355
	ldrb	w4, [x8, x5]
356
	ldrb	w5, [x10, x5]
357
	sub	w0, w4, w5
358
	ret
359

360
	.p2align 4
361
.Lmismatch2:
362
	sub	x8, x8, #16			// roll back second increment
363
.Lmismatch:
364
	rbit	x3, x5
365
	clz	x3, x3				// index of mismatch
366
	lsr	x3, x3, #2
367
	add	x11, x8, x11
368

369
	ldrb	w4, [x8, x3]
370
	ldrb	w5, [x11, x3]
371
	sub	w0, w4, w5			// byte difference
372
	ret
373

374
	/*
375
	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
376
	 * operands.  I found that this performs slightly better than
377
	 * using conditional moves to do the swap branchless.
378
	 */
379
	.p2align 4
380
.Lswapped:
381
	add	x12, x8, x9
382
	ldr	q0, [x12, #16]!
383
	sub	x8, x8, x10
384
	add	x11, x8, x9
385
	add	x2,x2,x9
386
	neg	x9, x9
387

388
	cmeq	v1.16b, v2.16b, #0
389
	cmeq	v0.16b, v0.16b, v3.16b
390
	shrn	v1.8b, v1.8h, #4
391
	shrn	v0.8b, v0.8h, #4
392
	fmov	x6, d1
393
	fmov	x5, d0
394

395
	add	x10, x10, #32
396

397
	lsl	x4, x2, #2
398
	lsl	x4, x13, x4
399
	orr	x3,x6,x4			// introduce a null byte match
400
	cmp	x2,#16
401
	csel	x6, x3, x6, lo
402
	cbnz	x6, .Lnulfound2s
403
	mvn	x5, x5
404
	cbnz	x5, .Lmismatch2s
405

406
	sub	x2, x2, #16
407
	cmp	x2, #32
408
	b.lo	.Ltails
409

410
	/*
411
	 * During the main loop, the layout of the two strings is something like:
412
	 *
413
	 *          v ------1------ v ------2------ v
414
	 *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
415
	 *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
416
	 *
417
	 * where v indicates the alignment boundaries and corresponding chunks
418
	 * of the strings have the same letters.  Chunk A has been checked in
419
	 * the previous iteration.  This iteration, we first check that string
420
	 * X0 doesn't end within region 2, then we compare chunk B between the
421
	 * two strings.  As X0 is known not to hold a NUL byte in regions 1
422
	 * and 2 at this point, this also ensures that X1 has not ended yet.
423
	 */
424
	.p2align 4
425
0:
426
	ldr	q0, [x10, x11]
427
	ldr	q1, [x10, x8]
428
	ldr	q2, [x10]
429

430
	cmeq	v1.16b, v1.16b, #0
431
	cmeq	v0.16b, v0.16b, v2.16b
432

433
	shrn	v1.8b, v1.8h, #4
434
	shrn	v0.8b, v0.8h, #4
435
	fmov	x6, d1
436
	fmov	x5, d0
437
	cbnz	x6, .Lnulfounds
438
	mvn	x5, x5
439
	cbnz	x5, .Lmismatchs
440

441
	add	x10, x10, #16
442

443
	/* main loop unrolled twice */
444
	ldr	q0, [x10, x11]
445
	ldr	q1, [x10, x8]
446
	ldr	q2, [x10]
447

448
	add	x10, x10, #16
449
	cmeq	v1.16b, v1.16b, #0
450
	cmeq	v0.16b, v0.16b, v2.16b
451

452
	shrn	v1.8b, v1.8h, #4
453
	shrn	v0.8b, v0.8h, #4
454
	fmov	x6, d1
455
	fmov	x5, d0
456
	cbnz	x6, .Lnulfound2s
457
	mvn	x5, x5
458
	cbnz	x5, .Lmismatch2s
459
	sub	x2, x2, #32
460
	cmp	x2, #32
461
	b.hs	0b
462

463
.Ltails:
464
	ldr	q0, [x10, x11]
465
	ldr	q1, [x10, x8]
466
	ldr	q2, [x10]
467

468
	cmeq	v1.16b, v1.16b, #0
469
	cmeq	v0.16b, v0.16b, v2.16b
470

471
	shrn	v1.8b, v1.8h, #4
472
	shrn	v0.8b, v0.8h, #4
473
	fmov	x6, d1
474
	fmov	x5, d0
475

476
	/*
477
	 * If x2 <= 16 then we introduce a NUL byte in the
478
	 * result from CMEQ to avoid comparing further!
479
	 */
480

481
	lsl	x4, x2, #2
482
	lsl	x4, x13, x4
483
	orr	x3, x6, x4			// introduce a null byte match
484
	cmp	x2, #16
485
	csel	x6, x3, x6, lo
486

487
	cbnz	x6, .Lnulfounds
488
	mvn	x5, x5
489
	cbnz	x5, .Lmismatchs
490

491
	add	x10, x10, #16
492

493
	ldr	q0, [x10, x11]
494
	ldr	q1, [x10, x8]
495
	ldr	q2, [x10]
496

497
	add	x10, x10, #16
498
	cmeq	v1.16b, v1.16b, #0
499
	cmeq	v0.16b, v0.16b, v2.16b
500

501
	shrn	v1.8b, v1.8h, #4
502
	shrn	v0.8b, v0.8h, #4
503
	fmov	x6, d1
504
	fmov	x5, d0
505

506
	ubfiz	x4, x2, #2, #4
507
	lsl	x4, x13, x4
508
	orr	x6, x6, x4			// introduce a null byte match
509

510
.Lnulfound2s:
511
	sub	x10, x10, #16
512
.Lnulfounds:
513
	mov	x4, x6
514

515
	ubfiz	x7, x9, #2, #4
516
	lsl	x6, x6, x7
517

518
	orn	x5, x6, x5
519

520
	cbnz	x5, .Lmismatchs
521

522
	ldr	q0, [x10, x9]
523
	ldr	q1, [x10, x8]
524

525
	cmeq	v1.16b, v0.16b, v1.16b
526
	shrn	v1.8b, v1.8h, #4
527
	fmov	x5, d1
528

529
	orn	x5, x4, x5
530

531
	rbit	x3, x5
532
	clz	x3, x3
533
	lsr	x5, x3, #2
534

535
	add	x11, x10, x8
536
	add	x10, x10, x9
537

538
	ldrb	w4, [x10, x5]
539
	ldrb	w5, [x11, x5]
540
	sub	w0, w5, w4
541
	ret
542

543
	.p2align 4
544
.Lmismatch2s:
545
	sub	x10, x10, #16
546
.Lmismatchs:
547
	rbit	x3, x5
548
	clz	x3, x3
549
	lsr	x3, x3, #2
550
	add	x11, x10, x11
551

552
	ldrb	w4, [x10, x3]
553
	ldrb	w5, [x11, x3]
554
	sub	w0, w5, w4
555
	ret
556

557
	.p2align 4
558
.Lempty:
559
	eor	x0, x0, x0
560
	ret
561

562
END(__strncmp)
563

564
	.section .rodata
565
	.p2align 4
566
shift_data:
567
	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
568
	.fill 16, 1, -1
569
	.size shift_data, .-shift_data
570

571
Product

Resources

Company