Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/aarch64/string/strncmp.S
48262 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2024 Getz Mikalsen <[email protected]>
5
*/
6
7
#include <machine/asm.h>
8
#include <machine/param.h>
9
10
.weak strncmp
11
.set strncmp, __strncmp
12
.text
13
14
ENTRY(__strncmp)
15
16
bic x8, x0, #0xf // x0 aligned to the boundary
17
and x9, x0, #0xf // x9 is the offset
18
bic x10, x1, #0xf // x1 aligned to the boundary
19
and x11, x1, #0xf // x11 is the offset
20
21
subs x2, x2, #1
22
b.lo .Lempty
23
24
mov x13, #-1 // save constants for later
25
mov x16, #0xf
26
27
/*
28
* Check if either string is located at end of page to avoid crossing
29
* into unmapped page. If so, we load 16 bytes from the nearest
30
* alignment boundary and shift based on the offset.
31
*/
32
33
add x3, x0, #16 // end of head
34
add x4, x1, #16
35
eor x3, x3, x0
36
eor x4, x4, x1 // bits that changed
37
orr x3, x3, x4 // in either str1 or str2
38
cmp x2,#16
39
b.lo .Llt16
40
tbz w3, #PAGE_SHIFT, .Lbegin
41
42
ldr q0, [x8] // load aligned head
43
ldr q1, [x10]
44
45
lsl x14, x9, #2
46
lsl x15, x11, #2
47
lsl x3, x13, x14 // string head
48
lsl x4, x13, x15
49
50
cmeq v5.16b, v0.16b, #0
51
cmeq v6.16b, v1.16b, #0
52
53
shrn v5.8b, v5.8h, #4
54
shrn v6.8b, v6.8h, #4
55
fmov x5, d5
56
fmov x6, d6
57
58
adrp x14, shift_data
59
add x14, x14, :lo12:shift_data
60
61
/* heads may cross page boundary, avoid unmapped loads */
62
tst x5, x3
63
b.eq 0f
64
65
ldr q4, [x14, x9] // load permutation table
66
tbl v0.16b, {v0.16b}, v4.16b
67
68
b 1f
69
.p2align 4
70
0:
71
ldr q0, [x0] // load true head
72
1:
73
tst x6, x4
74
b.eq 0f
75
76
ldr q4, [x14, x11]
77
tbl v4.16b, {v1.16b}, v4.16b
78
79
b 1f
80
81
.p2align 4
82
.Lbegin:
83
ldr q0, [x0] // load true heads
84
0:
85
ldr q4, [x1]
86
1:
87
cmeq v2.16b, v0.16b, #0 // NUL byte present?
88
cmeq v4.16b, v0.16b, v4.16b // which bytes match?
89
90
orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte?
91
92
shrn v2.8b, v2.8h, #4
93
fmov x5, d2
94
95
cbnz x5, .Lhead_mismatch
96
/* load head and second chunk */
97
ldr q2, [x8, #16] // load second chunk
98
ldr q3, [x10, #16]
99
100
add x2, x2, x11
101
sub x2, x2, #16
102
103
subs x9, x9, x11 // is a&0xf >= b&0xf
104
b.lo .Lswapped // if not swap operands
105
b .Lnormal
106
107
.p2align 4
108
.Llt16:
109
/*
110
* Check if either string is located at end of page to avoid crossing
111
* into unmapped page. If so, we load 16 bytes from the nearest
112
* alignment boundary and shift based on the offset.
113
*/
114
tbz w3, #PAGE_SHIFT, 2f
115
116
ldr q0, [x8] // load aligned head
117
ldr q1, [x10]
118
119
lsl x14, x9, #2
120
lsl x15, x11, #2
121
lsl x3, x13, x14 // string head
122
lsl x4, x13, x15
123
124
/* Introduce a null byte match if the limit is within the aligned chunk */
125
add x14, x2, x9
126
add x15, x2, x11
127
lsl x14, x14, #2
128
lsl x15, x15, #2
129
lsl x14, x16, x14
130
lsl x15, x16, x15
131
132
cmeq v5.16b, v0.16b, #0
133
cmeq v6.16b, v1.16b, #0
134
135
shrn v5.8b, v5.8h, #4
136
shrn v6.8b, v6.8h, #4
137
fmov x5, d5
138
fmov x6, d6
139
140
orr x5, x5, x14 // insert match at limit
141
orr x6, x6, x15
142
143
adrp x14, shift_data
144
add x14, x14, :lo12:shift_data
145
146
/* heads may cross page boundary, avoid unmapped loads */
147
tst x5, x3
148
b.eq 0f
149
150
ldr q4, [x14, x9] // load permutation table
151
tbl v0.16b, {v0.16b}, v4.16b
152
153
b 1f
154
.p2align 4
155
0:
156
ldr q0, [x0] // load true head
157
1:
158
tst x6, x4
159
b.eq 0f
160
161
ldr q4, [x14, x11]
162
tbl v4.16b, {v1.16b}, v4.16b
163
164
b 1f
165
166
.p2align 4
167
2:
168
ldr q0, [x0] // load true heads
169
0:
170
ldr q4, [x1]
171
1:
172
173
cmeq v2.16b, v0.16b, #0 // NUL byte present?
174
cmeq v4.16b, v0.16b, v4.16b // which bytes match?
175
176
bic v2.16b, v4.16b, v2.16b // match and not NUL byte
177
178
shrn v2.8b, v2.8h, #4
179
fmov x5, d2
180
lsl x4, x2, #2
181
lsl x4, x13, x4
182
orn x5, x4, x5 // mismatch or NUL byte?
183
184
.Lhead_mismatch:
185
rbit x3, x5
186
clz x3, x3 // index of mismatch
187
lsr x3, x3, #2
188
ldrb w4, [x0, x3]
189
ldrb w5, [x1, x3]
190
sub w0, w4, w5
191
ret
192
193
.p2align 4
194
.Lnormal:
195
sub x12, x10, x9
196
ldr q0, [x12, #16]!
197
sub x10, x10, x8
198
sub x11, x10, x9
199
200
cmeq v1.16b, v3.16b, #0 // NUL present?
201
cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks?
202
shrn v1.8b, v1.8h, #4
203
shrn v0.8b, v0.8h, #4
204
fmov x6, d1
205
fmov x5, d0
206
207
add x8, x8, #32 // advance to next iteration
208
209
lsl x4, x2, #2
210
lsl x4, x13, x4
211
orr x3, x6, x4 // introduce a null byte match
212
cmp x2, #16 // does the buffer end within x2
213
csel x6, x3, x6, lo
214
cbnz x6, .Lnulfound2 // NUL or end of buffer found?
215
mvn x5, x5
216
cbnz x5, .Lmismatch2
217
sub x2, x2, #16
218
cmp x2, #32 // end of buffer?
219
b.lo .Ltail
220
/*
221
* During the main loop, the layout of the two strings is something like:
222
*
223
* v ------1------ v ------2------ v
224
* X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
225
* X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
226
*
227
* where v indicates the alignment boundaries and corresponding chunks
228
* of the strings have the same letters. Chunk A has been checked in
229
* the previous iteration. This iteration, we first check that string
230
* X1 doesn't end within region 2, then we compare chunk B between the
231
* two strings. As X1 is known not to hold a NUL byte in regions 1
232
* and 2 at this point, this also ensures that x0 has not ended yet.
233
*/
234
.p2align 4
235
0:
236
ldr q0, [x8, x11]
237
ldr q1, [x8, x10]
238
ldr q2, [x8]
239
240
cmeq v1.16b, v1.16b, #0 // end of string?
241
cmeq v0.16b, v0.16b, v2.16b // do the chunks match?
242
243
shrn v1.8b, v1.8h, #4
244
shrn v0.8b, v0.8h, #4
245
fmov x6, d1
246
fmov x5, d0
247
cbnz x6, .Lnulfound
248
mvn x5, x5 // any mismatches?
249
cbnz x5, .Lmismatch
250
251
add x8, x8, #16
252
253
/* main loop unrolled twice */
254
ldr q0, [x8, x11]
255
ldr q1, [x8, x10]
256
ldr q2, [x8]
257
258
add x8, x8, #16
259
cmeq v1.16b, v1.16b, #0
260
cmeq v0.16b, v0.16b, v2.16b
261
262
shrn v1.8b, v1.8h, #4
263
shrn v0.8b, v0.8h, #4
264
fmov x6, d1
265
fmov x5, d0
266
cbnz x6, .Lnulfound2
267
mvn x5, x5
268
cbnz x5, .Lmismatch2
269
sub x2, x2, #32
270
cmp x2, #32 // end of buffer?
271
b.hs 0b // if yes, process tail
272
273
/* end of buffer will occur in next 32 bytes */
274
.Ltail:
275
ldr q0, [x8, x11]
276
ldr q1, [x8, x10]
277
ldr q2, [x8]
278
279
cmeq v1.16b, v1.16b, #0 // end of string?
280
cmeq v0.16b, v0.16b, v2.16b // do the chunks match?
281
282
shrn v1.8b, v1.8h, #4
283
shrn v0.8b, v0.8h, #4
284
fmov x6, d1
285
fmov x5, d0
286
287
/*
288
* If x2 <= 16 then we introduce a NUL byte in the
289
* result from CMEQ to avoid comparing further!
290
*/
291
292
lsl x4, x2, #2
293
lsl x4, x13, x4
294
orr x3, x6, x4 // introduce a null byte match
295
cmp x2, #16 // does the buffer end within x2
296
csel x6, x3, x6, lo
297
298
cbnz x6, .Lnulfound // NUL or end of string found
299
mvn x5, x5
300
cbnz x5, .Lmismatch
301
302
add x8, x8, #16
303
304
/* main loop unrolled twice */
305
ldr q0, [x8, x11]
306
ldr q1, [x8, x10]
307
ldr q2, [x8]
308
309
add x8, x8, #16
310
cmeq v1.16b, v1.16b, #0
311
cmeq v0.16b, v0.16b, v2.16b
312
313
shrn v1.8b, v1.8h, #4
314
shrn v0.8b, v0.8h, #4
315
fmov x6, d1
316
fmov x5, d0
317
318
ubfiz x4, x2, #2, #4 // (x2 - 16) << 2
319
lsl x4, x13, x4 // take first half into account
320
orr x6, x6, x4 // introduce a null byte match
321
322
.Lnulfound2:
323
sub x8, x8, #16
324
325
.Lnulfound:
326
mov x4, x6
327
328
ubfiz x7, x9, #2, #4
329
lsl x6, x6, x7 // adjust NUL mask to indices
330
331
orn x5, x6, x5
332
cbnz x5, .Lmismatch
333
334
/*
335
* (x0) == (x1) and NUL is past the string.
336
* Compare (x1) with the corresponding part
337
* of the other string until the NUL byte.
338
*/
339
ldr q0, [x8, x9]
340
ldr q1, [x8, x10]
341
342
cmeq v1.16b, v0.16b, v1.16b
343
shrn v1.8b, v1.8h, #4
344
fmov x5, d1
345
346
orn x5, x4, x5
347
348
rbit x3, x5
349
clz x3, x3
350
lsr x5, x3, #2
351
352
add x10, x10, x8 // restore x10 pointer
353
add x8, x8, x9 // point to corresponding chunk
354
355
ldrb w4, [x8, x5]
356
ldrb w5, [x10, x5]
357
sub w0, w4, w5
358
ret
359
360
.p2align 4
361
.Lmismatch2:
362
sub x8, x8, #16 // roll back second increment
363
.Lmismatch:
364
rbit x3, x5
365
clz x3, x3 // index of mismatch
366
lsr x3, x3, #2
367
add x11, x8, x11
368
369
ldrb w4, [x8, x3]
370
ldrb w5, [x11, x3]
371
sub w0, w4, w5 // byte difference
372
ret
373
374
/*
375
* If (a&0xf) < (b&0xf), we do the same thing but with swapped
376
* operands. I found that this performs slightly better than
377
* using conditional moves to do the swap branchless.
378
*/
379
.p2align 4
380
.Lswapped:
381
add x12, x8, x9
382
ldr q0, [x12, #16]!
383
sub x8, x8, x10
384
add x11, x8, x9
385
add x2,x2,x9
386
neg x9, x9
387
388
cmeq v1.16b, v2.16b, #0
389
cmeq v0.16b, v0.16b, v3.16b
390
shrn v1.8b, v1.8h, #4
391
shrn v0.8b, v0.8h, #4
392
fmov x6, d1
393
fmov x5, d0
394
395
add x10, x10, #32
396
397
lsl x4, x2, #2
398
lsl x4, x13, x4
399
orr x3,x6,x4 // introduce a null byte match
400
cmp x2,#16
401
csel x6, x3, x6, lo
402
cbnz x6, .Lnulfound2s
403
mvn x5, x5
404
cbnz x5, .Lmismatch2s
405
406
sub x2, x2, #16
407
cmp x2, #32
408
b.lo .Ltails
409
410
/*
411
* During the main loop, the layout of the two strings is something like:
412
*
413
* v ------1------ v ------2------ v
414
* X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
415
* X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
416
*
417
* where v indicates the alignment boundaries and corresponding chunks
418
* of the strings have the same letters. Chunk A has been checked in
419
* the previous iteration. This iteration, we first check that string
420
* X0 doesn't end within region 2, then we compare chunk B between the
421
* two strings. As X0 is known not to hold a NUL byte in regions 1
422
* and 2 at this point, this also ensures that X1 has not ended yet.
423
*/
424
.p2align 4
425
0:
426
ldr q0, [x10, x11]
427
ldr q1, [x10, x8]
428
ldr q2, [x10]
429
430
cmeq v1.16b, v1.16b, #0
431
cmeq v0.16b, v0.16b, v2.16b
432
433
shrn v1.8b, v1.8h, #4
434
shrn v0.8b, v0.8h, #4
435
fmov x6, d1
436
fmov x5, d0
437
cbnz x6, .Lnulfounds
438
mvn x5, x5
439
cbnz x5, .Lmismatchs
440
441
add x10, x10, #16
442
443
/* main loop unrolled twice */
444
ldr q0, [x10, x11]
445
ldr q1, [x10, x8]
446
ldr q2, [x10]
447
448
add x10, x10, #16
449
cmeq v1.16b, v1.16b, #0
450
cmeq v0.16b, v0.16b, v2.16b
451
452
shrn v1.8b, v1.8h, #4
453
shrn v0.8b, v0.8h, #4
454
fmov x6, d1
455
fmov x5, d0
456
cbnz x6, .Lnulfound2s
457
mvn x5, x5
458
cbnz x5, .Lmismatch2s
459
sub x2, x2, #32
460
cmp x2, #32
461
b.hs 0b
462
463
.Ltails:
464
ldr q0, [x10, x11]
465
ldr q1, [x10, x8]
466
ldr q2, [x10]
467
468
cmeq v1.16b, v1.16b, #0
469
cmeq v0.16b, v0.16b, v2.16b
470
471
shrn v1.8b, v1.8h, #4
472
shrn v0.8b, v0.8h, #4
473
fmov x6, d1
474
fmov x5, d0
475
476
/*
477
* If x2 <= 16 then we introduce a NUL byte in the
478
* result from CMEQ to avoid comparing further!
479
*/
480
481
lsl x4, x2, #2
482
lsl x4, x13, x4
483
orr x3, x6, x4 // introduce a null byte match
484
cmp x2, #16
485
csel x6, x3, x6, lo
486
487
cbnz x6, .Lnulfounds
488
mvn x5, x5
489
cbnz x5, .Lmismatchs
490
491
add x10, x10, #16
492
493
ldr q0, [x10, x11]
494
ldr q1, [x10, x8]
495
ldr q2, [x10]
496
497
add x10, x10, #16
498
cmeq v1.16b, v1.16b, #0
499
cmeq v0.16b, v0.16b, v2.16b
500
501
shrn v1.8b, v1.8h, #4
502
shrn v0.8b, v0.8h, #4
503
fmov x6, d1
504
fmov x5, d0
505
506
ubfiz x4, x2, #2, #4
507
lsl x4, x13, x4
508
orr x6, x6, x4 // introduce a null byte match
509
510
.Lnulfound2s:
511
sub x10, x10, #16
512
.Lnulfounds:
513
mov x4, x6
514
515
ubfiz x7, x9, #2, #4
516
lsl x6, x6, x7
517
518
orn x5, x6, x5
519
520
cbnz x5, .Lmismatchs
521
522
ldr q0, [x10, x9]
523
ldr q1, [x10, x8]
524
525
cmeq v1.16b, v0.16b, v1.16b
526
shrn v1.8b, v1.8h, #4
527
fmov x5, d1
528
529
orn x5, x4, x5
530
531
rbit x3, x5
532
clz x3, x3
533
lsr x5, x3, #2
534
535
add x11, x10, x8
536
add x10, x10, x9
537
538
ldrb w4, [x10, x5]
539
ldrb w5, [x11, x5]
540
sub w0, w5, w4
541
ret
542
543
.p2align 4
544
.Lmismatch2s:
545
sub x10, x10, #16
546
.Lmismatchs:
547
rbit x3, x5
548
clz x3, x3
549
lsr x3, x3, #2
550
add x11, x10, x11
551
552
ldrb w4, [x10, x3]
553
ldrb w5, [x11, x3]
554
sub w0, w5, w4
555
ret
556
557
.p2align 4
558
.Lempty:
559
eor x0, x0, x0
560
ret
561
562
END(__strncmp)
563
564
.section .rodata
565
.p2align 4
566
shift_data:
567
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
568
.fill 16, 1, -1
569
.size shift_data, .-shift_data
570
571