Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/arm/strcmp.S
39556 views
1
/*
2
* strcmp for ARMv7
3
*
4
* Copyright (c) 2012-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
9
10
/* Implementation of strcmp for ARMv7 when DSP instructions are
11
available. Use ldrd to support wider loads, provided the data
12
is sufficiently aligned. Use saturating arithmetic to optimize
13
the compares. */
14
15
#include "asmdefs.h"
16
17
/* Build Options:
18
STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
19
byte in the string. If comparing completely random strings
20
the pre-check will save time, since there is a very high
21
probability of a mismatch in the first character: we save
22
significant overhead if this is the common case. However,
23
if strings are likely to be identical (eg because we're
24
verifying a hit in a hash table), then this check is largely
25
redundant. */
26
27
#define STRCMP_NO_PRECHECK 0
28
29
/* Ensure the .cantunwind directive is prepended to .fnend.
30
Leaf functions cannot throw exceptions - EHABI only supports
31
synchronous exceptions. */
32
#define IS_LEAF
33
34
/* This version uses Thumb-2 code. */
35
.thumb
36
.syntax unified
37
38
#ifdef __ARM_BIG_ENDIAN
39
#define S2LO lsl
40
#define S2LOEQ lsleq
41
#define S2HI lsr
42
#define MSB 0x000000ff
43
#define LSB 0xff000000
44
#define BYTE0_OFFSET 24
45
#define BYTE1_OFFSET 16
46
#define BYTE2_OFFSET 8
47
#define BYTE3_OFFSET 0
48
#else /* not __ARM_BIG_ENDIAN */
49
#define S2LO lsr
50
#define S2LOEQ lsreq
51
#define S2HI lsl
52
#define BYTE0_OFFSET 0
53
#define BYTE1_OFFSET 8
54
#define BYTE2_OFFSET 16
55
#define BYTE3_OFFSET 24
56
#define MSB 0xff000000
57
#define LSB 0x000000ff
58
#endif /* not __ARM_BIG_ENDIAN */
59
60
/* Parameters and result. */
61
#define src1 r0
62
#define src2 r1
63
#define result r0 /* Overlaps src1. */
64
65
/* Internal variables. */
66
#define tmp1 r4
67
#define tmp2 r5
68
#define const_m1 r12
69
70
/* Additional internal variables for 64-bit aligned data. */
71
#define data1a r2
72
#define data1b r3
73
#define data2a r6
74
#define data2b r7
75
#define syndrome_a tmp1
76
#define syndrome_b tmp2
77
78
/* Additional internal variables for 32-bit aligned data. */
79
#define data1 r2
80
#define data2 r3
81
#define syndrome tmp2
82
83
84
/* Macro to compute and return the result value for word-aligned
85
cases. */
86
.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
87
#ifdef __ARM_BIG_ENDIAN
88
/* If data1 contains a zero byte, then syndrome will contain a 1 in
89
bit 7 of that byte. Otherwise, the highest set bit in the
90
syndrome will highlight the first different bit. It is therefore
91
sufficient to extract the eight bits starting with the syndrome
92
bit. */
93
clz tmp1, \synd
94
lsl r1, \d2, tmp1
95
.if \restore_r6
96
ldrd r6, r7, [sp, #8]
97
.endif
98
.cfi_restore 6
99
.cfi_restore 7
100
lsl \d1, \d1, tmp1
101
.cfi_remember_state
102
lsr result, \d1, #24
103
ldrd r4, r5, [sp], #16
104
.cfi_restore 4
105
.cfi_restore 5
106
.cfi_adjust_cfa_offset -16
107
sub result, result, r1, lsr #24
108
epilogue push_ip=HAVE_PAC_LEAF
109
#else
110
/* To use the big-endian trick we'd have to reverse all three words.
111
that's slower than this approach. */
112
rev \synd, \synd
113
clz tmp1, \synd
114
bic tmp1, tmp1, #7
115
lsr r1, \d2, tmp1
116
.cfi_remember_state
117
.if \restore_r6
118
ldrd r6, r7, [sp, #8]
119
.endif
120
.cfi_restore 6
121
.cfi_restore 7
122
lsr \d1, \d1, tmp1
123
and result, \d1, #255
124
and r1, r1, #255
125
ldrd r4, r5, [sp], #16
126
.cfi_restore 4
127
.cfi_restore 5
128
.cfi_adjust_cfa_offset -16
129
sub result, result, r1
130
131
epilogue push_ip=HAVE_PAC_LEAF
132
#endif
133
.endm
134
135
ENTRY(__strcmp_arm)
136
prologue push_ip=HAVE_PAC_LEAF
137
#if STRCMP_NO_PRECHECK == 0
138
ldrb r2, [src1]
139
ldrb r3, [src2]
140
cmp r2, #1
141
it cs
142
cmpcs r2, r3
143
bne L(fastpath_exit)
144
#endif
145
strd r4, r5, [sp, #-16]!
146
.cfi_adjust_cfa_offset 16
147
.cfi_rel_offset 4, 0
148
.cfi_rel_offset 5, 4
149
orr tmp1, src1, src2
150
strd r6, r7, [sp, #8]
151
.cfi_rel_offset 6, 8
152
.cfi_rel_offset 7, 12
153
mvn const_m1, #0
154
lsl r2, tmp1, #29
155
cbz r2, L(loop_aligned8)
156
157
L(not_aligned):
158
eor tmp1, src1, src2
159
tst tmp1, #7
160
bne L(misaligned8)
161
162
/* Deal with mutual misalignment by aligning downwards and then
163
masking off the unwanted loaded data to prevent a difference. */
164
and tmp1, src1, #7
165
bic src1, src1, #7
166
and tmp2, tmp1, #3
167
bic src2, src2, #7
168
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
169
ldrd data1a, data1b, [src1], #16
170
tst tmp1, #4
171
ldrd data2a, data2b, [src2], #16
172
/* In thumb code we can't use MVN with a register shift, but
173
we do have ORN. */
174
S2HI tmp1, const_m1, tmp2
175
orn data1a, data1a, tmp1
176
orn data2a, data2a, tmp1
177
beq L(start_realigned8)
178
orn data1b, data1b, tmp1
179
mov data1a, const_m1
180
orn data2b, data2b, tmp1
181
mov data2a, const_m1
182
b L(start_realigned8)
183
184
/* Unwind the inner loop by a factor of 2, giving 16 bytes per
185
pass. */
186
.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
187
.p2align 2 /* Always word aligned. */
188
L(loop_aligned8):
189
ldrd data1a, data1b, [src1], #16
190
ldrd data2a, data2b, [src2], #16
191
L(start_realigned8):
192
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
193
eor syndrome_a, data1a, data2a
194
sel syndrome_a, syndrome_a, const_m1
195
cbnz syndrome_a, L(diff_in_a)
196
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
197
eor syndrome_b, data1b, data2b
198
sel syndrome_b, syndrome_b, const_m1
199
cbnz syndrome_b, L(diff_in_b)
200
201
ldrd data1a, data1b, [src1, #-8]
202
ldrd data2a, data2b, [src2, #-8]
203
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
204
eor syndrome_a, data1a, data2a
205
sel syndrome_a, syndrome_a, const_m1
206
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
207
eor syndrome_b, data1b, data2b
208
sel syndrome_b, syndrome_b, const_m1
209
/* Can't use CBZ for backwards branch. */
210
orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
211
beq L(loop_aligned8)
212
213
L(diff_found):
214
cbnz syndrome_a, L(diff_in_a)
215
216
L(diff_in_b):
217
strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
218
219
L(diff_in_a):
220
.cfi_restore_state
221
strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
222
223
.cfi_restore_state
224
L(misaligned8):
225
tst tmp1, #3
226
bne L(misaligned4)
227
ands tmp1, src1, #3
228
bne L(mutual_align4)
229
230
/* Unrolled by a factor of 2, to reduce the number of post-increment
231
operations. */
232
L(loop_aligned4):
233
ldr data1, [src1], #8
234
ldr data2, [src2], #8
235
L(start_realigned4):
236
uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
237
eor syndrome, data1, data2
238
sel syndrome, syndrome, const_m1
239
cbnz syndrome, L(aligned4_done)
240
ldr data1, [src1, #-4]
241
ldr data2, [src2, #-4]
242
uadd8 syndrome, data1, const_m1
243
eor syndrome, data1, data2
244
sel syndrome, syndrome, const_m1
245
cmp syndrome, #0
246
beq L(loop_aligned4)
247
248
L(aligned4_done):
249
strcmp_epilogue_aligned syndrome, data1, data2, 0
250
251
L(mutual_align4):
252
.cfi_restore_state
253
/* Deal with mutual misalignment by aligning downwards and then
254
masking off the unwanted loaded data to prevent a difference. */
255
lsl tmp1, tmp1, #3 /* Bytes -> bits. */
256
bic src1, src1, #3
257
ldr data1, [src1], #8
258
bic src2, src2, #3
259
ldr data2, [src2], #8
260
261
/* In thumb code we can't use MVN with a register shift, but
262
we do have ORN. */
263
S2HI tmp1, const_m1, tmp1
264
orn data1, data1, tmp1
265
orn data2, data2, tmp1
266
b L(start_realigned4)
267
268
L(misaligned4):
269
ands tmp1, src1, #3
270
beq L(src1_aligned)
271
sub src2, src2, tmp1
272
bic src1, src1, #3
273
lsls tmp1, tmp1, #31
274
ldr data1, [src1], #4
275
beq L(aligned_m2)
276
bcs L(aligned_m1)
277
278
#if STRCMP_NO_PRECHECK == 1
279
ldrb data2, [src2, #1]
280
uxtb tmp1, data1, ror #BYTE1_OFFSET
281
subs tmp1, tmp1, data2
282
bne L(misaligned_exit)
283
cbz data2, L(misaligned_exit)
284
285
L(aligned_m2):
286
ldrb data2, [src2, #2]
287
uxtb tmp1, data1, ror #BYTE2_OFFSET
288
subs tmp1, tmp1, data2
289
bne L(misaligned_exit)
290
cbz data2, L(misaligned_exit)
291
292
L(aligned_m1):
293
ldrb data2, [src2, #3]
294
uxtb tmp1, data1, ror #BYTE3_OFFSET
295
subs tmp1, tmp1, data2
296
bne L(misaligned_exit)
297
add src2, src2, #4
298
cbnz data2, L(src1_aligned)
299
#else /* STRCMP_NO_PRECHECK */
300
/* If we've done the pre-check, then we don't need to check the
301
first byte again here. */
302
ldrb data2, [src2, #2]
303
uxtb tmp1, data1, ror #BYTE2_OFFSET
304
subs tmp1, tmp1, data2
305
bne L(misaligned_exit)
306
cbz data2, L(misaligned_exit)
307
308
L(aligned_m2):
309
ldrb data2, [src2, #3]
310
uxtb tmp1, data1, ror #BYTE3_OFFSET
311
subs tmp1, tmp1, data2
312
bne L(misaligned_exit)
313
cbnz data2, L(aligned_m1)
314
#endif
315
316
L(misaligned_exit):
317
.cfi_remember_state
318
mov result, tmp1
319
ldr r4, [sp], #16
320
.cfi_restore 4
321
.cfi_adjust_cfa_offset -16
322
epilogue push_ip=HAVE_PAC_LEAF
323
324
#if STRCMP_NO_PRECHECK == 0
325
L(fastpath_exit):
326
.cfi_restore_state
327
.cfi_remember_state
328
sub r0, r2, r3
329
epilogue push_ip=HAVE_PAC_LEAF
330
331
L(aligned_m1):
332
.cfi_restore_state
333
.cfi_remember_state
334
add src2, src2, #4
335
#endif
336
L(src1_aligned):
337
.cfi_restore_state
338
/* src1 is word aligned, but src2 has no common alignment
339
with it. */
340
ldr data1, [src1], #4
341
lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
342
343
bic src2, src2, #3
344
ldr data2, [src2], #4
345
bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */
346
bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */
347
348
/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
349
L(overlap3):
350
bic tmp1, data1, #MSB
351
uadd8 syndrome, data1, const_m1
352
eors syndrome, tmp1, data2, S2LO #8
353
sel syndrome, syndrome, const_m1
354
bne 4f
355
cbnz syndrome, 5f
356
ldr data2, [src2], #4
357
eor tmp1, tmp1, data1
358
cmp tmp1, data2, S2HI #24
359
bne 6f
360
ldr data1, [src1], #4
361
b L(overlap3)
362
4:
363
S2LO data2, data2, #8
364
b L(strcmp_tail)
365
366
5:
367
bics syndrome, syndrome, #MSB
368
bne L(strcmp_done_equal)
369
370
/* We can only get here if the MSB of data1 contains 0, so
371
fast-path the exit. */
372
ldrb result, [src2]
373
.cfi_remember_state
374
ldrd r4, r5, [sp], #16
375
.cfi_restore 4
376
.cfi_restore 5
377
/* R6/7 Not used in this sequence. */
378
.cfi_restore 6
379
.cfi_restore 7
380
.cfi_adjust_cfa_offset -16
381
neg result, result
382
epilogue push_ip=HAVE_PAC_LEAF
383
6:
384
.cfi_restore_state
385
S2LO data1, data1, #24
386
and data2, data2, #LSB
387
b L(strcmp_tail)
388
389
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
390
L(overlap2):
391
and tmp1, data1, const_m1, S2LO #16
392
uadd8 syndrome, data1, const_m1
393
eors syndrome, tmp1, data2, S2LO #16
394
sel syndrome, syndrome, const_m1
395
bne 4f
396
cbnz syndrome, 5f
397
ldr data2, [src2], #4
398
eor tmp1, tmp1, data1
399
cmp tmp1, data2, S2HI #16
400
bne 6f
401
ldr data1, [src1], #4
402
b L(overlap2)
403
4:
404
S2LO data2, data2, #16
405
b L(strcmp_tail)
406
5:
407
ands syndrome, syndrome, const_m1, S2LO #16
408
bne L(strcmp_done_equal)
409
410
ldrh data2, [src2]
411
S2LO data1, data1, #16
412
#ifdef __ARM_BIG_ENDIAN
413
lsl data2, data2, #16
414
#endif
415
b L(strcmp_tail)
416
417
6:
418
S2LO data1, data1, #16
419
and data2, data2, const_m1, S2LO #16
420
b L(strcmp_tail)
421
422
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
423
L(overlap1):
424
and tmp1, data1, #LSB
425
uadd8 syndrome, data1, const_m1
426
eors syndrome, tmp1, data2, S2LO #24
427
sel syndrome, syndrome, const_m1
428
bne 4f
429
cbnz syndrome, 5f
430
ldr data2, [src2], #4
431
eor tmp1, tmp1, data1
432
cmp tmp1, data2, S2HI #8
433
bne 6f
434
ldr data1, [src1], #4
435
b L(overlap1)
436
4:
437
S2LO data2, data2, #24
438
b L(strcmp_tail)
439
5:
440
tst syndrome, #LSB
441
bne L(strcmp_done_equal)
442
ldr data2, [src2]
443
6:
444
S2LO data1, data1, #8
445
bic data2, data2, #MSB
446
b L(strcmp_tail)
447
448
L(strcmp_done_equal):
449
mov result, #0
450
.cfi_remember_state
451
ldrd r4, r5, [sp], #16
452
.cfi_restore 4
453
.cfi_restore 5
454
/* R6/7 not used in this sequence. */
455
.cfi_restore 6
456
.cfi_restore 7
457
.cfi_adjust_cfa_offset -16
458
epilogue push_ip=HAVE_PAC_LEAF
459
460
L(strcmp_tail):
461
.cfi_restore_state
462
#ifndef __ARM_BIG_ENDIAN
463
rev data1, data1
464
rev data2, data2
465
/* Now everything looks big-endian... */
466
#endif
467
uadd8 tmp1, data1, const_m1
468
eor tmp1, data1, data2
469
sel syndrome, tmp1, const_m1
470
clz tmp1, syndrome
471
lsl data1, data1, tmp1
472
lsl data2, data2, tmp1
473
lsr result, data1, #24
474
ldrd r4, r5, [sp], #16
475
.cfi_restore 4
476
.cfi_restore 5
477
/* R6/7 not used in this sequence. */
478
.cfi_restore 6
479
.cfi_restore 7
480
.cfi_adjust_cfa_offset -16
481
sub result, result, data2, lsr #24
482
epilogue push_ip=HAVE_PAC_LEAF
483
484
END (__strcmp_arm)
485
486
#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */
487
488