Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strncmp.S
39486 views
1
/*
2
* strncmp - compare two strings
3
*
4
* Copyright (c) 2013-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64.
11
* MTE compatible.
12
*/
13
14
#include "asmdefs.h"
15
16
#define REP8_01 0x0101010101010101
17
#define REP8_7f 0x7f7f7f7f7f7f7f7f
18
19
/* Parameters and result. */
20
#define src1 x0
21
#define src2 x1
22
#define limit x2
23
#define result x0
24
25
/* Internal variables. */
26
#define data1 x3
27
#define data1w w3
28
#define data2 x4
29
#define data2w w4
30
#define has_nul x5
31
#define diff x6
32
#define syndrome x7
33
#define tmp1 x8
34
#define tmp2 x9
35
#define tmp3 x10
36
#define zeroones x11
37
#define pos x12
38
#define mask x13
39
#define endloop x14
40
#define count mask
41
#define offset pos
42
#define neg_offset x15
43
44
/* Define endian dependent shift operations.
45
On big-endian early bytes are at MSB and on little-endian LSB.
46
LS_FW means shifting towards early bytes.
47
LS_BK means shifting towards later bytes.
48
*/
49
#ifdef __AARCH64EB__
50
#define LS_FW lsl
51
#define LS_BK lsr
52
#else
53
#define LS_FW lsr
54
#define LS_BK lsl
55
#endif
56
57
ENTRY (__strncmp_aarch64)
58
cbz limit, L(ret0)
59
eor tmp1, src1, src2
60
mov zeroones, #REP8_01
61
tst tmp1, #7
62
and count, src1, #7
63
b.ne L(misaligned8)
64
cbnz count, L(mutual_align)
65
66
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
67
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
68
can be done in parallel across the entire word. */
69
.p2align 4
70
L(loop_aligned):
71
ldr data1, [src1], #8
72
ldr data2, [src2], #8
73
L(start_realigned):
74
subs limit, limit, #8
75
sub tmp1, data1, zeroones
76
orr tmp2, data1, #REP8_7f
77
eor diff, data1, data2 /* Non-zero if differences found. */
78
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
79
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
80
ccmp endloop, #0, #0, eq
81
b.eq L(loop_aligned)
82
/* End of main loop */
83
84
L(full_check):
85
#ifndef __AARCH64EB__
86
orr syndrome, diff, has_nul
87
add limit, limit, 8 /* Rewind limit to before last subs. */
88
L(syndrome_check):
89
/* Limit was reached. Check if the NUL byte or the difference
90
is before the limit. */
91
rev syndrome, syndrome
92
rev data1, data1
93
clz pos, syndrome
94
rev data2, data2
95
lsl data1, data1, pos
96
cmp limit, pos, lsr #3
97
lsl data2, data2, pos
98
/* But we need to zero-extend (char is unsigned) the value and then
99
perform a signed 32-bit subtraction. */
100
lsr data1, data1, #56
101
sub result, data1, data2, lsr #56
102
csel result, result, xzr, hi
103
ret
104
#else
105
/* Not reached the limit, must have found the end or a diff. */
106
tbz limit, #63, L(not_limit)
107
add tmp1, limit, 8
108
cbz limit, L(not_limit)
109
110
lsl limit, tmp1, #3 /* Bits -> bytes. */
111
mov mask, #~0
112
lsr mask, mask, limit
113
bic data1, data1, mask
114
bic data2, data2, mask
115
116
/* Make sure that the NUL byte is marked in the syndrome. */
117
orr has_nul, has_nul, mask
118
119
L(not_limit):
120
/* For big-endian we cannot use the trick with the syndrome value
121
as carry-propagation can corrupt the upper bits if the trailing
122
bytes in the string contain 0x01. */
123
/* However, if there is no NUL byte in the dword, we can generate
124
the result directly. We can't just subtract the bytes as the
125
MSB might be significant. */
126
cbnz has_nul, 1f
127
cmp data1, data2
128
cset result, ne
129
cneg result, result, lo
130
ret
131
1:
132
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
133
rev tmp3, data1
134
sub tmp1, tmp3, zeroones
135
orr tmp2, tmp3, #REP8_7f
136
bic has_nul, tmp1, tmp2
137
rev has_nul, has_nul
138
orr syndrome, diff, has_nul
139
clz pos, syndrome
140
/* The most-significant-non-zero bit of the syndrome marks either the
141
first bit that is different, or the top bit of the first zero byte.
142
Shifting left now will bring the critical information into the
143
top bits. */
144
L(end_quick):
145
lsl data1, data1, pos
146
lsl data2, data2, pos
147
/* But we need to zero-extend (char is unsigned) the value and then
148
perform a signed 32-bit subtraction. */
149
lsr data1, data1, #56
150
sub result, data1, data2, lsr #56
151
ret
152
#endif
153
154
L(mutual_align):
155
/* Sources are mutually aligned, but are not currently at an
156
alignment boundary. Round down the addresses and then mask off
157
the bytes that precede the start point.
158
We also need to adjust the limit calculations, but without
159
overflowing if the limit is near ULONG_MAX. */
160
bic src1, src1, #7
161
bic src2, src2, #7
162
ldr data1, [src1], #8
163
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
164
ldr data2, [src2], #8
165
mov tmp2, #~0
166
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
167
/* Adjust the limit and ensure it doesn't overflow. */
168
adds limit, limit, count
169
csinv limit, limit, xzr, lo
170
orr data1, data1, tmp2
171
orr data2, data2, tmp2
172
b L(start_realigned)
173
174
.p2align 4
175
/* Don't bother with dwords for up to 16 bytes. */
176
L(misaligned8):
177
cmp limit, #16
178
b.hs L(try_misaligned_words)
179
180
L(byte_loop):
181
/* Perhaps we can do better than this. */
182
ldrb data1w, [src1], #1
183
ldrb data2w, [src2], #1
184
subs limit, limit, #1
185
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
186
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
187
b.eq L(byte_loop)
188
L(done):
189
sub result, data1, data2
190
ret
191
/* Align the SRC1 to a dword by doing a bytewise compare and then do
192
the dword loop. */
193
L(try_misaligned_words):
194
cbz count, L(src1_aligned)
195
196
neg count, count
197
and count, count, #7
198
sub limit, limit, count
199
200
L(page_end_loop):
201
ldrb data1w, [src1], #1
202
ldrb data2w, [src2], #1
203
cmp data1w, #1
204
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
205
b.ne L(done)
206
subs count, count, #1
207
b.hi L(page_end_loop)
208
209
/* The following diagram explains the comparison of misaligned strings.
210
The bytes are shown in natural order. For little-endian, it is
211
reversed in the registers. The "x" bytes are before the string.
212
The "|" separates data that is loaded at one time.
213
src1 | a a a a a a a a | b b b c c c c c | . . .
214
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
215
216
After shifting in each step, the data looks like this:
217
STEP_A STEP_B STEP_C
218
data1 a a a a a a a a b b b c c c c c b b b c c c c c
219
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
220
221
The bytes with "0" are eliminated from the syndrome via mask.
222
223
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
224
time from SRC2. The comparison happens in 3 steps. After each step
225
the loop can exit, or read from SRC1 or SRC2. */
226
L(src1_aligned):
227
/* Calculate offset from 8 byte alignment to string start in bits. No
228
need to mask offset since shifts are ignoring upper bits. */
229
lsl offset, src2, #3
230
bic src2, src2, #0xf
231
mov mask, -1
232
neg neg_offset, offset
233
ldr data1, [src1], #8
234
ldp tmp1, tmp2, [src2], #16
235
LS_BK mask, mask, neg_offset
236
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
237
/* Skip the first compare if data in tmp1 is irrelevant. */
238
tbnz offset, 6, L(misaligned_mid_loop)
239
240
L(loop_misaligned):
241
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
242
LS_FW data2, tmp1, offset
243
LS_BK tmp1, tmp2, neg_offset
244
subs limit, limit, #8
245
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
246
sub has_nul, data1, zeroones
247
eor diff, data1, data2 /* Non-zero if differences found. */
248
orr tmp3, data1, #REP8_7f
249
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
250
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
251
orr tmp3, endloop, has_nul
252
cbnz tmp3, L(full_check)
253
254
ldr data1, [src1], #8
255
L(misaligned_mid_loop):
256
/* STEP_B: Compare first part of data1 to second part of tmp2. */
257
LS_FW data2, tmp2, offset
258
#ifdef __AARCH64EB__
259
/* For big-endian we do a byte reverse to avoid carry-propagation
260
problem described above. This way we can reuse the has_nul in the
261
next step and also use syndrome value trick at the end. */
262
rev tmp3, data1
263
#define data1_fixed tmp3
264
#else
265
#define data1_fixed data1
266
#endif
267
sub has_nul, data1_fixed, zeroones
268
orr tmp3, data1_fixed, #REP8_7f
269
eor diff, data2, data1 /* Non-zero if differences found. */
270
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
271
#ifdef __AARCH64EB__
272
rev has_nul, has_nul
273
#endif
274
cmp limit, neg_offset, lsr #3
275
orr syndrome, diff, has_nul
276
bic syndrome, syndrome, mask /* Ignore later bytes. */
277
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
278
cbnz tmp3, L(syndrome_check)
279
280
/* STEP_C: Compare second part of data1 to first part of tmp1. */
281
ldp tmp1, tmp2, [src2], #16
282
cmp limit, #8
283
LS_BK data2, tmp1, neg_offset
284
eor diff, data2, data1 /* Non-zero if differences found. */
285
orr syndrome, diff, has_nul
286
and syndrome, syndrome, mask /* Ignore earlier bytes. */
287
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
288
cbnz tmp3, L(syndrome_check)
289
290
ldr data1, [src1], #8
291
sub limit, limit, #8
292
b L(loop_misaligned)
293
294
#ifdef __AARCH64EB__
295
L(syndrome_check):
296
clz pos, syndrome
297
cmp pos, limit, lsl #3
298
b.lo L(end_quick)
299
#endif
300
301
L(ret0):
302
mov result, #0
303
ret
304
END(__strncmp_aarch64)
305
306
307