Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/lib/strncmp.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Copyright (c) 2013-2022, Arm Limited.
4
*
5
* Adapted from the original at:
6
* https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
7
*/
8
9
#include <linux/linkage.h>
10
#include <asm/assembler.h>
11
12
/* Assumptions:
13
*
14
* ARMv8-a, AArch64.
15
* MTE compatible.
16
*/
17
18
#define L(label) .L ## label
19
20
#define REP8_01 0x0101010101010101
21
#define REP8_7f 0x7f7f7f7f7f7f7f7f
22
23
/* Parameters and result. */
24
#define src1 x0
25
#define src2 x1
26
#define limit x2
27
#define result x0
28
29
/* Internal variables. */
30
#define data1 x3
31
#define data1w w3
32
#define data2 x4
33
#define data2w w4
34
#define has_nul x5
35
#define diff x6
36
#define syndrome x7
37
#define tmp1 x8
38
#define tmp2 x9
39
#define tmp3 x10
40
#define zeroones x11
41
#define pos x12
42
#define mask x13
43
#define endloop x14
44
#define count mask
45
#define offset pos
46
#define neg_offset x15
47
48
/* Define endian dependent shift operations.
49
On big-endian early bytes are at MSB and on little-endian LSB.
50
LS_FW means shifting towards early bytes.
51
LS_BK means shifting towards later bytes.
52
*/
53
#ifdef __AARCH64EB__
54
#define LS_FW lsl
55
#define LS_BK lsr
56
#else
57
#define LS_FW lsr
58
#define LS_BK lsl
59
#endif
60
61
SYM_FUNC_START(__pi_strncmp)
62
cbz limit, L(ret0)
63
eor tmp1, src1, src2
64
mov zeroones, #REP8_01
65
tst tmp1, #7
66
and count, src1, #7
67
b.ne L(misaligned8)
68
cbnz count, L(mutual_align)
69
70
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
71
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
72
can be done in parallel across the entire word. */
73
.p2align 4
74
L(loop_aligned):
75
ldr data1, [src1], #8
76
ldr data2, [src2], #8
77
L(start_realigned):
78
subs limit, limit, #8
79
sub tmp1, data1, zeroones
80
orr tmp2, data1, #REP8_7f
81
eor diff, data1, data2 /* Non-zero if differences found. */
82
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
83
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
84
ccmp endloop, #0, #0, eq
85
b.eq L(loop_aligned)
86
/* End of main loop */
87
88
L(full_check):
89
#ifndef __AARCH64EB__
90
orr syndrome, diff, has_nul
91
add limit, limit, 8 /* Rewind limit to before last subs. */
92
L(syndrome_check):
93
/* Limit was reached. Check if the NUL byte or the difference
94
is before the limit. */
95
rev syndrome, syndrome
96
rev data1, data1
97
clz pos, syndrome
98
rev data2, data2
99
lsl data1, data1, pos
100
cmp limit, pos, lsr #3
101
lsl data2, data2, pos
102
/* But we need to zero-extend (char is unsigned) the value and then
103
perform a signed 32-bit subtraction. */
104
lsr data1, data1, #56
105
sub result, data1, data2, lsr #56
106
csel result, result, xzr, hi
107
ret
108
#else
109
/* Not reached the limit, must have found the end or a diff. */
110
tbz limit, #63, L(not_limit)
111
add tmp1, limit, 8
112
cbz limit, L(not_limit)
113
114
lsl limit, tmp1, #3 /* Bits -> bytes. */
115
mov mask, #~0
116
lsr mask, mask, limit
117
bic data1, data1, mask
118
bic data2, data2, mask
119
120
/* Make sure that the NUL byte is marked in the syndrome. */
121
orr has_nul, has_nul, mask
122
123
L(not_limit):
124
/* For big-endian we cannot use the trick with the syndrome value
125
as carry-propagation can corrupt the upper bits if the trailing
126
bytes in the string contain 0x01. */
127
/* However, if there is no NUL byte in the dword, we can generate
128
the result directly. We can't just subtract the bytes as the
129
MSB might be significant. */
130
cbnz has_nul, 1f
131
cmp data1, data2
132
cset result, ne
133
cneg result, result, lo
134
ret
135
1:
136
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
137
rev tmp3, data1
138
sub tmp1, tmp3, zeroones
139
orr tmp2, tmp3, #REP8_7f
140
bic has_nul, tmp1, tmp2
141
rev has_nul, has_nul
142
orr syndrome, diff, has_nul
143
clz pos, syndrome
144
/* The most-significant-non-zero bit of the syndrome marks either the
145
first bit that is different, or the top bit of the first zero byte.
146
Shifting left now will bring the critical information into the
147
top bits. */
148
L(end_quick):
149
lsl data1, data1, pos
150
lsl data2, data2, pos
151
/* But we need to zero-extend (char is unsigned) the value and then
152
perform a signed 32-bit subtraction. */
153
lsr data1, data1, #56
154
sub result, data1, data2, lsr #56
155
ret
156
#endif
157
158
L(mutual_align):
159
/* Sources are mutually aligned, but are not currently at an
160
alignment boundary. Round down the addresses and then mask off
161
the bytes that precede the start point.
162
We also need to adjust the limit calculations, but without
163
overflowing if the limit is near ULONG_MAX. */
164
bic src1, src1, #7
165
bic src2, src2, #7
166
ldr data1, [src1], #8
167
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
168
ldr data2, [src2], #8
169
mov tmp2, #~0
170
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
171
/* Adjust the limit and ensure it doesn't overflow. */
172
adds limit, limit, count
173
csinv limit, limit, xzr, lo
174
orr data1, data1, tmp2
175
orr data2, data2, tmp2
176
b L(start_realigned)
177
178
.p2align 4
179
/* Don't bother with dwords for up to 16 bytes. */
180
L(misaligned8):
181
cmp limit, #16
182
b.hs L(try_misaligned_words)
183
184
L(byte_loop):
185
/* Perhaps we can do better than this. */
186
ldrb data1w, [src1], #1
187
ldrb data2w, [src2], #1
188
subs limit, limit, #1
189
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
190
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
191
b.eq L(byte_loop)
192
L(done):
193
sub result, data1, data2
194
ret
195
/* Align the SRC1 to a dword by doing a bytewise compare and then do
196
the dword loop. */
197
L(try_misaligned_words):
198
cbz count, L(src1_aligned)
199
200
neg count, count
201
and count, count, #7
202
sub limit, limit, count
203
204
L(page_end_loop):
205
ldrb data1w, [src1], #1
206
ldrb data2w, [src2], #1
207
cmp data1w, #1
208
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
209
b.ne L(done)
210
subs count, count, #1
211
b.hi L(page_end_loop)
212
213
/* The following diagram explains the comparison of misaligned strings.
214
The bytes are shown in natural order. For little-endian, it is
215
reversed in the registers. The "x" bytes are before the string.
216
The "|" separates data that is loaded at one time.
217
src1 | a a a a a a a a | b b b c c c c c | . . .
218
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
219
220
After shifting in each step, the data looks like this:
221
STEP_A STEP_B STEP_C
222
data1 a a a a a a a a b b b c c c c c b b b c c c c c
223
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
224
225
The bytes with "0" are eliminated from the syndrome via mask.
226
227
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228
time from SRC2. The comparison happens in 3 steps. After each step
229
the loop can exit, or read from SRC1 or SRC2. */
230
L(src1_aligned):
231
/* Calculate offset from 8 byte alignment to string start in bits. No
232
need to mask offset since shifts are ignoring upper bits. */
233
lsl offset, src2, #3
234
bic src2, src2, #0xf
235
mov mask, -1
236
neg neg_offset, offset
237
ldr data1, [src1], #8
238
ldp tmp1, tmp2, [src2], #16
239
LS_BK mask, mask, neg_offset
240
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
241
/* Skip the first compare if data in tmp1 is irrelevant. */
242
tbnz offset, 6, L(misaligned_mid_loop)
243
244
L(loop_misaligned):
245
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
246
LS_FW data2, tmp1, offset
247
LS_BK tmp1, tmp2, neg_offset
248
subs limit, limit, #8
249
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
250
sub has_nul, data1, zeroones
251
eor diff, data1, data2 /* Non-zero if differences found. */
252
orr tmp3, data1, #REP8_7f
253
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
254
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
255
orr tmp3, endloop, has_nul
256
cbnz tmp3, L(full_check)
257
258
ldr data1, [src1], #8
259
L(misaligned_mid_loop):
260
/* STEP_B: Compare first part of data1 to second part of tmp2. */
261
LS_FW data2, tmp2, offset
262
#ifdef __AARCH64EB__
263
/* For big-endian we do a byte reverse to avoid carry-propagation
264
problem described above. This way we can reuse the has_nul in the
265
next step and also use syndrome value trick at the end. */
266
rev tmp3, data1
267
#define data1_fixed tmp3
268
#else
269
#define data1_fixed data1
270
#endif
271
sub has_nul, data1_fixed, zeroones
272
orr tmp3, data1_fixed, #REP8_7f
273
eor diff, data2, data1 /* Non-zero if differences found. */
274
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
275
#ifdef __AARCH64EB__
276
rev has_nul, has_nul
277
#endif
278
cmp limit, neg_offset, lsr #3
279
orr syndrome, diff, has_nul
280
bic syndrome, syndrome, mask /* Ignore later bytes. */
281
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
282
cbnz tmp3, L(syndrome_check)
283
284
/* STEP_C: Compare second part of data1 to first part of tmp1. */
285
ldp tmp1, tmp2, [src2], #16
286
cmp limit, #8
287
LS_BK data2, tmp1, neg_offset
288
eor diff, data2, data1 /* Non-zero if differences found. */
289
orr syndrome, diff, has_nul
290
and syndrome, syndrome, mask /* Ignore earlier bytes. */
291
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
292
cbnz tmp3, L(syndrome_check)
293
294
ldr data1, [src1], #8
295
sub limit, limit, #8
296
b L(loop_misaligned)
297
298
#ifdef __AARCH64EB__
299
L(syndrome_check):
300
clz pos, syndrome
301
cmp pos, limit, lsl #3
302
b.lo L(end_quick)
303
#endif
304
305
L(ret0):
306
mov result, #0
307
ret
308
SYM_FUNC_END(__pi_strncmp)
309
SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
310
EXPORT_SYMBOL_NOKASAN(strncmp)
311
312