CoCalc -- strncmp.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/lib/strncmp.S
²⁶⁴²⁴ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * Copyright (c) 2013-2022, Arm Limited.
4
 *
5
 * Adapted from the original at:
6
 * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
7
 */
8

9
#include <linux/linkage.h>
10
#include <asm/assembler.h>
11

12
/* Assumptions:
13
 *
14
 * ARMv8-a, AArch64.
15
 * MTE compatible.
16
 */
17

18
#define L(label) .L ## label
19

20
#define REP8_01 0x0101010101010101
21
#define REP8_7f 0x7f7f7f7f7f7f7f7f
22

23
/* Parameters and result.  */
24
#define src1		x0
25
#define src2		x1
26
#define limit		x2
27
#define result		x0
28

29
/* Internal variables.  */
30
#define data1		x3
31
#define data1w		w3
32
#define data2		x4
33
#define data2w		w4
34
#define has_nul		x5
35
#define diff		x6
36
#define syndrome	x7
37
#define tmp1		x8
38
#define tmp2		x9
39
#define tmp3		x10
40
#define zeroones	x11
41
#define pos		x12
42
#define mask		x13
43
#define endloop		x14
44
#define count		mask
45
#define offset		pos
46
#define neg_offset	x15
47

48
/* Define endian dependent shift operations.
49
   On big-endian early bytes are at MSB and on little-endian LSB.
50
   LS_FW means shifting towards early bytes.
51
   LS_BK means shifting towards later bytes.
52
   */
53
#ifdef __AARCH64EB__
54
#define LS_FW lsl
55
#define LS_BK lsr
56
#else
57
#define LS_FW lsr
58
#define LS_BK lsl
59
#endif
60

61
SYM_FUNC_START(__pi_strncmp)
62
	cbz	limit, L(ret0)
63
	eor	tmp1, src1, src2
64
	mov	zeroones, #REP8_01
65
	tst	tmp1, #7
66
	and	count, src1, #7
67
	b.ne	L(misaligned8)
68
	cbnz	count, L(mutual_align)
69

70
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
71
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
72
	   can be done in parallel across the entire word.  */
73
	.p2align 4
74
L(loop_aligned):
75
	ldr	data1, [src1], #8
76
	ldr	data2, [src2], #8
77
L(start_realigned):
78
	subs	limit, limit, #8
79
	sub	tmp1, data1, zeroones
80
	orr	tmp2, data1, #REP8_7f
81
	eor	diff, data1, data2	/* Non-zero if differences found.  */
82
	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
83
	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
84
	ccmp	endloop, #0, #0, eq
85
	b.eq	L(loop_aligned)
86
	/* End of main loop */
87

88
L(full_check):
89
#ifndef __AARCH64EB__
90
	orr	syndrome, diff, has_nul
91
	add	limit, limit, 8	/* Rewind limit to before last subs. */
92
L(syndrome_check):
93
	/* Limit was reached. Check if the NUL byte or the difference
94
	   is before the limit. */
95
	rev	syndrome, syndrome
96
	rev	data1, data1
97
	clz	pos, syndrome
98
	rev	data2, data2
99
	lsl	data1, data1, pos
100
	cmp	limit, pos, lsr #3
101
	lsl	data2, data2, pos
102
	/* But we need to zero-extend (char is unsigned) the value and then
103
	   perform a signed 32-bit subtraction.  */
104
	lsr	data1, data1, #56
105
	sub	result, data1, data2, lsr #56
106
	csel result, result, xzr, hi
107
	ret
108
#else
109
	/* Not reached the limit, must have found the end or a diff.  */
110
	tbz	limit, #63, L(not_limit)
111
	add	tmp1, limit, 8
112
	cbz	limit, L(not_limit)
113

114
	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
115
	mov	mask, #~0
116
	lsr	mask, mask, limit
117
	bic	data1, data1, mask
118
	bic	data2, data2, mask
119

120
	/* Make sure that the NUL byte is marked in the syndrome.  */
121
	orr	has_nul, has_nul, mask
122

123
L(not_limit):
124
	/* For big-endian we cannot use the trick with the syndrome value
125
	   as carry-propagation can corrupt the upper bits if the trailing
126
	   bytes in the string contain 0x01.  */
127
	/* However, if there is no NUL byte in the dword, we can generate
128
	   the result directly.  We can't just subtract the bytes as the
129
	   MSB might be significant.  */
130
	cbnz	has_nul, 1f
131
	cmp	data1, data2
132
	cset	result, ne
133
	cneg	result, result, lo
134
	ret
135
1:
136
	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
137
	rev	tmp3, data1
138
	sub	tmp1, tmp3, zeroones
139
	orr	tmp2, tmp3, #REP8_7f
140
	bic	has_nul, tmp1, tmp2
141
	rev	has_nul, has_nul
142
	orr	syndrome, diff, has_nul
143
	clz	pos, syndrome
144
	/* The most-significant-non-zero bit of the syndrome marks either the
145
	   first bit that is different, or the top bit of the first zero byte.
146
	   Shifting left now will bring the critical information into the
147
	   top bits.  */
148
L(end_quick):
149
	lsl	data1, data1, pos
150
	lsl	data2, data2, pos
151
	/* But we need to zero-extend (char is unsigned) the value and then
152
	   perform a signed 32-bit subtraction.  */
153
	lsr	data1, data1, #56
154
	sub	result, data1, data2, lsr #56
155
	ret
156
#endif
157

158
L(mutual_align):
159
	/* Sources are mutually aligned, but are not currently at an
160
	   alignment boundary.  Round down the addresses and then mask off
161
	   the bytes that precede the start point.
162
	   We also need to adjust the limit calculations, but without
163
	   overflowing if the limit is near ULONG_MAX.  */
164
	bic	src1, src1, #7
165
	bic	src2, src2, #7
166
	ldr	data1, [src1], #8
167
	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
168
	ldr	data2, [src2], #8
169
	mov	tmp2, #~0
170
	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
171
	/* Adjust the limit and ensure it doesn't overflow.  */
172
	adds	limit, limit, count
173
	csinv	limit, limit, xzr, lo
174
	orr	data1, data1, tmp2
175
	orr	data2, data2, tmp2
176
	b	L(start_realigned)
177

178
	.p2align 4
179
	/* Don't bother with dwords for up to 16 bytes.  */
180
L(misaligned8):
181
	cmp	limit, #16
182
	b.hs	L(try_misaligned_words)
183

184
L(byte_loop):
185
	/* Perhaps we can do better than this.  */
186
	ldrb	data1w, [src1], #1
187
	ldrb	data2w, [src2], #1
188
	subs	limit, limit, #1
189
	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
190
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
191
	b.eq	L(byte_loop)
192
L(done):
193
	sub	result, data1, data2
194
	ret
195
	/* Align the SRC1 to a dword by doing a bytewise compare and then do
196
	   the dword loop.  */
197
L(try_misaligned_words):
198
	cbz	count, L(src1_aligned)
199

200
	neg	count, count
201
	and	count, count, #7
202
	sub	limit, limit, count
203

204
L(page_end_loop):
205
	ldrb	data1w, [src1], #1
206
	ldrb	data2w, [src2], #1
207
	cmp	data1w, #1
208
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
209
	b.ne	L(done)
210
	subs	count, count, #1
211
	b.hi	L(page_end_loop)
212

213
	/* The following diagram explains the comparison of misaligned strings.
214
	   The bytes are shown in natural order. For little-endian, it is
215
	   reversed in the registers. The "x" bytes are before the string.
216
	   The "|" separates data that is loaded at one time.
217
	   src1     | a a a a a a a a | b b b c c c c c | . . .
218
	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
219

220
	   After shifting in each step, the data looks like this:
221
	                STEP_A              STEP_B              STEP_C
222
	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
223
	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
224

225
	   The bytes with "0" are eliminated from the syndrome via mask.
226

227
	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228
	   time from SRC2. The comparison happens in 3 steps. After each step
229
	   the loop can exit, or read from SRC1 or SRC2. */
230
L(src1_aligned):
231
	/* Calculate offset from 8 byte alignment to string start in bits. No
232
	   need to mask offset since shifts are ignoring upper bits. */
233
	lsl	offset, src2, #3
234
	bic	src2, src2, #0xf
235
	mov	mask, -1
236
	neg	neg_offset, offset
237
	ldr	data1, [src1], #8
238
	ldp	tmp1, tmp2, [src2], #16
239
	LS_BK	mask, mask, neg_offset
240
	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
241
	/* Skip the first compare if data in tmp1 is irrelevant. */
242
	tbnz	offset, 6, L(misaligned_mid_loop)
243

244
L(loop_misaligned):
245
	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
246
	LS_FW	data2, tmp1, offset
247
	LS_BK	tmp1, tmp2, neg_offset
248
	subs	limit, limit, #8
249
	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
250
	sub	has_nul, data1, zeroones
251
	eor	diff, data1, data2	/* Non-zero if differences found.  */
252
	orr	tmp3, data1, #REP8_7f
253
	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
254
	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
255
	orr	tmp3, endloop, has_nul
256
	cbnz	tmp3, L(full_check)
257

258
	ldr	data1, [src1], #8
259
L(misaligned_mid_loop):
260
	/* STEP_B: Compare first part of data1 to second part of tmp2. */
261
	LS_FW	data2, tmp2, offset
262
#ifdef __AARCH64EB__
263
	/* For big-endian we do a byte reverse to avoid carry-propagation
264
	problem described above. This way we can reuse the has_nul in the
265
	next step and also use syndrome value trick at the end. */
266
	rev	tmp3, data1
267
	#define data1_fixed tmp3
268
#else
269
	#define data1_fixed data1
270
#endif
271
	sub	has_nul, data1_fixed, zeroones
272
	orr	tmp3, data1_fixed, #REP8_7f
273
	eor	diff, data2, data1	/* Non-zero if differences found.  */
274
	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
275
#ifdef __AARCH64EB__
276
	rev	has_nul, has_nul
277
#endif
278
	cmp	limit, neg_offset, lsr #3
279
	orr	syndrome, diff, has_nul
280
	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
281
	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
282
	cbnz	tmp3, L(syndrome_check)
283

284
	/* STEP_C: Compare second part of data1 to first part of tmp1. */
285
	ldp	tmp1, tmp2, [src2], #16
286
	cmp	limit, #8
287
	LS_BK	data2, tmp1, neg_offset
288
	eor	diff, data2, data1	/* Non-zero if differences found.  */
289
	orr	syndrome, diff, has_nul
290
	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
291
	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
292
	cbnz	tmp3, L(syndrome_check)
293

294
	ldr	data1, [src1], #8
295
	sub	limit, limit, #8
296
	b	L(loop_misaligned)
297

298
#ifdef	__AARCH64EB__
299
L(syndrome_check):
300
	clz	pos, syndrome
301
	cmp	pos, limit, lsl #3
302
	b.lo	L(end_quick)
303
#endif
304

305
L(ret0):
306
	mov	result, #0
307
	ret
308
SYM_FUNC_END(__pi_strncmp)
309
SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
310
EXPORT_SYMBOL_NOKASAN(strncmp)
311

312
Product

Resources

Company