CoCalc -- strncmp.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strncmp.S
³⁹⁴⁸⁶ views
1
/*
2
 * strncmp - compare two strings
3
 *
4
 * Copyright (c) 2013-2022, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7

8
/* Assumptions:
9
 *
10
 * ARMv8-a, AArch64.
11
 * MTE compatible.
12
 */
13

14
#include "asmdefs.h"
15

16
#define REP8_01 0x0101010101010101
17
#define REP8_7f 0x7f7f7f7f7f7f7f7f
18

19
/* Parameters and result.  */
20
#define src1		x0
21
#define src2		x1
22
#define limit		x2
23
#define result		x0
24

25
/* Internal variables.  */
26
#define data1		x3
27
#define data1w		w3
28
#define data2		x4
29
#define data2w		w4
30
#define has_nul		x5
31
#define diff		x6
32
#define syndrome	x7
33
#define tmp1		x8
34
#define tmp2		x9
35
#define tmp3		x10
36
#define zeroones	x11
37
#define pos		x12
38
#define mask		x13
39
#define endloop		x14
40
#define count		mask
41
#define offset		pos
42
#define neg_offset	x15
43

44
/* Define endian dependent shift operations.
45
   On big-endian early bytes are at MSB and on little-endian LSB.
46
   LS_FW means shifting towards early bytes.
47
   LS_BK means shifting towards later bytes.
48
   */
49
#ifdef __AARCH64EB__
50
#define LS_FW lsl
51
#define LS_BK lsr
52
#else
53
#define LS_FW lsr
54
#define LS_BK lsl
55
#endif
56

57
ENTRY (__strncmp_aarch64)
58
	cbz	limit, L(ret0)
59
	eor	tmp1, src1, src2
60
	mov	zeroones, #REP8_01
61
	tst	tmp1, #7
62
	and	count, src1, #7
63
	b.ne	L(misaligned8)
64
	cbnz	count, L(mutual_align)
65

66
	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
67
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
68
	   can be done in parallel across the entire word.  */
69
	.p2align 4
70
L(loop_aligned):
71
	ldr	data1, [src1], #8
72
	ldr	data2, [src2], #8
73
L(start_realigned):
74
	subs	limit, limit, #8
75
	sub	tmp1, data1, zeroones
76
	orr	tmp2, data1, #REP8_7f
77
	eor	diff, data1, data2	/* Non-zero if differences found.  */
78
	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
79
	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
80
	ccmp	endloop, #0, #0, eq
81
	b.eq	L(loop_aligned)
82
	/* End of main loop */
83

84
L(full_check):
85
#ifndef __AARCH64EB__
86
	orr	syndrome, diff, has_nul
87
	add	limit, limit, 8	/* Rewind limit to before last subs. */
88
L(syndrome_check):
89
	/* Limit was reached. Check if the NUL byte or the difference
90
	   is before the limit. */
91
	rev	syndrome, syndrome
92
	rev	data1, data1
93
	clz	pos, syndrome
94
	rev	data2, data2
95
	lsl	data1, data1, pos
96
	cmp	limit, pos, lsr #3
97
	lsl	data2, data2, pos
98
	/* But we need to zero-extend (char is unsigned) the value and then
99
	   perform a signed 32-bit subtraction.  */
100
	lsr	data1, data1, #56
101
	sub	result, data1, data2, lsr #56
102
	csel result, result, xzr, hi
103
	ret
104
#else
105
	/* Not reached the limit, must have found the end or a diff.  */
106
	tbz	limit, #63, L(not_limit)
107
	add	tmp1, limit, 8
108
	cbz	limit, L(not_limit)
109

110
	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
111
	mov	mask, #~0
112
	lsr	mask, mask, limit
113
	bic	data1, data1, mask
114
	bic	data2, data2, mask
115

116
	/* Make sure that the NUL byte is marked in the syndrome.  */
117
	orr	has_nul, has_nul, mask
118

119
L(not_limit):
120
	/* For big-endian we cannot use the trick with the syndrome value
121
	   as carry-propagation can corrupt the upper bits if the trailing
122
	   bytes in the string contain 0x01.  */
123
	/* However, if there is no NUL byte in the dword, we can generate
124
	   the result directly.  We can't just subtract the bytes as the
125
	   MSB might be significant.  */
126
	cbnz	has_nul, 1f
127
	cmp	data1, data2
128
	cset	result, ne
129
	cneg	result, result, lo
130
	ret
131
1:
132
	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
133
	rev	tmp3, data1
134
	sub	tmp1, tmp3, zeroones
135
	orr	tmp2, tmp3, #REP8_7f
136
	bic	has_nul, tmp1, tmp2
137
	rev	has_nul, has_nul
138
	orr	syndrome, diff, has_nul
139
	clz	pos, syndrome
140
	/* The most-significant-non-zero bit of the syndrome marks either the
141
	   first bit that is different, or the top bit of the first zero byte.
142
	   Shifting left now will bring the critical information into the
143
	   top bits.  */
144
L(end_quick):
145
	lsl	data1, data1, pos
146
	lsl	data2, data2, pos
147
	/* But we need to zero-extend (char is unsigned) the value and then
148
	   perform a signed 32-bit subtraction.  */
149
	lsr	data1, data1, #56
150
	sub	result, data1, data2, lsr #56
151
	ret
152
#endif
153

154
L(mutual_align):
155
	/* Sources are mutually aligned, but are not currently at an
156
	   alignment boundary.  Round down the addresses and then mask off
157
	   the bytes that precede the start point.
158
	   We also need to adjust the limit calculations, but without
159
	   overflowing if the limit is near ULONG_MAX.  */
160
	bic	src1, src1, #7
161
	bic	src2, src2, #7
162
	ldr	data1, [src1], #8
163
	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
164
	ldr	data2, [src2], #8
165
	mov	tmp2, #~0
166
	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
167
	/* Adjust the limit and ensure it doesn't overflow.  */
168
	adds	limit, limit, count
169
	csinv	limit, limit, xzr, lo
170
	orr	data1, data1, tmp2
171
	orr	data2, data2, tmp2
172
	b	L(start_realigned)
173

174
	.p2align 4
175
	/* Don't bother with dwords for up to 16 bytes.  */
176
L(misaligned8):
177
	cmp	limit, #16
178
	b.hs	L(try_misaligned_words)
179

180
L(byte_loop):
181
	/* Perhaps we can do better than this.  */
182
	ldrb	data1w, [src1], #1
183
	ldrb	data2w, [src2], #1
184
	subs	limit, limit, #1
185
	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
186
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
187
	b.eq	L(byte_loop)
188
L(done):
189
	sub	result, data1, data2
190
	ret
191
	/* Align the SRC1 to a dword by doing a bytewise compare and then do
192
	   the dword loop.  */
193
L(try_misaligned_words):
194
	cbz	count, L(src1_aligned)
195

196
	neg	count, count
197
	and	count, count, #7
198
	sub	limit, limit, count
199

200
L(page_end_loop):
201
	ldrb	data1w, [src1], #1
202
	ldrb	data2w, [src2], #1
203
	cmp	data1w, #1
204
	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
205
	b.ne	L(done)
206
	subs	count, count, #1
207
	b.hi	L(page_end_loop)
208

209
	/* The following diagram explains the comparison of misaligned strings.
210
	   The bytes are shown in natural order. For little-endian, it is
211
	   reversed in the registers. The "x" bytes are before the string.
212
	   The "|" separates data that is loaded at one time.
213
	   src1     | a a a a a a a a | b b b c c c c c | . . .
214
	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
215

216
	   After shifting in each step, the data looks like this:
217
	                STEP_A              STEP_B              STEP_C
218
	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
219
	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
220

221
	   The bytes with "0" are eliminated from the syndrome via mask.
222

223
	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
224
	   time from SRC2. The comparison happens in 3 steps. After each step
225
	   the loop can exit, or read from SRC1 or SRC2. */
226
L(src1_aligned):
227
	/* Calculate offset from 8 byte alignment to string start in bits. No
228
	   need to mask offset since shifts are ignoring upper bits. */
229
	lsl	offset, src2, #3
230
	bic	src2, src2, #0xf
231
	mov	mask, -1
232
	neg	neg_offset, offset
233
	ldr	data1, [src1], #8
234
	ldp	tmp1, tmp2, [src2], #16
235
	LS_BK	mask, mask, neg_offset
236
	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
237
	/* Skip the first compare if data in tmp1 is irrelevant. */
238
	tbnz	offset, 6, L(misaligned_mid_loop)
239

240
L(loop_misaligned):
241
	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
242
	LS_FW	data2, tmp1, offset
243
	LS_BK	tmp1, tmp2, neg_offset
244
	subs	limit, limit, #8
245
	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
246
	sub	has_nul, data1, zeroones
247
	eor	diff, data1, data2	/* Non-zero if differences found.  */
248
	orr	tmp3, data1, #REP8_7f
249
	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
250
	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
251
	orr	tmp3, endloop, has_nul
252
	cbnz	tmp3, L(full_check)
253

254
	ldr	data1, [src1], #8
255
L(misaligned_mid_loop):
256
	/* STEP_B: Compare first part of data1 to second part of tmp2. */
257
	LS_FW	data2, tmp2, offset
258
#ifdef __AARCH64EB__
259
	/* For big-endian we do a byte reverse to avoid carry-propagation
260
	problem described above. This way we can reuse the has_nul in the
261
	next step and also use syndrome value trick at the end. */
262
	rev	tmp3, data1
263
	#define data1_fixed tmp3
264
#else
265
	#define data1_fixed data1
266
#endif
267
	sub	has_nul, data1_fixed, zeroones
268
	orr	tmp3, data1_fixed, #REP8_7f
269
	eor	diff, data2, data1	/* Non-zero if differences found.  */
270
	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
271
#ifdef __AARCH64EB__
272
	rev	has_nul, has_nul
273
#endif
274
	cmp	limit, neg_offset, lsr #3
275
	orr	syndrome, diff, has_nul
276
	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
277
	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
278
	cbnz	tmp3, L(syndrome_check)
279

280
	/* STEP_C: Compare second part of data1 to first part of tmp1. */
281
	ldp	tmp1, tmp2, [src2], #16
282
	cmp	limit, #8
283
	LS_BK	data2, tmp1, neg_offset
284
	eor	diff, data2, data1	/* Non-zero if differences found.  */
285
	orr	syndrome, diff, has_nul
286
	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
287
	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
288
	cbnz	tmp3, L(syndrome_check)
289

290
	ldr	data1, [src1], #8
291
	sub	limit, limit, #8
292
	b	L(loop_misaligned)
293

294
#ifdef	__AARCH64EB__
295
L(syndrome_check):
296
	clz	pos, syndrome
297
	cmp	pos, limit, lsl #3
298
	b.lo	L(end_quick)
299
#endif
300

301
L(ret0):
302
	mov	result, #0
303
	ret
304
END(__strncmp_aarch64)
305

306

307
Product

Resources

Company