Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strlen.S
39486 views
1
/*
2
* strlen - calculate the length of a string.
3
*
4
* Copyright (c) 2020-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11
* Not MTE compatible.
12
*/
13
14
#include "asmdefs.h"
15
16
#define srcin x0
17
#define len x0
18
19
#define src x1
20
#define data1 x2
21
#define data2 x3
22
#define has_nul1 x4
23
#define has_nul2 x5
24
#define tmp1 x4
25
#define tmp2 x5
26
#define tmp3 x6
27
#define tmp4 x7
28
#define zeroones x8
29
30
#define maskv v0
31
#define maskd d0
32
#define dataq1 q1
33
#define dataq2 q2
34
#define datav1 v1
35
#define datav2 v2
36
#define tmp x2
37
#define tmpw w2
38
#define synd x3
39
#define syndw w3
40
#define shift x4
41
42
/* For the first 32 bytes, NUL detection works on the principle that
43
(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
44
byte is zero, and can be done in parallel across the entire word. */
45
46
#define REP8_01 0x0101010101010101
47
#define REP8_7f 0x7f7f7f7f7f7f7f7f
48
49
/* To test the page crossing code path more thoroughly, compile with
50
-DTEST_PAGE_CROSS - this will force all calls through the slower
51
entry path. This option is not intended for production use. */
52
53
#ifdef TEST_PAGE_CROSS
54
# define MIN_PAGE_SIZE 32
55
#else
56
# define MIN_PAGE_SIZE 4096
57
#endif
58
59
/* Core algorithm:
60
61
Since strings are short on average, we check the first 32 bytes of the
62
string for a NUL character without aligning the string. In order to use
63
unaligned loads safely we must do a page cross check first.
64
65
If there is a NUL byte we calculate the length from the 2 8-byte words
66
using conditional select to reduce branch mispredictions (it is unlikely
67
strlen will be repeatedly called on strings with the same length).
68
69
If the string is longer than 32 bytes, align src so we don't need further
70
page cross checks, and process 32 bytes per iteration using a fast SIMD
71
loop.
72
73
If the page cross check fails, we read 32 bytes from an aligned address,
74
and ignore any characters before the string. If it contains a NUL
75
character, return the length, if not, continue in the main loop. */
76
77
ENTRY (__strlen_aarch64)
78
and tmp1, srcin, MIN_PAGE_SIZE - 1
79
cmp tmp1, MIN_PAGE_SIZE - 32
80
b.hi L(page_cross)
81
82
/* Look for a NUL byte in the first 16 bytes. */
83
ldp data1, data2, [srcin]
84
mov zeroones, REP8_01
85
86
#ifdef __AARCH64EB__
87
/* For big-endian, carry propagation (if the final byte in the
88
string is 0x01) means we cannot use has_nul1/2 directly.
89
Since we expect strings to be small and early-exit,
90
byte-swap the data now so has_null1/2 will be correct. */
91
rev data1, data1
92
rev data2, data2
93
#endif
94
sub tmp1, data1, zeroones
95
orr tmp2, data1, REP8_7f
96
sub tmp3, data2, zeroones
97
orr tmp4, data2, REP8_7f
98
bics has_nul1, tmp1, tmp2
99
bic has_nul2, tmp3, tmp4
100
ccmp has_nul2, 0, 0, eq
101
b.eq L(bytes16_31)
102
103
/* Find the exact offset of the first NUL byte in the first 16 bytes
104
from the string start. Enter with C = has_nul1 == 0. */
105
csel has_nul1, has_nul1, has_nul2, cc
106
mov len, 8
107
rev has_nul1, has_nul1
108
csel len, xzr, len, cc
109
clz tmp1, has_nul1
110
add len, len, tmp1, lsr 3
111
ret
112
113
/* Look for a NUL byte at offset 16..31 in the string. */
114
L(bytes16_31):
115
ldp data1, data2, [srcin, 16]
116
#ifdef __AARCH64EB__
117
rev data1, data1
118
rev data2, data2
119
#endif
120
sub tmp1, data1, zeroones
121
orr tmp2, data1, REP8_7f
122
sub tmp3, data2, zeroones
123
orr tmp4, data2, REP8_7f
124
bics has_nul1, tmp1, tmp2
125
bic has_nul2, tmp3, tmp4
126
ccmp has_nul2, 0, 0, eq
127
b.eq L(loop_entry)
128
129
/* Find the exact offset of the first NUL byte at offset 16..31 from
130
the string start. Enter with C = has_nul1 == 0. */
131
csel has_nul1, has_nul1, has_nul2, cc
132
mov len, 24
133
rev has_nul1, has_nul1
134
mov tmp3, 16
135
clz tmp1, has_nul1
136
csel len, tmp3, len, cc
137
add len, len, tmp1, lsr 3
138
ret
139
140
nop
141
L(loop_entry):
142
bic src, srcin, 31
143
144
.p2align 5
145
L(loop):
146
ldp dataq1, dataq2, [src, 32]!
147
uminp maskv.16b, datav1.16b, datav2.16b
148
uminp maskv.16b, maskv.16b, maskv.16b
149
cmeq maskv.8b, maskv.8b, 0
150
fmov synd, maskd
151
cbz synd, L(loop)
152
153
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
154
cmeq maskv.16b, datav1.16b, 0
155
sub len, src, srcin
156
cbnz syndw, 1f
157
cmeq maskv.16b, datav2.16b, 0
158
add len, len, 16
159
1:
160
/* Generate a bitmask and compute correct byte offset. */
161
shrn maskv.8b, maskv.8h, 4
162
fmov synd, maskd
163
#ifndef __AARCH64EB__
164
rbit synd, synd
165
#endif
166
clz tmp, synd
167
add len, len, tmp, lsr 2
168
ret
169
170
L(page_cross):
171
bic src, srcin, 31
172
mov tmpw, 0x0c03
173
movk tmpw, 0xc030, lsl 16
174
ld1 {datav1.16b, datav2.16b}, [src]
175
dup maskv.4s, tmpw
176
cmeq datav1.16b, datav1.16b, 0
177
cmeq datav2.16b, datav2.16b, 0
178
and datav1.16b, datav1.16b, maskv.16b
179
and datav2.16b, datav2.16b, maskv.16b
180
addp maskv.16b, datav1.16b, datav2.16b
181
addp maskv.16b, maskv.16b, maskv.16b
182
fmov synd, maskd
183
lsl shift, srcin, 1
184
lsr synd, synd, shift
185
cbz synd, L(loop)
186
187
rbit synd, synd
188
clz len, synd
189
lsr len, len, 1
190
ret
191
192
END (__strlen_aarch64)
193
194