Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S
213799 views
1
//===----------------------------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file contains assembly-optimized implementations of Scalable Matrix
11
/// Extension (SME) compatible memset and memchr functions.
12
///
13
/// These implementations depend on unaligned access and floating-point support.
14
///
15
/// Routines taken from libc/AOR_v20.02/string/aarch64.
16
///
17
//===----------------------------------------------------------------------===//
18
19
#include "../assembly.h"
20
21
//
22
// __arm_sc_memset
23
//
24
25
#define dstin x0
26
#define val x1
27
#define valw w1
28
#define count x2
29
#define dst x3
30
#define dstend2 x4
31
#define zva_val x5
32
33
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
34
#ifdef __ARM_FEATURE_SVE
35
mov z0.b, valw
36
#else
37
bfi valw, valw, #8, #8
38
bfi valw, valw, #16, #16
39
bfi val, val, #32, #32
40
fmov d0, val
41
fmov v0.d[1], val
42
#endif
43
add dstend2, dstin, count
44
45
cmp count, 96
46
b.hi 7f // set_long
47
cmp count, 16
48
b.hs 4f // set_medium
49
mov val, v0.D[0]
50
51
/* Set 0..15 bytes. */
52
tbz count, 3, 1f
53
str val, [dstin]
54
str val, [dstend2, -8]
55
ret
56
nop
57
1: tbz count, 2, 2f
58
str valw, [dstin]
59
str valw, [dstend2, -4]
60
ret
61
2: cbz count, 3f
62
strb valw, [dstin]
63
tbz count, 1, 3f
64
strh valw, [dstend2, -2]
65
3: ret
66
67
/* Set 17..96 bytes. */
68
4: // set_medium
69
str q0, [dstin]
70
tbnz count, 6, 6f // set96
71
str q0, [dstend2, -16]
72
tbz count, 5, 5f
73
str q0, [dstin, 16]
74
str q0, [dstend2, -32]
75
5: ret
76
77
.p2align 4
78
/* Set 64..96 bytes. Write 64 bytes from the start and
79
32 bytes from the end. */
80
6: // set96
81
str q0, [dstin, 16]
82
stp q0, q0, [dstin, 32]
83
stp q0, q0, [dstend2, -32]
84
ret
85
86
.p2align 4
87
7: // set_long
88
and valw, valw, 255
89
bic dst, dstin, 15
90
str q0, [dstin]
91
cmp count, 160
92
ccmp valw, 0, 0, hs
93
b.ne 9f // no_zva
94
95
#ifndef SKIP_ZVA_CHECK
96
mrs zva_val, dczid_el0
97
and zva_val, zva_val, 31
98
cmp zva_val, 4 /* ZVA size is 64 bytes. */
99
b.ne 9f // no_zva
100
#endif
101
str q0, [dst, 16]
102
stp q0, q0, [dst, 32]
103
bic dst, dst, 63
104
sub count, dstend2, dst /* Count is now 64 too large. */
105
sub count, count, 128 /* Adjust count and bias for loop. */
106
107
.p2align 4
108
8: // zva_loop
109
add dst, dst, 64
110
dc zva, dst
111
subs count, count, 64
112
b.hi 8b // zva_loop
113
stp q0, q0, [dstend2, -64]
114
stp q0, q0, [dstend2, -32]
115
ret
116
117
9: // no_zva
118
sub count, dstend2, dst /* Count is 16 too large. */
119
sub dst, dst, 16 /* Dst is biased by -32. */
120
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
121
10: // no_zva_loop
122
stp q0, q0, [dst, 32]
123
stp q0, q0, [dst, 64]!
124
subs count, count, 64
125
b.hi 10b // no_zva_loop
126
stp q0, q0, [dstend2, -64]
127
stp q0, q0, [dstend2, -32]
128
ret
129
END_COMPILERRT_FUNCTION(__arm_sc_memset)
130
131
//
132
// __arm_sc_memchr
133
//
134
135
#define srcin x0
136
#define chrin w1
137
#define cntin x2
138
139
#define result x0
140
141
#define src x3
142
#define tmp x4
143
#define wtmp2 w5
144
#define synd x6
145
#define soff x9
146
#define cntrem x10
147
148
#define vrepchr v0
149
#define vdata1 v1
150
#define vdata2 v2
151
#define vhas_chr1 v3
152
#define vhas_chr2 v4
153
#define vrepmask v5
154
#define vend v6
155
156
/*
157
* Core algorithm:
158
*
159
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
160
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
161
* requested character and bit 1 is not used (faster than using a 32bit
162
* syndrome). Since the bits in the syndrome reflect exactly the order in which
163
* things occur in the original string, counting trailing zeros allows to
164
* identify exactly which byte has matched.
165
*/
166
167
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)
168
/* Do not dereference srcin if no bytes to compare. */
169
cbz cntin, 4f
170
/*
171
* Magic constant 0x40100401 allows us to identify which lane matches
172
* the requested byte.
173
*/
174
mov wtmp2, #0x0401
175
movk wtmp2, #0x4010, lsl #16
176
dup vrepchr.16b, chrin
177
/* Work with aligned 32-byte chunks */
178
bic src, srcin, #31
179
dup vrepmask.4s, wtmp2
180
ands soff, srcin, #31
181
and cntrem, cntin, #31
182
b.eq 0f
183
184
/*
185
* Input string is not 32-byte aligned. We calculate the syndrome
186
* value for the aligned 32 bytes block containing the first bytes
187
* and mask the irrelevant part.
188
*/
189
190
ld1 {vdata1.16b, vdata2.16b}, [src], #32
191
sub tmp, soff, #32
192
adds cntin, cntin, tmp
193
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
194
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
195
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
196
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
197
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
198
addp vend.16b, vend.16b, vend.16b /* 128->64 */
199
mov synd, vend.d[0]
200
/* Clear the soff*2 lower bits */
201
lsl tmp, soff, #1
202
lsr synd, synd, tmp
203
lsl synd, synd, tmp
204
/* The first block can also be the last */
205
b.ls 2f
206
/* Have we found something already? */
207
cbnz synd, 3f
208
209
0: // loop
210
ld1 {vdata1.16b, vdata2.16b}, [src], #32
211
subs cntin, cntin, #32
212
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
213
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
214
/* If we're out of data we finish regardless of the result */
215
b.ls 1f
216
/* Use a fast check for the termination condition */
217
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
218
addp vend.2d, vend.2d, vend.2d
219
mov synd, vend.d[0]
220
/* We're not out of data, loop if we haven't found the character */
221
cbz synd, 0b
222
223
1: // end
224
/* Termination condition found, let's calculate the syndrome value */
225
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
226
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
227
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
228
addp vend.16b, vend.16b, vend.16b /* 128->64 */
229
mov synd, vend.d[0]
230
/* Only do the clear for the last possible block */
231
b.hi 3f
232
233
2: // masklast
234
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
235
add tmp, cntrem, soff
236
and tmp, tmp, #31
237
sub tmp, tmp, #32
238
neg tmp, tmp, lsl #1
239
lsl synd, synd, tmp
240
lsr synd, synd, tmp
241
242
3: // tail
243
/* Count the trailing zeros using bit reversing */
244
rbit synd, synd
245
/* Compensate the last post-increment */
246
sub src, src, #32
247
/* Check that we have found a character */
248
cmp synd, #0
249
/* And count the leading zeros */
250
clz synd, synd
251
/* Compute the potential result */
252
add result, src, synd, lsr #1
253
/* Select result or NULL */
254
csel result, xzr, result, eq
255
ret
256
257
4: // zero_length
258
mov result, #0
259
ret
260
END_COMPILERRT_FUNCTION(__arm_sc_memchr)
261
262
263