Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memset-sve.S
39486 views
1
/*
2
* memset - fill memory with a constant byte
3
*
4
* Copyright (c) 2024-2024, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11
*
12
*/
13
14
#include "asmdefs.h"
15
16
.arch armv8-a+sve
17
18
#define dstin x0
19
#define val x1
20
#define valw w1
21
#define count x2
22
#define dst x3
23
#define dstend x4
24
#define zva_val x5
25
#define vlen x5
26
#define off x3
27
#define dstend2 x5
28
29
ENTRY (__memset_aarch64_sve)
30
dup v0.16B, valw
31
cmp count, 16
32
b.lo L(set_16)
33
34
add dstend, dstin, count
35
cmp count, 64
36
b.hs L(set_128)
37
38
/* Set 16..63 bytes. */
39
mov off, 16
40
and off, off, count, lsr 1
41
sub dstend2, dstend, off
42
str q0, [dstin]
43
str q0, [dstin, off]
44
str q0, [dstend2, -16]
45
str q0, [dstend, -16]
46
ret
47
48
.p2align 4
49
L(set_16):
50
whilelo p0.b, xzr, count
51
st1b z0.b, p0, [dstin]
52
ret
53
54
.p2align 4
55
L(set_128):
56
bic dst, dstin, 15
57
cmp count, 128
58
b.hi L(set_long)
59
stp q0, q0, [dstin]
60
stp q0, q0, [dstin, 32]
61
stp q0, q0, [dstend, -64]
62
stp q0, q0, [dstend, -32]
63
ret
64
65
.p2align 4
66
L(set_long):
67
cmp count, 256
68
b.lo L(no_zva)
69
tst valw, 255
70
b.ne L(no_zva)
71
72
#ifndef SKIP_ZVA_CHECK
73
mrs zva_val, dczid_el0
74
and zva_val, zva_val, 31
75
cmp zva_val, 4 /* ZVA size is 64 bytes. */
76
b.ne L(no_zva)
77
#endif
78
str q0, [dstin]
79
str q0, [dst, 16]
80
bic dst, dstin, 31
81
stp q0, q0, [dst, 32]
82
bic dst, dstin, 63
83
sub count, dstend, dst /* Count is now 64 too large. */
84
sub count, count, 128 /* Adjust count and bias for loop. */
85
86
sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
87
bic x8, x8, 15
88
stp q0, q0, [x8, -48]
89
str q0, [x8, -16]
90
str q0, [dstend, -16]
91
92
.p2align 4
93
L(zva64_loop):
94
add dst, dst, 64
95
dc zva, dst
96
subs count, count, 64
97
b.hi L(zva64_loop)
98
ret
99
100
L(no_zva):
101
str q0, [dstin]
102
sub count, dstend, dst /* Count is 16 too large. */
103
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
104
L(no_zva_loop):
105
stp q0, q0, [dst, 16]
106
stp q0, q0, [dst, 48]
107
add dst, dst, 64
108
subs count, count, 64
109
b.hi L(no_zva_loop)
110
stp q0, q0, [dstend, -64]
111
stp q0, q0, [dstend, -32]
112
ret
113
114
END (__memset_aarch64_sve)
115
116