Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strcpy.S
39491 views
1
/*
2
* strcpy/stpcpy - copy a string returning pointer to start/end.
3
*
4
* Copyright (c) 2020-2023, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD.
11
* MTE compatible.
12
*/
13
14
#include "asmdefs.h"
15
16
#define dstin x0
17
#define srcin x1
18
#define result x0
19
20
#define src x2
21
#define dst x3
22
#define len x4
23
#define synd x4
24
#define tmp x5
25
#define shift x5
26
#define data1 x6
27
#define dataw1 w6
28
#define data2 x7
29
#define dataw2 w7
30
31
#define dataq q0
32
#define vdata v0
33
#define vhas_nul v1
34
#define vend v2
35
#define dend d2
36
#define dataq2 q1
37
38
#ifdef BUILD_STPCPY
39
# define STRCPY __stpcpy_aarch64
40
# define IFSTPCPY(X,...) X,__VA_ARGS__
41
#else
42
# define STRCPY __strcpy_aarch64
43
# define IFSTPCPY(X,...)
44
#endif
45
46
/*
47
Core algorithm:
48
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
49
per byte. We take 4 bits of every comparison byte with shift right and narrow
50
by 4 instruction. Since the bits in the nibble mask reflect the order in
51
which things occur in the original string, counting leading zeros identifies
52
exactly which byte matched. */
53
54
ENTRY (STRCPY)
55
bic src, srcin, 15
56
ld1 {vdata.16b}, [src]
57
cmeq vhas_nul.16b, vdata.16b, 0
58
lsl shift, srcin, 2
59
shrn vend.8b, vhas_nul.8h, 4
60
fmov synd, dend
61
lsr synd, synd, shift
62
cbnz synd, L(tail)
63
64
ldr dataq, [src, 16]!
65
cmeq vhas_nul.16b, vdata.16b, 0
66
shrn vend.8b, vhas_nul.8h, 4
67
fmov synd, dend
68
cbz synd, L(start_loop)
69
70
#ifndef __AARCH64EB__
71
rbit synd, synd
72
#endif
73
sub tmp, src, srcin
74
clz len, synd
75
add len, tmp, len, lsr 2
76
tbz len, 4, L(less16)
77
sub tmp, len, 15
78
ldr dataq, [srcin]
79
ldr dataq2, [srcin, tmp]
80
str dataq, [dstin]
81
str dataq2, [dstin, tmp]
82
IFSTPCPY (add result, dstin, len)
83
ret
84
85
L(tail):
86
rbit synd, synd
87
clz len, synd
88
lsr len, len, 2
89
L(less16):
90
tbz len, 3, L(less8)
91
sub tmp, len, 7
92
ldr data1, [srcin]
93
ldr data2, [srcin, tmp]
94
str data1, [dstin]
95
str data2, [dstin, tmp]
96
IFSTPCPY (add result, dstin, len)
97
ret
98
99
.p2align 4
100
L(less8):
101
subs tmp, len, 3
102
b.lo L(less4)
103
ldr dataw1, [srcin]
104
ldr dataw2, [srcin, tmp]
105
str dataw1, [dstin]
106
str dataw2, [dstin, tmp]
107
IFSTPCPY (add result, dstin, len)
108
ret
109
110
L(less4):
111
cbz len, L(zerobyte)
112
ldrh dataw1, [srcin]
113
strh dataw1, [dstin]
114
L(zerobyte):
115
strb wzr, [dstin, len]
116
IFSTPCPY (add result, dstin, len)
117
ret
118
119
.p2align 4
120
L(start_loop):
121
sub tmp, srcin, dstin
122
ldr dataq2, [srcin]
123
sub dst, src, tmp
124
str dataq2, [dstin]
125
L(loop):
126
str dataq, [dst], 32
127
ldr dataq, [src, 16]
128
cmeq vhas_nul.16b, vdata.16b, 0
129
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
130
fmov synd, dend
131
cbnz synd, L(loopend)
132
str dataq, [dst, -16]
133
ldr dataq, [src, 32]!
134
cmeq vhas_nul.16b, vdata.16b, 0
135
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
136
fmov synd, dend
137
cbz synd, L(loop)
138
add dst, dst, 16
139
L(loopend):
140
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
141
fmov synd, dend
142
sub dst, dst, 31
143
#ifndef __AARCH64EB__
144
rbit synd, synd
145
#endif
146
clz len, synd
147
lsr len, len, 2
148
add dst, dst, len
149
ldr dataq, [dst, tmp]
150
str dataq, [dst]
151
IFSTPCPY (add result, dst, 15)
152
ret
153
154
END (STRCPY)
155
156