Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/lib/copy_template.S
26426 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Copyright (C) 2013 ARM Ltd.
4
* Copyright (C) 2013 Linaro.
5
*
6
* This code is based on glibc cortex strings work originally authored by Linaro
7
* be found @
8
*
9
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10
* files/head:/src/aarch64/
11
*/
12
13
14
/*
15
* Copy a buffer from src to dest (alignment handled by the hardware)
16
*
17
* Parameters:
18
* x0 - dest
19
* x1 - src
20
* x2 - n
21
* Returns:
22
* x0 - dest
23
*/
24
dstin .req x0
25
src .req x1
26
count .req x2
27
tmp1 .req x3
28
tmp1w .req w3
29
tmp2 .req x4
30
tmp2w .req w4
31
dst .req x6
32
33
A_l .req x7
34
A_h .req x8
35
B_l .req x9
36
B_h .req x10
37
C_l .req x11
38
C_h .req x12
39
D_l .req x13
40
D_h .req x14
41
42
mov dst, dstin
43
44
#ifdef CONFIG_AS_HAS_MOPS
45
alternative_if_not ARM64_HAS_MOPS
46
b .Lno_mops
47
alternative_else_nop_endif
48
cpy1 dst, src, count
49
b .Lexitfunc
50
.Lno_mops:
51
#endif
52
53
cmp count, #16
54
/*When memory length is less than 16, the accessed are not aligned.*/
55
b.lo .Ltiny15
56
57
neg tmp2, src
58
ands tmp2, tmp2, #15/* Bytes to reach alignment. */
59
b.eq .LSrcAligned
60
sub count, count, tmp2
61
/*
62
* Copy the leading memory data from src to dst in an increasing
63
* address order.By this way,the risk of overwriting the source
64
* memory data is eliminated when the distance between src and
65
* dst is less than 16. The memory accesses here are alignment.
66
*/
67
tbz tmp2, #0, 1f
68
ldrb1 tmp1w, src, #1
69
strb1 tmp1w, dst, #1
70
1:
71
tbz tmp2, #1, 2f
72
ldrh1 tmp1w, src, #2
73
strh1 tmp1w, dst, #2
74
2:
75
tbz tmp2, #2, 3f
76
ldr1 tmp1w, src, #4
77
str1 tmp1w, dst, #4
78
3:
79
tbz tmp2, #3, .LSrcAligned
80
ldr1 tmp1, src, #8
81
str1 tmp1, dst, #8
82
83
.LSrcAligned:
84
cmp count, #64
85
b.ge .Lcpy_over64
86
/*
87
* Deal with small copies quickly by dropping straight into the
88
* exit block.
89
*/
90
.Ltail63:
91
/*
92
* Copy up to 48 bytes of data. At this point we only need the
93
* bottom 6 bits of count to be accurate.
94
*/
95
ands tmp1, count, #0x30
96
b.eq .Ltiny15
97
cmp tmp1w, #0x20
98
b.eq 1f
99
b.lt 2f
100
ldp1 A_l, A_h, src, #16
101
stp1 A_l, A_h, dst, #16
102
1:
103
ldp1 A_l, A_h, src, #16
104
stp1 A_l, A_h, dst, #16
105
2:
106
ldp1 A_l, A_h, src, #16
107
stp1 A_l, A_h, dst, #16
108
.Ltiny15:
109
/*
110
* Prefer to break one ldp/stp into several load/store to access
111
* memory in an increasing address order,rather than to load/store 16
112
* bytes from (src-16) to (dst-16) and to backward the src to aligned
113
* address,which way is used in original cortex memcpy. If keeping
114
* the original memcpy process here, memmove need to satisfy the
115
* precondition that src address is at least 16 bytes bigger than dst
116
* address,otherwise some source data will be overwritten when memove
117
* call memcpy directly. To make memmove simpler and decouple the
118
* memcpy's dependency on memmove, withdrew the original process.
119
*/
120
tbz count, #3, 1f
121
ldr1 tmp1, src, #8
122
str1 tmp1, dst, #8
123
1:
124
tbz count, #2, 2f
125
ldr1 tmp1w, src, #4
126
str1 tmp1w, dst, #4
127
2:
128
tbz count, #1, 3f
129
ldrh1 tmp1w, src, #2
130
strh1 tmp1w, dst, #2
131
3:
132
tbz count, #0, .Lexitfunc
133
ldrb1 tmp1w, src, #1
134
strb1 tmp1w, dst, #1
135
136
b .Lexitfunc
137
138
.Lcpy_over64:
139
subs count, count, #128
140
b.ge .Lcpy_body_large
141
/*
142
* Less than 128 bytes to copy, so handle 64 here and then jump
143
* to the tail.
144
*/
145
ldp1 A_l, A_h, src, #16
146
stp1 A_l, A_h, dst, #16
147
ldp1 B_l, B_h, src, #16
148
ldp1 C_l, C_h, src, #16
149
stp1 B_l, B_h, dst, #16
150
stp1 C_l, C_h, dst, #16
151
ldp1 D_l, D_h, src, #16
152
stp1 D_l, D_h, dst, #16
153
154
tst count, #0x3f
155
b.ne .Ltail63
156
b .Lexitfunc
157
158
/*
159
* Critical loop. Start at a new cache line boundary. Assuming
160
* 64 bytes per line this ensures the entire loop is in one line.
161
*/
162
.p2align L1_CACHE_SHIFT
163
.Lcpy_body_large:
164
/* pre-get 64 bytes data. */
165
ldp1 A_l, A_h, src, #16
166
ldp1 B_l, B_h, src, #16
167
ldp1 C_l, C_h, src, #16
168
ldp1 D_l, D_h, src, #16
169
1:
170
/*
171
* interlace the load of next 64 bytes data block with store of the last
172
* loaded 64 bytes data.
173
*/
174
stp1 A_l, A_h, dst, #16
175
ldp1 A_l, A_h, src, #16
176
stp1 B_l, B_h, dst, #16
177
ldp1 B_l, B_h, src, #16
178
stp1 C_l, C_h, dst, #16
179
ldp1 C_l, C_h, src, #16
180
stp1 D_l, D_h, dst, #16
181
ldp1 D_l, D_h, src, #16
182
subs count, count, #64
183
b.ge 1b
184
stp1 A_l, A_h, dst, #16
185
stp1 B_l, B_h, dst, #16
186
stp1 C_l, C_h, dst, #16
187
stp1 D_l, D_h, dst, #16
188
189
tst count, #0x3f
190
b.ne .Ltail63
191
.Lexitfunc:
192
193