CoCalc -- copy_template.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/lib/copy_template.S
²⁶⁴²⁶ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * Copyright (C) 2013 ARM Ltd.
4
 * Copyright (C) 2013 Linaro.
5
 *
6
 * This code is based on glibc cortex strings work originally authored by Linaro
7
 * be found @
8
 *
9
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10
 * files/head:/src/aarch64/
11
 */
12

13

14
/*
15
 * Copy a buffer from src to dest (alignment handled by the hardware)
16
 *
17
 * Parameters:
18
 *	x0 - dest
19
 *	x1 - src
20
 *	x2 - n
21
 * Returns:
22
 *	x0 - dest
23
 */
24
dstin	.req	x0
25
src	.req	x1
26
count	.req	x2
27
tmp1	.req	x3
28
tmp1w	.req	w3
29
tmp2	.req	x4
30
tmp2w	.req	w4
31
dst	.req	x6
32

33
A_l	.req	x7
34
A_h	.req	x8
35
B_l	.req	x9
36
B_h	.req	x10
37
C_l	.req	x11
38
C_h	.req	x12
39
D_l	.req	x13
40
D_h	.req	x14
41

42
	mov	dst, dstin
43

44
#ifdef CONFIG_AS_HAS_MOPS
45
alternative_if_not ARM64_HAS_MOPS
46
	b	.Lno_mops
47
alternative_else_nop_endif
48
	cpy1	dst, src, count
49
	b	.Lexitfunc
50
.Lno_mops:
51
#endif
52

53
	cmp	count, #16
54
	/*When memory length is less than 16, the accessed are not aligned.*/
55
	b.lo	.Ltiny15
56

57
	neg	tmp2, src
58
	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
59
	b.eq	.LSrcAligned
60
	sub	count, count, tmp2
61
	/*
62
	* Copy the leading memory data from src to dst in an increasing
63
	* address order.By this way,the risk of overwriting the source
64
	* memory data is eliminated when the distance between src and
65
	* dst is less than 16. The memory accesses here are alignment.
66
	*/
67
	tbz	tmp2, #0, 1f
68
	ldrb1	tmp1w, src, #1
69
	strb1	tmp1w, dst, #1
70
1:
71
	tbz	tmp2, #1, 2f
72
	ldrh1	tmp1w, src, #2
73
	strh1	tmp1w, dst, #2
74
2:
75
	tbz	tmp2, #2, 3f
76
	ldr1	tmp1w, src, #4
77
	str1	tmp1w, dst, #4
78
3:
79
	tbz	tmp2, #3, .LSrcAligned
80
	ldr1	tmp1, src, #8
81
	str1	tmp1, dst, #8
82

83
.LSrcAligned:
84
	cmp	count, #64
85
	b.ge	.Lcpy_over64
86
	/*
87
	* Deal with small copies quickly by dropping straight into the
88
	* exit block.
89
	*/
90
.Ltail63:
91
	/*
92
	* Copy up to 48 bytes of data. At this point we only need the
93
	* bottom 6 bits of count to be accurate.
94
	*/
95
	ands	tmp1, count, #0x30
96
	b.eq	.Ltiny15
97
	cmp	tmp1w, #0x20
98
	b.eq	1f
99
	b.lt	2f
100
	ldp1	A_l, A_h, src, #16
101
	stp1	A_l, A_h, dst, #16
102
1:
103
	ldp1	A_l, A_h, src, #16
104
	stp1	A_l, A_h, dst, #16
105
2:
106
	ldp1	A_l, A_h, src, #16
107
	stp1	A_l, A_h, dst, #16
108
.Ltiny15:
109
	/*
110
	* Prefer to break one ldp/stp into several load/store to access
111
	* memory in an increasing address order,rather than to load/store 16
112
	* bytes from (src-16) to (dst-16) and to backward the src to aligned
113
	* address,which way is used in original cortex memcpy. If keeping
114
	* the original memcpy process here, memmove need to satisfy the
115
	* precondition that src address is at least 16 bytes bigger than dst
116
	* address,otherwise some source data will be overwritten when memove
117
	* call memcpy directly. To make memmove simpler and decouple the
118
	* memcpy's dependency on memmove, withdrew the original process.
119
	*/
120
	tbz	count, #3, 1f
121
	ldr1	tmp1, src, #8
122
	str1	tmp1, dst, #8
123
1:
124
	tbz	count, #2, 2f
125
	ldr1	tmp1w, src, #4
126
	str1	tmp1w, dst, #4
127
2:
128
	tbz	count, #1, 3f
129
	ldrh1	tmp1w, src, #2
130
	strh1	tmp1w, dst, #2
131
3:
132
	tbz	count, #0, .Lexitfunc
133
	ldrb1	tmp1w, src, #1
134
	strb1	tmp1w, dst, #1
135

136
	b	.Lexitfunc
137

138
.Lcpy_over64:
139
	subs	count, count, #128
140
	b.ge	.Lcpy_body_large
141
	/*
142
	* Less than 128 bytes to copy, so handle 64 here and then jump
143
	* to the tail.
144
	*/
145
	ldp1	A_l, A_h, src, #16
146
	stp1	A_l, A_h, dst, #16
147
	ldp1	B_l, B_h, src, #16
148
	ldp1	C_l, C_h, src, #16
149
	stp1	B_l, B_h, dst, #16
150
	stp1	C_l, C_h, dst, #16
151
	ldp1	D_l, D_h, src, #16
152
	stp1	D_l, D_h, dst, #16
153

154
	tst	count, #0x3f
155
	b.ne	.Ltail63
156
	b	.Lexitfunc
157

158
	/*
159
	* Critical loop.  Start at a new cache line boundary.  Assuming
160
	* 64 bytes per line this ensures the entire loop is in one line.
161
	*/
162
	.p2align	L1_CACHE_SHIFT
163
.Lcpy_body_large:
164
	/* pre-get 64 bytes data. */
165
	ldp1	A_l, A_h, src, #16
166
	ldp1	B_l, B_h, src, #16
167
	ldp1	C_l, C_h, src, #16
168
	ldp1	D_l, D_h, src, #16
169
1:
170
	/*
171
	* interlace the load of next 64 bytes data block with store of the last
172
	* loaded 64 bytes data.
173
	*/
174
	stp1	A_l, A_h, dst, #16
175
	ldp1	A_l, A_h, src, #16
176
	stp1	B_l, B_h, dst, #16
177
	ldp1	B_l, B_h, src, #16
178
	stp1	C_l, C_h, dst, #16
179
	ldp1	C_l, C_h, src, #16
180
	stp1	D_l, D_h, dst, #16
181
	ldp1	D_l, D_h, src, #16
182
	subs	count, count, #64
183
	b.ge	1b
184
	stp1	A_l, A_h, dst, #16
185
	stp1	B_l, B_h, dst, #16
186
	stp1	C_l, C_h, dst, #16
187
	stp1	D_l, D_h, dst, #16
188

189
	tst	count, #0x3f
190
	b.ne	.Ltail63
191
.Lexitfunc:
192

193
Product

Resources

Company