Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/alpha/lib/ev6-memcpy.S
26425 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* arch/alpha/lib/ev6-memcpy.S
4
* 21264 version by Rick Gorton <[email protected]>
5
*
6
* Reasonably optimized memcpy() routine for the Alpha 21264
7
*
8
* - memory accessed as aligned quadwords only
9
* - uses bcmpge to compare 8 bytes in parallel
10
*
11
* Much of the information about 21264 scheduling/coding comes from:
12
* Compiler Writer's Guide for the Alpha 21264
13
* abbreviated as 'CWG' in other comments here
14
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15
* Scheduling notation:
16
* E - either cluster
17
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19
*
20
* Temp usage notes:
21
* $1,$2, - scratch
22
*/
23
#include <linux/export.h>
24
.set noreorder
25
.set noat
26
27
.align 4
28
.globl memcpy
29
.ent memcpy
30
memcpy:
31
.frame $30,0,$26,0
32
.prologue 0
33
34
mov $16, $0 # E : copy dest to return
35
ble $18, $nomoredata # U : done with the copy?
36
xor $16, $17, $1 # E : are source and dest alignments the same?
37
and $1, 7, $1 # E : are they the same mod 8?
38
39
bne $1, $misaligned # U : Nope - gotta do this the slow way
40
/* source and dest are same mod 8 address */
41
and $16, 7, $1 # E : Are both 0mod8?
42
beq $1, $both_0mod8 # U : Yes
43
nop # E :
44
45
/*
46
* source and dest are same misalignment. move a byte at a time
47
* until a 0mod8 alignment for both is reached.
48
* At least one byte more to move
49
*/
50
51
$head_align:
52
ldbu $1, 0($17) # L : grab a byte
53
subq $18, 1, $18 # E : count--
54
addq $17, 1, $17 # E : src++
55
stb $1, 0($16) # L :
56
addq $16, 1, $16 # E : dest++
57
and $16, 7, $1 # E : Are we at 0mod8 yet?
58
ble $18, $nomoredata # U : done with the copy?
59
bne $1, $head_align # U :
60
61
$both_0mod8:
62
cmple $18, 127, $1 # E : Can we unroll the loop?
63
bne $1, $no_unroll # U :
64
and $16, 63, $1 # E : get mod64 alignment
65
beq $1, $do_unroll # U : no single quads to fiddle
66
67
$single_head_quad:
68
ldq $1, 0($17) # L : get 8 bytes
69
subq $18, 8, $18 # E : count -= 8
70
addq $17, 8, $17 # E : src += 8
71
nop # E :
72
73
stq $1, 0($16) # L : store
74
addq $16, 8, $16 # E : dest += 8
75
and $16, 63, $1 # E : get mod64 alignment
76
bne $1, $single_head_quad # U : still not fully aligned
77
78
$do_unroll:
79
addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
80
cmple $18, 127, $1 # E : Can we go through the unrolled loop?
81
bne $1, $tail_quads # U : Nope
82
nop # E :
83
84
$unroll_body:
85
wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
86
# ($7) are about to be over-written
87
ldq $6, 0($17) # L0 : bytes 0..7
88
nop # E :
89
nop # E :
90
91
ldq $4, 8($17) # L : bytes 8..15
92
ldq $5, 16($17) # L : bytes 16..23
93
addq $7, 64, $7 # E : Update next wh64 address
94
nop # E :
95
96
ldq $3, 24($17) # L : bytes 24..31
97
addq $16, 64, $1 # E : fallback value for wh64
98
nop # E :
99
nop # E :
100
101
addq $17, 32, $17 # E : src += 32 bytes
102
stq $6, 0($16) # L : bytes 0..7
103
nop # E :
104
nop # E :
105
106
stq $4, 8($16) # L : bytes 8..15
107
stq $5, 16($16) # L : bytes 16..23
108
subq $18, 192, $2 # E : At least two more trips to go?
109
nop # E :
110
111
stq $3, 24($16) # L : bytes 24..31
112
addq $16, 32, $16 # E : dest += 32 bytes
113
nop # E :
114
nop # E :
115
116
ldq $6, 0($17) # L : bytes 0..7
117
ldq $4, 8($17) # L : bytes 8..15
118
cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
119
# fallback wh64 address if < 2 more trips
120
nop # E :
121
122
ldq $5, 16($17) # L : bytes 16..23
123
ldq $3, 24($17) # L : bytes 24..31
124
addq $16, 32, $16 # E : dest += 32
125
subq $18, 64, $18 # E : count -= 64
126
127
addq $17, 32, $17 # E : src += 32
128
stq $6, -32($16) # L : bytes 0..7
129
stq $4, -24($16) # L : bytes 8..15
130
cmple $18, 63, $1 # E : At least one more trip?
131
132
stq $5, -16($16) # L : bytes 16..23
133
stq $3, -8($16) # L : bytes 24..31
134
nop # E :
135
beq $1, $unroll_body
136
137
$tail_quads:
138
$no_unroll:
139
.align 4
140
subq $18, 8, $18 # E : At least a quad left?
141
blt $18, $less_than_8 # U : Nope
142
nop # E :
143
nop # E :
144
145
$move_a_quad:
146
ldq $1, 0($17) # L : fetch 8
147
subq $18, 8, $18 # E : count -= 8
148
addq $17, 8, $17 # E : src += 8
149
nop # E :
150
151
stq $1, 0($16) # L : store 8
152
addq $16, 8, $16 # E : dest += 8
153
bge $18, $move_a_quad # U :
154
nop # E :
155
156
$less_than_8:
157
.align 4
158
addq $18, 8, $18 # E : add back for trailing bytes
159
ble $18, $nomoredata # U : All-done
160
nop # E :
161
nop # E :
162
163
/* Trailing bytes */
164
$tail_bytes:
165
subq $18, 1, $18 # E : count--
166
ldbu $1, 0($17) # L : fetch a byte
167
addq $17, 1, $17 # E : src++
168
nop # E :
169
170
stb $1, 0($16) # L : store a byte
171
addq $16, 1, $16 # E : dest++
172
bgt $18, $tail_bytes # U : more to be done?
173
nop # E :
174
175
/* branching to exit takes 3 extra cycles, so replicate exit here */
176
ret $31, ($26), 1 # L0 :
177
nop # E :
178
nop # E :
179
nop # E :
180
181
$misaligned:
182
mov $0, $4 # E : dest temp
183
and $0, 7, $1 # E : dest alignment mod8
184
beq $1, $dest_0mod8 # U : life doesnt totally suck
185
nop
186
187
$aligndest:
188
ble $18, $nomoredata # U :
189
ldbu $1, 0($17) # L : fetch a byte
190
subq $18, 1, $18 # E : count--
191
addq $17, 1, $17 # E : src++
192
193
stb $1, 0($4) # L : store it
194
addq $4, 1, $4 # E : dest++
195
and $4, 7, $1 # E : dest 0mod8 yet?
196
bne $1, $aligndest # U : go until we are aligned.
197
198
/* Source has unknown alignment, but dest is known to be 0mod8 */
199
$dest_0mod8:
200
subq $18, 8, $18 # E : At least a quad left?
201
blt $18, $misalign_tail # U : Nope
202
ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
203
nop # E :
204
205
$mis_quad:
206
ldq_u $16, 8($17) # L : Fetch next 8
207
extql $3, $17, $3 # U : masking
208
extqh $16, $17, $1 # U : masking
209
bis $3, $1, $1 # E : merged bytes to store
210
211
subq $18, 8, $18 # E : count -= 8
212
addq $17, 8, $17 # E : src += 8
213
stq $1, 0($4) # L : store 8 (aligned)
214
mov $16, $3 # E : "rotate" source data
215
216
addq $4, 8, $4 # E : dest += 8
217
bge $18, $mis_quad # U : More quads to move
218
nop
219
nop
220
221
$misalign_tail:
222
addq $18, 8, $18 # E : account for tail stuff
223
ble $18, $nomoredata # U :
224
nop
225
nop
226
227
$misalign_byte:
228
ldbu $1, 0($17) # L : fetch 1
229
subq $18, 1, $18 # E : count--
230
addq $17, 1, $17 # E : src++
231
nop # E :
232
233
stb $1, 0($4) # L : store
234
addq $4, 1, $4 # E : dest++
235
bgt $18, $misalign_byte # U : more to go?
236
nop
237
238
239
$nomoredata:
240
ret $31, ($26), 1 # L0 :
241
nop # E :
242
nop # E :
243
nop # E :
244
245
.end memcpy
246
EXPORT_SYMBOL(memcpy)
247
248
/* For backwards module compatibility. */
249
__memcpy = memcpy
250
.globl __memcpy
251
252