Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/alpha/lib/ev6-memcpy.S
10817 views
1
/*
2
* arch/alpha/lib/ev6-memcpy.S
3
* 21264 version by Rick Gorton <[email protected]>
4
*
5
* Reasonably optimized memcpy() routine for the Alpha 21264
6
*
7
* - memory accessed as aligned quadwords only
8
* - uses bcmpge to compare 8 bytes in parallel
9
*
10
* Much of the information about 21264 scheduling/coding comes from:
11
* Compiler Writer's Guide for the Alpha 21264
12
* abbreviated as 'CWG' in other comments here
13
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14
* Scheduling notation:
15
* E - either cluster
16
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18
*
19
* Temp usage notes:
20
* $1,$2, - scratch
21
*/
22
23
.set noreorder
24
.set noat
25
26
.align 4
27
.globl memcpy
28
.ent memcpy
29
memcpy:
30
.frame $30,0,$26,0
31
.prologue 0
32
33
mov $16, $0 # E : copy dest to return
34
ble $18, $nomoredata # U : done with the copy?
35
xor $16, $17, $1 # E : are source and dest alignments the same?
36
and $1, 7, $1 # E : are they the same mod 8?
37
38
bne $1, $misaligned # U : Nope - gotta do this the slow way
39
/* source and dest are same mod 8 address */
40
and $16, 7, $1 # E : Are both 0mod8?
41
beq $1, $both_0mod8 # U : Yes
42
nop # E :
43
44
/*
45
* source and dest are same misalignment. move a byte at a time
46
* until a 0mod8 alignment for both is reached.
47
* At least one byte more to move
48
*/
49
50
$head_align:
51
ldbu $1, 0($17) # L : grab a byte
52
subq $18, 1, $18 # E : count--
53
addq $17, 1, $17 # E : src++
54
stb $1, 0($16) # L :
55
addq $16, 1, $16 # E : dest++
56
and $16, 7, $1 # E : Are we at 0mod8 yet?
57
ble $18, $nomoredata # U : done with the copy?
58
bne $1, $head_align # U :
59
60
$both_0mod8:
61
cmple $18, 127, $1 # E : Can we unroll the loop?
62
bne $1, $no_unroll # U :
63
and $16, 63, $1 # E : get mod64 alignment
64
beq $1, $do_unroll # U : no single quads to fiddle
65
66
$single_head_quad:
67
ldq $1, 0($17) # L : get 8 bytes
68
subq $18, 8, $18 # E : count -= 8
69
addq $17, 8, $17 # E : src += 8
70
nop # E :
71
72
stq $1, 0($16) # L : store
73
addq $16, 8, $16 # E : dest += 8
74
and $16, 63, $1 # E : get mod64 alignment
75
bne $1, $single_head_quad # U : still not fully aligned
76
77
$do_unroll:
78
addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
79
cmple $18, 127, $1 # E : Can we go through the unrolled loop?
80
bne $1, $tail_quads # U : Nope
81
nop # E :
82
83
$unroll_body:
84
wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
85
# ($7) are about to be over-written
86
ldq $6, 0($17) # L0 : bytes 0..7
87
nop # E :
88
nop # E :
89
90
ldq $4, 8($17) # L : bytes 8..15
91
ldq $5, 16($17) # L : bytes 16..23
92
addq $7, 64, $7 # E : Update next wh64 address
93
nop # E :
94
95
ldq $3, 24($17) # L : bytes 24..31
96
addq $16, 64, $1 # E : fallback value for wh64
97
nop # E :
98
nop # E :
99
100
addq $17, 32, $17 # E : src += 32 bytes
101
stq $6, 0($16) # L : bytes 0..7
102
nop # E :
103
nop # E :
104
105
stq $4, 8($16) # L : bytes 8..15
106
stq $5, 16($16) # L : bytes 16..23
107
subq $18, 192, $2 # E : At least two more trips to go?
108
nop # E :
109
110
stq $3, 24($16) # L : bytes 24..31
111
addq $16, 32, $16 # E : dest += 32 bytes
112
nop # E :
113
nop # E :
114
115
ldq $6, 0($17) # L : bytes 0..7
116
ldq $4, 8($17) # L : bytes 8..15
117
cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
118
# fallback wh64 address if < 2 more trips
119
nop # E :
120
121
ldq $5, 16($17) # L : bytes 16..23
122
ldq $3, 24($17) # L : bytes 24..31
123
addq $16, 32, $16 # E : dest += 32
124
subq $18, 64, $18 # E : count -= 64
125
126
addq $17, 32, $17 # E : src += 32
127
stq $6, -32($16) # L : bytes 0..7
128
stq $4, -24($16) # L : bytes 8..15
129
cmple $18, 63, $1 # E : At least one more trip?
130
131
stq $5, -16($16) # L : bytes 16..23
132
stq $3, -8($16) # L : bytes 24..31
133
nop # E :
134
beq $1, $unroll_body
135
136
$tail_quads:
137
$no_unroll:
138
.align 4
139
subq $18, 8, $18 # E : At least a quad left?
140
blt $18, $less_than_8 # U : Nope
141
nop # E :
142
nop # E :
143
144
$move_a_quad:
145
ldq $1, 0($17) # L : fetch 8
146
subq $18, 8, $18 # E : count -= 8
147
addq $17, 8, $17 # E : src += 8
148
nop # E :
149
150
stq $1, 0($16) # L : store 8
151
addq $16, 8, $16 # E : dest += 8
152
bge $18, $move_a_quad # U :
153
nop # E :
154
155
$less_than_8:
156
.align 4
157
addq $18, 8, $18 # E : add back for trailing bytes
158
ble $18, $nomoredata # U : All-done
159
nop # E :
160
nop # E :
161
162
/* Trailing bytes */
163
$tail_bytes:
164
subq $18, 1, $18 # E : count--
165
ldbu $1, 0($17) # L : fetch a byte
166
addq $17, 1, $17 # E : src++
167
nop # E :
168
169
stb $1, 0($16) # L : store a byte
170
addq $16, 1, $16 # E : dest++
171
bgt $18, $tail_bytes # U : more to be done?
172
nop # E :
173
174
/* branching to exit takes 3 extra cycles, so replicate exit here */
175
ret $31, ($26), 1 # L0 :
176
nop # E :
177
nop # E :
178
nop # E :
179
180
$misaligned:
181
mov $0, $4 # E : dest temp
182
and $0, 7, $1 # E : dest alignment mod8
183
beq $1, $dest_0mod8 # U : life doesnt totally suck
184
nop
185
186
$aligndest:
187
ble $18, $nomoredata # U :
188
ldbu $1, 0($17) # L : fetch a byte
189
subq $18, 1, $18 # E : count--
190
addq $17, 1, $17 # E : src++
191
192
stb $1, 0($4) # L : store it
193
addq $4, 1, $4 # E : dest++
194
and $4, 7, $1 # E : dest 0mod8 yet?
195
bne $1, $aligndest # U : go until we are aligned.
196
197
/* Source has unknown alignment, but dest is known to be 0mod8 */
198
$dest_0mod8:
199
subq $18, 8, $18 # E : At least a quad left?
200
blt $18, $misalign_tail # U : Nope
201
ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
202
nop # E :
203
204
$mis_quad:
205
ldq_u $16, 8($17) # L : Fetch next 8
206
extql $3, $17, $3 # U : masking
207
extqh $16, $17, $1 # U : masking
208
bis $3, $1, $1 # E : merged bytes to store
209
210
subq $18, 8, $18 # E : count -= 8
211
addq $17, 8, $17 # E : src += 8
212
stq $1, 0($4) # L : store 8 (aligned)
213
mov $16, $3 # E : "rotate" source data
214
215
addq $4, 8, $4 # E : dest += 8
216
bge $18, $mis_quad # U : More quads to move
217
nop
218
nop
219
220
$misalign_tail:
221
addq $18, 8, $18 # E : account for tail stuff
222
ble $18, $nomoredata # U :
223
nop
224
nop
225
226
$misalign_byte:
227
ldbu $1, 0($17) # L : fetch 1
228
subq $18, 1, $18 # E : count--
229
addq $17, 1, $17 # E : src++
230
nop # E :
231
232
stb $1, 0($4) # L : store
233
addq $4, 1, $4 # E : dest++
234
bgt $18, $misalign_byte # U : more to go?
235
nop
236
237
238
$nomoredata:
239
ret $31, ($26), 1 # L0 :
240
nop # E :
241
nop # E :
242
nop # E :
243
244
.end memcpy
245
246
/* For backwards module compatibility. */
247
__memcpy = memcpy
248
.globl __memcpy
249
250