Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/alpha/lib/ev6-copy_page.S
26426 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* arch/alpha/lib/ev6-copy_page.S
4
*
5
* Copy an entire page.
6
*/
7
8
/* The following comparison of this routine vs the normal copy_page.S
9
was written by an unnamed ev6 hardware designer and forwarded to me
10
via Steven Hobbs <[email protected]>.
11
12
First Problem: STQ overflows.
13
-----------------------------
14
15
It would be nice if EV6 handled every resource overflow efficiently,
16
but for some it doesn't. Including store queue overflows. It causes
17
a trap and a restart of the pipe.
18
19
To get around this we sometimes use (to borrow a term from a VSSAD
20
researcher) "aeration". The idea is to slow the rate at which the
21
processor receives valid instructions by inserting nops in the fetch
22
path. In doing so, you can prevent the overflow and actually make
23
the code run faster. You can, of course, take advantage of the fact
24
that the processor can fetch at most 4 aligned instructions per cycle.
25
26
I inserted enough nops to force it to take 10 cycles to fetch the
27
loop code. In theory, EV6 should be able to execute this loop in
28
9 cycles but I was not able to get it to run that fast -- the initial
29
conditions were such that I could not reach this optimum rate on
30
(chaotic) EV6. I wrote the code such that everything would issue
31
in order.
32
33
Second Problem: Dcache index matches.
34
-------------------------------------
35
36
If you are going to use this routine on random aligned pages, there
37
is a 25% chance that the pages will be at the same dcache indices.
38
This results in many nasty memory traps without care.
39
40
The solution is to schedule the prefetches to avoid the memory
41
conflicts. I schedule the wh64 prefetches farther ahead of the
42
read prefetches to avoid this problem.
43
44
Third Problem: Needs more prefetching.
45
--------------------------------------
46
47
In order to improve the code I added deeper prefetching to take the
48
most advantage of EV6's bandwidth.
49
50
I also prefetched the read stream. Note that adding the read prefetch
51
forced me to add another cycle to the inner-most kernel - up to 11
52
from the original 8 cycles per iteration. We could improve performance
53
further by unrolling the loop and doing multiple prefetches per cycle.
54
55
I think that the code below will be very robust and fast code for the
56
purposes of copying aligned pages. It is slower when both source and
57
destination pages are in the dcache, but it is my guess that this is
58
less important than the dcache miss case. */
59
60
#include <linux/export.h>
61
.text
62
.align 4
63
.global copy_page
64
.ent copy_page
65
copy_page:
66
.prologue 0
67
68
/* Prefetch 5 read cachelines; write-hint 10 cache lines. */
69
wh64 ($16)
70
ldl $31,0($17)
71
ldl $31,64($17)
72
lda $1,1*64($16)
73
74
wh64 ($1)
75
ldl $31,128($17)
76
ldl $31,192($17)
77
lda $1,2*64($16)
78
79
wh64 ($1)
80
ldl $31,256($17)
81
lda $18,118
82
lda $1,3*64($16)
83
84
wh64 ($1)
85
nop
86
lda $1,4*64($16)
87
lda $2,5*64($16)
88
89
wh64 ($1)
90
wh64 ($2)
91
lda $1,6*64($16)
92
lda $2,7*64($16)
93
94
wh64 ($1)
95
wh64 ($2)
96
lda $1,8*64($16)
97
lda $2,9*64($16)
98
99
wh64 ($1)
100
wh64 ($2)
101
lda $19,10*64($16)
102
nop
103
104
/* Main prefetching/write-hinting loop. */
105
1: ldq $0,0($17)
106
ldq $1,8($17)
107
unop
108
unop
109
110
unop
111
unop
112
ldq $2,16($17)
113
ldq $3,24($17)
114
115
ldq $4,32($17)
116
ldq $5,40($17)
117
unop
118
unop
119
120
unop
121
unop
122
ldq $6,48($17)
123
ldq $7,56($17)
124
125
ldl $31,320($17)
126
unop
127
unop
128
unop
129
130
/* This gives the extra cycle of aeration above the minimum. */
131
unop
132
unop
133
unop
134
unop
135
136
wh64 ($19)
137
unop
138
unop
139
unop
140
141
stq $0,0($16)
142
subq $18,1,$18
143
stq $1,8($16)
144
unop
145
146
unop
147
stq $2,16($16)
148
addq $17,64,$17
149
stq $3,24($16)
150
151
stq $4,32($16)
152
stq $5,40($16)
153
addq $19,64,$19
154
unop
155
156
stq $6,48($16)
157
stq $7,56($16)
158
addq $16,64,$16
159
bne $18, 1b
160
161
/* Prefetch the final 5 cache lines of the read stream. */
162
lda $18,10
163
ldl $31,320($17)
164
ldl $31,384($17)
165
ldl $31,448($17)
166
167
ldl $31,512($17)
168
ldl $31,576($17)
169
nop
170
nop
171
172
/* Non-prefetching, non-write-hinting cleanup loop for the
173
final 10 cache lines. */
174
2: ldq $0,0($17)
175
ldq $1,8($17)
176
ldq $2,16($17)
177
ldq $3,24($17)
178
179
ldq $4,32($17)
180
ldq $5,40($17)
181
ldq $6,48($17)
182
ldq $7,56($17)
183
184
stq $0,0($16)
185
subq $18,1,$18
186
stq $1,8($16)
187
addq $17,64,$17
188
189
stq $2,16($16)
190
stq $3,24($16)
191
stq $4,32($16)
192
stq $5,40($16)
193
194
stq $6,48($16)
195
stq $7,56($16)
196
addq $16,64,$16
197
bne $18, 2b
198
199
ret
200
nop
201
unop
202
nop
203
204
.end copy_page
205
EXPORT_SYMBOL(copy_page)
206
207