CoCalc -- ev6-copy

GitHub Repository: torvalds/linux
Path: blob/master/arch/alpha/lib/ev6-copy_page.S
²⁶⁴²⁶ views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * arch/alpha/lib/ev6-copy_page.S
4
 *
5
 * Copy an entire page.
6
 */
7

8
/* The following comparison of this routine vs the normal copy_page.S
9
   was written by an unnamed ev6 hardware designer and forwarded to me
10
   via Steven Hobbs <[email protected]>.
11
 
12
   First Problem: STQ overflows.
13
   -----------------------------
14

15
	It would be nice if EV6 handled every resource overflow efficiently,
16
	but for some it doesn't.  Including store queue overflows.  It causes
17
	a trap and a restart of the pipe.
18

19
	To get around this we sometimes use (to borrow a term from a VSSAD
20
	researcher) "aeration".  The idea is to slow the rate at which the
21
	processor receives valid instructions by inserting nops in the fetch
22
	path.  In doing so, you can prevent the overflow and actually make
23
	the code run faster.  You can, of course, take advantage of the fact
24
	that the processor can fetch at most 4 aligned instructions per cycle.
25

26
	I inserted enough nops to force it to take 10 cycles to fetch the
27
	loop code.  In theory, EV6 should be able to execute this loop in
28
	9 cycles but I was not able to get it to run that fast -- the initial
29
	conditions were such that I could not reach this optimum rate on
30
	(chaotic) EV6.  I wrote the code such that everything would issue
31
	in order. 
32

33
   Second Problem: Dcache index matches.
34
   -------------------------------------
35

36
	If you are going to use this routine on random aligned pages, there
37
	is a 25% chance that the pages will be at the same dcache indices.
38
	This results in many nasty memory traps without care.
39

40
	The solution is to schedule the prefetches to avoid the memory
41
	conflicts.  I schedule the wh64 prefetches farther ahead of the
42
	read prefetches to avoid this problem.
43

44
   Third Problem: Needs more prefetching.
45
   --------------------------------------
46

47
	In order to improve the code I added deeper prefetching to take the
48
	most advantage of EV6's bandwidth.
49

50
	I also prefetched the read stream. Note that adding the read prefetch
51
	forced me to add another cycle to the inner-most kernel - up to 11
52
	from the original 8 cycles per iteration.  We could improve performance
53
	further by unrolling the loop and doing multiple prefetches per cycle.
54

55
   I think that the code below will be very robust and fast code for the
56
   purposes of copying aligned pages.  It is slower when both source and
57
   destination pages are in the dcache, but it is my guess that this is
58
   less important than the dcache miss case.  */
59

60
#include <linux/export.h>
61
	.text
62
	.align 4
63
	.global copy_page
64
	.ent copy_page
65
copy_page:
66
	.prologue 0
67

68
	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
69
	wh64	($16)
70
	ldl	$31,0($17)
71
	ldl	$31,64($17)
72
	lda	$1,1*64($16)
73

74
	wh64	($1)
75
	ldl	$31,128($17)
76
	ldl	$31,192($17)
77
	lda	$1,2*64($16)
78

79
	wh64	($1)
80
	ldl	$31,256($17)
81
	lda	$18,118
82
	lda	$1,3*64($16)
83

84
	wh64	($1)
85
	nop
86
	lda	$1,4*64($16)
87
	lda	$2,5*64($16)
88

89
	wh64	($1)
90
	wh64	($2)
91
	lda	$1,6*64($16)
92
	lda	$2,7*64($16)
93

94
	wh64	($1)
95
	wh64	($2)
96
	lda	$1,8*64($16)
97
	lda	$2,9*64($16)
98

99
	wh64	($1)
100
	wh64	($2)
101
	lda	$19,10*64($16)
102
	nop
103

104
	/* Main prefetching/write-hinting loop.  */
105
1:	ldq	$0,0($17)
106
	ldq	$1,8($17)
107
	unop
108
	unop
109

110
	unop
111
	unop
112
	ldq	$2,16($17)
113
	ldq	$3,24($17)
114

115
	ldq	$4,32($17)
116
	ldq	$5,40($17)
117
	unop
118
	unop
119

120
	unop
121
	unop
122
	ldq	$6,48($17)
123
	ldq	$7,56($17)
124

125
	ldl	$31,320($17)
126
	unop
127
	unop
128
	unop
129

130
	/* This gives the extra cycle of aeration above the minimum.  */
131
	unop			
132
	unop
133
	unop
134
	unop
135

136
	wh64	($19)
137
	unop
138
	unop
139
	unop
140

141
	stq	$0,0($16)
142
	subq	$18,1,$18
143
	stq	$1,8($16)
144
	unop
145

146
	unop
147
	stq	$2,16($16)
148
	addq	$17,64,$17
149
	stq	$3,24($16)
150

151
	stq	$4,32($16)
152
	stq	$5,40($16)
153
	addq	$19,64,$19
154
	unop
155

156
	stq	$6,48($16)
157
	stq	$7,56($16)
158
	addq	$16,64,$16
159
	bne	$18, 1b
160

161
	/* Prefetch the final 5 cache lines of the read stream.  */
162
	lda	$18,10
163
	ldl	$31,320($17)
164
	ldl	$31,384($17)
165
	ldl	$31,448($17)
166

167
	ldl	$31,512($17)
168
	ldl	$31,576($17)
169
	nop
170
	nop
171

172
	/* Non-prefetching, non-write-hinting cleanup loop for the
173
	   final 10 cache lines.  */
174
2:	ldq	$0,0($17)
175
	ldq	$1,8($17)
176
	ldq	$2,16($17)
177
	ldq	$3,24($17)
178

179
	ldq	$4,32($17)
180
	ldq	$5,40($17)
181
	ldq	$6,48($17)
182
	ldq	$7,56($17)
183

184
	stq	$0,0($16)
185
	subq	$18,1,$18
186
	stq	$1,8($16)
187
	addq	$17,64,$17
188

189
	stq	$2,16($16)
190
	stq	$3,24($16)
191
	stq	$4,32($16)
192
	stq	$5,40($16)
193

194
	stq	$6,48($16)
195
	stq	$7,56($16)
196
	addq	$16,64,$16
197
	bne	$18, 2b
198

199
	ret
200
	nop
201
	unop
202
	nop
203

204
	.end copy_page
205
	EXPORT_SYMBOL(copy_page)
206

207
Product

Resources

Company