CoCalc -- ev6-copy

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/alpha/lib/ev6-copy_page.S
¹⁰⁸¹⁷ views
1
/*
2
 * arch/alpha/lib/ev6-copy_page.S
3
 *
4
 * Copy an entire page.
5
 */
6

7
/* The following comparison of this routine vs the normal copy_page.S
8
   was written by an unnamed ev6 hardware designer and forwarded to me
9
   via Steven Hobbs <[email protected]>.
10
 
11
   First Problem: STQ overflows.
12
   -----------------------------
13

14
	It would be nice if EV6 handled every resource overflow efficiently,
15
	but for some it doesn't.  Including store queue overflows.  It causes
16
	a trap and a restart of the pipe.
17

18
	To get around this we sometimes use (to borrow a term from a VSSAD
19
	researcher) "aeration".  The idea is to slow the rate at which the
20
	processor receives valid instructions by inserting nops in the fetch
21
	path.  In doing so, you can prevent the overflow and actually make
22
	the code run faster.  You can, of course, take advantage of the fact
23
	that the processor can fetch at most 4 aligned instructions per cycle.
24

25
	I inserted enough nops to force it to take 10 cycles to fetch the
26
	loop code.  In theory, EV6 should be able to execute this loop in
27
	9 cycles but I was not able to get it to run that fast -- the initial
28
	conditions were such that I could not reach this optimum rate on
29
	(chaotic) EV6.  I wrote the code such that everything would issue
30
	in order. 
31

32
   Second Problem: Dcache index matches.
33
   -------------------------------------
34

35
	If you are going to use this routine on random aligned pages, there
36
	is a 25% chance that the pages will be at the same dcache indices.
37
	This results in many nasty memory traps without care.
38

39
	The solution is to schedule the prefetches to avoid the memory
40
	conflicts.  I schedule the wh64 prefetches farther ahead of the
41
	read prefetches to avoid this problem.
42

43
   Third Problem: Needs more prefetching.
44
   --------------------------------------
45

46
	In order to improve the code I added deeper prefetching to take the
47
	most advantage of EV6's bandwidth.
48

49
	I also prefetched the read stream. Note that adding the read prefetch
50
	forced me to add another cycle to the inner-most kernel - up to 11
51
	from the original 8 cycles per iteration.  We could improve performance
52
	further by unrolling the loop and doing multiple prefetches per cycle.
53

54
   I think that the code below will be very robust and fast code for the
55
   purposes of copying aligned pages.  It is slower when both source and
56
   destination pages are in the dcache, but it is my guess that this is
57
   less important than the dcache miss case.  */
58

59

60
	.text
61
	.align 4
62
	.global copy_page
63
	.ent copy_page
64
copy_page:
65
	.prologue 0
66

67
	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
68
	wh64	($16)
69
	ldl	$31,0($17)
70
	ldl	$31,64($17)
71
	lda	$1,1*64($16)
72

73
	wh64	($1)
74
	ldl	$31,128($17)
75
	ldl	$31,192($17)
76
	lda	$1,2*64($16)
77

78
	wh64	($1)
79
	ldl	$31,256($17)
80
	lda	$18,118
81
	lda	$1,3*64($16)
82

83
	wh64	($1)
84
	nop
85
	lda	$1,4*64($16)
86
	lda	$2,5*64($16)
87

88
	wh64	($1)
89
	wh64	($2)
90
	lda	$1,6*64($16)
91
	lda	$2,7*64($16)
92

93
	wh64	($1)
94
	wh64	($2)
95
	lda	$1,8*64($16)
96
	lda	$2,9*64($16)
97

98
	wh64	($1)
99
	wh64	($2)
100
	lda	$19,10*64($16)
101
	nop
102

103
	/* Main prefetching/write-hinting loop.  */
104
1:	ldq	$0,0($17)
105
	ldq	$1,8($17)
106
	unop
107
	unop
108

109
	unop
110
	unop
111
	ldq	$2,16($17)
112
	ldq	$3,24($17)
113

114
	ldq	$4,32($17)
115
	ldq	$5,40($17)
116
	unop
117
	unop
118

119
	unop
120
	unop
121
	ldq	$6,48($17)
122
	ldq	$7,56($17)
123

124
	ldl	$31,320($17)
125
	unop
126
	unop
127
	unop
128

129
	/* This gives the extra cycle of aeration above the minimum.  */
130
	unop			
131
	unop
132
	unop
133
	unop
134

135
	wh64	($19)
136
	unop
137
	unop
138
	unop
139

140
	stq	$0,0($16)
141
	subq	$18,1,$18
142
	stq	$1,8($16)
143
	unop
144

145
	unop
146
	stq	$2,16($16)
147
	addq	$17,64,$17
148
	stq	$3,24($16)
149

150
	stq	$4,32($16)
151
	stq	$5,40($16)
152
	addq	$19,64,$19
153
	unop
154

155
	stq	$6,48($16)
156
	stq	$7,56($16)
157
	addq	$16,64,$16
158
	bne	$18, 1b
159

160
	/* Prefetch the final 5 cache lines of the read stream.  */
161
	lda	$18,10
162
	ldl	$31,320($17)
163
	ldl	$31,384($17)
164
	ldl	$31,448($17)
165

166
	ldl	$31,512($17)
167
	ldl	$31,576($17)
168
	nop
169
	nop
170

171
	/* Non-prefetching, non-write-hinting cleanup loop for the
172
	   final 10 cache lines.  */
173
2:	ldq	$0,0($17)
174
	ldq	$1,8($17)
175
	ldq	$2,16($17)
176
	ldq	$3,24($17)
177

178
	ldq	$4,32($17)
179
	ldq	$5,40($17)
180
	ldq	$6,48($17)
181
	ldq	$7,56($17)
182

183
	stq	$0,0($16)
184
	subq	$18,1,$18
185
	stq	$1,8($16)
186
	addq	$17,64,$17
187

188
	stq	$2,16($16)
189
	stq	$3,24($16)
190
	stq	$4,32($16)
191
	stq	$5,40($16)
192

193
	stq	$6,48($16)
194
	stq	$7,56($16)
195
	addq	$16,64,$16
196
	bne	$18, 2b
197

198
	ret
199
	nop
200
	unop
201
	nop
202

203
	.end copy_page
204

205
Product

Resources

Company