Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/lib/mmx_32.c
10817 views
1
/*
2
* MMX 3DNow! library helper functions
3
*
4
* To do:
5
* We can use MMX just for prefetch in IRQ's. This may be a win.
6
* (reported so on K6-III)
7
* We should use a better code neutral filler for the short jump
8
* leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
9
* We also want to clobber the filler register so we don't get any
10
* register forwarding stalls on the filler.
11
*
12
* Add *user handling. Checksums are not a win with MMX on any CPU
13
* tested so far for any MMX solution figured.
14
*
15
* 22/09/2000 - Arjan van de Ven
16
* Improved for non-egineering-sample Athlons
17
*
18
*/
19
#include <linux/hardirq.h>
20
#include <linux/string.h>
21
#include <linux/module.h>
22
#include <linux/sched.h>
23
#include <linux/types.h>
24
25
#include <asm/i387.h>
26
#include <asm/asm.h>
27
28
void *_mmx_memcpy(void *to, const void *from, size_t len)
29
{
30
void *p;
31
int i;
32
33
if (unlikely(in_interrupt()))
34
return __memcpy(to, from, len);
35
36
p = to;
37
i = len >> 6; /* len/64 */
38
39
kernel_fpu_begin();
40
41
__asm__ __volatile__ (
42
"1: prefetch (%0)\n" /* This set is 28 bytes */
43
" prefetch 64(%0)\n"
44
" prefetch 128(%0)\n"
45
" prefetch 192(%0)\n"
46
" prefetch 256(%0)\n"
47
"2: \n"
48
".section .fixup, \"ax\"\n"
49
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
50
" jmp 2b\n"
51
".previous\n"
52
_ASM_EXTABLE(1b, 3b)
53
: : "r" (from));
54
55
for ( ; i > 5; i--) {
56
__asm__ __volatile__ (
57
"1: prefetch 320(%0)\n"
58
"2: movq (%0), %%mm0\n"
59
" movq 8(%0), %%mm1\n"
60
" movq 16(%0), %%mm2\n"
61
" movq 24(%0), %%mm3\n"
62
" movq %%mm0, (%1)\n"
63
" movq %%mm1, 8(%1)\n"
64
" movq %%mm2, 16(%1)\n"
65
" movq %%mm3, 24(%1)\n"
66
" movq 32(%0), %%mm0\n"
67
" movq 40(%0), %%mm1\n"
68
" movq 48(%0), %%mm2\n"
69
" movq 56(%0), %%mm3\n"
70
" movq %%mm0, 32(%1)\n"
71
" movq %%mm1, 40(%1)\n"
72
" movq %%mm2, 48(%1)\n"
73
" movq %%mm3, 56(%1)\n"
74
".section .fixup, \"ax\"\n"
75
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
76
" jmp 2b\n"
77
".previous\n"
78
_ASM_EXTABLE(1b, 3b)
79
: : "r" (from), "r" (to) : "memory");
80
81
from += 64;
82
to += 64;
83
}
84
85
for ( ; i > 0; i--) {
86
__asm__ __volatile__ (
87
" movq (%0), %%mm0\n"
88
" movq 8(%0), %%mm1\n"
89
" movq 16(%0), %%mm2\n"
90
" movq 24(%0), %%mm3\n"
91
" movq %%mm0, (%1)\n"
92
" movq %%mm1, 8(%1)\n"
93
" movq %%mm2, 16(%1)\n"
94
" movq %%mm3, 24(%1)\n"
95
" movq 32(%0), %%mm0\n"
96
" movq 40(%0), %%mm1\n"
97
" movq 48(%0), %%mm2\n"
98
" movq 56(%0), %%mm3\n"
99
" movq %%mm0, 32(%1)\n"
100
" movq %%mm1, 40(%1)\n"
101
" movq %%mm2, 48(%1)\n"
102
" movq %%mm3, 56(%1)\n"
103
: : "r" (from), "r" (to) : "memory");
104
105
from += 64;
106
to += 64;
107
}
108
/*
109
* Now do the tail of the block:
110
*/
111
__memcpy(to, from, len & 63);
112
kernel_fpu_end();
113
114
return p;
115
}
116
EXPORT_SYMBOL(_mmx_memcpy);
117
118
#ifdef CONFIG_MK7
119
120
/*
121
* The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
122
* other MMX using processors do not.
123
*/
124
125
static void fast_clear_page(void *page)
126
{
127
int i;
128
129
kernel_fpu_begin();
130
131
__asm__ __volatile__ (
132
" pxor %%mm0, %%mm0\n" : :
133
);
134
135
for (i = 0; i < 4096/64; i++) {
136
__asm__ __volatile__ (
137
" movntq %%mm0, (%0)\n"
138
" movntq %%mm0, 8(%0)\n"
139
" movntq %%mm0, 16(%0)\n"
140
" movntq %%mm0, 24(%0)\n"
141
" movntq %%mm0, 32(%0)\n"
142
" movntq %%mm0, 40(%0)\n"
143
" movntq %%mm0, 48(%0)\n"
144
" movntq %%mm0, 56(%0)\n"
145
: : "r" (page) : "memory");
146
page += 64;
147
}
148
149
/*
150
* Since movntq is weakly-ordered, a "sfence" is needed to become
151
* ordered again:
152
*/
153
__asm__ __volatile__("sfence\n"::);
154
155
kernel_fpu_end();
156
}
157
158
static void fast_copy_page(void *to, void *from)
159
{
160
int i;
161
162
kernel_fpu_begin();
163
164
/*
165
* maybe the prefetch stuff can go before the expensive fnsave...
166
* but that is for later. -AV
167
*/
168
__asm__ __volatile__(
169
"1: prefetch (%0)\n"
170
" prefetch 64(%0)\n"
171
" prefetch 128(%0)\n"
172
" prefetch 192(%0)\n"
173
" prefetch 256(%0)\n"
174
"2: \n"
175
".section .fixup, \"ax\"\n"
176
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
177
" jmp 2b\n"
178
".previous\n"
179
_ASM_EXTABLE(1b, 3b) : : "r" (from));
180
181
for (i = 0; i < (4096-320)/64; i++) {
182
__asm__ __volatile__ (
183
"1: prefetch 320(%0)\n"
184
"2: movq (%0), %%mm0\n"
185
" movntq %%mm0, (%1)\n"
186
" movq 8(%0), %%mm1\n"
187
" movntq %%mm1, 8(%1)\n"
188
" movq 16(%0), %%mm2\n"
189
" movntq %%mm2, 16(%1)\n"
190
" movq 24(%0), %%mm3\n"
191
" movntq %%mm3, 24(%1)\n"
192
" movq 32(%0), %%mm4\n"
193
" movntq %%mm4, 32(%1)\n"
194
" movq 40(%0), %%mm5\n"
195
" movntq %%mm5, 40(%1)\n"
196
" movq 48(%0), %%mm6\n"
197
" movntq %%mm6, 48(%1)\n"
198
" movq 56(%0), %%mm7\n"
199
" movntq %%mm7, 56(%1)\n"
200
".section .fixup, \"ax\"\n"
201
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
202
" jmp 2b\n"
203
".previous\n"
204
_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
205
206
from += 64;
207
to += 64;
208
}
209
210
for (i = (4096-320)/64; i < 4096/64; i++) {
211
__asm__ __volatile__ (
212
"2: movq (%0), %%mm0\n"
213
" movntq %%mm0, (%1)\n"
214
" movq 8(%0), %%mm1\n"
215
" movntq %%mm1, 8(%1)\n"
216
" movq 16(%0), %%mm2\n"
217
" movntq %%mm2, 16(%1)\n"
218
" movq 24(%0), %%mm3\n"
219
" movntq %%mm3, 24(%1)\n"
220
" movq 32(%0), %%mm4\n"
221
" movntq %%mm4, 32(%1)\n"
222
" movq 40(%0), %%mm5\n"
223
" movntq %%mm5, 40(%1)\n"
224
" movq 48(%0), %%mm6\n"
225
" movntq %%mm6, 48(%1)\n"
226
" movq 56(%0), %%mm7\n"
227
" movntq %%mm7, 56(%1)\n"
228
: : "r" (from), "r" (to) : "memory");
229
from += 64;
230
to += 64;
231
}
232
/*
233
* Since movntq is weakly-ordered, a "sfence" is needed to become
234
* ordered again:
235
*/
236
__asm__ __volatile__("sfence \n"::);
237
kernel_fpu_end();
238
}
239
240
#else /* CONFIG_MK7 */
241
242
/*
243
* Generic MMX implementation without K7 specific streaming
244
*/
245
static void fast_clear_page(void *page)
246
{
247
int i;
248
249
kernel_fpu_begin();
250
251
__asm__ __volatile__ (
252
" pxor %%mm0, %%mm0\n" : :
253
);
254
255
for (i = 0; i < 4096/128; i++) {
256
__asm__ __volatile__ (
257
" movq %%mm0, (%0)\n"
258
" movq %%mm0, 8(%0)\n"
259
" movq %%mm0, 16(%0)\n"
260
" movq %%mm0, 24(%0)\n"
261
" movq %%mm0, 32(%0)\n"
262
" movq %%mm0, 40(%0)\n"
263
" movq %%mm0, 48(%0)\n"
264
" movq %%mm0, 56(%0)\n"
265
" movq %%mm0, 64(%0)\n"
266
" movq %%mm0, 72(%0)\n"
267
" movq %%mm0, 80(%0)\n"
268
" movq %%mm0, 88(%0)\n"
269
" movq %%mm0, 96(%0)\n"
270
" movq %%mm0, 104(%0)\n"
271
" movq %%mm0, 112(%0)\n"
272
" movq %%mm0, 120(%0)\n"
273
: : "r" (page) : "memory");
274
page += 128;
275
}
276
277
kernel_fpu_end();
278
}
279
280
static void fast_copy_page(void *to, void *from)
281
{
282
int i;
283
284
kernel_fpu_begin();
285
286
__asm__ __volatile__ (
287
"1: prefetch (%0)\n"
288
" prefetch 64(%0)\n"
289
" prefetch 128(%0)\n"
290
" prefetch 192(%0)\n"
291
" prefetch 256(%0)\n"
292
"2: \n"
293
".section .fixup, \"ax\"\n"
294
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
295
" jmp 2b\n"
296
".previous\n"
297
_ASM_EXTABLE(1b, 3b) : : "r" (from));
298
299
for (i = 0; i < 4096/64; i++) {
300
__asm__ __volatile__ (
301
"1: prefetch 320(%0)\n"
302
"2: movq (%0), %%mm0\n"
303
" movq 8(%0), %%mm1\n"
304
" movq 16(%0), %%mm2\n"
305
" movq 24(%0), %%mm3\n"
306
" movq %%mm0, (%1)\n"
307
" movq %%mm1, 8(%1)\n"
308
" movq %%mm2, 16(%1)\n"
309
" movq %%mm3, 24(%1)\n"
310
" movq 32(%0), %%mm0\n"
311
" movq 40(%0), %%mm1\n"
312
" movq 48(%0), %%mm2\n"
313
" movq 56(%0), %%mm3\n"
314
" movq %%mm0, 32(%1)\n"
315
" movq %%mm1, 40(%1)\n"
316
" movq %%mm2, 48(%1)\n"
317
" movq %%mm3, 56(%1)\n"
318
".section .fixup, \"ax\"\n"
319
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
320
" jmp 2b\n"
321
".previous\n"
322
_ASM_EXTABLE(1b, 3b)
323
: : "r" (from), "r" (to) : "memory");
324
325
from += 64;
326
to += 64;
327
}
328
kernel_fpu_end();
329
}
330
331
#endif /* !CONFIG_MK7 */
332
333
/*
334
* Favour MMX for page clear and copy:
335
*/
336
static void slow_zero_page(void *page)
337
{
338
int d0, d1;
339
340
__asm__ __volatile__(
341
"cld\n\t"
342
"rep ; stosl"
343
344
: "=&c" (d0), "=&D" (d1)
345
:"a" (0), "1" (page), "0" (1024)
346
:"memory");
347
}
348
349
void mmx_clear_page(void *page)
350
{
351
if (unlikely(in_interrupt()))
352
slow_zero_page(page);
353
else
354
fast_clear_page(page);
355
}
356
EXPORT_SYMBOL(mmx_clear_page);
357
358
static void slow_copy_page(void *to, void *from)
359
{
360
int d0, d1, d2;
361
362
__asm__ __volatile__(
363
"cld\n\t"
364
"rep ; movsl"
365
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
366
: "0" (1024), "1" ((long) to), "2" ((long) from)
367
: "memory");
368
}
369
370
void mmx_copy_page(void *to, void *from)
371
{
372
if (unlikely(in_interrupt()))
373
slow_copy_page(to, from);
374
else
375
fast_copy_page(to, from);
376
}
377
EXPORT_SYMBOL(mmx_copy_page);
378
379