Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/mips/cavium-octeon/octeon-memcpy.S
10817 views
1
/*
2
* This file is subject to the terms and conditions of the GNU General Public
3
* License. See the file "COPYING" in the main directory of this archive
4
* for more details.
5
*
6
* Unified implementation of memcpy, memmove and the __copy_user backend.
7
*
8
* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])
9
* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10
* Copyright (C) 2002 Broadcom, Inc.
11
* memcpy/copy_user author: Mark Vandevoorde
12
*
13
* Mnemonic names for arguments to memcpy/__copy_user
14
*/
15
16
#include <asm/asm.h>
17
#include <asm/asm-offsets.h>
18
#include <asm/regdef.h>
19
20
#define dst a0
21
#define src a1
22
#define len a2
23
24
/*
25
* Spec
26
*
27
* memcpy copies len bytes from src to dst and sets v0 to dst.
28
* It assumes that
29
* - src and dst don't overlap
30
* - src is readable
31
* - dst is writable
32
* memcpy uses the standard calling convention
33
*
34
* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
35
* the number of uncopied bytes due to an exception caused by a read or write.
36
* __copy_user assumes that src and dst don't overlap, and that the call is
37
* implementing one of the following:
38
* copy_to_user
39
* - src is readable (no exceptions when reading src)
40
* copy_from_user
41
* - dst is writable (no exceptions when writing dst)
42
* __copy_user uses a non-standard calling convention; see
43
* arch/mips/include/asm/uaccess.h
44
*
45
* When an exception happens on a load, the handler must
46
# ensure that all of the destination buffer is overwritten to prevent
47
* leaking information to user mode programs.
48
*/
49
50
/*
51
* Implementation
52
*/
53
54
/*
55
* The exception handler for loads requires that:
56
* 1- AT contain the address of the byte just past the end of the source
57
* of the copy,
58
* 2- src_entry <= src < AT, and
59
* 3- (dst - src) == (dst_entry - src_entry),
60
* The _entry suffix denotes values when __copy_user was called.
61
*
62
* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
63
* (2) is met by incrementing src by the number of bytes copied
64
* (3) is met by not doing loads between a pair of increments of dst and src
65
*
66
* The exception handlers for stores adjust len (if necessary) and return.
67
* These handlers do not need to overwrite any data.
68
*
69
* For __rmemcpy and memmove an exception is always a kernel bug, therefore
70
* they're not protected.
71
*/
72
73
#define EXC(inst_reg,addr,handler) \
74
9: inst_reg, addr; \
75
.section __ex_table,"a"; \
76
PTR 9b, handler; \
77
.previous
78
79
/*
80
* Only on the 64-bit kernel we can made use of 64-bit registers.
81
*/
82
#ifdef CONFIG_64BIT
83
#define USE_DOUBLE
84
#endif
85
86
#ifdef USE_DOUBLE
87
88
#define LOAD ld
89
#define LOADL ldl
90
#define LOADR ldr
91
#define STOREL sdl
92
#define STORER sdr
93
#define STORE sd
94
#define ADD daddu
95
#define SUB dsubu
96
#define SRL dsrl
97
#define SRA dsra
98
#define SLL dsll
99
#define SLLV dsllv
100
#define SRLV dsrlv
101
#define NBYTES 8
102
#define LOG_NBYTES 3
103
104
/*
105
* As we are sharing code base with the mips32 tree (which use the o32 ABI
106
* register definitions). We need to redefine the register definitions from
107
* the n64 ABI register naming to the o32 ABI register naming.
108
*/
109
#undef t0
110
#undef t1
111
#undef t2
112
#undef t3
113
#define t0 $8
114
#define t1 $9
115
#define t2 $10
116
#define t3 $11
117
#define t4 $12
118
#define t5 $13
119
#define t6 $14
120
#define t7 $15
121
122
#else
123
124
#define LOAD lw
125
#define LOADL lwl
126
#define LOADR lwr
127
#define STOREL swl
128
#define STORER swr
129
#define STORE sw
130
#define ADD addu
131
#define SUB subu
132
#define SRL srl
133
#define SLL sll
134
#define SRA sra
135
#define SLLV sllv
136
#define SRLV srlv
137
#define NBYTES 4
138
#define LOG_NBYTES 2
139
140
#endif /* USE_DOUBLE */
141
142
#ifdef CONFIG_CPU_LITTLE_ENDIAN
143
#define LDFIRST LOADR
144
#define LDREST LOADL
145
#define STFIRST STORER
146
#define STREST STOREL
147
#define SHIFT_DISCARD SLLV
148
#else
149
#define LDFIRST LOADL
150
#define LDREST LOADR
151
#define STFIRST STOREL
152
#define STREST STORER
153
#define SHIFT_DISCARD SRLV
154
#endif
155
156
#define FIRST(unit) ((unit)*NBYTES)
157
#define REST(unit) (FIRST(unit)+NBYTES-1)
158
#define UNIT(unit) FIRST(unit)
159
160
#define ADDRMASK (NBYTES-1)
161
162
.text
163
.set noreorder
164
.set noat
165
166
/*
167
* A combined memcpy/__copy_user
168
* __copy_user sets len to 0 for success; else to an upper bound of
169
* the number of uncopied bytes.
170
* memcpy sets v0 to dst.
171
*/
172
.align 5
173
LEAF(memcpy) /* a0=dst a1=src a2=len */
174
move v0, dst /* return value */
175
__memcpy:
176
FEXPORT(__copy_user)
177
/*
178
* Note: dst & src may be unaligned, len may be 0
179
* Temps
180
*/
181
#
182
# Octeon doesn't care if the destination is unaligned. The hardware
183
# can fix it faster than we can special case the assembly.
184
#
185
pref 0, 0(src)
186
sltu t0, len, NBYTES # Check if < 1 word
187
bnez t0, copy_bytes_checklen
188
and t0, src, ADDRMASK # Check if src unaligned
189
bnez t0, src_unaligned
190
sltu t0, len, 4*NBYTES # Check if < 4 words
191
bnez t0, less_than_4units
192
sltu t0, len, 8*NBYTES # Check if < 8 words
193
bnez t0, less_than_8units
194
sltu t0, len, 16*NBYTES # Check if < 16 words
195
bnez t0, cleanup_both_aligned
196
sltu t0, len, 128+1 # Check if len < 129
197
bnez t0, 1f # Skip prefetch if len is too short
198
sltu t0, len, 256+1 # Check if len < 257
199
bnez t0, 1f # Skip prefetch if len is too short
200
pref 0, 128(src) # We must not prefetch invalid addresses
201
#
202
# This is where we loop if there is more than 128 bytes left
203
2: pref 0, 256(src) # We must not prefetch invalid addresses
204
#
205
# This is where we loop if we can't prefetch anymore
206
1:
207
EXC( LOAD t0, UNIT(0)(src), l_exc)
208
EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
209
EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
210
EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
211
SUB len, len, 16*NBYTES
212
EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
213
EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
214
EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
215
EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
216
EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
217
EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
218
EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
219
EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
220
EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
221
EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
222
EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
223
ADD src, src, 16*NBYTES
224
EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
225
ADD dst, dst, 16*NBYTES
226
EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
227
EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
228
EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
229
EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
230
EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
231
EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
232
EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
233
EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
234
EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
235
EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
236
EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
237
EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
238
EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
239
EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
240
EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
241
EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
242
sltu t0, len, 256+1 # See if we can prefetch more
243
beqz t0, 2b
244
sltu t0, len, 128 # See if we can loop more time
245
beqz t0, 1b
246
nop
247
#
248
# Jump here if there are less than 16*NBYTES left.
249
#
250
cleanup_both_aligned:
251
beqz len, done
252
sltu t0, len, 8*NBYTES
253
bnez t0, less_than_8units
254
nop
255
EXC( LOAD t0, UNIT(0)(src), l_exc)
256
EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
257
EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
258
EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
259
SUB len, len, 8*NBYTES
260
EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
261
EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
262
EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
263
EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
264
EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
265
EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
266
EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
267
EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
268
EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
269
EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
270
EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
271
EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
272
ADD src, src, 8*NBYTES
273
beqz len, done
274
ADD dst, dst, 8*NBYTES
275
#
276
# Jump here if there are less than 8*NBYTES left.
277
#
278
less_than_8units:
279
sltu t0, len, 4*NBYTES
280
bnez t0, less_than_4units
281
nop
282
EXC( LOAD t0, UNIT(0)(src), l_exc)
283
EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
284
EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
285
EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
286
SUB len, len, 4*NBYTES
287
EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
288
EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
289
EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
290
EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
291
ADD src, src, 4*NBYTES
292
beqz len, done
293
ADD dst, dst, 4*NBYTES
294
#
295
# Jump here if there are less than 4*NBYTES left. This means
296
# we may need to copy up to 3 NBYTES words.
297
#
298
less_than_4units:
299
sltu t0, len, 1*NBYTES
300
bnez t0, copy_bytes_checklen
301
nop
302
#
303
# 1) Copy NBYTES, then check length again
304
#
305
EXC( LOAD t0, 0(src), l_exc)
306
SUB len, len, NBYTES
307
sltu t1, len, 8
308
EXC( STORE t0, 0(dst), s_exc_p1u)
309
ADD src, src, NBYTES
310
bnez t1, copy_bytes_checklen
311
ADD dst, dst, NBYTES
312
#
313
# 2) Copy NBYTES, then check length again
314
#
315
EXC( LOAD t0, 0(src), l_exc)
316
SUB len, len, NBYTES
317
sltu t1, len, 8
318
EXC( STORE t0, 0(dst), s_exc_p1u)
319
ADD src, src, NBYTES
320
bnez t1, copy_bytes_checklen
321
ADD dst, dst, NBYTES
322
#
323
# 3) Copy NBYTES, then check length again
324
#
325
EXC( LOAD t0, 0(src), l_exc)
326
SUB len, len, NBYTES
327
ADD src, src, NBYTES
328
ADD dst, dst, NBYTES
329
b copy_bytes_checklen
330
EXC( STORE t0, -8(dst), s_exc_p1u)
331
332
src_unaligned:
333
#define rem t8
334
SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
335
beqz t0, cleanup_src_unaligned
336
and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
337
1:
338
/*
339
* Avoid consecutive LD*'s to the same register since some mips
340
* implementations can't issue them in the same cycle.
341
* It's OK to load FIRST(N+1) before REST(N) because the two addresses
342
* are to the same unit (unless src is aligned, but it's not).
343
*/
344
EXC( LDFIRST t0, FIRST(0)(src), l_exc)
345
EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
346
SUB len, len, 4*NBYTES
347
EXC( LDREST t0, REST(0)(src), l_exc_copy)
348
EXC( LDREST t1, REST(1)(src), l_exc_copy)
349
EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
350
EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
351
EXC( LDREST t2, REST(2)(src), l_exc_copy)
352
EXC( LDREST t3, REST(3)(src), l_exc_copy)
353
ADD src, src, 4*NBYTES
354
EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
355
EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
356
EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
357
EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
358
bne len, rem, 1b
359
ADD dst, dst, 4*NBYTES
360
361
cleanup_src_unaligned:
362
beqz len, done
363
and rem, len, NBYTES-1 # rem = len % NBYTES
364
beq rem, len, copy_bytes
365
nop
366
1:
367
EXC( LDFIRST t0, FIRST(0)(src), l_exc)
368
EXC( LDREST t0, REST(0)(src), l_exc_copy)
369
SUB len, len, NBYTES
370
EXC( STORE t0, 0(dst), s_exc_p1u)
371
ADD src, src, NBYTES
372
bne len, rem, 1b
373
ADD dst, dst, NBYTES
374
375
copy_bytes_checklen:
376
beqz len, done
377
nop
378
copy_bytes:
379
/* 0 < len < NBYTES */
380
#define COPY_BYTE(N) \
381
EXC( lb t0, N(src), l_exc); \
382
SUB len, len, 1; \
383
beqz len, done; \
384
EXC( sb t0, N(dst), s_exc_p1)
385
386
COPY_BYTE(0)
387
COPY_BYTE(1)
388
#ifdef USE_DOUBLE
389
COPY_BYTE(2)
390
COPY_BYTE(3)
391
COPY_BYTE(4)
392
COPY_BYTE(5)
393
#endif
394
EXC( lb t0, NBYTES-2(src), l_exc)
395
SUB len, len, 1
396
jr ra
397
EXC( sb t0, NBYTES-2(dst), s_exc_p1)
398
done:
399
jr ra
400
nop
401
END(memcpy)
402
403
l_exc_copy:
404
/*
405
* Copy bytes from src until faulting load address (or until a
406
* lb faults)
407
*
408
* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
409
* may be more than a byte beyond the last address.
410
* Hence, the lb below may get an exception.
411
*
412
* Assumes src < THREAD_BUADDR($28)
413
*/
414
LOAD t0, TI_TASK($28)
415
nop
416
LOAD t0, THREAD_BUADDR(t0)
417
1:
418
EXC( lb t1, 0(src), l_exc)
419
ADD src, src, 1
420
sb t1, 0(dst) # can't fault -- we're copy_from_user
421
bne src, t0, 1b
422
ADD dst, dst, 1
423
l_exc:
424
LOAD t0, TI_TASK($28)
425
nop
426
LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
427
nop
428
SUB len, AT, t0 # len number of uncopied bytes
429
/*
430
* Here's where we rely on src and dst being incremented in tandem,
431
* See (3) above.
432
* dst += (fault addr - src) to put dst at first byte to clear
433
*/
434
ADD dst, t0 # compute start address in a1
435
SUB dst, src
436
/*
437
* Clear len bytes starting at dst. Can't call __bzero because it
438
* might modify len. An inefficient loop for these rare times...
439
*/
440
beqz len, done
441
SUB src, len, 1
442
1: sb zero, 0(dst)
443
ADD dst, dst, 1
444
bnez src, 1b
445
SUB src, src, 1
446
jr ra
447
nop
448
449
450
#define SEXC(n) \
451
s_exc_p ## n ## u: \
452
jr ra; \
453
ADD len, len, n*NBYTES
454
455
SEXC(16)
456
SEXC(15)
457
SEXC(14)
458
SEXC(13)
459
SEXC(12)
460
SEXC(11)
461
SEXC(10)
462
SEXC(9)
463
SEXC(8)
464
SEXC(7)
465
SEXC(6)
466
SEXC(5)
467
SEXC(4)
468
SEXC(3)
469
SEXC(2)
470
SEXC(1)
471
472
s_exc_p1:
473
jr ra
474
ADD len, len, 1
475
s_exc:
476
jr ra
477
nop
478
479
.align 5
480
LEAF(memmove)
481
ADD t0, a0, a2
482
ADD t1, a1, a2
483
sltu t0, a1, t0 # dst + len <= src -> memcpy
484
sltu t1, a0, t1 # dst >= src + len -> memcpy
485
and t0, t1
486
beqz t0, __memcpy
487
move v0, a0 /* return value */
488
beqz a2, r_out
489
END(memmove)
490
491
/* fall through to __rmemcpy */
492
LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
493
sltu t0, a1, a0
494
beqz t0, r_end_bytes_up # src >= dst
495
nop
496
ADD a0, a2 # dst = dst + len
497
ADD a1, a2 # src = src + len
498
499
r_end_bytes:
500
lb t0, -1(a1)
501
SUB a2, a2, 0x1
502
sb t0, -1(a0)
503
SUB a1, a1, 0x1
504
bnez a2, r_end_bytes
505
SUB a0, a0, 0x1
506
507
r_out:
508
jr ra
509
move a2, zero
510
511
r_end_bytes_up:
512
lb t0, (a1)
513
SUB a2, a2, 0x1
514
sb t0, (a0)
515
ADD a1, a1, 0x1
516
bnez a2, r_end_bytes_up
517
ADD a0, a0, 0x1
518
519
jr ra
520
move a2, zero
521
END(__rmemcpy)
522
523