Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/mips/cavium-octeon/octeon-memcpy.S
26424 views
1
/*
2
* This file is subject to the terms and conditions of the GNU General Public
3
* License. See the file "COPYING" in the main directory of this archive
4
* for more details.
5
*
6
* Unified implementation of memcpy, memmove and the __copy_user backend.
7
*
8
* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])
9
* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10
* Copyright (C) 2002 Broadcom, Inc.
11
* memcpy/copy_user author: Mark Vandevoorde
12
*
13
* Mnemonic names for arguments to memcpy/__copy_user
14
*/
15
16
#include <linux/export.h>
17
#include <asm/asm.h>
18
#include <asm/asm-offsets.h>
19
#include <asm/regdef.h>
20
21
#define dst a0
22
#define src a1
23
#define len a2
24
25
/*
26
* Spec
27
*
28
* memcpy copies len bytes from src to dst and sets v0 to dst.
29
* It assumes that
30
* - src and dst don't overlap
31
* - src is readable
32
* - dst is writable
33
* memcpy uses the standard calling convention
34
*
35
* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
36
* the number of uncopied bytes due to an exception caused by a read or write.
37
* __copy_user assumes that src and dst don't overlap, and that the call is
38
* implementing one of the following:
39
* copy_to_user
40
* - src is readable (no exceptions when reading src)
41
* copy_from_user
42
* - dst is writable (no exceptions when writing dst)
43
* __copy_user uses a non-standard calling convention; see
44
* arch/mips/include/asm/uaccess.h
45
*
46
* When an exception happens on a load, the handler must
47
# ensure that all of the destination buffer is overwritten to prevent
48
* leaking information to user mode programs.
49
*/
50
51
/*
52
* Implementation
53
*/
54
55
/*
56
* The exception handler for loads requires that:
57
* 1- AT contain the address of the byte just past the end of the source
58
* of the copy,
59
* 2- src_entry <= src < AT, and
60
* 3- (dst - src) == (dst_entry - src_entry),
61
* The _entry suffix denotes values when __copy_user was called.
62
*
63
* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
64
* (2) is met by incrementing src by the number of bytes copied
65
* (3) is met by not doing loads between a pair of increments of dst and src
66
*
67
* The exception handlers for stores adjust len (if necessary) and return.
68
* These handlers do not need to overwrite any data.
69
*
70
* For __rmemcpy and memmove an exception is always a kernel bug, therefore
71
* they're not protected.
72
*/
73
74
#define EXC(inst_reg,addr,handler) \
75
9: inst_reg, addr; \
76
.section __ex_table,"a"; \
77
PTR_WD 9b, handler; \
78
.previous
79
80
/*
81
* Only on the 64-bit kernel we can made use of 64-bit registers.
82
*/
83
84
#define LOAD ld
85
#define LOADL ldl
86
#define LOADR ldr
87
#define STOREL sdl
88
#define STORER sdr
89
#define STORE sd
90
#define ADD daddu
91
#define SUB dsubu
92
#define SRL dsrl
93
#define SRA dsra
94
#define SLL dsll
95
#define SLLV dsllv
96
#define SRLV dsrlv
97
#define NBYTES 8
98
#define LOG_NBYTES 3
99
100
/*
101
* As we are sharing code base with the mips32 tree (which use the o32 ABI
102
* register definitions). We need to redefine the register definitions from
103
* the n64 ABI register naming to the o32 ABI register naming.
104
*/
105
#undef t0
106
#undef t1
107
#undef t2
108
#undef t3
109
#define t0 $8
110
#define t1 $9
111
#define t2 $10
112
#define t3 $11
113
#define t4 $12
114
#define t5 $13
115
#define t6 $14
116
#define t7 $15
117
118
#ifdef CONFIG_CPU_LITTLE_ENDIAN
119
#define LDFIRST LOADR
120
#define LDREST LOADL
121
#define STFIRST STORER
122
#define STREST STOREL
123
#define SHIFT_DISCARD SLLV
124
#else
125
#define LDFIRST LOADL
126
#define LDREST LOADR
127
#define STFIRST STOREL
128
#define STREST STORER
129
#define SHIFT_DISCARD SRLV
130
#endif
131
132
#define FIRST(unit) ((unit)*NBYTES)
133
#define REST(unit) (FIRST(unit)+NBYTES-1)
134
#define UNIT(unit) FIRST(unit)
135
136
#define ADDRMASK (NBYTES-1)
137
138
.text
139
.set noreorder
140
.set noat
141
142
/*
143
* A combined memcpy/__copy_user
144
* __copy_user sets len to 0 for success; else to an upper bound of
145
* the number of uncopied bytes.
146
* memcpy sets v0 to dst.
147
*/
148
.align 5
149
LEAF(memcpy) /* a0=dst a1=src a2=len */
150
EXPORT_SYMBOL(memcpy)
151
move v0, dst /* return value */
152
__memcpy:
153
FEXPORT(__raw_copy_from_user)
154
EXPORT_SYMBOL(__raw_copy_from_user)
155
FEXPORT(__raw_copy_to_user)
156
EXPORT_SYMBOL(__raw_copy_to_user)
157
/*
158
* Note: dst & src may be unaligned, len may be 0
159
* Temps
160
*/
161
#
162
# Octeon doesn't care if the destination is unaligned. The hardware
163
# can fix it faster than we can special case the assembly.
164
#
165
pref 0, 0(src)
166
sltu t0, len, NBYTES # Check if < 1 word
167
bnez t0, copy_bytes_checklen
168
and t0, src, ADDRMASK # Check if src unaligned
169
bnez t0, src_unaligned
170
sltu t0, len, 4*NBYTES # Check if < 4 words
171
bnez t0, less_than_4units
172
sltu t0, len, 8*NBYTES # Check if < 8 words
173
bnez t0, less_than_8units
174
sltu t0, len, 16*NBYTES # Check if < 16 words
175
bnez t0, cleanup_both_aligned
176
sltu t0, len, 128+1 # Check if len < 129
177
bnez t0, 1f # Skip prefetch if len is too short
178
sltu t0, len, 256+1 # Check if len < 257
179
bnez t0, 1f # Skip prefetch if len is too short
180
pref 0, 128(src) # We must not prefetch invalid addresses
181
#
182
# This is where we loop if there is more than 128 bytes left
183
2: pref 0, 256(src) # We must not prefetch invalid addresses
184
#
185
# This is where we loop if we can't prefetch anymore
186
1:
187
EXC( LOAD t0, UNIT(0)(src), l_exc)
188
EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
189
EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
190
EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
191
SUB len, len, 16*NBYTES
192
EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
193
EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
194
EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
195
EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
196
EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
197
EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
198
EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
199
EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
200
EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
201
EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
202
EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
203
ADD src, src, 16*NBYTES
204
EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
205
ADD dst, dst, 16*NBYTES
206
EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16)
207
EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16)
208
EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16)
209
EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16)
210
EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
211
EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
212
EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
213
EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
214
EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16)
215
EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16)
216
EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16)
217
EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16)
218
EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
219
EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
220
EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
221
EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
222
sltu t0, len, 256+1 # See if we can prefetch more
223
beqz t0, 2b
224
sltu t0, len, 128 # See if we can loop more time
225
beqz t0, 1b
226
nop
227
#
228
# Jump here if there are less than 16*NBYTES left.
229
#
230
cleanup_both_aligned:
231
beqz len, done
232
sltu t0, len, 8*NBYTES
233
bnez t0, less_than_8units
234
nop
235
EXC( LOAD t0, UNIT(0)(src), l_exc)
236
EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
237
EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
238
EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
239
SUB len, len, 8*NBYTES
240
EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
241
EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
242
EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
243
EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
244
EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
245
EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
246
EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
247
EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
248
EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
249
EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
250
EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
251
EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
252
ADD src, src, 8*NBYTES
253
beqz len, done
254
ADD dst, dst, 8*NBYTES
255
#
256
# Jump here if there are less than 8*NBYTES left.
257
#
258
less_than_8units:
259
sltu t0, len, 4*NBYTES
260
bnez t0, less_than_4units
261
nop
262
EXC( LOAD t0, UNIT(0)(src), l_exc)
263
EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
264
EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
265
EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
266
SUB len, len, 4*NBYTES
267
EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
268
EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
269
EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
270
EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
271
ADD src, src, 4*NBYTES
272
beqz len, done
273
ADD dst, dst, 4*NBYTES
274
#
275
# Jump here if there are less than 4*NBYTES left. This means
276
# we may need to copy up to 3 NBYTES words.
277
#
278
less_than_4units:
279
sltu t0, len, 1*NBYTES
280
bnez t0, copy_bytes_checklen
281
nop
282
#
283
# 1) Copy NBYTES, then check length again
284
#
285
EXC( LOAD t0, 0(src), l_exc)
286
SUB len, len, NBYTES
287
sltu t1, len, 8
288
EXC( STORE t0, 0(dst), s_exc_p1u)
289
ADD src, src, NBYTES
290
bnez t1, copy_bytes_checklen
291
ADD dst, dst, NBYTES
292
#
293
# 2) Copy NBYTES, then check length again
294
#
295
EXC( LOAD t0, 0(src), l_exc)
296
SUB len, len, NBYTES
297
sltu t1, len, 8
298
EXC( STORE t0, 0(dst), s_exc_p1u)
299
ADD src, src, NBYTES
300
bnez t1, copy_bytes_checklen
301
ADD dst, dst, NBYTES
302
#
303
# 3) Copy NBYTES, then check length again
304
#
305
EXC( LOAD t0, 0(src), l_exc)
306
SUB len, len, NBYTES
307
ADD src, src, NBYTES
308
ADD dst, dst, NBYTES
309
b copy_bytes_checklen
310
EXC( STORE t0, -8(dst), s_exc_p1u)
311
312
src_unaligned:
313
#define rem t8
314
SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
315
beqz t0, cleanup_src_unaligned
316
and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
317
1:
318
/*
319
* Avoid consecutive LD*'s to the same register since some mips
320
* implementations can't issue them in the same cycle.
321
* It's OK to load FIRST(N+1) before REST(N) because the two addresses
322
* are to the same unit (unless src is aligned, but it's not).
323
*/
324
EXC( LDFIRST t0, FIRST(0)(src), l_exc)
325
EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
326
SUB len, len, 4*NBYTES
327
EXC( LDREST t0, REST(0)(src), l_exc_copy)
328
EXC( LDREST t1, REST(1)(src), l_exc_copy)
329
EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
330
EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
331
EXC( LDREST t2, REST(2)(src), l_exc_copy)
332
EXC( LDREST t3, REST(3)(src), l_exc_copy)
333
ADD src, src, 4*NBYTES
334
EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
335
EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
336
EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
337
EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
338
bne len, rem, 1b
339
ADD dst, dst, 4*NBYTES
340
341
cleanup_src_unaligned:
342
beqz len, done
343
and rem, len, NBYTES-1 # rem = len % NBYTES
344
beq rem, len, copy_bytes
345
nop
346
1:
347
EXC( LDFIRST t0, FIRST(0)(src), l_exc)
348
EXC( LDREST t0, REST(0)(src), l_exc_copy)
349
SUB len, len, NBYTES
350
EXC( STORE t0, 0(dst), s_exc_p1u)
351
ADD src, src, NBYTES
352
bne len, rem, 1b
353
ADD dst, dst, NBYTES
354
355
copy_bytes_checklen:
356
beqz len, done
357
nop
358
copy_bytes:
359
/* 0 < len < NBYTES */
360
#define COPY_BYTE(N) \
361
EXC( lb t0, N(src), l_exc); \
362
SUB len, len, 1; \
363
beqz len, done; \
364
EXC( sb t0, N(dst), s_exc_p1)
365
366
COPY_BYTE(0)
367
COPY_BYTE(1)
368
COPY_BYTE(2)
369
COPY_BYTE(3)
370
COPY_BYTE(4)
371
COPY_BYTE(5)
372
EXC( lb t0, NBYTES-2(src), l_exc)
373
SUB len, len, 1
374
jr ra
375
EXC( sb t0, NBYTES-2(dst), s_exc_p1)
376
done:
377
jr ra
378
nop
379
END(memcpy)
380
381
l_exc_copy_rewind16:
382
/* Rewind src and dst by 16*NBYTES for l_exc_copy */
383
SUB src, src, 16*NBYTES
384
SUB dst, dst, 16*NBYTES
385
l_exc_copy:
386
/*
387
* Copy bytes from src until faulting load address (or until a
388
* lb faults)
389
*
390
* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
391
* may be more than a byte beyond the last address.
392
* Hence, the lb below may get an exception.
393
*
394
* Assumes src < THREAD_BUADDR($28)
395
*/
396
LOAD t0, TI_TASK($28)
397
LOAD t0, THREAD_BUADDR(t0)
398
1:
399
EXC( lb t1, 0(src), l_exc)
400
ADD src, src, 1
401
sb t1, 0(dst) # can't fault -- we're copy_from_user
402
bne src, t0, 1b
403
ADD dst, dst, 1
404
l_exc:
405
LOAD t0, TI_TASK($28)
406
LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
407
SUB len, AT, t0 # len number of uncopied bytes
408
jr ra
409
nop
410
411
412
#define SEXC(n) \
413
s_exc_p ## n ## u: \
414
jr ra; \
415
ADD len, len, n*NBYTES
416
417
SEXC(16)
418
SEXC(15)
419
SEXC(14)
420
SEXC(13)
421
SEXC(12)
422
SEXC(11)
423
SEXC(10)
424
SEXC(9)
425
SEXC(8)
426
SEXC(7)
427
SEXC(6)
428
SEXC(5)
429
SEXC(4)
430
SEXC(3)
431
SEXC(2)
432
SEXC(1)
433
434
s_exc_p1:
435
jr ra
436
ADD len, len, 1
437
s_exc:
438
jr ra
439
nop
440
441
.align 5
442
LEAF(memmove)
443
EXPORT_SYMBOL(memmove)
444
ADD t0, a0, a2
445
ADD t1, a1, a2
446
sltu t0, a1, t0 # dst + len <= src -> memcpy
447
sltu t1, a0, t1 # dst >= src + len -> memcpy
448
and t0, t1
449
beqz t0, __memcpy
450
move v0, a0 /* return value */
451
beqz a2, r_out
452
END(memmove)
453
454
/* fall through to __rmemcpy */
455
LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
456
sltu t0, a1, a0
457
beqz t0, r_end_bytes_up # src >= dst
458
nop
459
ADD a0, a2 # dst = dst + len
460
ADD a1, a2 # src = src + len
461
462
r_end_bytes:
463
lb t0, -1(a1)
464
SUB a2, a2, 0x1
465
sb t0, -1(a0)
466
SUB a1, a1, 0x1
467
bnez a2, r_end_bytes
468
SUB a0, a0, 0x1
469
470
r_out:
471
jr ra
472
move a2, zero
473
474
r_end_bytes_up:
475
lb t0, (a1)
476
SUB a2, a2, 0x1
477
sb t0, (a0)
478
ADD a1, a1, 0x1
479
bnez a2, r_end_bytes_up
480
ADD a0, a0, 0x1
481
482
jr ra
483
move a2, zero
484
END(__rmemcpy)
485
486