CoCalc -- octeon-memcpy.S

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/mips/cavium-octeon/octeon-memcpy.S
¹⁰⁸¹⁷ views
1
/*
2
 * This file is subject to the terms and conditions of the GNU General Public
3
 * License.  See the file "COPYING" in the main directory of this archive
4
 * for more details.
5
 *
6
 * Unified implementation of memcpy, memmove and the __copy_user backend.
7
 *
8
 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])
9
 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10
 * Copyright (C) 2002 Broadcom, Inc.
11
 *   memcpy/copy_user author: Mark Vandevoorde
12
 *
13
 * Mnemonic names for arguments to memcpy/__copy_user
14
 */
15

16
#include <asm/asm.h>
17
#include <asm/asm-offsets.h>
18
#include <asm/regdef.h>
19

20
#define dst a0
21
#define src a1
22
#define len a2
23

24
/*
25
 * Spec
26
 *
27
 * memcpy copies len bytes from src to dst and sets v0 to dst.
28
 * It assumes that
29
 *   - src and dst don't overlap
30
 *   - src is readable
31
 *   - dst is writable
32
 * memcpy uses the standard calling convention
33
 *
34
 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
35
 * the number of uncopied bytes due to an exception caused by a read or write.
36
 * __copy_user assumes that src and dst don't overlap, and that the call is
37
 * implementing one of the following:
38
 *   copy_to_user
39
 *     - src is readable  (no exceptions when reading src)
40
 *   copy_from_user
41
 *     - dst is writable  (no exceptions when writing dst)
42
 * __copy_user uses a non-standard calling convention; see
43
 * arch/mips/include/asm/uaccess.h
44
 *
45
 * When an exception happens on a load, the handler must
46
 # ensure that all of the destination buffer is overwritten to prevent
47
 * leaking information to user mode programs.
48
 */
49

50
/*
51
 * Implementation
52
 */
53

54
/*
55
 * The exception handler for loads requires that:
56
 *  1- AT contain the address of the byte just past the end of the source
57
 *     of the copy,
58
 *  2- src_entry <= src < AT, and
59
 *  3- (dst - src) == (dst_entry - src_entry),
60
 * The _entry suffix denotes values when __copy_user was called.
61
 *
62
 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
63
 * (2) is met by incrementing src by the number of bytes copied
64
 * (3) is met by not doing loads between a pair of increments of dst and src
65
 *
66
 * The exception handlers for stores adjust len (if necessary) and return.
67
 * These handlers do not need to overwrite any data.
68
 *
69
 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
70
 * they're not protected.
71
 */
72

73
#define EXC(inst_reg,addr,handler)		\
74
9:	inst_reg, addr;				\
75
	.section __ex_table,"a";		\
76
	PTR	9b, handler;			\
77
	.previous
78

79
/*
80
 * Only on the 64-bit kernel we can made use of 64-bit registers.
81
 */
82
#ifdef CONFIG_64BIT
83
#define USE_DOUBLE
84
#endif
85

86
#ifdef USE_DOUBLE
87

88
#define LOAD   ld
89
#define LOADL  ldl
90
#define LOADR  ldr
91
#define STOREL sdl
92
#define STORER sdr
93
#define STORE  sd
94
#define ADD    daddu
95
#define SUB    dsubu
96
#define SRL    dsrl
97
#define SRA    dsra
98
#define SLL    dsll
99
#define SLLV   dsllv
100
#define SRLV   dsrlv
101
#define NBYTES 8
102
#define LOG_NBYTES 3
103

104
/*
105
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
106
 * register definitions). We need to redefine the register definitions from
107
 * the n64 ABI register naming to the o32 ABI register naming.
108
 */
109
#undef t0
110
#undef t1
111
#undef t2
112
#undef t3
113
#define t0	$8
114
#define t1	$9
115
#define t2	$10
116
#define t3	$11
117
#define t4	$12
118
#define t5	$13
119
#define t6	$14
120
#define t7	$15
121

122
#else
123

124
#define LOAD   lw
125
#define LOADL  lwl
126
#define LOADR  lwr
127
#define STOREL swl
128
#define STORER swr
129
#define STORE  sw
130
#define ADD    addu
131
#define SUB    subu
132
#define SRL    srl
133
#define SLL    sll
134
#define SRA    sra
135
#define SLLV   sllv
136
#define SRLV   srlv
137
#define NBYTES 4
138
#define LOG_NBYTES 2
139

140
#endif /* USE_DOUBLE */
141

142
#ifdef CONFIG_CPU_LITTLE_ENDIAN
143
#define LDFIRST LOADR
144
#define LDREST  LOADL
145
#define STFIRST STORER
146
#define STREST  STOREL
147
#define SHIFT_DISCARD SLLV
148
#else
149
#define LDFIRST LOADL
150
#define LDREST  LOADR
151
#define STFIRST STOREL
152
#define STREST  STORER
153
#define SHIFT_DISCARD SRLV
154
#endif
155

156
#define FIRST(unit) ((unit)*NBYTES)
157
#define REST(unit)  (FIRST(unit)+NBYTES-1)
158
#define UNIT(unit)  FIRST(unit)
159

160
#define ADDRMASK (NBYTES-1)
161

162
	.text
163
	.set	noreorder
164
	.set	noat
165

166
/*
167
 * A combined memcpy/__copy_user
168
 * __copy_user sets len to 0 for success; else to an upper bound of
169
 * the number of uncopied bytes.
170
 * memcpy sets v0 to dst.
171
 */
172
	.align	5
173
LEAF(memcpy)					/* a0=dst a1=src a2=len */
174
	move	v0, dst				/* return value */
175
__memcpy:
176
FEXPORT(__copy_user)
177
	/*
178
	 * Note: dst & src may be unaligned, len may be 0
179
	 * Temps
180
	 */
181
	#
182
	# Octeon doesn't care if the destination is unaligned. The hardware
183
	# can fix it faster than we can special case the assembly.
184
	#
185
	pref	0, 0(src)
186
	sltu	t0, len, NBYTES		# Check if < 1 word
187
	bnez	t0, copy_bytes_checklen
188
	 and	t0, src, ADDRMASK	# Check if src unaligned
189
	bnez	t0, src_unaligned
190
	 sltu	t0, len, 4*NBYTES	# Check if < 4 words
191
	bnez	t0, less_than_4units
192
	 sltu	t0, len, 8*NBYTES	# Check if < 8 words
193
	bnez	t0, less_than_8units
194
	 sltu	t0, len, 16*NBYTES	# Check if < 16 words
195
	bnez	t0, cleanup_both_aligned
196
	 sltu	t0, len, 128+1		# Check if len < 129
197
	bnez	t0, 1f			# Skip prefetch if len is too short
198
	 sltu	t0, len, 256+1		# Check if len < 257
199
	bnez	t0, 1f			# Skip prefetch if len is too short
200
	 pref	0, 128(src)		# We must not prefetch invalid addresses
201
	#
202
	# This is where we loop if there is more than 128 bytes left
203
2:	pref	0, 256(src)		# We must not prefetch invalid addresses
204
	#
205
	# This is where we loop if we can't prefetch anymore
206
1:
207
EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
208
EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
209
EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
210
EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
211
	SUB	len, len, 16*NBYTES
212
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p16u)
213
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p15u)
214
EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p14u)
215
EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p13u)
216
EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
217
EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
218
EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
219
EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
220
EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p12u)
221
EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p11u)
222
EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p10u)
223
	ADD	src, src, 16*NBYTES
224
EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p9u)
225
	ADD	dst, dst, 16*NBYTES
226
EXC(	LOAD	t0, UNIT(-8)(src),	l_exc_copy)
227
EXC(	LOAD	t1, UNIT(-7)(src),	l_exc_copy)
228
EXC(	LOAD	t2, UNIT(-6)(src),	l_exc_copy)
229
EXC(	LOAD	t3, UNIT(-5)(src),	l_exc_copy)
230
EXC(	STORE	t0, UNIT(-8)(dst),	s_exc_p8u)
231
EXC(	STORE	t1, UNIT(-7)(dst),	s_exc_p7u)
232
EXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
233
EXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
234
EXC(	LOAD	t0, UNIT(-4)(src),	l_exc_copy)
235
EXC(	LOAD	t1, UNIT(-3)(src),	l_exc_copy)
236
EXC(	LOAD	t2, UNIT(-2)(src),	l_exc_copy)
237
EXC(	LOAD	t3, UNIT(-1)(src),	l_exc_copy)
238
EXC(	STORE	t0, UNIT(-4)(dst),	s_exc_p4u)
239
EXC(	STORE	t1, UNIT(-3)(dst),	s_exc_p3u)
240
EXC(	STORE	t2, UNIT(-2)(dst),	s_exc_p2u)
241
EXC(	STORE	t3, UNIT(-1)(dst),	s_exc_p1u)
242
	sltu	t0, len, 256+1		# See if we can prefetch more
243
	beqz	t0, 2b
244
	 sltu	t0, len, 128		# See if we can loop more time
245
	beqz	t0, 1b
246
	 nop
247
	#
248
	# Jump here if there are less than 16*NBYTES left.
249
	#
250
cleanup_both_aligned:
251
	beqz	len, done
252
	 sltu	t0, len, 8*NBYTES
253
	bnez	t0, less_than_8units
254
	 nop
255
EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
256
EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
257
EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
258
EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
259
	SUB	len, len, 8*NBYTES
260
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
261
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
262
EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p6u)
263
EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p5u)
264
EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
265
EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
266
EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
267
EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
268
EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p4u)
269
EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p3u)
270
EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p2u)
271
EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p1u)
272
	ADD	src, src, 8*NBYTES
273
	beqz	len, done
274
	 ADD	dst, dst, 8*NBYTES
275
	#
276
	# Jump here if there are less than 8*NBYTES left.
277
	#
278
less_than_8units:
279
	sltu	t0, len, 4*NBYTES
280
	bnez	t0, less_than_4units
281
	 nop
282
EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
283
EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
284
EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
285
EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
286
	SUB	len, len, 4*NBYTES
287
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
288
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
289
EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
290
EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
291
	ADD	src, src, 4*NBYTES
292
	beqz	len, done
293
	 ADD	dst, dst, 4*NBYTES
294
	#
295
	# Jump here if there are less than 4*NBYTES left. This means
296
	# we may need to copy up to 3 NBYTES words.
297
	#
298
less_than_4units:
299
	sltu	t0, len, 1*NBYTES
300
	bnez	t0, copy_bytes_checklen
301
	 nop
302
	#
303
	# 1) Copy NBYTES, then check length again
304
	#
305
EXC(	LOAD	t0, 0(src),		l_exc)
306
	SUB	len, len, NBYTES
307
	sltu	t1, len, 8
308
EXC(	STORE	t0, 0(dst),		s_exc_p1u)
309
	ADD	src, src, NBYTES
310
	bnez	t1, copy_bytes_checklen
311
	 ADD	dst, dst, NBYTES
312
	#
313
	# 2) Copy NBYTES, then check length again
314
	#
315
EXC(	LOAD	t0, 0(src),		l_exc)
316
	SUB	len, len, NBYTES
317
	sltu	t1, len, 8
318
EXC(	STORE	t0, 0(dst),		s_exc_p1u)
319
	ADD	src, src, NBYTES
320
	bnez	t1, copy_bytes_checklen
321
	 ADD	dst, dst, NBYTES
322
	#
323
	# 3) Copy NBYTES, then check length again
324
	#
325
EXC(	LOAD	t0, 0(src),		l_exc)
326
	SUB	len, len, NBYTES
327
	ADD	src, src, NBYTES
328
	ADD	dst, dst, NBYTES
329
	b copy_bytes_checklen
330
EXC(	 STORE	t0, -8(dst),		s_exc_p1u)
331

332
src_unaligned:
333
#define rem t8
334
	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
335
	beqz	t0, cleanup_src_unaligned
336
	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
337
1:
338
/*
339
 * Avoid consecutive LD*'s to the same register since some mips
340
 * implementations can't issue them in the same cycle.
341
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
342
 * are to the same unit (unless src is aligned, but it's not).
343
 */
344
EXC(	LDFIRST	t0, FIRST(0)(src),	l_exc)
345
EXC(	LDFIRST	t1, FIRST(1)(src),	l_exc_copy)
346
	SUB     len, len, 4*NBYTES
347
EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
348
EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
349
EXC(	LDFIRST	t2, FIRST(2)(src),	l_exc_copy)
350
EXC(	LDFIRST	t3, FIRST(3)(src),	l_exc_copy)
351
EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
352
EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
353
	ADD	src, src, 4*NBYTES
354
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
355
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
356
EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
357
EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
358
	bne	len, rem, 1b
359
	 ADD	dst, dst, 4*NBYTES
360

361
cleanup_src_unaligned:
362
	beqz	len, done
363
	 and	rem, len, NBYTES-1  # rem = len % NBYTES
364
	beq	rem, len, copy_bytes
365
	 nop
366
1:
367
EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
368
EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
369
	SUB	len, len, NBYTES
370
EXC(	STORE	t0, 0(dst),		s_exc_p1u)
371
	ADD	src, src, NBYTES
372
	bne	len, rem, 1b
373
	 ADD	dst, dst, NBYTES
374

375
copy_bytes_checklen:
376
	beqz	len, done
377
	 nop
378
copy_bytes:
379
	/* 0 < len < NBYTES  */
380
#define COPY_BYTE(N)			\
381
EXC(	lb	t0, N(src), l_exc);	\
382
	SUB	len, len, 1;		\
383
	beqz	len, done;		\
384
EXC(	 sb	t0, N(dst), s_exc_p1)
385

386
	COPY_BYTE(0)
387
	COPY_BYTE(1)
388
#ifdef USE_DOUBLE
389
	COPY_BYTE(2)
390
	COPY_BYTE(3)
391
	COPY_BYTE(4)
392
	COPY_BYTE(5)
393
#endif
394
EXC(	lb	t0, NBYTES-2(src), l_exc)
395
	SUB	len, len, 1
396
	jr	ra
397
EXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
398
done:
399
	jr	ra
400
	 nop
401
	END(memcpy)
402

403
l_exc_copy:
404
	/*
405
	 * Copy bytes from src until faulting load address (or until a
406
	 * lb faults)
407
	 *
408
	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
409
	 * may be more than a byte beyond the last address.
410
	 * Hence, the lb below may get an exception.
411
	 *
412
	 * Assumes src < THREAD_BUADDR($28)
413
	 */
414
	LOAD	t0, TI_TASK($28)
415
	 nop
416
	LOAD	t0, THREAD_BUADDR(t0)
417
1:
418
EXC(	lb	t1, 0(src),	l_exc)
419
	ADD	src, src, 1
420
	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
421
	bne	src, t0, 1b
422
	 ADD	dst, dst, 1
423
l_exc:
424
	LOAD	t0, TI_TASK($28)
425
	 nop
426
	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
427
	 nop
428
	SUB	len, AT, t0		# len number of uncopied bytes
429
	/*
430
	 * Here's where we rely on src and dst being incremented in tandem,
431
	 *   See (3) above.
432
	 * dst += (fault addr - src) to put dst at first byte to clear
433
	 */
434
	ADD	dst, t0			# compute start address in a1
435
	SUB	dst, src
436
	/*
437
	 * Clear len bytes starting at dst.  Can't call __bzero because it
438
	 * might modify len.  An inefficient loop for these rare times...
439
	 */
440
	beqz	len, done
441
	 SUB	src, len, 1
442
1:	sb	zero, 0(dst)
443
	ADD	dst, dst, 1
444
	bnez	src, 1b
445
	 SUB	src, src, 1
446
	jr	ra
447
	 nop
448

449

450
#define SEXC(n)				\
451
s_exc_p ## n ## u:			\
452
	jr	ra;			\
453
	 ADD	len, len, n*NBYTES
454

455
SEXC(16)
456
SEXC(15)
457
SEXC(14)
458
SEXC(13)
459
SEXC(12)
460
SEXC(11)
461
SEXC(10)
462
SEXC(9)
463
SEXC(8)
464
SEXC(7)
465
SEXC(6)
466
SEXC(5)
467
SEXC(4)
468
SEXC(3)
469
SEXC(2)
470
SEXC(1)
471

472
s_exc_p1:
473
	jr	ra
474
	 ADD	len, len, 1
475
s_exc:
476
	jr	ra
477
	 nop
478

479
	.align	5
480
LEAF(memmove)
481
	ADD	t0, a0, a2
482
	ADD	t1, a1, a2
483
	sltu	t0, a1, t0			# dst + len <= src -> memcpy
484
	sltu	t1, a0, t1			# dst >= src + len -> memcpy
485
	and	t0, t1
486
	beqz	t0, __memcpy
487
	 move	v0, a0				/* return value */
488
	beqz	a2, r_out
489
	END(memmove)
490

491
	/* fall through to __rmemcpy */
492
LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
493
	 sltu	t0, a1, a0
494
	beqz	t0, r_end_bytes_up		# src >= dst
495
	 nop
496
	ADD	a0, a2				# dst = dst + len
497
	ADD	a1, a2				# src = src + len
498

499
r_end_bytes:
500
	lb	t0, -1(a1)
501
	SUB	a2, a2, 0x1
502
	sb	t0, -1(a0)
503
	SUB	a1, a1, 0x1
504
	bnez	a2, r_end_bytes
505
	 SUB	a0, a0, 0x1
506

507
r_out:
508
	jr	ra
509
	 move	a2, zero
510

511
r_end_bytes_up:
512
	lb	t0, (a1)
513
	SUB	a2, a2, 0x1
514
	sb	t0, (a0)
515
	ADD	a1, a1, 0x1
516
	bnez	a2, r_end_bytes_up
517
	 ADD	a0, a0, 0x1
518

519
	jr	ra
520
	 move	a2, zero
521
	END(__rmemcpy)
522

523
Product

Resources

Company