CoCalc -- memcpy.S

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/mips/lib/memcpy.S
¹⁰⁸¹⁷ views
1
/*
2
 * This file is subject to the terms and conditions of the GNU General Public
3
 * License.  See the file "COPYING" in the main directory of this archive
4
 * for more details.
5
 *
6
 * Unified implementation of memcpy, memmove and the __copy_user backend.
7
 *
8
 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])
9
 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10
 * Copyright (C) 2002 Broadcom, Inc.
11
 *   memcpy/copy_user author: Mark Vandevoorde
12
 * Copyright (C) 2007  Maciej W. Rozycki
13
 *
14
 * Mnemonic names for arguments to memcpy/__copy_user
15
 */
16

17
/*
18
 * Hack to resolve longstanding prefetch issue
19
 *
20
 * Prefetching may be fatal on some systems if we're prefetching beyond the
21
 * end of memory on some systems.  It's also a seriously bad idea on non
22
 * dma-coherent systems.
23
 */
24
#ifdef CONFIG_DMA_NONCOHERENT
25
#undef CONFIG_CPU_HAS_PREFETCH
26
#endif
27
#ifdef CONFIG_MIPS_MALTA
28
#undef CONFIG_CPU_HAS_PREFETCH
29
#endif
30

31
#include <asm/asm.h>
32
#include <asm/asm-offsets.h>
33
#include <asm/regdef.h>
34

35
#define dst a0
36
#define src a1
37
#define len a2
38

39
/*
40
 * Spec
41
 *
42
 * memcpy copies len bytes from src to dst and sets v0 to dst.
43
 * It assumes that
44
 *   - src and dst don't overlap
45
 *   - src is readable
46
 *   - dst is writable
47
 * memcpy uses the standard calling convention
48
 *
49
 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
50
 * the number of uncopied bytes due to an exception caused by a read or write.
51
 * __copy_user assumes that src and dst don't overlap, and that the call is
52
 * implementing one of the following:
53
 *   copy_to_user
54
 *     - src is readable  (no exceptions when reading src)
55
 *   copy_from_user
56
 *     - dst is writable  (no exceptions when writing dst)
57
 * __copy_user uses a non-standard calling convention; see
58
 * include/asm-mips/uaccess.h
59
 *
60
 * When an exception happens on a load, the handler must
61
 # ensure that all of the destination buffer is overwritten to prevent
62
 * leaking information to user mode programs.
63
 */
64

65
/*
66
 * Implementation
67
 */
68

69
/*
70
 * The exception handler for loads requires that:
71
 *  1- AT contain the address of the byte just past the end of the source
72
 *     of the copy,
73
 *  2- src_entry <= src < AT, and
74
 *  3- (dst - src) == (dst_entry - src_entry),
75
 * The _entry suffix denotes values when __copy_user was called.
76
 *
77
 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
78
 * (2) is met by incrementing src by the number of bytes copied
79
 * (3) is met by not doing loads between a pair of increments of dst and src
80
 *
81
 * The exception handlers for stores adjust len (if necessary) and return.
82
 * These handlers do not need to overwrite any data.
83
 *
84
 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
85
 * they're not protected.
86
 */
87

88
#define EXC(inst_reg,addr,handler)		\
89
9:	inst_reg, addr;				\
90
	.section __ex_table,"a";		\
91
	PTR	9b, handler;			\
92
	.previous
93

94
/*
95
 * Only on the 64-bit kernel we can made use of 64-bit registers.
96
 */
97
#ifdef CONFIG_64BIT
98
#define USE_DOUBLE
99
#endif
100

101
#ifdef USE_DOUBLE
102

103
#define LOAD   ld
104
#define LOADL  ldl
105
#define LOADR  ldr
106
#define STOREL sdl
107
#define STORER sdr
108
#define STORE  sd
109
#define ADD    daddu
110
#define SUB    dsubu
111
#define SRL    dsrl
112
#define SRA    dsra
113
#define SLL    dsll
114
#define SLLV   dsllv
115
#define SRLV   dsrlv
116
#define NBYTES 8
117
#define LOG_NBYTES 3
118

119
/*
120
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
121
 * register definitions). We need to redefine the register definitions from
122
 * the n64 ABI register naming to the o32 ABI register naming.
123
 */
124
#undef t0
125
#undef t1
126
#undef t2
127
#undef t3
128
#define t0	$8
129
#define t1	$9
130
#define t2	$10
131
#define t3	$11
132
#define t4	$12
133
#define t5	$13
134
#define t6	$14
135
#define t7	$15
136

137
#else
138

139
#define LOAD   lw
140
#define LOADL  lwl
141
#define LOADR  lwr
142
#define STOREL swl
143
#define STORER swr
144
#define STORE  sw
145
#define ADD    addu
146
#define SUB    subu
147
#define SRL    srl
148
#define SLL    sll
149
#define SRA    sra
150
#define SLLV   sllv
151
#define SRLV   srlv
152
#define NBYTES 4
153
#define LOG_NBYTES 2
154

155
#endif /* USE_DOUBLE */
156

157
#ifdef CONFIG_CPU_LITTLE_ENDIAN
158
#define LDFIRST LOADR
159
#define LDREST  LOADL
160
#define STFIRST STORER
161
#define STREST  STOREL
162
#define SHIFT_DISCARD SLLV
163
#else
164
#define LDFIRST LOADL
165
#define LDREST  LOADR
166
#define STFIRST STOREL
167
#define STREST  STORER
168
#define SHIFT_DISCARD SRLV
169
#endif
170

171
#define FIRST(unit) ((unit)*NBYTES)
172
#define REST(unit)  (FIRST(unit)+NBYTES-1)
173
#define UNIT(unit)  FIRST(unit)
174

175
#define ADDRMASK (NBYTES-1)
176

177
	.text
178
	.set	noreorder
179
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
180
	.set	noat
181
#else
182
	.set	at=v1
183
#endif
184

185
/*
186
 * A combined memcpy/__copy_user
187
 * __copy_user sets len to 0 for success; else to an upper bound of
188
 * the number of uncopied bytes.
189
 * memcpy sets v0 to dst.
190
 */
191
	.align	5
192
LEAF(memcpy)					/* a0=dst a1=src a2=len */
193
	move	v0, dst				/* return value */
194
.L__memcpy:
195
FEXPORT(__copy_user)
196
	/*
197
	 * Note: dst & src may be unaligned, len may be 0
198
	 * Temps
199
	 */
200
#define rem t8
201

202
	R10KCBARRIER(0(ra))
203
	/*
204
	 * The "issue break"s below are very approximate.
205
	 * Issue delays for dcache fills will perturb the schedule, as will
206
	 * load queue full replay traps, etc.
207
	 *
208
	 * If len < NBYTES use byte operations.
209
	 */
210
	PREF(	0, 0(src) )
211
	PREF(	1, 0(dst) )
212
	sltu	t2, len, NBYTES
213
	and	t1, dst, ADDRMASK
214
	PREF(	0, 1*32(src) )
215
	PREF(	1, 1*32(dst) )
216
	bnez	t2, .Lcopy_bytes_checklen
217
	 and	t0, src, ADDRMASK
218
	PREF(	0, 2*32(src) )
219
	PREF(	1, 2*32(dst) )
220
	bnez	t1, .Ldst_unaligned
221
	 nop
222
	bnez	t0, .Lsrc_unaligned_dst_aligned
223
	/*
224
	 * use delay slot for fall-through
225
	 * src and dst are aligned; need to compute rem
226
	 */
227
.Lboth_aligned:
228
	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
229
	beqz	t0, .Lcleanup_both_aligned # len < 8*NBYTES
230
	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
231
	PREF(	0, 3*32(src) )
232
	PREF(	1, 3*32(dst) )
233
	.align	4
234
1:
235
	R10KCBARRIER(0(ra))
236
EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
237
EXC(	LOAD	t1, UNIT(1)(src),	.Ll_exc_copy)
238
EXC(	LOAD	t2, UNIT(2)(src),	.Ll_exc_copy)
239
EXC(	LOAD	t3, UNIT(3)(src),	.Ll_exc_copy)
240
	SUB	len, len, 8*NBYTES
241
EXC(	LOAD	t4, UNIT(4)(src),	.Ll_exc_copy)
242
EXC(	LOAD	t7, UNIT(5)(src),	.Ll_exc_copy)
243
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc_p8u)
244
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc_p7u)
245
EXC(	LOAD	t0, UNIT(6)(src),	.Ll_exc_copy)
246
EXC(	LOAD	t1, UNIT(7)(src),	.Ll_exc_copy)
247
	ADD	src, src, 8*NBYTES
248
	ADD	dst, dst, 8*NBYTES
249
EXC(	STORE	t2, UNIT(-6)(dst),	.Ls_exc_p6u)
250
EXC(	STORE	t3, UNIT(-5)(dst),	.Ls_exc_p5u)
251
EXC(	STORE	t4, UNIT(-4)(dst),	.Ls_exc_p4u)
252
EXC(	STORE	t7, UNIT(-3)(dst),	.Ls_exc_p3u)
253
EXC(	STORE	t0, UNIT(-2)(dst),	.Ls_exc_p2u)
254
EXC(	STORE	t1, UNIT(-1)(dst),	.Ls_exc_p1u)
255
	PREF(	0, 8*32(src) )
256
	PREF(	1, 8*32(dst) )
257
	bne	len, rem, 1b
258
	 nop
259

260
	/*
261
	 * len == rem == the number of bytes left to copy < 8*NBYTES
262
	 */
263
.Lcleanup_both_aligned:
264
	beqz	len, .Ldone
265
	 sltu	t0, len, 4*NBYTES
266
	bnez	t0, .Lless_than_4units
267
	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
268
	/*
269
	 * len >= 4*NBYTES
270
	 */
271
EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
272
EXC(	LOAD	t1, UNIT(1)(src),	.Ll_exc_copy)
273
EXC(	LOAD	t2, UNIT(2)(src),	.Ll_exc_copy)
274
EXC(	LOAD	t3, UNIT(3)(src),	.Ll_exc_copy)
275
	SUB	len, len, 4*NBYTES
276
	ADD	src, src, 4*NBYTES
277
	R10KCBARRIER(0(ra))
278
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc_p4u)
279
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc_p3u)
280
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc_p2u)
281
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc_p1u)
282
	.set	reorder				/* DADDI_WAR */
283
	ADD	dst, dst, 4*NBYTES
284
	beqz	len, .Ldone
285
	.set	noreorder
286
.Lless_than_4units:
287
	/*
288
	 * rem = len % NBYTES
289
	 */
290
	beq	rem, len, .Lcopy_bytes
291
	 nop
292
1:
293
	R10KCBARRIER(0(ra))
294
EXC(	LOAD	t0, 0(src),		.Ll_exc)
295
	ADD	src, src, NBYTES
296
	SUB	len, len, NBYTES
297
EXC(	STORE	t0, 0(dst),		.Ls_exc_p1u)
298
	.set	reorder				/* DADDI_WAR */
299
	ADD	dst, dst, NBYTES
300
	bne	rem, len, 1b
301
	.set	noreorder
302

303
	/*
304
	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
305
	 * A loop would do only a byte at a time with possible branch
306
	 * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
307
	 * because can't assume read-access to dst.  Instead, use
308
	 * STREST dst, which doesn't require read access to dst.
309
	 *
310
	 * This code should perform better than a simple loop on modern,
311
	 * wide-issue mips processors because the code has fewer branches and
312
	 * more instruction-level parallelism.
313
	 */
314
#define bits t2
315
	beqz	len, .Ldone
316
	 ADD	t1, dst, len	# t1 is just past last byte of dst
317
	li	bits, 8*NBYTES
318
	SLL	rem, len, 3	# rem = number of bits to keep
319
EXC(	LOAD	t0, 0(src),		.Ll_exc)
320
	SUB	bits, bits, rem	# bits = number of bits to discard
321
	SHIFT_DISCARD t0, t0, bits
322
EXC(	STREST	t0, -1(t1),		.Ls_exc)
323
	jr	ra
324
	 move	len, zero
325
.Ldst_unaligned:
326
	/*
327
	 * dst is unaligned
328
	 * t0 = src & ADDRMASK
329
	 * t1 = dst & ADDRMASK; T1 > 0
330
	 * len >= NBYTES
331
	 *
332
	 * Copy enough bytes to align dst
333
	 * Set match = (src and dst have same alignment)
334
	 */
335
#define match rem
336
EXC(	LDFIRST	t3, FIRST(0)(src),	.Ll_exc)
337
	ADD	t2, zero, NBYTES
338
EXC(	LDREST	t3, REST(0)(src),	.Ll_exc_copy)
339
	SUB	t2, t2, t1	# t2 = number of bytes copied
340
	xor	match, t0, t1
341
	R10KCBARRIER(0(ra))
342
EXC(	STFIRST t3, FIRST(0)(dst),	.Ls_exc)
343
	beq	len, t2, .Ldone
344
	 SUB	len, len, t2
345
	ADD	dst, dst, t2
346
	beqz	match, .Lboth_aligned
347
	 ADD	src, src, t2
348

349
.Lsrc_unaligned_dst_aligned:
350
	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
351
	PREF(	0, 3*32(src) )
352
	beqz	t0, .Lcleanup_src_unaligned
353
	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
354
	PREF(	1, 3*32(dst) )
355
1:
356
/*
357
 * Avoid consecutive LD*'s to the same register since some mips
358
 * implementations can't issue them in the same cycle.
359
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
360
 * are to the same unit (unless src is aligned, but it's not).
361
 */
362
	R10KCBARRIER(0(ra))
363
EXC(	LDFIRST	t0, FIRST(0)(src),	.Ll_exc)
364
EXC(	LDFIRST	t1, FIRST(1)(src),	.Ll_exc_copy)
365
	SUB     len, len, 4*NBYTES
366
EXC(	LDREST	t0, REST(0)(src),	.Ll_exc_copy)
367
EXC(	LDREST	t1, REST(1)(src),	.Ll_exc_copy)
368
EXC(	LDFIRST	t2, FIRST(2)(src),	.Ll_exc_copy)
369
EXC(	LDFIRST	t3, FIRST(3)(src),	.Ll_exc_copy)
370
EXC(	LDREST	t2, REST(2)(src),	.Ll_exc_copy)
371
EXC(	LDREST	t3, REST(3)(src),	.Ll_exc_copy)
372
	PREF(	0, 9*32(src) )		# 0 is PREF_LOAD  (not streamed)
373
	ADD	src, src, 4*NBYTES
374
#ifdef CONFIG_CPU_SB1
375
	nop				# improves slotting
376
#endif
377
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc_p4u)
378
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc_p3u)
379
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc_p2u)
380
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc_p1u)
381
	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
382
	.set	reorder				/* DADDI_WAR */
383
	ADD	dst, dst, 4*NBYTES
384
	bne	len, rem, 1b
385
	.set	noreorder
386

387
.Lcleanup_src_unaligned:
388
	beqz	len, .Ldone
389
	 and	rem, len, NBYTES-1  # rem = len % NBYTES
390
	beq	rem, len, .Lcopy_bytes
391
	 nop
392
1:
393
	R10KCBARRIER(0(ra))
394
EXC(	LDFIRST t0, FIRST(0)(src),	.Ll_exc)
395
EXC(	LDREST	t0, REST(0)(src),	.Ll_exc_copy)
396
	ADD	src, src, NBYTES
397
	SUB	len, len, NBYTES
398
EXC(	STORE	t0, 0(dst),		.Ls_exc_p1u)
399
	.set	reorder				/* DADDI_WAR */
400
	ADD	dst, dst, NBYTES
401
	bne	len, rem, 1b
402
	.set	noreorder
403

404
.Lcopy_bytes_checklen:
405
	beqz	len, .Ldone
406
	 nop
407
.Lcopy_bytes:
408
	/* 0 < len < NBYTES  */
409
	R10KCBARRIER(0(ra))
410
#define COPY_BYTE(N)			\
411
EXC(	lb	t0, N(src), .Ll_exc);	\
412
	SUB	len, len, 1;		\
413
	beqz	len, .Ldone;		\
414
EXC(	 sb	t0, N(dst), .Ls_exc_p1)
415

416
	COPY_BYTE(0)
417
	COPY_BYTE(1)
418
#ifdef USE_DOUBLE
419
	COPY_BYTE(2)
420
	COPY_BYTE(3)
421
	COPY_BYTE(4)
422
	COPY_BYTE(5)
423
#endif
424
EXC(	lb	t0, NBYTES-2(src), .Ll_exc)
425
	SUB	len, len, 1
426
	jr	ra
427
EXC(	 sb	t0, NBYTES-2(dst), .Ls_exc_p1)
428
.Ldone:
429
	jr	ra
430
	 nop
431
	END(memcpy)
432

433
.Ll_exc_copy:
434
	/*
435
	 * Copy bytes from src until faulting load address (or until a
436
	 * lb faults)
437
	 *
438
	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
439
	 * may be more than a byte beyond the last address.
440
	 * Hence, the lb below may get an exception.
441
	 *
442
	 * Assumes src < THREAD_BUADDR($28)
443
	 */
444
	LOAD	t0, TI_TASK($28)
445
	 nop
446
	LOAD	t0, THREAD_BUADDR(t0)
447
1:
448
EXC(	lb	t1, 0(src),	.Ll_exc)
449
	ADD	src, src, 1
450
	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
451
	.set	reorder				/* DADDI_WAR */
452
	ADD	dst, dst, 1
453
	bne	src, t0, 1b
454
	.set	noreorder
455
.Ll_exc:
456
	LOAD	t0, TI_TASK($28)
457
	 nop
458
	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
459
	 nop
460
	SUB	len, AT, t0		# len number of uncopied bytes
461
	/*
462
	 * Here's where we rely on src and dst being incremented in tandem,
463
	 *   See (3) above.
464
	 * dst += (fault addr - src) to put dst at first byte to clear
465
	 */
466
	ADD	dst, t0			# compute start address in a1
467
	SUB	dst, src
468
	/*
469
	 * Clear len bytes starting at dst.  Can't call __bzero because it
470
	 * might modify len.  An inefficient loop for these rare times...
471
	 */
472
	.set	reorder				/* DADDI_WAR */
473
	SUB	src, len, 1
474
	beqz	len, .Ldone
475
	.set	noreorder
476
1:	sb	zero, 0(dst)
477
	ADD	dst, dst, 1
478
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
479
	bnez	src, 1b
480
	 SUB	src, src, 1
481
#else
482
	.set	push
483
	.set	noat
484
	li	v1, 1
485
	bnez	src, 1b
486
	 SUB	src, src, v1
487
	.set	pop
488
#endif
489
	jr	ra
490
	 nop
491

492

493
#define SEXC(n)							\
494
	.set	reorder;			/* DADDI_WAR */	\
495
.Ls_exc_p ## n ## u:						\
496
	ADD	len, len, n*NBYTES;				\
497
	jr	ra;						\
498
	.set	noreorder
499

500
SEXC(8)
501
SEXC(7)
502
SEXC(6)
503
SEXC(5)
504
SEXC(4)
505
SEXC(3)
506
SEXC(2)
507
SEXC(1)
508

509
.Ls_exc_p1:
510
	.set	reorder				/* DADDI_WAR */
511
	ADD	len, len, 1
512
	jr	ra
513
	.set	noreorder
514
.Ls_exc:
515
	jr	ra
516
	 nop
517

518
	.align	5
519
LEAF(memmove)
520
	ADD	t0, a0, a2
521
	ADD	t1, a1, a2
522
	sltu	t0, a1, t0			# dst + len <= src -> memcpy
523
	sltu	t1, a0, t1			# dst >= src + len -> memcpy
524
	and	t0, t1
525
	beqz	t0, .L__memcpy
526
	 move	v0, a0				/* return value */
527
	beqz	a2, .Lr_out
528
	END(memmove)
529

530
	/* fall through to __rmemcpy */
531
LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
532
	 sltu	t0, a1, a0
533
	beqz	t0, .Lr_end_bytes_up		# src >= dst
534
	 nop
535
	ADD	a0, a2				# dst = dst + len
536
	ADD	a1, a2				# src = src + len
537

538
.Lr_end_bytes:
539
	R10KCBARRIER(0(ra))
540
	lb	t0, -1(a1)
541
	SUB	a2, a2, 0x1
542
	sb	t0, -1(a0)
543
	SUB	a1, a1, 0x1
544
	.set	reorder				/* DADDI_WAR */
545
	SUB	a0, a0, 0x1
546
	bnez	a2, .Lr_end_bytes
547
	.set	noreorder
548

549
.Lr_out:
550
	jr	ra
551
	 move	a2, zero
552

553
.Lr_end_bytes_up:
554
	R10KCBARRIER(0(ra))
555
	lb	t0, (a1)
556
	SUB	a2, a2, 0x1
557
	sb	t0, (a0)
558
	ADD	a1, a1, 0x1
559
	.set	reorder				/* DADDI_WAR */
560
	ADD	a0, a0, 0x1
561
	bnez	a2, .Lr_end_bytes_up
562
	.set	noreorder
563

564
	jr	ra
565
	 move	a2, zero
566
	END(__rmemcpy)
567

568
Product

Resources

Company