CoCalc -- memcpy-sh4.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/sh/lib/memcpy-sh4.S
⁵⁰⁵¹⁴ views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * "memcpy" implementation of SuperH
4
 *
5
 * Copyright (C) 1999  Niibe Yutaka
6
 * Copyright (c) 2002  STMicroelectronics Ltd
7
 *   Modified from memcpy.S and micro-optimised for SH4
8
 *   Stuart Menefy ([email protected])
9
 *
10
 */
11
#include <linux/linkage.h>
12

13
/*
14
 * void *memcpy(void *dst, const void *src, size_t n);
15
 *
16
 * It is assumed that there is no overlap between src and dst.
17
 * If there is an overlap, then the results are undefined.
18
 */
19

20
	!
21
	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
22
	!
23

24
	! Size is 16 or greater, and may have trailing bytes
25

26
	.balign	32
27
.Lcase1:
28
	! Read a long word and write a long word at once
29
	! At the start of each iteration, r7 contains last long load
30
	add	#-1,r5		!  79 EX
31
	mov	r4,r2		!   5 MT (0 cycles latency)
32

33
	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
34
	add	#-4,r5		!  50 EX
35

36
	add	#7,r2		!  79 EX
37
	!
38
#ifdef CONFIG_CPU_LITTLE_ENDIAN
39
	! 6 cycles, 4 bytes per iteration
40
3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
41
	mov	r7, r3		!   5 MT (latency=0)	! RQPO
42

43
	cmp/hi	r2,r0		!  57 MT
44
	shll16	r3		! 103 EX
45

46
	mov	r1,r6		!   5 MT (latency=0)
47
	shll8	r3		! 102 EX		! Oxxx
48

49
	shlr8	r6		! 106 EX		! xNML
50
	mov	r1, r7		!   5 MT (latency=0)
51

52
	or	r6,r3		!  82 EX		! ONML
53
	bt/s	3b		! 109 BR
54

55
	 mov.l	r3,@-r0		!  30 LS
56
#else
57
3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
58
	mov	r7,r3		!   5 MT (latency=0)	! OPQR
59

60
	cmp/hi	r2,r0		!  57 MT
61
	shlr16	r3		! 107 EX
62

63
	shlr8	r3		! 106 EX		! xxxO
64
	mov	r1,r6		!   5 MT (latency=0)
65

66
	shll8	r6		! 102 EX		! LMNx
67
	mov	r1,r7		!   5 MT (latency=0)
68

69
	or	r6,r3		!  82 EX		! LMNO
70
	bt/s	3b		! 109 BR
71

72
	 mov.l	r3,@-r0		!  30 LS
73
#endif
74
	! Finally, copy a byte at once, if necessary
75

76
	add	#4,r5		!  50 EX
77
	cmp/eq	r4,r0		!  54 MT
78

79
	add	#-6,r2		!  50 EX
80
	bt	9f		! 109 BR
81

82
8:	cmp/hi	r2,r0		!  57 MT
83
	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
84

85
	bt/s	8b		! 109 BR
86

87
	 mov.b	r1,@-r0		!  29 LS
88

89
9:	rts
90
	 nop
91

92

93
	!
94
	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
95
	!
96

97
	! Size is 16 or greater, and may have trailing bytes
98

99
	.balign	32
100
.Lcase3:
101
	! Read a long word and write a long word at once
102
	! At the start of each iteration, r7 contains last long load
103
	add	#-3,r5		! 79 EX
104
	mov	r4,r2		!  5 MT (0 cycles latency)
105

106
	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
107
	add	#-4,r5		! 50 EX
108

109
	add	#7,r2		!  79 EX
110
	!
111
#ifdef CONFIG_CPU_LITTLE_ENDIAN
112
	! 6 cycles, 4 bytes per iteration
113
3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
114
	mov	r7, r3		!   5 MT (latency=0)	! RQPO
115

116
	cmp/hi	r2,r0		!  57 MT
117
	shll8	r3		! 102 EX		! QPOx
118

119
	mov	r1,r6		!   5 MT (latency=0)
120
	shlr16	r6		! 107 EX
121

122
	shlr8	r6		! 106 EX		! xxxN
123
	mov	r1, r7		!   5 MT (latency=0)
124

125
	or	r6,r3		!  82 EX		! QPON
126
	bt/s	3b		! 109 BR
127

128
	 mov.l	r3,@-r0		!  30 LS
129
#else
130
3:	mov	r7,r3		! OPQR
131
	shlr8	r3		! xOPQ
132
	mov.l	@(r0,r5),r7	! KLMN
133
	mov	r7,r6
134
	shll16	r6
135
	shll8	r6		! Nxxx
136
	or	r6,r3		! NOPQ
137
	cmp/hi	r2,r0
138
	bt/s	3b
139
	 mov.l	r3,@-r0
140
#endif
141

142
	! Finally, copy a byte at once, if necessary
143

144
	add	#6,r5		!  50 EX
145
	cmp/eq	r4,r0		!  54 MT
146

147
	add	#-6,r2		!  50 EX
148
	bt	9f		! 109 BR
149

150
8:	cmp/hi	r2,r0		!  57 MT
151
	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
152

153
	bt/s	8b		! 109 BR
154

155
	 mov.b	r1,@-r0		!  29 LS
156

157
9:	rts
158
	 nop
159

160
ENTRY(memcpy)
161

162
	! Calculate the invariants which will be used in the remainder
163
	! of the code:
164
	!
165
	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
166
	!	         [ ...  ]                 [ ...  ]
167
	!	           :                        :
168
	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
169
	!
170
	!
171

172
	! Short circuit the common case of src, dst and len being 32 bit aligned
173
	! and test for zero length move
174

175
	mov	r6, r0		!   5 MT (0 cycle latency)
176
	or	r4, r0		!  82 EX
177

178
	or	r5, r0		!  82 EX
179
	tst	r6, r6		!  86 MT
180

181
	bt/s	99f		! 111 BR		(zero len)
182
	 tst	#3, r0		!  87 MT
183

184
	mov	r4, r0		!   5 MT (0 cycle latency)
185
	add	r6, r0		!  49 EX
186

187
	mov	#16, r1		!   6 EX
188
	bt/s	.Lcase00	! 111 BR		(aligned)
189

190
	 sub	r4, r5		!  75 EX
191

192
	! Arguments are not nicely long word aligned or zero len.
193
	! Check for small copies, and if so do a simple byte at a time copy.
194
	!
195
	! Deciding on an exact value of 'small' is not easy, as the point at which
196
	! using the optimised routines become worthwhile varies (these are the
197
	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198
	!	size	byte-at-time	long	word	byte
199
	!	16	42		39-40	46-50	50-55
200
	!	24	58		43-44	54-58	62-67
201
	!	36	82		49-50	66-70	80-85
202
	! However the penalty for getting it 'wrong' is much higher for long word
203
	! aligned data (and this is more common), so use a value of 16.
204

205
	cmp/gt	r6,r1		!  56 MT
206

207
	add	#-1,r5		!  50 EX
208
	bf/s	6f		! 108 BR		(not small)
209

210
	 mov	r5, r3		!   5 MT (latency=0)
211
	shlr	r6		! 104 EX
212

213
	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
214
	bf/s	4f		! 111 BR
215

216
	 add	#-1,r3		!  50 EX
217
	tst	r6, r6		!  86 MT
218

219
	bt/s	98f		! 110 BR
220
	 mov.b	r1,@-r0		!  29 LS
221

222
	! 4 cycles, 2 bytes per iteration
223
3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
224

225
4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
226
	dt	r6		!  67 EX
227

228
	mov.b	r1,@-r0		!  29 LS
229
	bf/s	3b		! 111 BR
230

231
	 mov.b	r2,@-r0		!  29 LS
232
98:
233
	rts
234
	 nop
235

236
99:	rts
237
	 mov	r4, r0
238

239
	! Size is not small, so its worthwhile looking for optimisations.
240
	! First align destination to a long word boundary.
241
	!
242
	! r5 = normal value -1
243

244
6:	tst	#3, r0		!  87 MT
245
        mov	#3, r3		!   6 EX
246

247
	bt/s	2f		! 111 BR
248
	 and	r0,r3		!  78 EX
249

250
	! 3 cycles, 1 byte per iteration
251
1:	dt	r3		!  67 EX
252
	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
253

254
	add	#-1, r6		!  79 EX
255
	bf/s	1b		! 109 BR
256

257
	 mov.b	r1,@-r0		!  28 LS
258

259
2:	add	#1, r5		!  79 EX
260

261
	! Now select the appropriate bulk transfer code based on relative
262
	! alignment of src and dst.
263

264
	mov	r0, r3		!   5 MT (latency=0)
265

266
	mov	r5, r0		!   5 MT (latency=0)
267
	tst	#1, r0		!  87 MT
268

269
	bf/s	1f		! 111 BR
270
	 mov	#64, r7		!   6 EX
271

272
	! bit 0 clear
273

274
	cmp/ge	r7, r6		!  55 MT
275

276
	bt/s	2f		! 111 BR
277
	 tst	#2, r0		!  87 MT
278

279
	! small
280
	bt/s	.Lcase0
281
	 mov	r3, r0
282

283
	bra	.Lcase2
284
	 nop
285

286
	! big
287
2:	bt/s	.Lcase0b
288
	 mov	r3, r0
289

290
	bra	.Lcase2b
291
	 nop
292

293
	! bit 0 set
294
1:	tst	#2, r0		! 87 MT
295

296
	bt/s	.Lcase1
297
	 mov	r3, r0
298

299
	bra	.Lcase3
300
	 nop
301

302

303
	!
304
	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
305
	!
306

307
	! src, dst and size are all long word aligned
308
	! size is non-zero
309

310
	.balign	32
311
.Lcase00:
312
	mov	#64, r1		!   6 EX
313
	mov	r5, r3		!   5 MT (latency=0)
314

315
	cmp/gt	r6, r1		!  56 MT
316
	add	#-4, r5		!  50 EX
317

318
	bf	.Lcase00b	! 108 BR		(big loop)
319
	shlr2	r6		! 105 EX
320

321
	shlr	r6		! 104 EX
322
	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
323

324
	bf/s	4f		! 111 BR
325
	 add	#-8, r3		!  50 EX
326

327
	tst	r6, r6		!  86 MT
328
	bt/s	5f		! 110 BR
329

330
	 mov.l	r1,@-r0		!  30 LS
331

332
	! 4 cycles, 2 long words per iteration
333
3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
334

335
4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
336
	dt	r6		!  67 EX
337

338
	mov.l	r1, @-r0	!  30 LS
339
	bf/s	3b		! 109 BR
340

341
	 mov.l	r2, @-r0	!  30 LS
342

343
5:	rts
344
	 nop
345

346

347
	! Size is 16 or greater and less than 64, but may have trailing bytes
348

349
	.balign	32
350
.Lcase0:
351
	add	#-4, r5		!  50 EX
352
	mov	r4, r7		!   5 MT (latency=0)
353

354
	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
355
	mov	#4, r2		!   6 EX
356

357
	add	#11, r7		!  50 EX
358
	tst	r2, r6		!  86 MT
359

360
	mov	r5, r3		!   5 MT (latency=0)
361
	bt/s	4f		! 111 BR
362

363
	 add	#-4, r3		!  50 EX
364
	mov.l	r1,@-r0		!  30 LS
365

366
	! 4 cycles, 2 long words per iteration
367
3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
368

369
4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
370
	cmp/hi	r7, r0
371

372
	mov.l	r1, @-r0	!  30 LS
373
	bt/s	3b		! 109 BR
374

375
	 mov.l	r2, @-r0	!  30 LS
376

377
	! Copy the final 0-3 bytes
378

379
	add	#3,r5		!  50 EX
380

381
	cmp/eq	r0, r4		!  54 MT
382
	add	#-10, r7	!  50 EX
383

384
	bt	9f		! 110 BR
385

386
	! 3 cycles, 1 byte per iteration
387
1:	mov.b	@(r0,r5),r1	!  19 LS
388
	cmp/hi	r7,r0		!  57 MT
389

390
	bt/s	1b		! 111 BR
391
	 mov.b	r1,@-r0		!  28 LS
392

393
9:	rts
394
	 nop
395

396
	! Size is at least 64 bytes, so will be going round the big loop at least once.
397
	!
398
	!   r2 = rounded up r4
399
	!   r3 = rounded down r0
400

401
	.balign	32
402
.Lcase0b:
403
	add	#-4, r5		!  50 EX
404

405
.Lcase00b:
406
	mov	r0, r3		!   5 MT (latency=0)
407
	mov	#(~0x1f), r1	!   6 EX
408

409
	and	r1, r3		!  78 EX
410
	mov	r4, r2		!   5 MT (latency=0)
411

412
	cmp/eq	r3, r0		!  54 MT
413
	add	#0x1f, r2	!  50 EX
414

415
	bt/s	1f		! 110 BR
416
	 and	r1, r2		!  78 EX
417

418
	! copy initial words until cache line aligned
419

420
	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
421
	tst	#4, r0		!  87 MT
422

423
	mov	r5, r6		!   5 MT (latency=0)
424
	add	#-4, r6		!  50 EX
425

426
	bt/s	4f		! 111 BR
427
	 add	#8, r3		!  50 EX
428

429
	tst	#0x18, r0	!  87 MT
430

431
	bt/s	1f		! 109 BR
432
	 mov.l	r1,@-r0		!  30 LS
433

434
	! 4 cycles, 2 long words per iteration
435
3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
436

437
4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
438
	cmp/eq	r3, r0		!  54 MT
439

440
	mov.l	r1, @-r0	!  30 LS
441
	bf/s	3b		! 109 BR
442

443
	 mov.l	r7, @-r0	!  30 LS
444

445
	! Copy the cache line aligned blocks
446
	!
447
	! In use: r0, r2, r4, r5
448
	! Scratch: r1, r3, r6, r7
449
	!
450
	! We could do this with the four scratch registers, but if src
451
	! and dest hit the same cache line, this will thrash, so make
452
	! use of additional registers.
453
	!
454
	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455
	!   r5:	 src (was r0+r5)
456
	!   r1:	 dest (was r0)
457
	! this can be reversed at the end, so we don't need to save any extra
458
	! state.
459
	!
460
1:	mov.l	r8, @-r15	!  30 LS
461
	add	r0, r5		!  49 EX
462

463
	mov.l	r9, @-r15	!  30 LS
464
	mov	r0, r1		!   5 MT (latency=0)
465

466
	mov.l	r10, @-r15	!  30 LS
467
	add	#-0x1c, r5	!  50 EX
468

469
	mov.l	r11, @-r15	!  30 LS
470

471
	! 16 cycles, 32 bytes per iteration
472
2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
473
	add	#-0x20, r1	! 50 EX
474
	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
475
	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
476
	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
477
	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
478
	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
479
	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
480
	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
481
	movca.l	r0,@r1		! 40 LS (latency=3-7)
482
	mov.l	r3,@(0x04,r1)	! 33 LS
483
	mov.l	r6,@(0x08,r1)	! 33 LS
484
	mov.l	r7,@(0x0c,r1)	! 33 LS
485

486
	mov.l	r8,@(0x10,r1)	! 33 LS
487
	add	#-0x20, r5	! 50 EX
488

489
	mov.l	r9,@(0x14,r1)	! 33 LS
490
	cmp/eq	r2,r1		! 54 MT
491

492
	mov.l	r10,@(0x18,r1)	!  33 LS
493
	bf/s	2b		! 109 BR
494

495
	 mov.l	r11,@(0x1c,r1)	!  33 LS
496

497
	mov	r1, r0		!   5 MT (latency=0)
498

499
	mov.l	@r15+, r11	!  15 LS
500
	sub	r1, r5		!  75 EX
501

502
	mov.l	@r15+, r10	!  15 LS
503
	cmp/eq	r4, r0		!  54 MT
504

505
	bf/s	1f		! 109 BR
506
	 mov.l	 @r15+, r9	!  15 LS
507

508
	rts
509
1:	 mov.l	@r15+, r8	!  15 LS
510
	sub	r4, r1		!  75 EX		(len remaining)
511

512
	! number of trailing bytes is non-zero
513
	!
514
	! invariants restored (r5 already decremented by 4)
515
	! also r1=num bytes remaining
516

517
	mov	#4, r2		!   6 EX
518
	mov	r4, r7		!   5 MT (latency=0)
519

520
	add	#0x1c, r5	!  50 EX		(back to -4)
521
	cmp/hs	r2, r1		!  58 MT
522

523
	bf/s	5f		! 108 BR
524
	 add	 #11, r7	!  50 EX
525

526
	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
527
	tst	r2, r1		!  86 MT
528

529
	mov	r5, r3		!   5 MT (latency=0)
530
	bt/s	4f		! 111 BR
531

532
	 add	#-4, r3		!  50 EX
533
	cmp/hs	r2, r1		!  58 MT
534

535
	bt/s	5f		! 111 BR
536
	 mov.l	r6,@-r0		!  30 LS
537

538
	! 4 cycles, 2 long words per iteration
539
3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
540

541
4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
542
	cmp/hi	r7, r0
543

544
	mov.l	r6, @-r0	!  30 LS
545
	bt/s	3b		! 109 BR
546

547
	 mov.l	r2, @-r0	!  30 LS
548

549
	! Copy the final 0-3 bytes
550

551
5:	cmp/eq	r0, r4		!  54 MT
552
	add	#-10, r7	!  50 EX
553

554
	bt	9f		! 110 BR
555
	add	#3,r5		!  50 EX
556

557
	! 3 cycles, 1 byte per iteration
558
1:	mov.b	@(r0,r5),r1	!  19 LS
559
	cmp/hi	r7,r0		!  57 MT
560

561
	bt/s	1b		! 111 BR
562
	 mov.b	r1,@-r0		!  28 LS
563

564
9:	rts
565
	 nop
566

567
	!
568
	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
569
	!
570

571
	.balign	32
572
.Lcase2:
573
	! Size is 16 or greater and less then 64, but may have trailing bytes
574

575
2:	mov	r5, r6		!   5 MT (latency=0)
576
	add	#-2,r5		!  50 EX
577

578
	mov	r4,r2		!   5 MT (latency=0)
579
	add	#-4,r6		!  50 EX
580

581
	add	#7,r2		!  50 EX
582
3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
583

584
	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
585
	cmp/hi	r2,r0		!  57 MT
586

587
	mov.w	r1,@-r0		!  29 LS
588
	bt/s	3b		! 111 BR
589

590
	 mov.w	r3,@-r0		!  29 LS
591

592
	bra	10f
593
	 nop
594

595

596
	.balign	32
597
.Lcase2b:
598
	! Size is at least 64 bytes, so will be going round the big loop at least once.
599
	!
600
	!   r2 = rounded up r4
601
	!   r3 = rounded down r0
602

603
	mov	r0, r3		!   5 MT (latency=0)
604
	mov	#(~0x1f), r1	!   6 EX
605

606
	and	r1, r3		!  78 EX
607
	mov	r4, r2		!   5 MT (latency=0)
608

609
	cmp/eq	r3, r0		!  54 MT
610
	add	#0x1f, r2	!  50 EX
611

612
	add	#-2, r5		!  50 EX
613
	bt/s	1f		! 110 BR
614
	 and	r1, r2		!  78 EX
615

616
	! Copy a short word one at a time until we are cache line aligned
617
	!   Normal values: r0, r2, r3, r4
618
	!   Unused: r1, r6, r7
619
	!   Mod: r5 (=r5-2)
620
	!
621
	add	#2, r3		!  50 EX
622

623
2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
624
	cmp/eq	r3,r0		!  54 MT
625

626
	bf/s	2b		! 111 BR
627

628
	 mov.w	r1,@-r0		!  29 LS
629

630
	! Copy the cache line aligned blocks
631
	!
632
	! In use: r0, r2, r4, r5 (=r5-2)
633
	! Scratch: r1, r3, r6, r7
634
	!
635
	! We could do this with the four scratch registers, but if src
636
	! and dest hit the same cache line, this will thrash, so make
637
	! use of additional registers.
638
	!
639
	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640
	!   r5:	 src (was r0+r5)
641
	!   r1:	 dest (was r0)
642
	! this can be reversed at the end, so we don't need to save any extra
643
	! state.
644
	!
645
1:	mov.l	r8, @-r15	!  30 LS
646
	add	r0, r5		!  49 EX
647

648
	mov.l	r9, @-r15	!  30 LS
649
	mov	r0, r1		!   5 MT (latency=0)
650

651
	mov.l	r10, @-r15	!  30 LS
652
	add	#-0x1e, r5	!  50 EX
653

654
	mov.l	r11, @-r15	!  30 LS
655

656
	mov.l	r12, @-r15	!  30 LS
657

658
	! 17 cycles, 32 bytes per iteration
659
#ifdef CONFIG_CPU_LITTLE_ENDIAN
660
2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
661
	add	#-0x20, r1	!  50 EX
662

663
	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
664

665
	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
666
	shll16	r0		! 103 EX			JI..
667

668
	mov.l	@r5+, r7	!  15 LS (latency=2)
669
	xtrct	r3, r0		!  48 EX			LKJI
670

671
	mov.l	@r5+, r8	!  15 LS (latency=2)
672
	xtrct	r6, r3		!  48 EX			PONM
673

674
	mov.l	@r5+, r9	!  15 LS (latency=2)
675
	xtrct	r7, r6		!  48 EX
676

677
	mov.l	@r5+, r10	!  15 LS (latency=2)
678
	xtrct	r8, r7		!  48 EX
679

680
	mov.l	@r5+, r11	!  15 LS (latency=2)
681
	xtrct	r9, r8		!  48 EX
682

683
	mov.w	@r5+, r12	!  15 LS (latency=2)
684
	xtrct	r10, r9		!  48 EX
685

686
	movca.l	r0,@r1		!  40 LS (latency=3-7)
687
	xtrct	r11, r10	!  48 EX
688

689
	mov.l	r3, @(0x04,r1)	!  33 LS
690
	xtrct	r12, r11	!  48 EX
691

692
	mov.l	r6, @(0x08,r1)	!  33 LS
693

694
	mov.l	r7, @(0x0c,r1)	!  33 LS
695

696
	mov.l	r8, @(0x10,r1)	!  33 LS
697
	add	#-0x40, r5	!  50 EX
698

699
	mov.l	r9, @(0x14,r1)	!  33 LS
700
	cmp/eq	r2,r1		!  54 MT
701

702
	mov.l	r10, @(0x18,r1)	!  33 LS
703
	bf/s	2b		! 109 BR
704

705
	 mov.l	r11, @(0x1c,r1)	!  33 LS
706
#else
707
2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
708
	add	#-2, r5		!  50 EX
709

710
	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
711
	add	#-4, r1		!  50 EX
712

713
	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
714
	shll16	r0		! 103 EX
715

716
	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
717
	xtrct	r3, r0		!  48 EX
718

719
	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
720
	xtrct	r6, r3		!  48 EX
721

722
	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
723
	xtrct	r7, r6		!  48 EX
724

725
	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
726
	xtrct	r8, r7		!  48 EX
727

728
	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
729
	xtrct	r9, r8		!  48 EX
730

731
	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
732
    	xtrct	r10, r9		!  48 EX
733

734
	movca.l	r0,@r1		!  40 LS (latency=3-7)
735
	add	#-0x1c, r1	!  50 EX
736

737
	mov.l	r3, @(0x18,r1)	!  33 LS
738
	xtrct	r11, r10	!  48 EX
739

740
	mov.l	r6, @(0x14,r1)	!  33 LS
741
	xtrct	r12, r11	!  48 EX
742

743
	mov.l	r7, @(0x10,r1)	!  33 LS
744

745
	mov.l	r8, @(0x0c,r1)	!  33 LS
746
	add	#-0x1e, r5	!  50 EX
747

748
	mov.l	r9, @(0x08,r1)	!  33 LS
749
	cmp/eq	r2,r1		!  54 MT
750

751
	mov.l	r10, @(0x04,r1)	!  33 LS
752
	bf/s	2b		! 109 BR
753

754
	 mov.l	r11, @(0x00,r1)	!  33 LS
755
#endif
756

757
	mov.l	@r15+, r12
758
	mov	r1, r0		!   5 MT (latency=0)
759

760
	mov.l	@r15+, r11	!  15 LS
761
	sub	r1, r5		!  75 EX
762

763
	mov.l	@r15+, r10	!  15 LS
764
	cmp/eq	r4, r0		!  54 MT
765

766
	bf/s	1f		! 109 BR
767
	 mov.l	 @r15+, r9	!  15 LS
768

769
	rts
770
1:	 mov.l	@r15+, r8	!  15 LS
771

772
	add	#0x1e, r5	!  50 EX
773

774
	! Finish off a short word at a time
775
	! r5 must be invariant - 2
776
10:	mov	r4,r2		!   5 MT (latency=0)
777
	add	#1,r2		!  50 EX
778

779
	cmp/hi	r2, r0		!  57 MT
780
	bf/s	1f		! 109 BR
781

782
	 add	#2, r2		!  50 EX
783

784
3:	mov.w	@(r0,r5),r1	!  20 LS
785
	cmp/hi	r2,r0		!  57 MT
786

787
	bt/s	3b		! 109 BR
788

789
	 mov.w	r1,@-r0		!  29 LS
790
1:
791

792
	!
793
	! Finally, copy the last byte if necessary
794
	cmp/eq	r4,r0		!  54 MT
795
	bt/s	9b
796
	 add	#1,r5
797
	mov.b	@(r0,r5),r1
798
	rts
799
	 mov.b	r1,@-r0
800

801

802
Product

Resources

Company