Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/sh/lib/memcpy-sh4.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* "memcpy" implementation of SuperH
4
*
5
* Copyright (C) 1999 Niibe Yutaka
6
* Copyright (c) 2002 STMicroelectronics Ltd
7
* Modified from memcpy.S and micro-optimised for SH4
8
* Stuart Menefy ([email protected])
9
*
10
*/
11
#include <linux/linkage.h>
12
13
/*
14
* void *memcpy(void *dst, const void *src, size_t n);
15
*
16
* It is assumed that there is no overlap between src and dst.
17
* If there is an overlap, then the results are undefined.
18
*/
19
20
!
21
! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
22
!
23
24
! Size is 16 or greater, and may have trailing bytes
25
26
.balign 32
27
.Lcase1:
28
! Read a long word and write a long word at once
29
! At the start of each iteration, r7 contains last long load
30
add #-1,r5 ! 79 EX
31
mov r4,r2 ! 5 MT (0 cycles latency)
32
33
mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
34
add #-4,r5 ! 50 EX
35
36
add #7,r2 ! 79 EX
37
!
38
#ifdef CONFIG_CPU_LITTLE_ENDIAN
39
! 6 cycles, 4 bytes per iteration
40
3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
41
mov r7, r3 ! 5 MT (latency=0) ! RQPO
42
43
cmp/hi r2,r0 ! 57 MT
44
shll16 r3 ! 103 EX
45
46
mov r1,r6 ! 5 MT (latency=0)
47
shll8 r3 ! 102 EX ! Oxxx
48
49
shlr8 r6 ! 106 EX ! xNML
50
mov r1, r7 ! 5 MT (latency=0)
51
52
or r6,r3 ! 82 EX ! ONML
53
bt/s 3b ! 109 BR
54
55
mov.l r3,@-r0 ! 30 LS
56
#else
57
3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
58
mov r7,r3 ! 5 MT (latency=0) ! OPQR
59
60
cmp/hi r2,r0 ! 57 MT
61
shlr16 r3 ! 107 EX
62
63
shlr8 r3 ! 106 EX ! xxxO
64
mov r1,r6 ! 5 MT (latency=0)
65
66
shll8 r6 ! 102 EX ! LMNx
67
mov r1,r7 ! 5 MT (latency=0)
68
69
or r6,r3 ! 82 EX ! LMNO
70
bt/s 3b ! 109 BR
71
72
mov.l r3,@-r0 ! 30 LS
73
#endif
74
! Finally, copy a byte at once, if necessary
75
76
add #4,r5 ! 50 EX
77
cmp/eq r4,r0 ! 54 MT
78
79
add #-6,r2 ! 50 EX
80
bt 9f ! 109 BR
81
82
8: cmp/hi r2,r0 ! 57 MT
83
mov.b @(r0,r5),r1 ! 20 LS (latency=2)
84
85
bt/s 8b ! 109 BR
86
87
mov.b r1,@-r0 ! 29 LS
88
89
9: rts
90
nop
91
92
93
!
94
! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
95
!
96
97
! Size is 16 or greater, and may have trailing bytes
98
99
.balign 32
100
.Lcase3:
101
! Read a long word and write a long word at once
102
! At the start of each iteration, r7 contains last long load
103
add #-3,r5 ! 79 EX
104
mov r4,r2 ! 5 MT (0 cycles latency)
105
106
mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
107
add #-4,r5 ! 50 EX
108
109
add #7,r2 ! 79 EX
110
!
111
#ifdef CONFIG_CPU_LITTLE_ENDIAN
112
! 6 cycles, 4 bytes per iteration
113
3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
114
mov r7, r3 ! 5 MT (latency=0) ! RQPO
115
116
cmp/hi r2,r0 ! 57 MT
117
shll8 r3 ! 102 EX ! QPOx
118
119
mov r1,r6 ! 5 MT (latency=0)
120
shlr16 r6 ! 107 EX
121
122
shlr8 r6 ! 106 EX ! xxxN
123
mov r1, r7 ! 5 MT (latency=0)
124
125
or r6,r3 ! 82 EX ! QPON
126
bt/s 3b ! 109 BR
127
128
mov.l r3,@-r0 ! 30 LS
129
#else
130
3: mov r7,r3 ! OPQR
131
shlr8 r3 ! xOPQ
132
mov.l @(r0,r5),r7 ! KLMN
133
mov r7,r6
134
shll16 r6
135
shll8 r6 ! Nxxx
136
or r6,r3 ! NOPQ
137
cmp/hi r2,r0
138
bt/s 3b
139
mov.l r3,@-r0
140
#endif
141
142
! Finally, copy a byte at once, if necessary
143
144
add #6,r5 ! 50 EX
145
cmp/eq r4,r0 ! 54 MT
146
147
add #-6,r2 ! 50 EX
148
bt 9f ! 109 BR
149
150
8: cmp/hi r2,r0 ! 57 MT
151
mov.b @(r0,r5),r1 ! 20 LS (latency=2)
152
153
bt/s 8b ! 109 BR
154
155
mov.b r1,@-r0 ! 29 LS
156
157
9: rts
158
nop
159
160
ENTRY(memcpy)
161
162
! Calculate the invariants which will be used in the remainder
163
! of the code:
164
!
165
! r4 --> [ ... ] DST [ ... ] SRC
166
! [ ... ] [ ... ]
167
! : :
168
! r0 --> [ ... ] r0+r5 --> [ ... ]
169
!
170
!
171
172
! Short circuit the common case of src, dst and len being 32 bit aligned
173
! and test for zero length move
174
175
mov r6, r0 ! 5 MT (0 cycle latency)
176
or r4, r0 ! 82 EX
177
178
or r5, r0 ! 82 EX
179
tst r6, r6 ! 86 MT
180
181
bt/s 99f ! 111 BR (zero len)
182
tst #3, r0 ! 87 MT
183
184
mov r4, r0 ! 5 MT (0 cycle latency)
185
add r6, r0 ! 49 EX
186
187
mov #16, r1 ! 6 EX
188
bt/s .Lcase00 ! 111 BR (aligned)
189
190
sub r4, r5 ! 75 EX
191
192
! Arguments are not nicely long word aligned or zero len.
193
! Check for small copies, and if so do a simple byte at a time copy.
194
!
195
! Deciding on an exact value of 'small' is not easy, as the point at which
196
! using the optimised routines become worthwhile varies (these are the
197
! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198
! size byte-at-time long word byte
199
! 16 42 39-40 46-50 50-55
200
! 24 58 43-44 54-58 62-67
201
! 36 82 49-50 66-70 80-85
202
! However the penalty for getting it 'wrong' is much higher for long word
203
! aligned data (and this is more common), so use a value of 16.
204
205
cmp/gt r6,r1 ! 56 MT
206
207
add #-1,r5 ! 50 EX
208
bf/s 6f ! 108 BR (not small)
209
210
mov r5, r3 ! 5 MT (latency=0)
211
shlr r6 ! 104 EX
212
213
mov.b @(r0,r5),r1 ! 20 LS (latency=2)
214
bf/s 4f ! 111 BR
215
216
add #-1,r3 ! 50 EX
217
tst r6, r6 ! 86 MT
218
219
bt/s 98f ! 110 BR
220
mov.b r1,@-r0 ! 29 LS
221
222
! 4 cycles, 2 bytes per iteration
223
3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
224
225
4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
226
dt r6 ! 67 EX
227
228
mov.b r1,@-r0 ! 29 LS
229
bf/s 3b ! 111 BR
230
231
mov.b r2,@-r0 ! 29 LS
232
98:
233
rts
234
nop
235
236
99: rts
237
mov r4, r0
238
239
! Size is not small, so its worthwhile looking for optimisations.
240
! First align destination to a long word boundary.
241
!
242
! r5 = normal value -1
243
244
6: tst #3, r0 ! 87 MT
245
mov #3, r3 ! 6 EX
246
247
bt/s 2f ! 111 BR
248
and r0,r3 ! 78 EX
249
250
! 3 cycles, 1 byte per iteration
251
1: dt r3 ! 67 EX
252
mov.b @(r0,r5),r1 ! 19 LS (latency=2)
253
254
add #-1, r6 ! 79 EX
255
bf/s 1b ! 109 BR
256
257
mov.b r1,@-r0 ! 28 LS
258
259
2: add #1, r5 ! 79 EX
260
261
! Now select the appropriate bulk transfer code based on relative
262
! alignment of src and dst.
263
264
mov r0, r3 ! 5 MT (latency=0)
265
266
mov r5, r0 ! 5 MT (latency=0)
267
tst #1, r0 ! 87 MT
268
269
bf/s 1f ! 111 BR
270
mov #64, r7 ! 6 EX
271
272
! bit 0 clear
273
274
cmp/ge r7, r6 ! 55 MT
275
276
bt/s 2f ! 111 BR
277
tst #2, r0 ! 87 MT
278
279
! small
280
bt/s .Lcase0
281
mov r3, r0
282
283
bra .Lcase2
284
nop
285
286
! big
287
2: bt/s .Lcase0b
288
mov r3, r0
289
290
bra .Lcase2b
291
nop
292
293
! bit 0 set
294
1: tst #2, r0 ! 87 MT
295
296
bt/s .Lcase1
297
mov r3, r0
298
299
bra .Lcase3
300
nop
301
302
303
!
304
! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
305
!
306
307
! src, dst and size are all long word aligned
308
! size is non-zero
309
310
.balign 32
311
.Lcase00:
312
mov #64, r1 ! 6 EX
313
mov r5, r3 ! 5 MT (latency=0)
314
315
cmp/gt r6, r1 ! 56 MT
316
add #-4, r5 ! 50 EX
317
318
bf .Lcase00b ! 108 BR (big loop)
319
shlr2 r6 ! 105 EX
320
321
shlr r6 ! 104 EX
322
mov.l @(r0, r5), r1 ! 21 LS (latency=2)
323
324
bf/s 4f ! 111 BR
325
add #-8, r3 ! 50 EX
326
327
tst r6, r6 ! 86 MT
328
bt/s 5f ! 110 BR
329
330
mov.l r1,@-r0 ! 30 LS
331
332
! 4 cycles, 2 long words per iteration
333
3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
334
335
4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
336
dt r6 ! 67 EX
337
338
mov.l r1, @-r0 ! 30 LS
339
bf/s 3b ! 109 BR
340
341
mov.l r2, @-r0 ! 30 LS
342
343
5: rts
344
nop
345
346
347
! Size is 16 or greater and less than 64, but may have trailing bytes
348
349
.balign 32
350
.Lcase0:
351
add #-4, r5 ! 50 EX
352
mov r4, r7 ! 5 MT (latency=0)
353
354
mov.l @(r0, r5), r1 ! 21 LS (latency=2)
355
mov #4, r2 ! 6 EX
356
357
add #11, r7 ! 50 EX
358
tst r2, r6 ! 86 MT
359
360
mov r5, r3 ! 5 MT (latency=0)
361
bt/s 4f ! 111 BR
362
363
add #-4, r3 ! 50 EX
364
mov.l r1,@-r0 ! 30 LS
365
366
! 4 cycles, 2 long words per iteration
367
3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
368
369
4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
370
cmp/hi r7, r0
371
372
mov.l r1, @-r0 ! 30 LS
373
bt/s 3b ! 109 BR
374
375
mov.l r2, @-r0 ! 30 LS
376
377
! Copy the final 0-3 bytes
378
379
add #3,r5 ! 50 EX
380
381
cmp/eq r0, r4 ! 54 MT
382
add #-10, r7 ! 50 EX
383
384
bt 9f ! 110 BR
385
386
! 3 cycles, 1 byte per iteration
387
1: mov.b @(r0,r5),r1 ! 19 LS
388
cmp/hi r7,r0 ! 57 MT
389
390
bt/s 1b ! 111 BR
391
mov.b r1,@-r0 ! 28 LS
392
393
9: rts
394
nop
395
396
! Size is at least 64 bytes, so will be going round the big loop at least once.
397
!
398
! r2 = rounded up r4
399
! r3 = rounded down r0
400
401
.balign 32
402
.Lcase0b:
403
add #-4, r5 ! 50 EX
404
405
.Lcase00b:
406
mov r0, r3 ! 5 MT (latency=0)
407
mov #(~0x1f), r1 ! 6 EX
408
409
and r1, r3 ! 78 EX
410
mov r4, r2 ! 5 MT (latency=0)
411
412
cmp/eq r3, r0 ! 54 MT
413
add #0x1f, r2 ! 50 EX
414
415
bt/s 1f ! 110 BR
416
and r1, r2 ! 78 EX
417
418
! copy initial words until cache line aligned
419
420
mov.l @(r0, r5), r1 ! 21 LS (latency=2)
421
tst #4, r0 ! 87 MT
422
423
mov r5, r6 ! 5 MT (latency=0)
424
add #-4, r6 ! 50 EX
425
426
bt/s 4f ! 111 BR
427
add #8, r3 ! 50 EX
428
429
tst #0x18, r0 ! 87 MT
430
431
bt/s 1f ! 109 BR
432
mov.l r1,@-r0 ! 30 LS
433
434
! 4 cycles, 2 long words per iteration
435
3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
436
437
4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
438
cmp/eq r3, r0 ! 54 MT
439
440
mov.l r1, @-r0 ! 30 LS
441
bf/s 3b ! 109 BR
442
443
mov.l r7, @-r0 ! 30 LS
444
445
! Copy the cache line aligned blocks
446
!
447
! In use: r0, r2, r4, r5
448
! Scratch: r1, r3, r6, r7
449
!
450
! We could do this with the four scratch registers, but if src
451
! and dest hit the same cache line, this will thrash, so make
452
! use of additional registers.
453
!
454
! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455
! r5: src (was r0+r5)
456
! r1: dest (was r0)
457
! this can be reversed at the end, so we don't need to save any extra
458
! state.
459
!
460
1: mov.l r8, @-r15 ! 30 LS
461
add r0, r5 ! 49 EX
462
463
mov.l r9, @-r15 ! 30 LS
464
mov r0, r1 ! 5 MT (latency=0)
465
466
mov.l r10, @-r15 ! 30 LS
467
add #-0x1c, r5 ! 50 EX
468
469
mov.l r11, @-r15 ! 30 LS
470
471
! 16 cycles, 32 bytes per iteration
472
2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
473
add #-0x20, r1 ! 50 EX
474
mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
475
mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
476
mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
477
mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
478
mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
479
mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
480
mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
481
movca.l r0,@r1 ! 40 LS (latency=3-7)
482
mov.l r3,@(0x04,r1) ! 33 LS
483
mov.l r6,@(0x08,r1) ! 33 LS
484
mov.l r7,@(0x0c,r1) ! 33 LS
485
486
mov.l r8,@(0x10,r1) ! 33 LS
487
add #-0x20, r5 ! 50 EX
488
489
mov.l r9,@(0x14,r1) ! 33 LS
490
cmp/eq r2,r1 ! 54 MT
491
492
mov.l r10,@(0x18,r1) ! 33 LS
493
bf/s 2b ! 109 BR
494
495
mov.l r11,@(0x1c,r1) ! 33 LS
496
497
mov r1, r0 ! 5 MT (latency=0)
498
499
mov.l @r15+, r11 ! 15 LS
500
sub r1, r5 ! 75 EX
501
502
mov.l @r15+, r10 ! 15 LS
503
cmp/eq r4, r0 ! 54 MT
504
505
bf/s 1f ! 109 BR
506
mov.l @r15+, r9 ! 15 LS
507
508
rts
509
1: mov.l @r15+, r8 ! 15 LS
510
sub r4, r1 ! 75 EX (len remaining)
511
512
! number of trailing bytes is non-zero
513
!
514
! invariants restored (r5 already decremented by 4)
515
! also r1=num bytes remaining
516
517
mov #4, r2 ! 6 EX
518
mov r4, r7 ! 5 MT (latency=0)
519
520
add #0x1c, r5 ! 50 EX (back to -4)
521
cmp/hs r2, r1 ! 58 MT
522
523
bf/s 5f ! 108 BR
524
add #11, r7 ! 50 EX
525
526
mov.l @(r0, r5), r6 ! 21 LS (latency=2)
527
tst r2, r1 ! 86 MT
528
529
mov r5, r3 ! 5 MT (latency=0)
530
bt/s 4f ! 111 BR
531
532
add #-4, r3 ! 50 EX
533
cmp/hs r2, r1 ! 58 MT
534
535
bt/s 5f ! 111 BR
536
mov.l r6,@-r0 ! 30 LS
537
538
! 4 cycles, 2 long words per iteration
539
3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
540
541
4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
542
cmp/hi r7, r0
543
544
mov.l r6, @-r0 ! 30 LS
545
bt/s 3b ! 109 BR
546
547
mov.l r2, @-r0 ! 30 LS
548
549
! Copy the final 0-3 bytes
550
551
5: cmp/eq r0, r4 ! 54 MT
552
add #-10, r7 ! 50 EX
553
554
bt 9f ! 110 BR
555
add #3,r5 ! 50 EX
556
557
! 3 cycles, 1 byte per iteration
558
1: mov.b @(r0,r5),r1 ! 19 LS
559
cmp/hi r7,r0 ! 57 MT
560
561
bt/s 1b ! 111 BR
562
mov.b r1,@-r0 ! 28 LS
563
564
9: rts
565
nop
566
567
!
568
! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
569
!
570
571
.balign 32
572
.Lcase2:
573
! Size is 16 or greater and less then 64, but may have trailing bytes
574
575
2: mov r5, r6 ! 5 MT (latency=0)
576
add #-2,r5 ! 50 EX
577
578
mov r4,r2 ! 5 MT (latency=0)
579
add #-4,r6 ! 50 EX
580
581
add #7,r2 ! 50 EX
582
3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
583
584
mov.w @(r0,r6),r3 ! 20 LS (latency=2)
585
cmp/hi r2,r0 ! 57 MT
586
587
mov.w r1,@-r0 ! 29 LS
588
bt/s 3b ! 111 BR
589
590
mov.w r3,@-r0 ! 29 LS
591
592
bra 10f
593
nop
594
595
596
.balign 32
597
.Lcase2b:
598
! Size is at least 64 bytes, so will be going round the big loop at least once.
599
!
600
! r2 = rounded up r4
601
! r3 = rounded down r0
602
603
mov r0, r3 ! 5 MT (latency=0)
604
mov #(~0x1f), r1 ! 6 EX
605
606
and r1, r3 ! 78 EX
607
mov r4, r2 ! 5 MT (latency=0)
608
609
cmp/eq r3, r0 ! 54 MT
610
add #0x1f, r2 ! 50 EX
611
612
add #-2, r5 ! 50 EX
613
bt/s 1f ! 110 BR
614
and r1, r2 ! 78 EX
615
616
! Copy a short word one at a time until we are cache line aligned
617
! Normal values: r0, r2, r3, r4
618
! Unused: r1, r6, r7
619
! Mod: r5 (=r5-2)
620
!
621
add #2, r3 ! 50 EX
622
623
2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
624
cmp/eq r3,r0 ! 54 MT
625
626
bf/s 2b ! 111 BR
627
628
mov.w r1,@-r0 ! 29 LS
629
630
! Copy the cache line aligned blocks
631
!
632
! In use: r0, r2, r4, r5 (=r5-2)
633
! Scratch: r1, r3, r6, r7
634
!
635
! We could do this with the four scratch registers, but if src
636
! and dest hit the same cache line, this will thrash, so make
637
! use of additional registers.
638
!
639
! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640
! r5: src (was r0+r5)
641
! r1: dest (was r0)
642
! this can be reversed at the end, so we don't need to save any extra
643
! state.
644
!
645
1: mov.l r8, @-r15 ! 30 LS
646
add r0, r5 ! 49 EX
647
648
mov.l r9, @-r15 ! 30 LS
649
mov r0, r1 ! 5 MT (latency=0)
650
651
mov.l r10, @-r15 ! 30 LS
652
add #-0x1e, r5 ! 50 EX
653
654
mov.l r11, @-r15 ! 30 LS
655
656
mov.l r12, @-r15 ! 30 LS
657
658
! 17 cycles, 32 bytes per iteration
659
#ifdef CONFIG_CPU_LITTLE_ENDIAN
660
2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
661
add #-0x20, r1 ! 50 EX
662
663
mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
664
665
mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
666
shll16 r0 ! 103 EX JI..
667
668
mov.l @r5+, r7 ! 15 LS (latency=2)
669
xtrct r3, r0 ! 48 EX LKJI
670
671
mov.l @r5+, r8 ! 15 LS (latency=2)
672
xtrct r6, r3 ! 48 EX PONM
673
674
mov.l @r5+, r9 ! 15 LS (latency=2)
675
xtrct r7, r6 ! 48 EX
676
677
mov.l @r5+, r10 ! 15 LS (latency=2)
678
xtrct r8, r7 ! 48 EX
679
680
mov.l @r5+, r11 ! 15 LS (latency=2)
681
xtrct r9, r8 ! 48 EX
682
683
mov.w @r5+, r12 ! 15 LS (latency=2)
684
xtrct r10, r9 ! 48 EX
685
686
movca.l r0,@r1 ! 40 LS (latency=3-7)
687
xtrct r11, r10 ! 48 EX
688
689
mov.l r3, @(0x04,r1) ! 33 LS
690
xtrct r12, r11 ! 48 EX
691
692
mov.l r6, @(0x08,r1) ! 33 LS
693
694
mov.l r7, @(0x0c,r1) ! 33 LS
695
696
mov.l r8, @(0x10,r1) ! 33 LS
697
add #-0x40, r5 ! 50 EX
698
699
mov.l r9, @(0x14,r1) ! 33 LS
700
cmp/eq r2,r1 ! 54 MT
701
702
mov.l r10, @(0x18,r1) ! 33 LS
703
bf/s 2b ! 109 BR
704
705
mov.l r11, @(0x1c,r1) ! 33 LS
706
#else
707
2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
708
add #-2, r5 ! 50 EX
709
710
mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
711
add #-4, r1 ! 50 EX
712
713
mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
714
shll16 r0 ! 103 EX
715
716
mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
717
xtrct r3, r0 ! 48 EX
718
719
mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
720
xtrct r6, r3 ! 48 EX
721
722
mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
723
xtrct r7, r6 ! 48 EX
724
725
mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
726
xtrct r8, r7 ! 48 EX
727
728
mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
729
xtrct r9, r8 ! 48 EX
730
731
mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
732
xtrct r10, r9 ! 48 EX
733
734
movca.l r0,@r1 ! 40 LS (latency=3-7)
735
add #-0x1c, r1 ! 50 EX
736
737
mov.l r3, @(0x18,r1) ! 33 LS
738
xtrct r11, r10 ! 48 EX
739
740
mov.l r6, @(0x14,r1) ! 33 LS
741
xtrct r12, r11 ! 48 EX
742
743
mov.l r7, @(0x10,r1) ! 33 LS
744
745
mov.l r8, @(0x0c,r1) ! 33 LS
746
add #-0x1e, r5 ! 50 EX
747
748
mov.l r9, @(0x08,r1) ! 33 LS
749
cmp/eq r2,r1 ! 54 MT
750
751
mov.l r10, @(0x04,r1) ! 33 LS
752
bf/s 2b ! 109 BR
753
754
mov.l r11, @(0x00,r1) ! 33 LS
755
#endif
756
757
mov.l @r15+, r12
758
mov r1, r0 ! 5 MT (latency=0)
759
760
mov.l @r15+, r11 ! 15 LS
761
sub r1, r5 ! 75 EX
762
763
mov.l @r15+, r10 ! 15 LS
764
cmp/eq r4, r0 ! 54 MT
765
766
bf/s 1f ! 109 BR
767
mov.l @r15+, r9 ! 15 LS
768
769
rts
770
1: mov.l @r15+, r8 ! 15 LS
771
772
add #0x1e, r5 ! 50 EX
773
774
! Finish off a short word at a time
775
! r5 must be invariant - 2
776
10: mov r4,r2 ! 5 MT (latency=0)
777
add #1,r2 ! 50 EX
778
779
cmp/hi r2, r0 ! 57 MT
780
bf/s 1f ! 109 BR
781
782
add #2, r2 ! 50 EX
783
784
3: mov.w @(r0,r5),r1 ! 20 LS
785
cmp/hi r2,r0 ! 57 MT
786
787
bt/s 3b ! 109 BR
788
789
mov.w r1,@-r0 ! 29 LS
790
1:
791
792
!
793
! Finally, copy the last byte if necessary
794
cmp/eq r4,r0 ! 54 MT
795
bt/s 9b
796
add #1,r5
797
mov.b @(r0,r5),r1
798
rts
799
mov.b r1,@-r0
800
801
802