Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/sh/lib64/memcpy.S
10817 views
1
/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2
/* Modified by SuperH, Inc. September 2003 */
3
!
4
! Fast SH memcpy
5
!
6
! by Toshiyasu Morita (tm@netcom.com)
7
! hacked by J"orn Rernnecke ([email protected]) ("o for o-umlaut)
8
! SH5 code Copyright 2002 SuperH Ltd.
9
!
10
! Entry: ARG0: destination pointer
11
! ARG1: source pointer
12
! ARG2: byte count
13
!
14
! Exit: RESULT: destination pointer
15
! any other registers in the range r0-r7: trashed
16
!
17
! Notes: Usually one wants to do small reads and write a longword, but
18
! unfortunately it is difficult in some cases to concatanate bytes
19
! into a longword on the SH, so this does a longword read and small
20
! writes.
21
!
22
! This implementation makes two assumptions about how it is called:
23
!
24
! 1.: If the byte count is nonzero, the address of the last byte to be
25
! copied is unsigned greater than the address of the first byte to
26
! be copied. This could be easily swapped for a signed comparison,
27
! but the algorithm used needs some comparison.
28
!
29
! 2.: When there are two or three bytes in the last word of an 11-or-more
30
! bytes memory chunk to b copied, the rest of the word can be read
31
! without side effects.
32
! This could be easily changed by increasing the minimum size of
33
! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34
! however, this would cost a few extra cyles on average.
35
! For SHmedia, the assumption is that any quadword can be read in its
36
! enirety if at least one byte is included in the copy.
37
!
38
39
.section .text..SHmedia32,"ax"
40
.globl memcpy
41
.type memcpy, @function
42
.align 5
43
44
memcpy:
45
46
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51
ld.b r3,0,r63
52
pta/l Large,tr0
53
movi 25,r0
54
bgeu/u r4,r0,tr0
55
nsb r4,r0
56
shlli r0,5,r0
57
movi (L1-L0+63*32 + 1) & 0xffff,r1
58
sub r1, r0, r0
59
L0: ptrel r0,tr0
60
add r2,r4,r5
61
ptabs r18,tr1
62
add r3,r4,r6
63
blink tr0,r63
64
65
/* Rearranged to make cut2 safe */
66
.balign 8
67
L4_7: /* 4..7 byte memcpy cntd. */
68
stlo.l r2, 0, r0
69
or r6, r7, r6
70
sthi.l r5, -1, r6
71
stlo.l r5, -4, r6
72
blink tr1,r63
73
74
.balign 8
75
L1: /* 0 byte memcpy */
76
nop
77
blink tr1,r63
78
nop
79
nop
80
nop
81
nop
82
83
L2_3: /* 2 or 3 byte memcpy cntd. */
84
st.b r5,-1,r6
85
blink tr1,r63
86
87
/* 1 byte memcpy */
88
ld.b r3,0,r0
89
st.b r2,0,r0
90
blink tr1,r63
91
92
L8_15: /* 8..15 byte memcpy cntd. */
93
stlo.q r2, 0, r0
94
or r6, r7, r6
95
sthi.q r5, -1, r6
96
stlo.q r5, -8, r6
97
blink tr1,r63
98
99
/* 2 or 3 byte memcpy */
100
ld.b r3,0,r0
101
ld.b r2,0,r63
102
ld.b r3,1,r1
103
st.b r2,0,r0
104
pta/l L2_3,tr0
105
ld.b r6,-1,r6
106
st.b r2,1,r1
107
blink tr0, r63
108
109
/* 4 .. 7 byte memcpy */
110
LDUAL (r3, 0, r0, r1)
111
pta L4_7, tr0
112
ldlo.l r6, -4, r7
113
or r0, r1, r0
114
sthi.l r2, 3, r0
115
ldhi.l r6, -1, r6
116
blink tr0, r63
117
118
/* 8 .. 15 byte memcpy */
119
LDUAQ (r3, 0, r0, r1)
120
pta L8_15, tr0
121
ldlo.q r6, -8, r7
122
or r0, r1, r0
123
sthi.q r2, 7, r0
124
ldhi.q r6, -1, r6
125
blink tr0, r63
126
127
/* 16 .. 24 byte memcpy */
128
LDUAQ (r3, 0, r0, r1)
129
LDUAQ (r3, 8, r8, r9)
130
or r0, r1, r0
131
sthi.q r2, 7, r0
132
or r8, r9, r8
133
sthi.q r2, 15, r8
134
ldlo.q r6, -8, r7
135
ldhi.q r6, -1, r6
136
stlo.q r2, 8, r8
137
stlo.q r2, 0, r0
138
or r6, r7, r6
139
sthi.q r5, -1, r6
140
stlo.q r5, -8, r6
141
blink tr1,r63
142
143
Large:
144
ld.b r2, 0, r63
145
pta/l Loop_ua, tr1
146
ori r3, -8, r7
147
sub r2, r7, r22
148
sub r3, r2, r6
149
add r2, r4, r5
150
ldlo.q r3, 0, r0
151
addi r5, -16, r5
152
movi 64+8, r27 // could subtract r7 from that.
153
stlo.q r2, 0, r0
154
sthi.q r2, 7, r0
155
ldx.q r22, r6, r0
156
bgtu/l r27, r4, tr1
157
158
addi r5, -48, r27
159
pta/l Loop_line, tr0
160
addi r6, 64, r36
161
addi r6, -24, r19
162
addi r6, -16, r20
163
addi r6, -8, r21
164
165
Loop_line:
166
ldx.q r22, r36, r63
167
alloco r22, 32
168
addi r22, 32, r22
169
ldx.q r22, r19, r23
170
sthi.q r22, -25, r0
171
ldx.q r22, r20, r24
172
ldx.q r22, r21, r25
173
stlo.q r22, -32, r0
174
ldx.q r22, r6, r0
175
sthi.q r22, -17, r23
176
sthi.q r22, -9, r24
177
sthi.q r22, -1, r25
178
stlo.q r22, -24, r23
179
stlo.q r22, -16, r24
180
stlo.q r22, -8, r25
181
bgeu r27, r22, tr0
182
183
Loop_ua:
184
addi r22, 8, r22
185
sthi.q r22, -1, r0
186
stlo.q r22, -8, r0
187
ldx.q r22, r6, r0
188
bgtu/l r5, r22, tr1
189
190
add r3, r4, r7
191
ldlo.q r7, -8, r1
192
sthi.q r22, 7, r0
193
ldhi.q r7, -1, r7
194
ptabs r18,tr1
195
stlo.q r22, 0, r0
196
or r1, r7, r1
197
sthi.q r5, 15, r1
198
stlo.q r5, 8, r1
199
blink tr1, r63
200
201
.size memcpy,.-memcpy
202
203