Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/sh/lib64/copy_user_memcpy.S
10817 views
1
!
2
! Fast SH memcpy
3
!
4
! by Toshiyasu Morita (tm@netcom.com)
5
! hacked by J"orn Rernnecke ([email protected]) ("o for o-umlaut)
6
! SH5 code Copyright 2002 SuperH Ltd.
7
!
8
! Entry: ARG0: destination pointer
9
! ARG1: source pointer
10
! ARG2: byte count
11
!
12
! Exit: RESULT: destination pointer
13
! any other registers in the range r0-r7: trashed
14
!
15
! Notes: Usually one wants to do small reads and write a longword, but
16
! unfortunately it is difficult in some cases to concatanate bytes
17
! into a longword on the SH, so this does a longword read and small
18
! writes.
19
!
20
! This implementation makes two assumptions about how it is called:
21
!
22
! 1.: If the byte count is nonzero, the address of the last byte to be
23
! copied is unsigned greater than the address of the first byte to
24
! be copied. This could be easily swapped for a signed comparison,
25
! but the algorithm used needs some comparison.
26
!
27
! 2.: When there are two or three bytes in the last word of an 11-or-more
28
! bytes memory chunk to b copied, the rest of the word can be read
29
! without side effects.
30
! This could be easily changed by increasing the minimum size of
31
! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
32
! however, this would cost a few extra cyles on average.
33
! For SHmedia, the assumption is that any quadword can be read in its
34
! enirety if at least one byte is included in the copy.
35
36
/* Imported into Linux kernel by Richard Curnow. This is used to implement the
37
__copy_user function in the general case, so it has to be a distinct
38
function from intra-kernel memcpy to allow for exception fix-ups in the
39
event that the user pointer is bad somewhere in the copy (e.g. due to
40
running off the end of the vma).
41
42
Note, this algorithm will be slightly wasteful in the case where the source
43
and destination pointers are equally aligned, because the stlo/sthi pairs
44
could then be merged back into single stores. If there are a lot of cache
45
misses, this is probably offset by the stall lengths on the preloads.
46
47
*/
48
49
/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
50
* erratum. The first two prefetches are nop-ed out to avoid upsetting the
51
* instruction counts used in the jump address calculation.
52
* */
53
54
.section .text..SHmedia32,"ax"
55
.little
56
.balign 32
57
.global copy_user_memcpy
58
.global copy_user_memcpy_end
59
copy_user_memcpy:
60
61
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
62
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
63
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
64
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
65
66
nop ! ld.b r3,0,r63 ! TAKum03020
67
pta/l Large,tr0
68
movi 25,r0
69
bgeu/u r4,r0,tr0
70
nsb r4,r0
71
shlli r0,5,r0
72
movi (L1-L0+63*32 + 1) & 0xffff,r1
73
sub r1, r0, r0
74
L0: ptrel r0,tr0
75
add r2,r4,r5
76
ptabs r18,tr1
77
add r3,r4,r6
78
blink tr0,r63
79
80
/* Rearranged to make cut2 safe */
81
.balign 8
82
L4_7: /* 4..7 byte memcpy cntd. */
83
stlo.l r2, 0, r0
84
or r6, r7, r6
85
sthi.l r5, -1, r6
86
stlo.l r5, -4, r6
87
blink tr1,r63
88
89
.balign 8
90
L1: /* 0 byte memcpy */
91
nop
92
blink tr1,r63
93
nop
94
nop
95
nop
96
nop
97
98
L2_3: /* 2 or 3 byte memcpy cntd. */
99
st.b r5,-1,r6
100
blink tr1,r63
101
102
/* 1 byte memcpy */
103
ld.b r3,0,r0
104
st.b r2,0,r0
105
blink tr1,r63
106
107
L8_15: /* 8..15 byte memcpy cntd. */
108
stlo.q r2, 0, r0
109
or r6, r7, r6
110
sthi.q r5, -1, r6
111
stlo.q r5, -8, r6
112
blink tr1,r63
113
114
/* 2 or 3 byte memcpy */
115
ld.b r3,0,r0
116
nop ! ld.b r2,0,r63 ! TAKum03020
117
ld.b r3,1,r1
118
st.b r2,0,r0
119
pta/l L2_3,tr0
120
ld.b r6,-1,r6
121
st.b r2,1,r1
122
blink tr0, r63
123
124
/* 4 .. 7 byte memcpy */
125
LDUAL (r3, 0, r0, r1)
126
pta L4_7, tr0
127
ldlo.l r6, -4, r7
128
or r0, r1, r0
129
sthi.l r2, 3, r0
130
ldhi.l r6, -1, r6
131
blink tr0, r63
132
133
/* 8 .. 15 byte memcpy */
134
LDUAQ (r3, 0, r0, r1)
135
pta L8_15, tr0
136
ldlo.q r6, -8, r7
137
or r0, r1, r0
138
sthi.q r2, 7, r0
139
ldhi.q r6, -1, r6
140
blink tr0, r63
141
142
/* 16 .. 24 byte memcpy */
143
LDUAQ (r3, 0, r0, r1)
144
LDUAQ (r3, 8, r8, r9)
145
or r0, r1, r0
146
sthi.q r2, 7, r0
147
or r8, r9, r8
148
sthi.q r2, 15, r8
149
ldlo.q r6, -8, r7
150
ldhi.q r6, -1, r6
151
stlo.q r2, 8, r8
152
stlo.q r2, 0, r0
153
or r6, r7, r6
154
sthi.q r5, -1, r6
155
stlo.q r5, -8, r6
156
blink tr1,r63
157
158
Large:
159
! ld.b r2, 0, r63 ! TAKum03020
160
pta/l Loop_ua, tr1
161
ori r3, -8, r7
162
sub r2, r7, r22
163
sub r3, r2, r6
164
add r2, r4, r5
165
ldlo.q r3, 0, r0
166
addi r5, -16, r5
167
movi 64+8, r27 ! could subtract r7 from that.
168
stlo.q r2, 0, r0
169
sthi.q r2, 7, r0
170
ldx.q r22, r6, r0
171
bgtu/l r27, r4, tr1
172
173
addi r5, -48, r27
174
pta/l Loop_line, tr0
175
addi r6, 64, r36
176
addi r6, -24, r19
177
addi r6, -16, r20
178
addi r6, -8, r21
179
180
Loop_line:
181
! ldx.q r22, r36, r63 ! TAKum03020
182
alloco r22, 32
183
synco
184
addi r22, 32, r22
185
ldx.q r22, r19, r23
186
sthi.q r22, -25, r0
187
ldx.q r22, r20, r24
188
ldx.q r22, r21, r25
189
stlo.q r22, -32, r0
190
ldx.q r22, r6, r0
191
sthi.q r22, -17, r23
192
sthi.q r22, -9, r24
193
sthi.q r22, -1, r25
194
stlo.q r22, -24, r23
195
stlo.q r22, -16, r24
196
stlo.q r22, -8, r25
197
bgeu r27, r22, tr0
198
199
Loop_ua:
200
addi r22, 8, r22
201
sthi.q r22, -1, r0
202
stlo.q r22, -8, r0
203
ldx.q r22, r6, r0
204
bgtu/l r5, r22, tr1
205
206
add r3, r4, r7
207
ldlo.q r7, -8, r1
208
sthi.q r22, 7, r0
209
ldhi.q r7, -1, r7
210
ptabs r18,tr1
211
stlo.q r22, 0, r0
212
or r1, r7, r1
213
sthi.q r5, 15, r1
214
stlo.q r5, 8, r1
215
blink tr1, r63
216
copy_user_memcpy_end:
217
nop
218
219