Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/alpha/lib/ev6-stxcpy.S
10817 views
1
/*
2
* arch/alpha/lib/ev6-stxcpy.S
3
* 21264 version contributed by Rick Gorton <[email protected]>
4
*
5
* Copy a null-terminated string from SRC to DST.
6
*
7
* This is an internal routine used by strcpy, stpcpy, and strcat.
8
* As such, it uses special linkage conventions to make implementation
9
* of these public functions more efficient.
10
*
11
* On input:
12
* t9 = return address
13
* a0 = DST
14
* a1 = SRC
15
*
16
* On output:
17
* t12 = bitmask (with one bit set) indicating the last byte written
18
* a0 = unaligned address of the last *word* written
19
*
20
* Furthermore, v0, a3-a5, t11, and t12 are untouched.
21
*
22
* Much of the information about 21264 scheduling/coding comes from:
23
* Compiler Writer's Guide for the Alpha 21264
24
* abbreviated as 'CWG' in other comments here
25
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
26
* Scheduling notation:
27
* E - either cluster
28
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
29
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
30
* Try not to change the actual algorithm if possible for consistency.
31
*/
32
33
#include <asm/regdef.h>
34
35
.set noat
36
.set noreorder
37
38
.text
39
40
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
41
doesn't like putting the entry point for a procedure somewhere in the
42
middle of the procedure descriptor. Work around this by putting the
43
aligned copy in its own procedure descriptor */
44
45
46
.ent stxcpy_aligned
47
.align 4
48
stxcpy_aligned:
49
.frame sp, 0, t9
50
.prologue 0
51
52
/* On entry to this basic block:
53
t0 == the first destination word for masking back in
54
t1 == the first source word. */
55
56
/* Create the 1st output word and detect 0's in the 1st input word. */
57
lda t2, -1 # E : build a mask against false zero
58
mskqh t2, a1, t2 # U : detection in the src word (stall)
59
mskqh t1, a1, t3 # U :
60
ornot t1, t2, t2 # E : (stall)
61
62
mskql t0, a1, t0 # U : assemble the first output word
63
cmpbge zero, t2, t8 # E : bits set iff null found
64
or t0, t3, t1 # E : (stall)
65
bne t8, $a_eos # U : (stall)
66
67
/* On entry to this basic block:
68
t0 == the first destination word for masking back in
69
t1 == a source word not containing a null. */
70
/* Nops here to separate store quads from load quads */
71
72
$a_loop:
73
stq_u t1, 0(a0) # L :
74
addq a0, 8, a0 # E :
75
nop
76
nop
77
78
ldq_u t1, 0(a1) # L : Latency=3
79
addq a1, 8, a1 # E :
80
cmpbge zero, t1, t8 # E : (3 cycle stall)
81
beq t8, $a_loop # U : (stall for t8)
82
83
/* Take care of the final (partial) word store.
84
On entry to this basic block we have:
85
t1 == the source word containing the null
86
t8 == the cmpbge mask that found it. */
87
$a_eos:
88
negq t8, t6 # E : find low bit set
89
and t8, t6, t12 # E : (stall)
90
/* For the sake of the cache, don't read a destination word
91
if we're not going to need it. */
92
and t12, 0x80, t6 # E : (stall)
93
bne t6, 1f # U : (stall)
94
95
/* We're doing a partial word store and so need to combine
96
our source and original destination words. */
97
ldq_u t0, 0(a0) # L : Latency=3
98
subq t12, 1, t6 # E :
99
zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
100
or t12, t6, t8 # E : (stall)
101
102
zap t0, t8, t0 # E : clear dst bytes <= null
103
or t0, t1, t1 # E : (stall)
104
nop
105
nop
106
107
1: stq_u t1, 0(a0) # L :
108
ret (t9) # L0 : Latency=3
109
nop
110
nop
111
112
.end stxcpy_aligned
113
114
.align 4
115
.ent __stxcpy
116
.globl __stxcpy
117
__stxcpy:
118
.frame sp, 0, t9
119
.prologue 0
120
121
/* Are source and destination co-aligned? */
122
xor a0, a1, t0 # E :
123
unop # E :
124
and t0, 7, t0 # E : (stall)
125
bne t0, $unaligned # U : (stall)
126
127
/* We are co-aligned; take care of a partial first word. */
128
ldq_u t1, 0(a1) # L : load first src word
129
and a0, 7, t0 # E : take care not to load a word ...
130
addq a1, 8, a1 # E :
131
beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
132
133
ldq_u t0, 0(a0) # L :
134
br stxcpy_aligned # L0 : Latency=3
135
nop
136
nop
137
138
139
/* The source and destination are not co-aligned. Align the destination
140
and cope. We have to be very careful about not reading too much and
141
causing a SEGV. */
142
143
.align 4
144
$u_head:
145
/* We know just enough now to be able to assemble the first
146
full source word. We can still find a zero at the end of it
147
that prevents us from outputting the whole thing.
148
149
On entry to this basic block:
150
t0 == the first dest word, for masking back in, if needed else 0
151
t1 == the low bits of the first source word
152
t6 == bytemask that is -1 in dest word bytes */
153
154
ldq_u t2, 8(a1) # L :
155
addq a1, 8, a1 # E :
156
extql t1, a1, t1 # U : (stall on a1)
157
extqh t2, a1, t4 # U : (stall on a1)
158
159
mskql t0, a0, t0 # U :
160
or t1, t4, t1 # E :
161
mskqh t1, a0, t1 # U : (stall on t1)
162
or t0, t1, t1 # E : (stall on t1)
163
164
or t1, t6, t6 # E :
165
cmpbge zero, t6, t8 # E : (stall)
166
lda t6, -1 # E : for masking just below
167
bne t8, $u_final # U : (stall)
168
169
mskql t6, a1, t6 # U : mask out the bits we have
170
or t6, t2, t2 # E : already extracted before (stall)
171
cmpbge zero, t2, t8 # E : testing eos (stall)
172
bne t8, $u_late_head_exit # U : (stall)
173
174
/* Finally, we've got all the stupid leading edge cases taken care
175
of and we can set up to enter the main loop. */
176
177
stq_u t1, 0(a0) # L : store first output word
178
addq a0, 8, a0 # E :
179
extql t2, a1, t0 # U : position ho-bits of lo word
180
ldq_u t2, 8(a1) # U : read next high-order source word
181
182
addq a1, 8, a1 # E :
183
cmpbge zero, t2, t8 # E : (stall for t2)
184
nop # E :
185
bne t8, $u_eos # U : (stall)
186
187
/* Unaligned copy main loop. In order to avoid reading too much,
188
the loop is structured to detect zeros in aligned source words.
189
This has, unfortunately, effectively pulled half of a loop
190
iteration out into the head and half into the tail, but it does
191
prevent nastiness from accumulating in the very thing we want
192
to run as fast as possible.
193
194
On entry to this basic block:
195
t0 == the shifted high-order bits from the previous source word
196
t2 == the unshifted current source word
197
198
We further know that t2 does not contain a null terminator. */
199
200
.align 3
201
$u_loop:
202
extqh t2, a1, t1 # U : extract high bits for current word
203
addq a1, 8, a1 # E : (stall)
204
extql t2, a1, t3 # U : extract low bits for next time (stall)
205
addq a0, 8, a0 # E :
206
207
or t0, t1, t1 # E : current dst word now complete
208
ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
209
stq_u t1, -8(a0) # L : save the current word (stall)
210
mov t3, t0 # E :
211
212
cmpbge zero, t2, t8 # E : test new word for eos
213
beq t8, $u_loop # U : (stall)
214
nop
215
nop
216
217
/* We've found a zero somewhere in the source word we just read.
218
If it resides in the lower half, we have one (probably partial)
219
word to write out, and if it resides in the upper half, we
220
have one full and one partial word left to write out.
221
222
On entry to this basic block:
223
t0 == the shifted high-order bits from the previous source word
224
t2 == the unshifted current source word. */
225
$u_eos:
226
extqh t2, a1, t1 # U :
227
or t0, t1, t1 # E : first (partial) source word complete (stall)
228
cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
229
bne t8, $u_final # U : (stall)
230
231
$u_late_head_exit:
232
stq_u t1, 0(a0) # L : the null was in the high-order bits
233
addq a0, 8, a0 # E :
234
extql t2, a1, t1 # U :
235
cmpbge zero, t1, t8 # E : (stall)
236
237
/* Take care of a final (probably partial) result word.
238
On entry to this basic block:
239
t1 == assembled source word
240
t8 == cmpbge mask that found the null. */
241
$u_final:
242
negq t8, t6 # E : isolate low bit set
243
and t6, t8, t12 # E : (stall)
244
and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
245
bne t6, 1f # U : (stall)
246
247
ldq_u t0, 0(a0) # E :
248
subq t12, 1, t6 # E :
249
or t6, t12, t8 # E : (stall)
250
zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
251
252
zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
253
or t0, t1, t1 # E : (stall)
254
nop
255
nop
256
257
1: stq_u t1, 0(a0) # L :
258
ret (t9) # L0 : Latency=3
259
nop
260
nop
261
262
/* Unaligned copy entry point. */
263
.align 4
264
$unaligned:
265
266
ldq_u t1, 0(a1) # L : load first source word
267
and a0, 7, t4 # E : find dest misalignment
268
and a1, 7, t5 # E : find src misalignment
269
/* Conditionally load the first destination word and a bytemask
270
with 0xff indicating that the destination byte is sacrosanct. */
271
mov zero, t0 # E :
272
273
mov zero, t6 # E :
274
beq t4, 1f # U :
275
ldq_u t0, 0(a0) # L :
276
lda t6, -1 # E :
277
278
mskql t6, a0, t6 # U :
279
nop
280
nop
281
nop
282
1:
283
subq a1, t4, a1 # E : sub dest misalignment from src addr
284
/* If source misalignment is larger than dest misalignment, we need
285
extra startup checks to avoid SEGV. */
286
cmplt t4, t5, t12 # E :
287
beq t12, $u_head # U :
288
lda t2, -1 # E : mask out leading garbage in source
289
290
mskqh t2, t5, t2 # U :
291
ornot t1, t2, t3 # E : (stall)
292
cmpbge zero, t3, t8 # E : is there a zero? (stall)
293
beq t8, $u_head # U : (stall)
294
295
/* At this point we've found a zero in the first partial word of
296
the source. We need to isolate the valid source data and mask
297
it into the original destination data. (Incidentally, we know
298
that we'll need at least one byte of that original dest word.) */
299
300
ldq_u t0, 0(a0) # L :
301
negq t8, t6 # E : build bitmask of bytes <= zero
302
and t6, t8, t12 # E : (stall)
303
and a1, 7, t5 # E :
304
305
subq t12, 1, t6 # E :
306
or t6, t12, t8 # E : (stall)
307
srl t12, t5, t12 # U : adjust final null return value
308
zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
309
310
and t1, t2, t1 # E : to source validity mask
311
extql t2, a1, t2 # U :
312
extql t1, a1, t1 # U : (stall)
313
andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
314
315
or t0, t1, t1 # e1 : and put it there
316
stq_u t1, 0(a0) # .. e0 : (stall)
317
ret (t9) # e1 :
318
nop
319
320
.end __stxcpy
321
322
323