Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/alpha/lib/ev6-strncpy_from_user.S
10817 views
1
/*
2
* arch/alpha/lib/ev6-strncpy_from_user.S
3
* 21264 version contributed by Rick Gorton <[email protected]>
4
*
5
* Just like strncpy except in the return value:
6
*
7
* -EFAULT if an exception occurs before the terminator is copied.
8
* N if the buffer filled.
9
*
10
* Otherwise the length of the string is returned.
11
*
12
* Much of the information about 21264 scheduling/coding comes from:
13
* Compiler Writer's Guide for the Alpha 21264
14
* abbreviated as 'CWG' in other comments here
15
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16
* Scheduling notation:
17
* E - either cluster
18
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
19
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20
* A bunch of instructions got moved and temp registers were changed
21
* to aid in scheduling. Control flow was also re-arranged to eliminate
22
* branches, and to provide longer code sequences to enable better scheduling.
23
* A total rewrite (using byte load/stores for start & tail sequences)
24
* is desirable, but very difficult to do without a from-scratch rewrite.
25
* Save that for the future.
26
*/
27
28
29
#include <asm/errno.h>
30
#include <asm/regdef.h>
31
32
33
/* Allow an exception for an insn; exit if we get one. */
34
#define EX(x,y...) \
35
99: x,##y; \
36
.section __ex_table,"a"; \
37
.long 99b - .; \
38
lda $31, $exception-99b($0); \
39
.previous
40
41
42
.set noat
43
.set noreorder
44
.text
45
46
.globl __strncpy_from_user
47
.ent __strncpy_from_user
48
.frame $30, 0, $26
49
.prologue 0
50
51
.align 4
52
__strncpy_from_user:
53
and a0, 7, t3 # E : find dest misalignment
54
beq a2, $zerolength # U :
55
56
/* Are source and destination co-aligned? */
57
mov a0, v0 # E : save the string start
58
xor a0, a1, t4 # E :
59
EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword
60
ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword
61
62
addq a2, t3, a2 # E : bias count by dest misalignment
63
subq a2, 1, a3 # E :
64
addq zero, 1, t10 # E :
65
and t4, 7, t4 # E : misalignment between the two
66
67
and a3, 7, t6 # E : number of tail bytes
68
sll t10, t6, t10 # E : t10 = bitmask of last count byte
69
bne t4, $unaligned # U :
70
lda t2, -1 # E : build a mask against false zero
71
72
/*
73
* We are co-aligned; take care of a partial first word.
74
* On entry to this basic block:
75
* t0 == the first destination word for masking back in
76
* t1 == the first source word.
77
*/
78
79
srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8
80
addq a1, 8, a1 # E :
81
mskqh t2, a1, t2 # U : detection in the src word
82
nop
83
84
/* Create the 1st output word and detect 0's in the 1st input word. */
85
mskqh t1, a1, t3 # U :
86
mskql t0, a1, t0 # U : assemble the first output word
87
ornot t1, t2, t2 # E :
88
nop
89
90
cmpbge zero, t2, t8 # E : bits set iff null found
91
or t0, t3, t0 # E :
92
beq a2, $a_eoc # U :
93
bne t8, $a_eos # U : 2nd branch in a quad. Bad.
94
95
/* On entry to this basic block:
96
* t0 == a source quad not containing a null.
97
* a0 - current aligned destination address
98
* a1 - current aligned source address
99
* a2 - count of quadwords to move.
100
* NOTE: Loop improvement - unrolling this is going to be
101
* a huge win, since we're going to stall otherwise.
102
* Fix this later. For _really_ large copies, look
103
* at using wh64 on a look-ahead basis. See the code
104
* in clear_user.S and copy_user.S.
105
* Presumably, since (a0) and (a1) do not overlap (by C definition)
106
* Lots of nops here:
107
* - Separate loads from stores
108
* - Keep it to 1 branch/quadpack so the branch predictor
109
* can train.
110
*/
111
$a_loop:
112
stq_u t0, 0(a0) # L :
113
addq a0, 8, a0 # E :
114
nop
115
subq a2, 1, a2 # E :
116
117
EX( ldq_u t0, 0(a1) ) # L :
118
addq a1, 8, a1 # E :
119
cmpbge zero, t0, t8 # E : Stall 2 cycles on t0
120
beq a2, $a_eoc # U :
121
122
beq t8, $a_loop # U :
123
nop
124
nop
125
nop
126
127
/* Take care of the final (partial) word store. At this point
128
* the end-of-count bit is set in t8 iff it applies.
129
*
130
* On entry to this basic block we have:
131
* t0 == the source word containing the null
132
* t8 == the cmpbge mask that found it.
133
*/
134
$a_eos:
135
negq t8, t12 # E : find low bit set
136
and t8, t12, t12 # E :
137
138
/* We're doing a partial word store and so need to combine
139
our source and original destination words. */
140
ldq_u t1, 0(a0) # L :
141
subq t12, 1, t6 # E :
142
143
or t12, t6, t8 # E :
144
zapnot t0, t8, t0 # U : clear src bytes > null
145
zap t1, t8, t1 # U : clear dst bytes <= null
146
or t0, t1, t0 # E :
147
148
stq_u t0, 0(a0) # L :
149
br $finish_up # L0 :
150
nop
151
nop
152
153
/* Add the end-of-count bit to the eos detection bitmask. */
154
.align 4
155
$a_eoc:
156
or t10, t8, t8
157
br $a_eos
158
nop
159
nop
160
161
162
/* The source and destination are not co-aligned. Align the destination
163
and cope. We have to be very careful about not reading too much and
164
causing a SEGV. */
165
166
.align 4
167
$u_head:
168
/* We know just enough now to be able to assemble the first
169
full source word. We can still find a zero at the end of it
170
that prevents us from outputting the whole thing.
171
172
On entry to this basic block:
173
t0 == the first dest word, unmasked
174
t1 == the shifted low bits of the first source word
175
t6 == bytemask that is -1 in dest word bytes */
176
177
EX( ldq_u t2, 8(a1) ) # L : load second src word
178
addq a1, 8, a1 # E :
179
mskql t0, a0, t0 # U : mask trailing garbage in dst
180
extqh t2, a1, t4 # U :
181
182
or t1, t4, t1 # E : first aligned src word complete
183
mskqh t1, a0, t1 # U : mask leading garbage in src
184
or t0, t1, t0 # E : first output word complete
185
or t0, t6, t6 # E : mask original data for zero test
186
187
cmpbge zero, t6, t8 # E :
188
beq a2, $u_eocfin # U :
189
bne t8, $u_final # U : bad news - 2nd branch in a quad
190
lda t6, -1 # E : mask out the bits we have
191
192
mskql t6, a1, t6 # U : already seen
193
stq_u t0, 0(a0) # L : store first output word
194
or t6, t2, t2 # E :
195
cmpbge zero, t2, t8 # E : find nulls in second partial
196
197
addq a0, 8, a0 # E :
198
subq a2, 1, a2 # E :
199
bne t8, $u_late_head_exit # U :
200
nop
201
202
/* Finally, we've got all the stupid leading edge cases taken care
203
of and we can set up to enter the main loop. */
204
205
extql t2, a1, t1 # U : position hi-bits of lo word
206
EX( ldq_u t2, 8(a1) ) # L : read next high-order source word
207
addq a1, 8, a1 # E :
208
cmpbge zero, t2, t8 # E :
209
210
beq a2, $u_eoc # U :
211
bne t8, $u_eos # U :
212
nop
213
nop
214
215
/* Unaligned copy main loop. In order to avoid reading too much,
216
the loop is structured to detect zeros in aligned source words.
217
This has, unfortunately, effectively pulled half of a loop
218
iteration out into the head and half into the tail, but it does
219
prevent nastiness from accumulating in the very thing we want
220
to run as fast as possible.
221
222
On entry to this basic block:
223
t1 == the shifted high-order bits from the previous source word
224
t2 == the unshifted current source word
225
226
We further know that t2 does not contain a null terminator. */
227
228
/*
229
* Extra nops here:
230
* separate load quads from store quads
231
* only one branch/quad to permit predictor training
232
*/
233
234
.align 4
235
$u_loop:
236
extqh t2, a1, t0 # U : extract high bits for current word
237
addq a1, 8, a1 # E :
238
extql t2, a1, t3 # U : extract low bits for next time
239
addq a0, 8, a0 # E :
240
241
or t0, t1, t0 # E : current dst word now complete
242
EX( ldq_u t2, 0(a1) ) # L : load high word for next time
243
subq a2, 1, a2 # E :
244
nop
245
246
stq_u t0, -8(a0) # L : save the current word
247
mov t3, t1 # E :
248
cmpbge zero, t2, t8 # E : test new word for eos
249
beq a2, $u_eoc # U :
250
251
beq t8, $u_loop # U :
252
nop
253
nop
254
nop
255
256
/* We've found a zero somewhere in the source word we just read.
257
If it resides in the lower half, we have one (probably partial)
258
word to write out, and if it resides in the upper half, we
259
have one full and one partial word left to write out.
260
261
On entry to this basic block:
262
t1 == the shifted high-order bits from the previous source word
263
t2 == the unshifted current source word. */
264
.align 4
265
$u_eos:
266
extqh t2, a1, t0 # U :
267
or t0, t1, t0 # E : first (partial) source word complete
268
cmpbge zero, t0, t8 # E : is the null in this first bit?
269
nop
270
271
bne t8, $u_final # U :
272
stq_u t0, 0(a0) # L : the null was in the high-order bits
273
addq a0, 8, a0 # E :
274
subq a2, 1, a2 # E :
275
276
.align 4
277
$u_late_head_exit:
278
extql t2, a1, t0 # U :
279
cmpbge zero, t0, t8 # E :
280
or t8, t10, t6 # E :
281
cmoveq a2, t6, t8 # E :
282
283
/* Take care of a final (probably partial) result word.
284
On entry to this basic block:
285
t0 == assembled source word
286
t8 == cmpbge mask that found the null. */
287
.align 4
288
$u_final:
289
negq t8, t6 # E : isolate low bit set
290
and t6, t8, t12 # E :
291
ldq_u t1, 0(a0) # L :
292
subq t12, 1, t6 # E :
293
294
or t6, t12, t8 # E :
295
zapnot t0, t8, t0 # U : kill source bytes > null
296
zap t1, t8, t1 # U : kill dest bytes <= null
297
or t0, t1, t0 # E :
298
299
stq_u t0, 0(a0) # E :
300
br $finish_up # U :
301
nop
302
nop
303
304
.align 4
305
$u_eoc: # end-of-count
306
extqh t2, a1, t0 # U :
307
or t0, t1, t0 # E :
308
cmpbge zero, t0, t8 # E :
309
nop
310
311
.align 4
312
$u_eocfin: # end-of-count, final word
313
or t10, t8, t8 # E :
314
br $u_final # U :
315
nop
316
nop
317
318
/* Unaligned copy entry point. */
319
.align 4
320
$unaligned:
321
322
srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8
323
and a0, 7, t4 # E : find dest misalignment
324
and a1, 7, t5 # E : find src misalignment
325
mov zero, t0 # E :
326
327
/* Conditionally load the first destination word and a bytemask
328
with 0xff indicating that the destination byte is sacrosanct. */
329
330
mov zero, t6 # E :
331
beq t4, 1f # U :
332
ldq_u t0, 0(a0) # L :
333
lda t6, -1 # E :
334
335
mskql t6, a0, t6 # E :
336
nop
337
nop
338
nop
339
340
.align 4
341
1:
342
subq a1, t4, a1 # E : sub dest misalignment from src addr
343
/* If source misalignment is larger than dest misalignment, we need
344
extra startup checks to avoid SEGV. */
345
cmplt t4, t5, t12 # E :
346
extql t1, a1, t1 # U : shift src into place
347
lda t2, -1 # E : for creating masks later
348
349
beq t12, $u_head # U :
350
mskqh t2, t5, t2 # U : begin src byte validity mask
351
cmpbge zero, t1, t8 # E : is there a zero?
352
nop
353
354
extql t2, a1, t2 # U :
355
or t8, t10, t5 # E : test for end-of-count too
356
cmpbge zero, t2, t3 # E :
357
cmoveq a2, t5, t8 # E : Latency=2, extra map slot
358
359
nop # E : goes with cmov
360
andnot t8, t3, t8 # E :
361
beq t8, $u_head # U :
362
nop
363
364
/* At this point we've found a zero in the first partial word of
365
the source. We need to isolate the valid source data and mask
366
it into the original destination data. (Incidentally, we know
367
that we'll need at least one byte of that original dest word.) */
368
369
ldq_u t0, 0(a0) # L :
370
negq t8, t6 # E : build bitmask of bytes <= zero
371
mskqh t1, t4, t1 # U :
372
and t6, t8, t12 # E :
373
374
subq t12, 1, t6 # E :
375
or t6, t12, t8 # E :
376
zapnot t2, t8, t2 # U : prepare source word; mirror changes
377
zapnot t1, t8, t1 # U : to source validity mask
378
379
andnot t0, t2, t0 # E : zero place for source to reside
380
or t0, t1, t0 # E : and put it there
381
stq_u t0, 0(a0) # L :
382
nop
383
384
.align 4
385
$finish_up:
386
zapnot t0, t12, t4 # U : was last byte written null?
387
and t12, 0xf0, t3 # E : binary search for the address of the
388
cmovne t4, 1, t4 # E : Latency=2, extra map slot
389
nop # E : with cmovne
390
391
and t12, 0xcc, t2 # E : last byte written
392
and t12, 0xaa, t1 # E :
393
cmovne t3, 4, t3 # E : Latency=2, extra map slot
394
nop # E : with cmovne
395
396
bic a0, 7, t0
397
cmovne t2, 2, t2 # E : Latency=2, extra map slot
398
nop # E : with cmovne
399
nop
400
401
cmovne t1, 1, t1 # E : Latency=2, extra map slot
402
nop # E : with cmovne
403
addq t0, t3, t0 # E :
404
addq t1, t2, t1 # E :
405
406
addq t0, t1, t0 # E :
407
addq t0, t4, t0 # add one if we filled the buffer
408
subq t0, v0, v0 # find string length
409
ret # L0 :
410
411
.align 4
412
$zerolength:
413
nop
414
nop
415
nop
416
clr v0
417
418
$exception:
419
nop
420
nop
421
nop
422
ret
423
424
.end __strncpy_from_user
425
426