Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/microblaze/lib/fastcopy.S
26424 views
1
/*
2
* Copyright (C) 2008-2009 Michal Simek <[email protected]>
3
* Copyright (C) 2008-2009 PetaLogix
4
* Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5
*
6
* This file is subject to the terms and conditions of the GNU General
7
* Public License. See the file COPYING in the main directory of this
8
* archive for more details.
9
*
10
* Written by Jim Law <[email protected]>
11
*
12
* intended to replace:
13
* memcpy in memcpy.c and
14
* memmove in memmove.c
15
* ... in arch/microblaze/lib
16
*
17
*
18
* assly_fastcopy.S
19
*
20
* Attempt at quicker memcpy and memmove for MicroBlaze
21
* Input : Operand1 in Reg r5 - destination address
22
* Operand2 in Reg r6 - source address
23
* Operand3 in Reg r7 - number of bytes to transfer
24
* Output: Result in Reg r3 - starting destinaition address
25
*
26
*
27
* Explanation:
28
* Perform (possibly unaligned) copy of a block of memory
29
* between mem locations with size of xfer spec'd in bytes
30
*/
31
32
#include <linux/linkage.h>
33
.text
34
.globl memcpy
35
.type memcpy, @function
36
.ent memcpy
37
38
memcpy:
39
fast_memcpy_ascending:
40
/* move d to return register as value of function */
41
addi r3, r5, 0
42
43
addi r4, r0, 4 /* n = 4 */
44
cmpu r4, r4, r7 /* n = c - n (unsigned) */
45
blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
46
47
/* transfer first 0~3 bytes to get aligned dest address */
48
andi r4, r5, 3 /* n = d & 3 */
49
/* if zero, destination already aligned */
50
beqi r4, a_dalign_done
51
/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52
rsubi r4, r4, 4
53
rsub r7, r4, r7 /* c = c - n adjust c */
54
55
a_xfer_first_loop:
56
/* if no bytes left to transfer, transfer the bulk */
57
beqi r4, a_dalign_done
58
lbui r11, r6, 0 /* h = *s */
59
sbi r11, r5, 0 /* *d = h */
60
addi r6, r6, 1 /* s++ */
61
addi r5, r5, 1 /* d++ */
62
brid a_xfer_first_loop /* loop */
63
addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
64
65
a_dalign_done:
66
addi r4, r0, 32 /* n = 32 */
67
cmpu r4, r4, r7 /* n = c - n (unsigned) */
68
/* if n < 0, less than one block to transfer */
69
blti r4, a_block_done
70
71
a_block_xfer:
72
andi r4, r7, 0xffffffe0 /* n = c & ~31 */
73
rsub r7, r4, r7 /* c = c - n */
74
75
andi r9, r6, 3 /* t1 = s & 3 */
76
/* if temp != 0, unaligned transfers needed */
77
bnei r9, a_block_unaligned
78
79
a_block_aligned:
80
lwi r9, r6, 0 /* t1 = *(s + 0) */
81
lwi r10, r6, 4 /* t2 = *(s + 4) */
82
lwi r11, r6, 8 /* t3 = *(s + 8) */
83
lwi r12, r6, 12 /* t4 = *(s + 12) */
84
swi r9, r5, 0 /* *(d + 0) = t1 */
85
swi r10, r5, 4 /* *(d + 4) = t2 */
86
swi r11, r5, 8 /* *(d + 8) = t3 */
87
swi r12, r5, 12 /* *(d + 12) = t4 */
88
lwi r9, r6, 16 /* t1 = *(s + 16) */
89
lwi r10, r6, 20 /* t2 = *(s + 20) */
90
lwi r11, r6, 24 /* t3 = *(s + 24) */
91
lwi r12, r6, 28 /* t4 = *(s + 28) */
92
swi r9, r5, 16 /* *(d + 16) = t1 */
93
swi r10, r5, 20 /* *(d + 20) = t2 */
94
swi r11, r5, 24 /* *(d + 24) = t3 */
95
swi r12, r5, 28 /* *(d + 28) = t4 */
96
addi r6, r6, 32 /* s = s + 32 */
97
addi r4, r4, -32 /* n = n - 32 */
98
bneid r4, a_block_aligned /* while (n) loop */
99
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
100
bri a_block_done
101
102
a_block_unaligned:
103
andi r8, r6, 0xfffffffc /* as = s & ~3 */
104
add r6, r6, r4 /* s = s + n */
105
lwi r11, r8, 0 /* h = *(as + 0) */
106
107
addi r9, r9, -1
108
beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
109
addi r9, r9, -1
110
beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
111
112
a_block_u3:
113
bslli r11, r11, 24 /* h = h << 24 */
114
a_bu3_loop:
115
lwi r12, r8, 4 /* v = *(as + 4) */
116
bsrli r9, r12, 8 /* t1 = v >> 8 */
117
or r9, r11, r9 /* t1 = h | t1 */
118
swi r9, r5, 0 /* *(d + 0) = t1 */
119
bslli r11, r12, 24 /* h = v << 24 */
120
lwi r12, r8, 8 /* v = *(as + 8) */
121
bsrli r9, r12, 8 /* t1 = v >> 8 */
122
or r9, r11, r9 /* t1 = h | t1 */
123
swi r9, r5, 4 /* *(d + 4) = t1 */
124
bslli r11, r12, 24 /* h = v << 24 */
125
lwi r12, r8, 12 /* v = *(as + 12) */
126
bsrli r9, r12, 8 /* t1 = v >> 8 */
127
or r9, r11, r9 /* t1 = h | t1 */
128
swi r9, r5, 8 /* *(d + 8) = t1 */
129
bslli r11, r12, 24 /* h = v << 24 */
130
lwi r12, r8, 16 /* v = *(as + 16) */
131
bsrli r9, r12, 8 /* t1 = v >> 8 */
132
or r9, r11, r9 /* t1 = h | t1 */
133
swi r9, r5, 12 /* *(d + 12) = t1 */
134
bslli r11, r12, 24 /* h = v << 24 */
135
lwi r12, r8, 20 /* v = *(as + 20) */
136
bsrli r9, r12, 8 /* t1 = v >> 8 */
137
or r9, r11, r9 /* t1 = h | t1 */
138
swi r9, r5, 16 /* *(d + 16) = t1 */
139
bslli r11, r12, 24 /* h = v << 24 */
140
lwi r12, r8, 24 /* v = *(as + 24) */
141
bsrli r9, r12, 8 /* t1 = v >> 8 */
142
or r9, r11, r9 /* t1 = h | t1 */
143
swi r9, r5, 20 /* *(d + 20) = t1 */
144
bslli r11, r12, 24 /* h = v << 24 */
145
lwi r12, r8, 28 /* v = *(as + 28) */
146
bsrli r9, r12, 8 /* t1 = v >> 8 */
147
or r9, r11, r9 /* t1 = h | t1 */
148
swi r9, r5, 24 /* *(d + 24) = t1 */
149
bslli r11, r12, 24 /* h = v << 24 */
150
lwi r12, r8, 32 /* v = *(as + 32) */
151
bsrli r9, r12, 8 /* t1 = v >> 8 */
152
or r9, r11, r9 /* t1 = h | t1 */
153
swi r9, r5, 28 /* *(d + 28) = t1 */
154
bslli r11, r12, 24 /* h = v << 24 */
155
addi r8, r8, 32 /* as = as + 32 */
156
addi r4, r4, -32 /* n = n - 32 */
157
bneid r4, a_bu3_loop /* while (n) loop */
158
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
159
bri a_block_done
160
161
a_block_u1:
162
bslli r11, r11, 8 /* h = h << 8 */
163
a_bu1_loop:
164
lwi r12, r8, 4 /* v = *(as + 4) */
165
bsrli r9, r12, 24 /* t1 = v >> 24 */
166
or r9, r11, r9 /* t1 = h | t1 */
167
swi r9, r5, 0 /* *(d + 0) = t1 */
168
bslli r11, r12, 8 /* h = v << 8 */
169
lwi r12, r8, 8 /* v = *(as + 8) */
170
bsrli r9, r12, 24 /* t1 = v >> 24 */
171
or r9, r11, r9 /* t1 = h | t1 */
172
swi r9, r5, 4 /* *(d + 4) = t1 */
173
bslli r11, r12, 8 /* h = v << 8 */
174
lwi r12, r8, 12 /* v = *(as + 12) */
175
bsrli r9, r12, 24 /* t1 = v >> 24 */
176
or r9, r11, r9 /* t1 = h | t1 */
177
swi r9, r5, 8 /* *(d + 8) = t1 */
178
bslli r11, r12, 8 /* h = v << 8 */
179
lwi r12, r8, 16 /* v = *(as + 16) */
180
bsrli r9, r12, 24 /* t1 = v >> 24 */
181
or r9, r11, r9 /* t1 = h | t1 */
182
swi r9, r5, 12 /* *(d + 12) = t1 */
183
bslli r11, r12, 8 /* h = v << 8 */
184
lwi r12, r8, 20 /* v = *(as + 20) */
185
bsrli r9, r12, 24 /* t1 = v >> 24 */
186
or r9, r11, r9 /* t1 = h | t1 */
187
swi r9, r5, 16 /* *(d + 16) = t1 */
188
bslli r11, r12, 8 /* h = v << 8 */
189
lwi r12, r8, 24 /* v = *(as + 24) */
190
bsrli r9, r12, 24 /* t1 = v >> 24 */
191
or r9, r11, r9 /* t1 = h | t1 */
192
swi r9, r5, 20 /* *(d + 20) = t1 */
193
bslli r11, r12, 8 /* h = v << 8 */
194
lwi r12, r8, 28 /* v = *(as + 28) */
195
bsrli r9, r12, 24 /* t1 = v >> 24 */
196
or r9, r11, r9 /* t1 = h | t1 */
197
swi r9, r5, 24 /* *(d + 24) = t1 */
198
bslli r11, r12, 8 /* h = v << 8 */
199
lwi r12, r8, 32 /* v = *(as + 32) */
200
bsrli r9, r12, 24 /* t1 = v >> 24 */
201
or r9, r11, r9 /* t1 = h | t1 */
202
swi r9, r5, 28 /* *(d + 28) = t1 */
203
bslli r11, r12, 8 /* h = v << 8 */
204
addi r8, r8, 32 /* as = as + 32 */
205
addi r4, r4, -32 /* n = n - 32 */
206
bneid r4, a_bu1_loop /* while (n) loop */
207
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
208
bri a_block_done
209
210
a_block_u2:
211
bslli r11, r11, 16 /* h = h << 16 */
212
a_bu2_loop:
213
lwi r12, r8, 4 /* v = *(as + 4) */
214
bsrli r9, r12, 16 /* t1 = v >> 16 */
215
or r9, r11, r9 /* t1 = h | t1 */
216
swi r9, r5, 0 /* *(d + 0) = t1 */
217
bslli r11, r12, 16 /* h = v << 16 */
218
lwi r12, r8, 8 /* v = *(as + 8) */
219
bsrli r9, r12, 16 /* t1 = v >> 16 */
220
or r9, r11, r9 /* t1 = h | t1 */
221
swi r9, r5, 4 /* *(d + 4) = t1 */
222
bslli r11, r12, 16 /* h = v << 16 */
223
lwi r12, r8, 12 /* v = *(as + 12) */
224
bsrli r9, r12, 16 /* t1 = v >> 16 */
225
or r9, r11, r9 /* t1 = h | t1 */
226
swi r9, r5, 8 /* *(d + 8) = t1 */
227
bslli r11, r12, 16 /* h = v << 16 */
228
lwi r12, r8, 16 /* v = *(as + 16) */
229
bsrli r9, r12, 16 /* t1 = v >> 16 */
230
or r9, r11, r9 /* t1 = h | t1 */
231
swi r9, r5, 12 /* *(d + 12) = t1 */
232
bslli r11, r12, 16 /* h = v << 16 */
233
lwi r12, r8, 20 /* v = *(as + 20) */
234
bsrli r9, r12, 16 /* t1 = v >> 16 */
235
or r9, r11, r9 /* t1 = h | t1 */
236
swi r9, r5, 16 /* *(d + 16) = t1 */
237
bslli r11, r12, 16 /* h = v << 16 */
238
lwi r12, r8, 24 /* v = *(as + 24) */
239
bsrli r9, r12, 16 /* t1 = v >> 16 */
240
or r9, r11, r9 /* t1 = h | t1 */
241
swi r9, r5, 20 /* *(d + 20) = t1 */
242
bslli r11, r12, 16 /* h = v << 16 */
243
lwi r12, r8, 28 /* v = *(as + 28) */
244
bsrli r9, r12, 16 /* t1 = v >> 16 */
245
or r9, r11, r9 /* t1 = h | t1 */
246
swi r9, r5, 24 /* *(d + 24) = t1 */
247
bslli r11, r12, 16 /* h = v << 16 */
248
lwi r12, r8, 32 /* v = *(as + 32) */
249
bsrli r9, r12, 16 /* t1 = v >> 16 */
250
or r9, r11, r9 /* t1 = h | t1 */
251
swi r9, r5, 28 /* *(d + 28) = t1 */
252
bslli r11, r12, 16 /* h = v << 16 */
253
addi r8, r8, 32 /* as = as + 32 */
254
addi r4, r4, -32 /* n = n - 32 */
255
bneid r4, a_bu2_loop /* while (n) loop */
256
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
257
258
a_block_done:
259
addi r4, r0, 4 /* n = 4 */
260
cmpu r4, r4, r7 /* n = c - n (unsigned) */
261
blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
262
263
a_word_xfer:
264
andi r4, r7, 0xfffffffc /* n = c & ~3 */
265
addi r10, r0, 0 /* offset = 0 */
266
267
andi r9, r6, 3 /* t1 = s & 3 */
268
/* if temp != 0, unaligned transfers needed */
269
bnei r9, a_word_unaligned
270
271
a_word_aligned:
272
lw r9, r6, r10 /* t1 = *(s+offset) */
273
sw r9, r5, r10 /* *(d+offset) = t1 */
274
addi r4, r4,-4 /* n-- */
275
bneid r4, a_word_aligned /* loop */
276
addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
277
278
bri a_word_done
279
280
a_word_unaligned:
281
andi r8, r6, 0xfffffffc /* as = s & ~3 */
282
lwi r11, r8, 0 /* h = *(as + 0) */
283
addi r8, r8, 4 /* as = as + 4 */
284
285
addi r9, r9, -1
286
beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
287
addi r9, r9, -1
288
beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
289
290
a_word_u3:
291
bslli r11, r11, 24 /* h = h << 24 */
292
a_wu3_loop:
293
lw r12, r8, r10 /* v = *(as + offset) */
294
bsrli r9, r12, 8 /* t1 = v >> 8 */
295
or r9, r11, r9 /* t1 = h | t1 */
296
sw r9, r5, r10 /* *(d + offset) = t1 */
297
bslli r11, r12, 24 /* h = v << 24 */
298
addi r4, r4,-4 /* n = n - 4 */
299
bneid r4, a_wu3_loop /* while (n) loop */
300
addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
301
302
bri a_word_done
303
304
a_word_u1:
305
bslli r11, r11, 8 /* h = h << 8 */
306
a_wu1_loop:
307
lw r12, r8, r10 /* v = *(as + offset) */
308
bsrli r9, r12, 24 /* t1 = v >> 24 */
309
or r9, r11, r9 /* t1 = h | t1 */
310
sw r9, r5, r10 /* *(d + offset) = t1 */
311
bslli r11, r12, 8 /* h = v << 8 */
312
addi r4, r4,-4 /* n = n - 4 */
313
bneid r4, a_wu1_loop /* while (n) loop */
314
addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
315
316
bri a_word_done
317
318
a_word_u2:
319
bslli r11, r11, 16 /* h = h << 16 */
320
a_wu2_loop:
321
lw r12, r8, r10 /* v = *(as + offset) */
322
bsrli r9, r12, 16 /* t1 = v >> 16 */
323
or r9, r11, r9 /* t1 = h | t1 */
324
sw r9, r5, r10 /* *(d + offset) = t1 */
325
bslli r11, r12, 16 /* h = v << 16 */
326
addi r4, r4,-4 /* n = n - 4 */
327
bneid r4, a_wu2_loop /* while (n) loop */
328
addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
329
330
a_word_done:
331
add r5, r5, r10 /* d = d + offset */
332
add r6, r6, r10 /* s = s + offset */
333
rsub r7, r10, r7 /* c = c - offset */
334
335
a_xfer_end:
336
a_xfer_end_loop:
337
beqi r7, a_done /* while (c) */
338
lbui r9, r6, 0 /* t1 = *s */
339
addi r6, r6, 1 /* s++ */
340
sbi r9, r5, 0 /* *d = t1 */
341
addi r7, r7, -1 /* c-- */
342
brid a_xfer_end_loop /* loop */
343
addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
344
345
a_done:
346
rtsd r15, 8
347
nop
348
349
.size memcpy, . - memcpy
350
.end memcpy
351
/*----------------------------------------------------------------------------*/
352
.globl memmove
353
.type memmove, @function
354
.ent memmove
355
356
memmove:
357
cmpu r4, r5, r6 /* n = s - d */
358
bgei r4,fast_memcpy_ascending
359
360
fast_memcpy_descending:
361
/* move d to return register as value of function */
362
addi r3, r5, 0
363
364
add r5, r5, r7 /* d = d + c */
365
add r6, r6, r7 /* s = s + c */
366
367
addi r4, r0, 4 /* n = 4 */
368
cmpu r4, r4, r7 /* n = c - n (unsigned) */
369
blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
370
371
/* transfer first 0~3 bytes to get aligned dest address */
372
andi r4, r5, 3 /* n = d & 3 */
373
/* if zero, destination already aligned */
374
beqi r4,d_dalign_done
375
rsub r7, r4, r7 /* c = c - n adjust c */
376
377
d_xfer_first_loop:
378
/* if no bytes left to transfer, transfer the bulk */
379
beqi r4,d_dalign_done
380
addi r6, r6, -1 /* s-- */
381
addi r5, r5, -1 /* d-- */
382
lbui r11, r6, 0 /* h = *s */
383
sbi r11, r5, 0 /* *d = h */
384
brid d_xfer_first_loop /* loop */
385
addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
386
387
d_dalign_done:
388
addi r4, r0, 32 /* n = 32 */
389
cmpu r4, r4, r7 /* n = c - n (unsigned) */
390
/* if n < 0, less than one block to transfer */
391
blti r4, d_block_done
392
393
d_block_xfer:
394
andi r4, r7, 0xffffffe0 /* n = c & ~31 */
395
rsub r7, r4, r7 /* c = c - n */
396
397
andi r9, r6, 3 /* t1 = s & 3 */
398
/* if temp != 0, unaligned transfers needed */
399
bnei r9, d_block_unaligned
400
401
d_block_aligned:
402
addi r6, r6, -32 /* s = s - 32 */
403
addi r5, r5, -32 /* d = d - 32 */
404
lwi r9, r6, 28 /* t1 = *(s + 28) */
405
lwi r10, r6, 24 /* t2 = *(s + 24) */
406
lwi r11, r6, 20 /* t3 = *(s + 20) */
407
lwi r12, r6, 16 /* t4 = *(s + 16) */
408
swi r9, r5, 28 /* *(d + 28) = t1 */
409
swi r10, r5, 24 /* *(d + 24) = t2 */
410
swi r11, r5, 20 /* *(d + 20) = t3 */
411
swi r12, r5, 16 /* *(d + 16) = t4 */
412
lwi r9, r6, 12 /* t1 = *(s + 12) */
413
lwi r10, r6, 8 /* t2 = *(s + 8) */
414
lwi r11, r6, 4 /* t3 = *(s + 4) */
415
lwi r12, r6, 0 /* t4 = *(s + 0) */
416
swi r9, r5, 12 /* *(d + 12) = t1 */
417
swi r10, r5, 8 /* *(d + 8) = t2 */
418
swi r11, r5, 4 /* *(d + 4) = t3 */
419
addi r4, r4, -32 /* n = n - 32 */
420
bneid r4, d_block_aligned /* while (n) loop */
421
swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
422
bri d_block_done
423
424
d_block_unaligned:
425
andi r8, r6, 0xfffffffc /* as = s & ~3 */
426
rsub r6, r4, r6 /* s = s - n */
427
lwi r11, r8, 0 /* h = *(as + 0) */
428
429
addi r9, r9, -1
430
beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
431
addi r9, r9, -1
432
beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
433
434
d_block_u3:
435
bsrli r11, r11, 8 /* h = h >> 8 */
436
d_bu3_loop:
437
addi r8, r8, -32 /* as = as - 32 */
438
addi r5, r5, -32 /* d = d - 32 */
439
lwi r12, r8, 28 /* v = *(as + 28) */
440
bslli r9, r12, 24 /* t1 = v << 24 */
441
or r9, r11, r9 /* t1 = h | t1 */
442
swi r9, r5, 28 /* *(d + 28) = t1 */
443
bsrli r11, r12, 8 /* h = v >> 8 */
444
lwi r12, r8, 24 /* v = *(as + 24) */
445
bslli r9, r12, 24 /* t1 = v << 24 */
446
or r9, r11, r9 /* t1 = h | t1 */
447
swi r9, r5, 24 /* *(d + 24) = t1 */
448
bsrli r11, r12, 8 /* h = v >> 8 */
449
lwi r12, r8, 20 /* v = *(as + 20) */
450
bslli r9, r12, 24 /* t1 = v << 24 */
451
or r9, r11, r9 /* t1 = h | t1 */
452
swi r9, r5, 20 /* *(d + 20) = t1 */
453
bsrli r11, r12, 8 /* h = v >> 8 */
454
lwi r12, r8, 16 /* v = *(as + 16) */
455
bslli r9, r12, 24 /* t1 = v << 24 */
456
or r9, r11, r9 /* t1 = h | t1 */
457
swi r9, r5, 16 /* *(d + 16) = t1 */
458
bsrli r11, r12, 8 /* h = v >> 8 */
459
lwi r12, r8, 12 /* v = *(as + 12) */
460
bslli r9, r12, 24 /* t1 = v << 24 */
461
or r9, r11, r9 /* t1 = h | t1 */
462
swi r9, r5, 12 /* *(d + 112) = t1 */
463
bsrli r11, r12, 8 /* h = v >> 8 */
464
lwi r12, r8, 8 /* v = *(as + 8) */
465
bslli r9, r12, 24 /* t1 = v << 24 */
466
or r9, r11, r9 /* t1 = h | t1 */
467
swi r9, r5, 8 /* *(d + 8) = t1 */
468
bsrli r11, r12, 8 /* h = v >> 8 */
469
lwi r12, r8, 4 /* v = *(as + 4) */
470
bslli r9, r12, 24 /* t1 = v << 24 */
471
or r9, r11, r9 /* t1 = h | t1 */
472
swi r9, r5, 4 /* *(d + 4) = t1 */
473
bsrli r11, r12, 8 /* h = v >> 8 */
474
lwi r12, r8, 0 /* v = *(as + 0) */
475
bslli r9, r12, 24 /* t1 = v << 24 */
476
or r9, r11, r9 /* t1 = h | t1 */
477
swi r9, r5, 0 /* *(d + 0) = t1 */
478
addi r4, r4, -32 /* n = n - 32 */
479
bneid r4, d_bu3_loop /* while (n) loop */
480
bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
481
bri d_block_done
482
483
d_block_u1:
484
bsrli r11, r11, 24 /* h = h >> 24 */
485
d_bu1_loop:
486
addi r8, r8, -32 /* as = as - 32 */
487
addi r5, r5, -32 /* d = d - 32 */
488
lwi r12, r8, 28 /* v = *(as + 28) */
489
bslli r9, r12, 8 /* t1 = v << 8 */
490
or r9, r11, r9 /* t1 = h | t1 */
491
swi r9, r5, 28 /* *(d + 28) = t1 */
492
bsrli r11, r12, 24 /* h = v >> 24 */
493
lwi r12, r8, 24 /* v = *(as + 24) */
494
bslli r9, r12, 8 /* t1 = v << 8 */
495
or r9, r11, r9 /* t1 = h | t1 */
496
swi r9, r5, 24 /* *(d + 24) = t1 */
497
bsrli r11, r12, 24 /* h = v >> 24 */
498
lwi r12, r8, 20 /* v = *(as + 20) */
499
bslli r9, r12, 8 /* t1 = v << 8 */
500
or r9, r11, r9 /* t1 = h | t1 */
501
swi r9, r5, 20 /* *(d + 20) = t1 */
502
bsrli r11, r12, 24 /* h = v >> 24 */
503
lwi r12, r8, 16 /* v = *(as + 16) */
504
bslli r9, r12, 8 /* t1 = v << 8 */
505
or r9, r11, r9 /* t1 = h | t1 */
506
swi r9, r5, 16 /* *(d + 16) = t1 */
507
bsrli r11, r12, 24 /* h = v >> 24 */
508
lwi r12, r8, 12 /* v = *(as + 12) */
509
bslli r9, r12, 8 /* t1 = v << 8 */
510
or r9, r11, r9 /* t1 = h | t1 */
511
swi r9, r5, 12 /* *(d + 112) = t1 */
512
bsrli r11, r12, 24 /* h = v >> 24 */
513
lwi r12, r8, 8 /* v = *(as + 8) */
514
bslli r9, r12, 8 /* t1 = v << 8 */
515
or r9, r11, r9 /* t1 = h | t1 */
516
swi r9, r5, 8 /* *(d + 8) = t1 */
517
bsrli r11, r12, 24 /* h = v >> 24 */
518
lwi r12, r8, 4 /* v = *(as + 4) */
519
bslli r9, r12, 8 /* t1 = v << 8 */
520
or r9, r11, r9 /* t1 = h | t1 */
521
swi r9, r5, 4 /* *(d + 4) = t1 */
522
bsrli r11, r12, 24 /* h = v >> 24 */
523
lwi r12, r8, 0 /* v = *(as + 0) */
524
bslli r9, r12, 8 /* t1 = v << 8 */
525
or r9, r11, r9 /* t1 = h | t1 */
526
swi r9, r5, 0 /* *(d + 0) = t1 */
527
addi r4, r4, -32 /* n = n - 32 */
528
bneid r4, d_bu1_loop /* while (n) loop */
529
bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
530
bri d_block_done
531
532
d_block_u2:
533
bsrli r11, r11, 16 /* h = h >> 16 */
534
d_bu2_loop:
535
addi r8, r8, -32 /* as = as - 32 */
536
addi r5, r5, -32 /* d = d - 32 */
537
lwi r12, r8, 28 /* v = *(as + 28) */
538
bslli r9, r12, 16 /* t1 = v << 16 */
539
or r9, r11, r9 /* t1 = h | t1 */
540
swi r9, r5, 28 /* *(d + 28) = t1 */
541
bsrli r11, r12, 16 /* h = v >> 16 */
542
lwi r12, r8, 24 /* v = *(as + 24) */
543
bslli r9, r12, 16 /* t1 = v << 16 */
544
or r9, r11, r9 /* t1 = h | t1 */
545
swi r9, r5, 24 /* *(d + 24) = t1 */
546
bsrli r11, r12, 16 /* h = v >> 16 */
547
lwi r12, r8, 20 /* v = *(as + 20) */
548
bslli r9, r12, 16 /* t1 = v << 16 */
549
or r9, r11, r9 /* t1 = h | t1 */
550
swi r9, r5, 20 /* *(d + 20) = t1 */
551
bsrli r11, r12, 16 /* h = v >> 16 */
552
lwi r12, r8, 16 /* v = *(as + 16) */
553
bslli r9, r12, 16 /* t1 = v << 16 */
554
or r9, r11, r9 /* t1 = h | t1 */
555
swi r9, r5, 16 /* *(d + 16) = t1 */
556
bsrli r11, r12, 16 /* h = v >> 16 */
557
lwi r12, r8, 12 /* v = *(as + 12) */
558
bslli r9, r12, 16 /* t1 = v << 16 */
559
or r9, r11, r9 /* t1 = h | t1 */
560
swi r9, r5, 12 /* *(d + 112) = t1 */
561
bsrli r11, r12, 16 /* h = v >> 16 */
562
lwi r12, r8, 8 /* v = *(as + 8) */
563
bslli r9, r12, 16 /* t1 = v << 16 */
564
or r9, r11, r9 /* t1 = h | t1 */
565
swi r9, r5, 8 /* *(d + 8) = t1 */
566
bsrli r11, r12, 16 /* h = v >> 16 */
567
lwi r12, r8, 4 /* v = *(as + 4) */
568
bslli r9, r12, 16 /* t1 = v << 16 */
569
or r9, r11, r9 /* t1 = h | t1 */
570
swi r9, r5, 4 /* *(d + 4) = t1 */
571
bsrli r11, r12, 16 /* h = v >> 16 */
572
lwi r12, r8, 0 /* v = *(as + 0) */
573
bslli r9, r12, 16 /* t1 = v << 16 */
574
or r9, r11, r9 /* t1 = h | t1 */
575
swi r9, r5, 0 /* *(d + 0) = t1 */
576
addi r4, r4, -32 /* n = n - 32 */
577
bneid r4, d_bu2_loop /* while (n) loop */
578
bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
579
580
d_block_done:
581
addi r4, r0, 4 /* n = 4 */
582
cmpu r4, r4, r7 /* n = c - n (unsigned) */
583
blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
584
585
d_word_xfer:
586
andi r4, r7, 0xfffffffc /* n = c & ~3 */
587
rsub r5, r4, r5 /* d = d - n */
588
rsub r6, r4, r6 /* s = s - n */
589
rsub r7, r4, r7 /* c = c - n */
590
591
andi r9, r6, 3 /* t1 = s & 3 */
592
/* if temp != 0, unaligned transfers needed */
593
bnei r9, d_word_unaligned
594
595
d_word_aligned:
596
addi r4, r4,-4 /* n-- */
597
lw r9, r6, r4 /* t1 = *(s+n) */
598
bneid r4, d_word_aligned /* loop */
599
sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
600
601
bri d_word_done
602
603
d_word_unaligned:
604
andi r8, r6, 0xfffffffc /* as = s & ~3 */
605
lw r11, r8, r4 /* h = *(as + n) */
606
607
addi r9, r9, -1
608
beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
609
addi r9, r9, -1
610
beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
611
612
d_word_u3:
613
bsrli r11, r11, 8 /* h = h >> 8 */
614
d_wu3_loop:
615
addi r4, r4,-4 /* n = n - 4 */
616
lw r12, r8, r4 /* v = *(as + n) */
617
bslli r9, r12, 24 /* t1 = v << 24 */
618
or r9, r11, r9 /* t1 = h | t1 */
619
sw r9, r5, r4 /* *(d + n) = t1 */
620
bneid r4, d_wu3_loop /* while (n) loop */
621
bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
622
623
bri d_word_done
624
625
d_word_u1:
626
bsrli r11, r11, 24 /* h = h >> 24 */
627
d_wu1_loop:
628
addi r4, r4,-4 /* n = n - 4 */
629
lw r12, r8, r4 /* v = *(as + n) */
630
bslli r9, r12, 8 /* t1 = v << 8 */
631
or r9, r11, r9 /* t1 = h | t1 */
632
sw r9, r5, r4 /* *(d + n) = t1 */
633
bneid r4, d_wu1_loop /* while (n) loop */
634
bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
635
636
bri d_word_done
637
638
d_word_u2:
639
bsrli r11, r11, 16 /* h = h >> 16 */
640
d_wu2_loop:
641
addi r4, r4,-4 /* n = n - 4 */
642
lw r12, r8, r4 /* v = *(as + n) */
643
bslli r9, r12, 16 /* t1 = v << 16 */
644
or r9, r11, r9 /* t1 = h | t1 */
645
sw r9, r5, r4 /* *(d + n) = t1 */
646
bneid r4, d_wu2_loop /* while (n) loop */
647
bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
648
649
d_word_done:
650
651
d_xfer_end:
652
d_xfer_end_loop:
653
beqi r7, a_done /* while (c) */
654
addi r6, r6, -1 /* s-- */
655
lbui r9, r6, 0 /* t1 = *s */
656
addi r5, r5, -1 /* d-- */
657
sbi r9, r5, 0 /* *d = t1 */
658
brid d_xfer_end_loop /* loop */
659
addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
660
661
d_done:
662
rtsd r15, 8
663
nop
664
665
.size memmove, . - memmove
666
.end memmove
667
668