Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/riscv/lib/memmove.S
26442 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Copyright (C) 2022 Michael T. Kloos <[email protected]>
4
*/
5
6
#include <linux/linkage.h>
7
#include <asm/asm.h>
8
9
SYM_FUNC_START(__memmove)
10
/*
11
* Returns
12
* a0 - dest
13
*
14
* Parameters
15
* a0 - Inclusive first byte of dest
16
* a1 - Inclusive first byte of src
17
* a2 - Length of copy n
18
*
19
* Because the return matches the parameter register a0,
20
* we will not clobber or modify that register.
21
*
22
* Note: This currently only works on little-endian.
23
* To port to big-endian, reverse the direction of shifts
24
* in the 2 misaligned fixup copy loops.
25
*/
26
27
/* Return if nothing to do */
28
beq a0, a1, .Lreturn_from_memmove
29
beqz a2, .Lreturn_from_memmove
30
31
/*
32
* Register Uses
33
* Forward Copy: a1 - Index counter of src
34
* Reverse Copy: a4 - Index counter of src
35
* Forward Copy: t3 - Index counter of dest
36
* Reverse Copy: t4 - Index counter of dest
37
* Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
38
* Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
39
* Both Copy Modes: t0 - Link / Temporary for load-store
40
* Both Copy Modes: t1 - Temporary for load-store
41
* Both Copy Modes: t2 - Temporary for load-store
42
* Both Copy Modes: a5 - dest to src alignment offset
43
* Both Copy Modes: a6 - Shift ammount
44
* Both Copy Modes: a7 - Inverse Shift ammount
45
* Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
46
*/
47
48
/*
49
* Solve for some register values now.
50
* Byte copy does not need t5 or t6.
51
*/
52
mv t3, a0
53
add t4, a0, a2
54
add a4, a1, a2
55
56
/*
57
* Byte copy if copying less than (2 * SZREG) bytes. This can
58
* cause problems with the bulk copy implementation and is
59
* small enough not to bother.
60
*/
61
andi t0, a2, -(2 * SZREG)
62
beqz t0, .Lbyte_copy
63
64
/*
65
* Now solve for t5 and t6.
66
*/
67
andi t5, t3, -SZREG
68
andi t6, t4, -SZREG
69
/*
70
* If dest(Register t3) rounded down to the nearest naturally
71
* aligned SZREG address, does not equal dest, then add SZREG
72
* to find the low-bound of SZREG alignment in the dest memory
73
* region. Note that this could overshoot the dest memory
74
* region if n is less than SZREG. This is one reason why
75
* we always byte copy if n is less than SZREG.
76
* Otherwise, dest is already naturally aligned to SZREG.
77
*/
78
beq t5, t3, 1f
79
addi t5, t5, SZREG
80
1:
81
82
/*
83
* If the dest and src are co-aligned to SZREG, then there is
84
* no need for the full rigmarole of a full misaligned fixup copy.
85
* Instead, do a simpler co-aligned copy.
86
*/
87
xor t0, a0, a1
88
andi t1, t0, (SZREG - 1)
89
beqz t1, .Lcoaligned_copy
90
/* Fall through to misaligned fixup copy */
91
92
.Lmisaligned_fixup_copy:
93
bltu a1, a0, .Lmisaligned_fixup_copy_reverse
94
95
.Lmisaligned_fixup_copy_forward:
96
jal t0, .Lbyte_copy_until_aligned_forward
97
98
andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
99
slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
100
sub a5, a1, t3 /* Find the difference between src and dest */
101
andi a1, a1, -SZREG /* Align the src pointer */
102
addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
103
104
/*
105
* Compute The Inverse Shift
106
* a7 = XLEN - a6 = XLEN + -a6
107
* 2s complement negation to find the negative: -a6 = ~a6 + 1
108
* Add that to XLEN. XLEN = SZREG * 8.
109
*/
110
not a7, a6
111
addi a7, a7, (SZREG * 8 + 1)
112
113
/*
114
* Fix Misalignment Copy Loop - Forward
115
* load_val0 = load_ptr[0];
116
* do {
117
* load_val1 = load_ptr[1];
118
* store_ptr += 2;
119
* store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
120
*
121
* if (store_ptr == {a2})
122
* break;
123
*
124
* load_val0 = load_ptr[2];
125
* load_ptr += 2;
126
* store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
127
*
128
* } while (store_ptr != store_ptr_end);
129
* store_ptr = store_ptr_end;
130
*/
131
132
REG_L t0, (0 * SZREG)(a1)
133
1:
134
REG_L t1, (1 * SZREG)(a1)
135
addi t3, t3, (2 * SZREG)
136
srl t0, t0, a6
137
sll t2, t1, a7
138
or t2, t0, t2
139
REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
140
141
beq t3, a2, 2f
142
143
REG_L t0, (2 * SZREG)(a1)
144
addi a1, a1, (2 * SZREG)
145
srl t1, t1, a6
146
sll t2, t0, a7
147
or t2, t1, t2
148
REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
149
150
bne t3, t6, 1b
151
2:
152
mv t3, t6 /* Fix the dest pointer in case the loop was broken */
153
154
add a1, t3, a5 /* Restore the src pointer */
155
j .Lbyte_copy_forward /* Copy any remaining bytes */
156
157
.Lmisaligned_fixup_copy_reverse:
158
jal t0, .Lbyte_copy_until_aligned_reverse
159
160
andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
161
slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
162
sub a5, a4, t4 /* Find the difference between src and dest */
163
andi a4, a4, -SZREG /* Align the src pointer */
164
addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
165
166
/*
167
* Compute The Inverse Shift
168
* a7 = XLEN - a6 = XLEN + -a6
169
* 2s complement negation to find the negative: -a6 = ~a6 + 1
170
* Add that to XLEN. XLEN = SZREG * 8.
171
*/
172
not a7, a6
173
addi a7, a7, (SZREG * 8 + 1)
174
175
/*
176
* Fix Misalignment Copy Loop - Reverse
177
* load_val1 = load_ptr[0];
178
* do {
179
* load_val0 = load_ptr[-1];
180
* store_ptr -= 2;
181
* store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
182
*
183
* if (store_ptr == {a2})
184
* break;
185
*
186
* load_val1 = load_ptr[-2];
187
* load_ptr -= 2;
188
* store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
189
*
190
* } while (store_ptr != store_ptr_end);
191
* store_ptr = store_ptr_end;
192
*/
193
194
REG_L t1, ( 0 * SZREG)(a4)
195
1:
196
REG_L t0, (-1 * SZREG)(a4)
197
addi t4, t4, (-2 * SZREG)
198
sll t1, t1, a7
199
srl t2, t0, a6
200
or t2, t1, t2
201
REG_S t2, ( 1 * SZREG)(t4)
202
203
beq t4, a2, 2f
204
205
REG_L t1, (-2 * SZREG)(a4)
206
addi a4, a4, (-2 * SZREG)
207
sll t0, t0, a7
208
srl t2, t1, a6
209
or t2, t0, t2
210
REG_S t2, ( 0 * SZREG)(t4)
211
212
bne t4, t5, 1b
213
2:
214
mv t4, t5 /* Fix the dest pointer in case the loop was broken */
215
216
add a4, t4, a5 /* Restore the src pointer */
217
j .Lbyte_copy_reverse /* Copy any remaining bytes */
218
219
/*
220
* Simple copy loops for SZREG co-aligned memory locations.
221
* These also make calls to do byte copies for any unaligned
222
* data at their terminations.
223
*/
224
.Lcoaligned_copy:
225
bltu a1, a0, .Lcoaligned_copy_reverse
226
227
.Lcoaligned_copy_forward:
228
jal t0, .Lbyte_copy_until_aligned_forward
229
230
1:
231
REG_L t1, ( 0 * SZREG)(a1)
232
addi a1, a1, SZREG
233
addi t3, t3, SZREG
234
REG_S t1, (-1 * SZREG)(t3)
235
bne t3, t6, 1b
236
237
j .Lbyte_copy_forward /* Copy any remaining bytes */
238
239
.Lcoaligned_copy_reverse:
240
jal t0, .Lbyte_copy_until_aligned_reverse
241
242
1:
243
REG_L t1, (-1 * SZREG)(a4)
244
addi a4, a4, -SZREG
245
addi t4, t4, -SZREG
246
REG_S t1, ( 0 * SZREG)(t4)
247
bne t4, t5, 1b
248
249
j .Lbyte_copy_reverse /* Copy any remaining bytes */
250
251
/*
252
* These are basically sub-functions within the function. They
253
* are used to byte copy until the dest pointer is in alignment.
254
* At which point, a bulk copy method can be used by the
255
* calling code. These work on the same registers as the bulk
256
* copy loops. Therefore, the register values can be picked
257
* up from where they were left and we avoid code duplication
258
* without any overhead except the call in and return jumps.
259
*/
260
.Lbyte_copy_until_aligned_forward:
261
beq t3, t5, 2f
262
1:
263
lb t1, 0(a1)
264
addi a1, a1, 1
265
addi t3, t3, 1
266
sb t1, -1(t3)
267
bne t3, t5, 1b
268
2:
269
jalr zero, 0x0(t0) /* Return to multibyte copy loop */
270
271
.Lbyte_copy_until_aligned_reverse:
272
beq t4, t6, 2f
273
1:
274
lb t1, -1(a4)
275
addi a4, a4, -1
276
addi t4, t4, -1
277
sb t1, 0(t4)
278
bne t4, t6, 1b
279
2:
280
jalr zero, 0x0(t0) /* Return to multibyte copy loop */
281
282
/*
283
* Simple byte copy loops.
284
* These will byte copy until they reach the end of data to copy.
285
* At that point, they will call to return from memmove.
286
*/
287
.Lbyte_copy:
288
bltu a1, a0, .Lbyte_copy_reverse
289
290
.Lbyte_copy_forward:
291
beq t3, t4, 2f
292
1:
293
lb t1, 0(a1)
294
addi a1, a1, 1
295
addi t3, t3, 1
296
sb t1, -1(t3)
297
bne t3, t4, 1b
298
2:
299
ret
300
301
.Lbyte_copy_reverse:
302
beq t4, t3, 2f
303
1:
304
lb t1, -1(a4)
305
addi a4, a4, -1
306
addi t4, t4, -1
307
sb t1, 0(t4)
308
bne t4, t3, 1b
309
2:
310
311
.Lreturn_from_memmove:
312
ret
313
314
SYM_FUNC_END(__memmove)
315
SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
316
SYM_FUNC_ALIAS(__pi_memmove, __memmove)
317
SYM_FUNC_ALIAS(__pi___memmove, __memmove)
318
319