CoCalc -- memmove.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/riscv/lib/memmove.S
²⁶⁴⁴² views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * Copyright (C) 2022 Michael T. Kloos <[email protected]>
4
 */
5

6
#include <linux/linkage.h>
7
#include <asm/asm.h>
8

9
SYM_FUNC_START(__memmove)
10
	/*
11
	 * Returns
12
	 *   a0 - dest
13
	 *
14
	 * Parameters
15
	 *   a0 - Inclusive first byte of dest
16
	 *   a1 - Inclusive first byte of src
17
	 *   a2 - Length of copy n
18
	 *
19
	 * Because the return matches the parameter register a0,
20
	 * we will not clobber or modify that register.
21
	 *
22
	 * Note: This currently only works on little-endian.
23
	 * To port to big-endian, reverse the direction of shifts
24
	 * in the 2 misaligned fixup copy loops.
25
	 */
26

27
	/* Return if nothing to do */
28
	beq a0, a1, .Lreturn_from_memmove
29
	beqz a2, .Lreturn_from_memmove
30

31
	/*
32
	 * Register Uses
33
	 *      Forward Copy: a1 - Index counter of src
34
	 *      Reverse Copy: a4 - Index counter of src
35
	 *      Forward Copy: t3 - Index counter of dest
36
	 *      Reverse Copy: t4 - Index counter of dest
37
	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
38
	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
39
	 *   Both Copy Modes: t0 - Link / Temporary for load-store
40
	 *   Both Copy Modes: t1 - Temporary for load-store
41
	 *   Both Copy Modes: t2 - Temporary for load-store
42
	 *   Both Copy Modes: a5 - dest to src alignment offset
43
	 *   Both Copy Modes: a6 - Shift ammount
44
	 *   Both Copy Modes: a7 - Inverse Shift ammount
45
	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
46
	 */
47

48
	/*
49
	 * Solve for some register values now.
50
	 * Byte copy does not need t5 or t6.
51
	 */
52
	mv   t3, a0
53
	add  t4, a0, a2
54
	add  a4, a1, a2
55

56
	/*
57
	 * Byte copy if copying less than (2 * SZREG) bytes. This can
58
	 * cause problems with the bulk copy implementation and is
59
	 * small enough not to bother.
60
	 */
61
	andi t0, a2, -(2 * SZREG)
62
	beqz t0, .Lbyte_copy
63

64
	/*
65
	 * Now solve for t5 and t6.
66
	 */
67
	andi t5, t3, -SZREG
68
	andi t6, t4, -SZREG
69
	/*
70
	 * If dest(Register t3) rounded down to the nearest naturally
71
	 * aligned SZREG address, does not equal dest, then add SZREG
72
	 * to find the low-bound of SZREG alignment in the dest memory
73
	 * region.  Note that this could overshoot the dest memory
74
	 * region if n is less than SZREG.  This is one reason why
75
	 * we always byte copy if n is less than SZREG.
76
	 * Otherwise, dest is already naturally aligned to SZREG.
77
	 */
78
	beq  t5, t3, 1f
79
		addi t5, t5, SZREG
80
	1:
81

82
	/*
83
	 * If the dest and src are co-aligned to SZREG, then there is
84
	 * no need for the full rigmarole of a full misaligned fixup copy.
85
	 * Instead, do a simpler co-aligned copy.
86
	 */
87
	xor  t0, a0, a1
88
	andi t1, t0, (SZREG - 1)
89
	beqz t1, .Lcoaligned_copy
90
	/* Fall through to misaligned fixup copy */
91

92
.Lmisaligned_fixup_copy:
93
	bltu a1, a0, .Lmisaligned_fixup_copy_reverse
94

95
.Lmisaligned_fixup_copy_forward:
96
	jal  t0, .Lbyte_copy_until_aligned_forward
97

98
	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
99
	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
100
	sub  a5, a1, t3 /* Find the difference between src and dest */
101
	andi a1, a1, -SZREG /* Align the src pointer */
102
	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
103

104
	/*
105
	 * Compute The Inverse Shift
106
	 * a7 = XLEN - a6 = XLEN + -a6
107
	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
108
	 * Add that to XLEN.  XLEN = SZREG * 8.
109
	 */
110
	not  a7, a6
111
	addi a7, a7, (SZREG * 8 + 1)
112

113
	/*
114
	 * Fix Misalignment Copy Loop - Forward
115
	 * load_val0 = load_ptr[0];
116
	 * do {
117
	 * 	load_val1 = load_ptr[1];
118
	 * 	store_ptr += 2;
119
	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
120
	 *
121
	 * 	if (store_ptr == {a2})
122
	 * 		break;
123
	 *
124
	 * 	load_val0 = load_ptr[2];
125
	 * 	load_ptr += 2;
126
	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
127
	 *
128
	 * } while (store_ptr != store_ptr_end);
129
	 * store_ptr = store_ptr_end;
130
	 */
131

132
	REG_L t0, (0 * SZREG)(a1)
133
	1:
134
	REG_L t1, (1 * SZREG)(a1)
135
	addi  t3, t3, (2 * SZREG)
136
	srl   t0, t0, a6
137
	sll   t2, t1, a7
138
	or    t2, t0, t2
139
	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
140

141
	beq   t3, a2, 2f
142

143
	REG_L t0, (2 * SZREG)(a1)
144
	addi  a1, a1, (2 * SZREG)
145
	srl   t1, t1, a6
146
	sll   t2, t0, a7
147
	or    t2, t1, t2
148
	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
149

150
	bne   t3, t6, 1b
151
	2:
152
	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
153

154
	add  a1, t3, a5 /* Restore the src pointer */
155
	j .Lbyte_copy_forward /* Copy any remaining bytes */
156

157
.Lmisaligned_fixup_copy_reverse:
158
	jal  t0, .Lbyte_copy_until_aligned_reverse
159

160
	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
161
	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
162
	sub  a5, a4, t4 /* Find the difference between src and dest */
163
	andi a4, a4, -SZREG /* Align the src pointer */
164
	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
165

166
	/*
167
	 * Compute The Inverse Shift
168
	 * a7 = XLEN - a6 = XLEN + -a6
169
	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
170
	 * Add that to XLEN.  XLEN = SZREG * 8.
171
	 */
172
	not  a7, a6
173
	addi a7, a7, (SZREG * 8 + 1)
174

175
	/*
176
	 * Fix Misalignment Copy Loop - Reverse
177
	 * load_val1 = load_ptr[0];
178
	 * do {
179
	 * 	load_val0 = load_ptr[-1];
180
	 * 	store_ptr -= 2;
181
	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
182
	 *
183
	 * 	if (store_ptr == {a2})
184
	 * 		break;
185
	 *
186
	 * 	load_val1 = load_ptr[-2];
187
	 * 	load_ptr -= 2;
188
	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
189
	 *
190
	 * } while (store_ptr != store_ptr_end);
191
	 * store_ptr = store_ptr_end;
192
	 */
193

194
	REG_L t1, ( 0 * SZREG)(a4)
195
	1:
196
	REG_L t0, (-1 * SZREG)(a4)
197
	addi  t4, t4, (-2 * SZREG)
198
	sll   t1, t1, a7
199
	srl   t2, t0, a6
200
	or    t2, t1, t2
201
	REG_S t2, ( 1 * SZREG)(t4)
202

203
	beq   t4, a2, 2f
204

205
	REG_L t1, (-2 * SZREG)(a4)
206
	addi  a4, a4, (-2 * SZREG)
207
	sll   t0, t0, a7
208
	srl   t2, t1, a6
209
	or    t2, t0, t2
210
	REG_S t2, ( 0 * SZREG)(t4)
211

212
	bne   t4, t5, 1b
213
	2:
214
	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
215

216
	add  a4, t4, a5 /* Restore the src pointer */
217
	j .Lbyte_copy_reverse /* Copy any remaining bytes */
218

219
/*
220
 * Simple copy loops for SZREG co-aligned memory locations.
221
 * These also make calls to do byte copies for any unaligned
222
 * data at their terminations.
223
 */
224
.Lcoaligned_copy:
225
	bltu a1, a0, .Lcoaligned_copy_reverse
226

227
.Lcoaligned_copy_forward:
228
	jal t0, .Lbyte_copy_until_aligned_forward
229

230
	1:
231
	REG_L t1, ( 0 * SZREG)(a1)
232
	addi  a1, a1, SZREG
233
	addi  t3, t3, SZREG
234
	REG_S t1, (-1 * SZREG)(t3)
235
	bne   t3, t6, 1b
236

237
	j .Lbyte_copy_forward /* Copy any remaining bytes */
238

239
.Lcoaligned_copy_reverse:
240
	jal t0, .Lbyte_copy_until_aligned_reverse
241

242
	1:
243
	REG_L t1, (-1 * SZREG)(a4)
244
	addi  a4, a4, -SZREG
245
	addi  t4, t4, -SZREG
246
	REG_S t1, ( 0 * SZREG)(t4)
247
	bne   t4, t5, 1b
248

249
	j .Lbyte_copy_reverse /* Copy any remaining bytes */
250

251
/*
252
 * These are basically sub-functions within the function.  They
253
 * are used to byte copy until the dest pointer is in alignment.
254
 * At which point, a bulk copy method can be used by the
255
 * calling code.  These work on the same registers as the bulk
256
 * copy loops.  Therefore, the register values can be picked
257
 * up from where they were left and we avoid code duplication
258
 * without any overhead except the call in and return jumps.
259
 */
260
.Lbyte_copy_until_aligned_forward:
261
	beq  t3, t5, 2f
262
	1:
263
	lb   t1,  0(a1)
264
	addi a1, a1, 1
265
	addi t3, t3, 1
266
	sb   t1, -1(t3)
267
	bne  t3, t5, 1b
268
	2:
269
	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
270

271
.Lbyte_copy_until_aligned_reverse:
272
	beq  t4, t6, 2f
273
	1:
274
	lb   t1, -1(a4)
275
	addi a4, a4, -1
276
	addi t4, t4, -1
277
	sb   t1,  0(t4)
278
	bne  t4, t6, 1b
279
	2:
280
	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
281

282
/*
283
 * Simple byte copy loops.
284
 * These will byte copy until they reach the end of data to copy.
285
 * At that point, they will call to return from memmove.
286
 */
287
.Lbyte_copy:
288
	bltu a1, a0, .Lbyte_copy_reverse
289

290
.Lbyte_copy_forward:
291
	beq  t3, t4, 2f
292
	1:
293
	lb   t1,  0(a1)
294
	addi a1, a1, 1
295
	addi t3, t3, 1
296
	sb   t1, -1(t3)
297
	bne  t3, t4, 1b
298
	2:
299
	ret
300

301
.Lbyte_copy_reverse:
302
	beq  t4, t3, 2f
303
	1:
304
	lb   t1, -1(a4)
305
	addi a4, a4, -1
306
	addi t4, t4, -1
307
	sb   t1,  0(t4)
308
	bne  t4, t3, 1b
309
	2:
310

311
.Lreturn_from_memmove:
312
	ret
313

314
SYM_FUNC_END(__memmove)
315
SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
316
SYM_FUNC_ALIAS(__pi_memmove, __memmove)
317
SYM_FUNC_ALIAS(__pi___memmove, __memmove)
318

319
Product

Resources

Company