CoCalc -- memcpy.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/arm/memcpy.S
³⁹⁵⁵⁶ views
1
/*
2
 * memcpy - copy memory area
3
 *
4
 * Copyright (c) 2013-2022, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7

8
/*
9
   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
10
   of VFP or NEON when built with the appropriate flags.
11

12
   Assumptions:
13

14
    ARMv6 (ARMv7-a if using Neon)
15
    ARM state
16
    Unaligned accesses
17

18
 */
19

20
#include "asmdefs.h"
21

22
	.syntax unified
23
	/* This implementation requires ARM state.  */
24
	.arm
25

26
#ifdef __ARM_NEON__
27

28
	.fpu	neon
29
	.arch	armv7-a
30
# define FRAME_SIZE	4
31
# define USE_VFP
32
# define USE_NEON
33

34
#elif !defined (__SOFTFP__)
35

36
	.arch	armv6
37
	.fpu	vfpv2
38
# define FRAME_SIZE	32
39
# define USE_VFP
40

41
#else
42
	.arch	armv6
43
# define FRAME_SIZE    32
44

45
#endif
46

47
/* Old versions of GAS incorrectly implement the NEON align semantics.  */
48
#ifdef BROKEN_ASM_NEON_ALIGN
49
#define ALIGN(addr, align) addr,:align
50
#else
51
#define ALIGN(addr, align) addr:align
52
#endif
53

54
#define PC_OFFSET	8	/* PC pipeline compensation.  */
55
#define INSN_SIZE	4
56

57
/* Call parameters.  */
58
#define dstin	r0
59
#define src	r1
60
#define count	r2
61

62
/* Locals.  */
63
#define tmp1	r3
64
#define dst	ip
65
#define tmp2	r10
66

67
#ifndef USE_NEON
68
/* For bulk copies using GP registers.  */
69
#define	A_l	r2		/* Call-clobbered.  */
70
#define	A_h	r3		/* Call-clobbered.  */
71
#define	B_l	r4
72
#define	B_h	r5
73
#define	C_l	r6
74
#define	C_h	r7
75
#define	D_l	r8
76
#define	D_h	r9
77
#endif
78

79
/* Number of lines ahead to pre-fetch data.  If you change this the code
80
   below will need adjustment to compensate.  */
81

82
#define prefetch_lines	5
83

84
#ifdef USE_VFP
85
	.macro	cpy_line_vfp vreg, base
86
	vstr	\vreg, [dst, #\base]
87
	vldr	\vreg, [src, #\base]
88
	vstr	d0, [dst, #\base + 8]
89
	vldr	d0, [src, #\base + 8]
90
	vstr	d1, [dst, #\base + 16]
91
	vldr	d1, [src, #\base + 16]
92
	vstr	d2, [dst, #\base + 24]
93
	vldr	d2, [src, #\base + 24]
94
	vstr	\vreg, [dst, #\base + 32]
95
	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
96
	vstr	d0, [dst, #\base + 40]
97
	vldr	d0, [src, #\base + 40]
98
	vstr	d1, [dst, #\base + 48]
99
	vldr	d1, [src, #\base + 48]
100
	vstr	d2, [dst, #\base + 56]
101
	vldr	d2, [src, #\base + 56]
102
	.endm
103

104
	.macro	cpy_tail_vfp vreg, base
105
	vstr	\vreg, [dst, #\base]
106
	vldr	\vreg, [src, #\base]
107
	vstr	d0, [dst, #\base + 8]
108
	vldr	d0, [src, #\base + 8]
109
	vstr	d1, [dst, #\base + 16]
110
	vldr	d1, [src, #\base + 16]
111
	vstr	d2, [dst, #\base + 24]
112
	vldr	d2, [src, #\base + 24]
113
	vstr	\vreg, [dst, #\base + 32]
114
	vstr	d0, [dst, #\base + 40]
115
	vldr	d0, [src, #\base + 40]
116
	vstr	d1, [dst, #\base + 48]
117
	vldr	d1, [src, #\base + 48]
118
	vstr	d2, [dst, #\base + 56]
119
	vldr	d2, [src, #\base + 56]
120
	.endm
121
#endif
122

123
ENTRY (__memcpy_arm)
124

125
	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
126
	cmp	count, #64
127
	bhs	L(cpy_not_short)
128
	/* Deal with small copies quickly by dropping straight into the
129
	   exit block.  */
130

131
L(tail63unaligned):
132
#ifdef USE_NEON
133
	and	tmp1, count, #0x38
134
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
135
	add	pc, pc, tmp1
136
	vld1.8	{d0}, [src]!	/* 14 words to go.  */
137
	vst1.8	{d0}, [dst]!
138
	vld1.8	{d0}, [src]!	/* 12 words to go.  */
139
	vst1.8	{d0}, [dst]!
140
	vld1.8	{d0}, [src]!	/* 10 words to go.  */
141
	vst1.8	{d0}, [dst]!
142
	vld1.8	{d0}, [src]!	/* 8 words to go.  */
143
	vst1.8	{d0}, [dst]!
144
	vld1.8	{d0}, [src]!	/* 6 words to go.  */
145
	vst1.8	{d0}, [dst]!
146
	vld1.8	{d0}, [src]!	/* 4 words to go.  */
147
	vst1.8	{d0}, [dst]!
148
	vld1.8	{d0}, [src]!	/* 2 words to go.  */
149
	vst1.8	{d0}, [dst]!
150

151
	tst	count, #4
152
	ldrne	tmp1, [src], #4
153
	strne	tmp1, [dst], #4
154
#else
155
	/* Copy up to 15 full words of data.  May not be aligned.  */
156
	/* Cannot use VFP for unaligned data.  */
157
	and	tmp1, count, #0x3c
158
	add	dst, dst, tmp1
159
	add	src, src, tmp1
160
	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
161
	/* Jump directly into the sequence below at the correct offset.  */
162
	add	pc, pc, tmp1, lsl #1
163

164
	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
165
	str	tmp1, [dst, #-60]
166

167
	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
168
	str	tmp1, [dst, #-56]
169
	ldr	tmp1, [src, #-52]
170
	str	tmp1, [dst, #-52]
171

172
	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
173
	str	tmp1, [dst, #-48]
174
	ldr	tmp1, [src, #-44]
175
	str	tmp1, [dst, #-44]
176

177
	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
178
	str	tmp1, [dst, #-40]
179
	ldr	tmp1, [src, #-36]
180
	str	tmp1, [dst, #-36]
181

182
	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
183
	str	tmp1, [dst, #-32]
184
	ldr	tmp1, [src, #-28]
185
	str	tmp1, [dst, #-28]
186

187
	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
188
	str	tmp1, [dst, #-24]
189
	ldr	tmp1, [src, #-20]
190
	str	tmp1, [dst, #-20]
191

192
	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
193
	str	tmp1, [dst, #-16]
194
	ldr	tmp1, [src, #-12]
195
	str	tmp1, [dst, #-12]
196

197
	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
198
	str	tmp1, [dst, #-8]
199
	ldr	tmp1, [src, #-4]
200
	str	tmp1, [dst, #-4]
201
#endif
202

203
	lsls	count, count, #31
204
	ldrhcs	tmp1, [src], #2
205
	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
206
	strhcs	tmp1, [dst], #2
207
	strbne	src, [dst]
208
	bx	lr
209

210
L(cpy_not_short):
211
	/* At least 64 bytes to copy, but don't know the alignment yet.  */
212
	str	tmp2, [sp, #-FRAME_SIZE]!
213
	and	tmp2, src, #7
214
	and	tmp1, dst, #7
215
	cmp	tmp1, tmp2
216
	bne	L(cpy_notaligned)
217

218
#ifdef USE_VFP
219
	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
220
	   that the FP pipeline is much better at streaming loads and
221
	   stores.  This is outside the critical loop.  */
222
	vmov.f32	s0, s0
223
#endif
224

225
	/* SRC and DST have the same mutual 64-bit alignment, but we may
226
	   still need to pre-copy some bytes to get to natural alignment.
227
	   We bring SRC and DST into full 64-bit alignment.  */
228
	lsls	tmp2, dst, #29
229
	beq	1f
230
	rsbs	tmp2, tmp2, #0
231
	sub	count, count, tmp2, lsr #29
232
	ldrmi	tmp1, [src], #4
233
	strmi	tmp1, [dst], #4
234
	lsls	tmp2, tmp2, #2
235
	ldrhcs	tmp1, [src], #2
236
	ldrbne	tmp2, [src], #1
237
	strhcs	tmp1, [dst], #2
238
	strbne	tmp2, [dst], #1
239

240
1:
241
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
242
	blo	L(tail63aligned)
243

244
	cmp	tmp2, #512
245
	bhs	L(cpy_body_long)
246

247
L(cpy_body_medium):			/* Count in tmp2.  */
248
#ifdef USE_VFP
249
1:
250
	vldr	d0, [src, #0]
251
	subs	tmp2, tmp2, #64
252
	vldr	d1, [src, #8]
253
	vstr	d0, [dst, #0]
254
	vldr	d0, [src, #16]
255
	vstr	d1, [dst, #8]
256
	vldr	d1, [src, #24]
257
	vstr	d0, [dst, #16]
258
	vldr	d0, [src, #32]
259
	vstr	d1, [dst, #24]
260
	vldr	d1, [src, #40]
261
	vstr	d0, [dst, #32]
262
	vldr	d0, [src, #48]
263
	vstr	d1, [dst, #40]
264
	vldr	d1, [src, #56]
265
	vstr	d0, [dst, #48]
266
	add	src, src, #64
267
	vstr	d1, [dst, #56]
268
	add	dst, dst, #64
269
	bhs	1b
270
	tst	tmp2, #0x3f
271
	beq	L(done)
272

273
L(tail63aligned):			/* Count in tmp2.  */
274
	and	tmp1, tmp2, #0x38
275
	add	dst, dst, tmp1
276
	add	src, src, tmp1
277
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
278
	add	pc, pc, tmp1
279

280
	vldr	d0, [src, #-56]	/* 14 words to go.  */
281
	vstr	d0, [dst, #-56]
282
	vldr	d0, [src, #-48]	/* 12 words to go.  */
283
	vstr	d0, [dst, #-48]
284
	vldr	d0, [src, #-40]	/* 10 words to go.  */
285
	vstr	d0, [dst, #-40]
286
	vldr	d0, [src, #-32]	/* 8 words to go.  */
287
	vstr	d0, [dst, #-32]
288
	vldr	d0, [src, #-24]	/* 6 words to go.  */
289
	vstr	d0, [dst, #-24]
290
	vldr	d0, [src, #-16]	/* 4 words to go.  */
291
	vstr	d0, [dst, #-16]
292
	vldr	d0, [src, #-8]	/* 2 words to go.  */
293
	vstr	d0, [dst, #-8]
294
#else
295
	sub	src, src, #8
296
	sub	dst, dst, #8
297
1:
298
	ldrd	A_l, A_h, [src, #8]
299
	strd	A_l, A_h, [dst, #8]
300
	ldrd	A_l, A_h, [src, #16]
301
	strd	A_l, A_h, [dst, #16]
302
	ldrd	A_l, A_h, [src, #24]
303
	strd	A_l, A_h, [dst, #24]
304
	ldrd	A_l, A_h, [src, #32]
305
	strd	A_l, A_h, [dst, #32]
306
	ldrd	A_l, A_h, [src, #40]
307
	strd	A_l, A_h, [dst, #40]
308
	ldrd	A_l, A_h, [src, #48]
309
	strd	A_l, A_h, [dst, #48]
310
	ldrd	A_l, A_h, [src, #56]
311
	strd	A_l, A_h, [dst, #56]
312
	ldrd	A_l, A_h, [src, #64]!
313
	strd	A_l, A_h, [dst, #64]!
314
	subs	tmp2, tmp2, #64
315
	bhs	1b
316
	tst	tmp2, #0x3f
317
	bne	1f
318
	ldr	tmp2,[sp], #FRAME_SIZE
319
	bx	lr
320
1:
321
	add	src, src, #8
322
	add	dst, dst, #8
323

324
L(tail63aligned):			/* Count in tmp2.  */
325
	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
326
	   we know that the src and dest are 64-bit aligned so we can use
327
	   LDRD/STRD to improve efficiency.  */
328
	/* TMP2 is now negative, but we don't care about that.  The bottom
329
	   six bits still tell us how many bytes are left to copy.  */
330

331
	and	tmp1, tmp2, #0x38
332
	add	dst, dst, tmp1
333
	add	src, src, tmp1
334
	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
335
	add	pc, pc, tmp1
336
	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
337
	strd	A_l, A_h, [dst, #-56]
338
	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
339
	strd	A_l, A_h, [dst, #-48]
340
	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
341
	strd	A_l, A_h, [dst, #-40]
342
	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
343
	strd	A_l, A_h, [dst, #-32]
344
	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
345
	strd	A_l, A_h, [dst, #-24]
346
	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
347
	strd	A_l, A_h, [dst, #-16]
348
	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
349
	strd	A_l, A_h, [dst, #-8]
350

351
#endif
352
	tst	tmp2, #4
353
	ldrne	tmp1, [src], #4
354
	strne	tmp1, [dst], #4
355
	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
356
	ldrhcs	tmp1, [src], #2
357
	ldrbne	tmp2, [src]
358
	strhcs	tmp1, [dst], #2
359
	strbne	tmp2, [dst]
360

361
L(done):
362
	ldr	tmp2, [sp], #FRAME_SIZE
363
	bx	lr
364

365
L(cpy_body_long):			/* Count in tmp2.  */
366

367
	/* Long copy.  We know that there's at least (prefetch_lines * 64)
368
	   bytes to go.  */
369
#ifdef USE_VFP
370
	/* Don't use PLD.  Instead, read some data in advance of the current
371
	   copy position into a register.  This should act like a PLD
372
	   operation but we won't have to repeat the transfer.  */
373

374
	vldr	d3, [src, #0]
375
	vldr	d4, [src, #64]
376
	vldr	d5, [src, #128]
377
	vldr	d6, [src, #192]
378
	vldr	d7, [src, #256]
379

380
	vldr	d0, [src, #8]
381
	vldr	d1, [src, #16]
382
	vldr	d2, [src, #24]
383
	add	src, src, #32
384

385
	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
386
	blo	2f
387
1:
388
	cpy_line_vfp	d3, 0
389
	cpy_line_vfp	d4, 64
390
	cpy_line_vfp	d5, 128
391
	add	dst, dst, #3 * 64
392
	add	src, src, #3 * 64
393
	cpy_line_vfp	d6, 0
394
	cpy_line_vfp	d7, 64
395
	add	dst, dst, #2 * 64
396
	add	src, src, #2 * 64
397
	subs	tmp2, tmp2, #prefetch_lines * 64
398
	bhs	1b
399

400
2:
401
	cpy_tail_vfp	d3, 0
402
	cpy_tail_vfp	d4, 64
403
	cpy_tail_vfp	d5, 128
404
	add	src, src, #3 * 64
405
	add	dst, dst, #3 * 64
406
	cpy_tail_vfp	d6, 0
407
	vstr	d7, [dst, #64]
408
	vldr	d7, [src, #64]
409
	vstr	d0, [dst, #64 + 8]
410
	vldr	d0, [src, #64 + 8]
411
	vstr	d1, [dst, #64 + 16]
412
	vldr	d1, [src, #64 + 16]
413
	vstr	d2, [dst, #64 + 24]
414
	vldr	d2, [src, #64 + 24]
415
	vstr	d7, [dst, #64 + 32]
416
	add	src, src, #96
417
	vstr	d0, [dst, #64 + 40]
418
	vstr	d1, [dst, #64 + 48]
419
	vstr	d2, [dst, #64 + 56]
420
	add	dst, dst, #128
421
	add	tmp2, tmp2, #prefetch_lines * 64
422
	b	L(cpy_body_medium)
423
#else
424
	/* Long copy.  Use an SMS style loop to maximize the I/O
425
	   bandwidth of the core.  We don't have enough spare registers
426
	   to synthesise prefetching, so use PLD operations.  */
427
	/* Pre-bias src and dst.  */
428
	sub	src, src, #8
429
	sub	dst, dst, #8
430
	pld	[src, #8]
431
	pld	[src, #72]
432
	subs	tmp2, tmp2, #64
433
	pld	[src, #136]
434
	ldrd	A_l, A_h, [src, #8]
435
	strd	B_l, B_h, [sp, #8]
436
	ldrd	B_l, B_h, [src, #16]
437
	strd	C_l, C_h, [sp, #16]
438
	ldrd	C_l, C_h, [src, #24]
439
	strd	D_l, D_h, [sp, #24]
440
	pld	[src, #200]
441
	ldrd	D_l, D_h, [src, #32]!
442
	b	1f
443
	.p2align	6
444
2:
445
	pld	[src, #232]
446
	strd	A_l, A_h, [dst, #40]
447
	ldrd	A_l, A_h, [src, #40]
448
	strd	B_l, B_h, [dst, #48]
449
	ldrd	B_l, B_h, [src, #48]
450
	strd	C_l, C_h, [dst, #56]
451
	ldrd	C_l, C_h, [src, #56]
452
	strd	D_l, D_h, [dst, #64]!
453
	ldrd	D_l, D_h, [src, #64]!
454
	subs	tmp2, tmp2, #64
455
1:
456
	strd	A_l, A_h, [dst, #8]
457
	ldrd	A_l, A_h, [src, #8]
458
	strd	B_l, B_h, [dst, #16]
459
	ldrd	B_l, B_h, [src, #16]
460
	strd	C_l, C_h, [dst, #24]
461
	ldrd	C_l, C_h, [src, #24]
462
	strd	D_l, D_h, [dst, #32]
463
	ldrd	D_l, D_h, [src, #32]
464
	bcs	2b
465
	/* Save the remaining bytes and restore the callee-saved regs.  */
466
	strd	A_l, A_h, [dst, #40]
467
	add	src, src, #40
468
	strd	B_l, B_h, [dst, #48]
469
	ldrd	B_l, B_h, [sp, #8]
470
	strd	C_l, C_h, [dst, #56]
471
	ldrd	C_l, C_h, [sp, #16]
472
	strd	D_l, D_h, [dst, #64]
473
	ldrd	D_l, D_h, [sp, #24]
474
	add	dst, dst, #72
475
	tst	tmp2, #0x3f
476
	bne	L(tail63aligned)
477
	ldr	tmp2, [sp], #FRAME_SIZE
478
	bx	lr
479
#endif
480

481
L(cpy_notaligned):
482
	pld	[src]
483
	pld	[src, #64]
484
	/* There's at least 64 bytes to copy, but there is no mutual
485
	   alignment.  */
486
	/* Bring DST to 64-bit alignment.  */
487
	lsls	tmp2, dst, #29
488
	pld	[src, #(2 * 64)]
489
	beq	1f
490
	rsbs	tmp2, tmp2, #0
491
	sub	count, count, tmp2, lsr #29
492
	ldrmi	tmp1, [src], #4
493
	strmi	tmp1, [dst], #4
494
	lsls	tmp2, tmp2, #2
495
	ldrbne	tmp1, [src], #1
496
	ldrhcs	tmp2, [src], #2
497
	strbne	tmp1, [dst], #1
498
	strhcs	tmp2, [dst], #2
499
1:
500
	pld	[src, #(3 * 64)]
501
	subs	count, count, #64
502
	ldrlo	tmp2, [sp], #FRAME_SIZE
503
	blo	L(tail63unaligned)
504
	pld	[src, #(4 * 64)]
505

506
#ifdef USE_NEON
507
	vld1.8	{d0-d3}, [src]!
508
	vld1.8	{d4-d7}, [src]!
509
	subs	count, count, #64
510
	blo	2f
511
1:
512
	pld	[src, #(4 * 64)]
513
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
514
	vld1.8	{d0-d3}, [src]!
515
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
516
	vld1.8	{d4-d7}, [src]!
517
	subs	count, count, #64
518
	bhs	1b
519
2:
520
	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
521
	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
522
	ands	count, count, #0x3f
523
#else
524
	/* Use an SMS style loop to maximize the I/O bandwidth.  */
525
	sub	src, src, #4
526
	sub	dst, dst, #8
527
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
528
	ldr	A_l, [src, #4]
529
	ldr	A_h, [src, #8]
530
	strd	B_l, B_h, [sp, #8]
531
	ldr	B_l, [src, #12]
532
	ldr	B_h, [src, #16]
533
	strd	C_l, C_h, [sp, #16]
534
	ldr	C_l, [src, #20]
535
	ldr	C_h, [src, #24]
536
	strd	D_l, D_h, [sp, #24]
537
	ldr	D_l, [src, #28]
538
	ldr	D_h, [src, #32]!
539
	b	1f
540
	.p2align	6
541
2:
542
	pld	[src, #(5 * 64) - (32 - 4)]
543
	strd	A_l, A_h, [dst, #40]
544
	ldr	A_l, [src, #36]
545
	ldr	A_h, [src, #40]
546
	strd	B_l, B_h, [dst, #48]
547
	ldr	B_l, [src, #44]
548
	ldr	B_h, [src, #48]
549
	strd	C_l, C_h, [dst, #56]
550
	ldr	C_l, [src, #52]
551
	ldr	C_h, [src, #56]
552
	strd	D_l, D_h, [dst, #64]!
553
	ldr	D_l, [src, #60]
554
	ldr	D_h, [src, #64]!
555
	subs	tmp2, tmp2, #64
556
1:
557
	strd	A_l, A_h, [dst, #8]
558
	ldr	A_l, [src, #4]
559
	ldr	A_h, [src, #8]
560
	strd	B_l, B_h, [dst, #16]
561
	ldr	B_l, [src, #12]
562
	ldr	B_h, [src, #16]
563
	strd	C_l, C_h, [dst, #24]
564
	ldr	C_l, [src, #20]
565
	ldr	C_h, [src, #24]
566
	strd	D_l, D_h, [dst, #32]
567
	ldr	D_l, [src, #28]
568
	ldr	D_h, [src, #32]
569
	bcs	2b
570

571
	/* Save the remaining bytes and restore the callee-saved regs.  */
572
	strd	A_l, A_h, [dst, #40]
573
	add	src, src, #36
574
	strd	B_l, B_h, [dst, #48]
575
	ldrd	B_l, B_h, [sp, #8]
576
	strd	C_l, C_h, [dst, #56]
577
	ldrd	C_l, C_h, [sp, #16]
578
	strd	D_l, D_h, [dst, #64]
579
	ldrd	D_l, D_h, [sp, #24]
580
	add	dst, dst, #72
581
	ands	count, tmp2, #0x3f
582
#endif
583
	ldr	tmp2, [sp], #FRAME_SIZE
584
	bne	L(tail63unaligned)
585
	bx	lr
586

587
END (__memcpy_arm)
588

589
Product

Resources

Company