CoCalc -- sme-libc-opt-memcpy-memmove.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S
²¹³⁷⁹⁹ views
1
//===----------------------------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file contains assembly-optimized implementations of Scalable Matrix
11
/// Extension (SME) compatible memcpy and memmove functions.
12
///
13
/// These implementations depend on unaligned access support.
14
///
15
/// Routines taken from libc/AOR_v20.02/string/aarch64.
16
///
17
//===----------------------------------------------------------------------===//
18

19
#include "../assembly.h"
20

21
//
22
//  __arm_sc_memcpy / __arm_sc_memmove
23
//
24

25
#define dstin    x0
26
#define src      x1
27
#define count    x2
28
#define dst      x3
29
#define srcend1  x4
30
#define dstend1  x5
31
#define A_l      x6
32
#define A_lw     w6
33
#define A_h      x7
34
#define B_l      x8
35
#define B_lw     w8
36
#define B_h      x9
37
#define C_l      x10
38
#define C_lw     w10
39
#define C_h      x11
40
#define D_l      x12
41
#define D_h      x13
42
#define E_l      x14
43
#define E_h      x15
44
#define F_l      x16
45
#define F_h      x17
46
#define G_l      count
47
#define G_h      dst
48
#define H_l      src
49
#define H_h      srcend1
50
#define tmp1     x14
51

52
/* This implementation handles overlaps and supports both memcpy and memmove
53
   from a single entry point.  It uses unaligned accesses and branchless
54
   sequences to keep the code small, simple and improve performance.
55

56
   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
57
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
58
   check is negligible since it is only required for large copies.
59

60
   Large copies use a software pipelined loop processing 64 bytes per iteration.
61
   The destination pointer is 16-byte aligned to minimize unaligned accesses.
62
   The loop tail is handled by always copying 64 bytes from the end.
63
*/
64

65
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
66
        add     srcend1, src, count
67
        add     dstend1, dstin, count
68
        cmp     count, 128
69
        b.hi    7f  // copy_long
70
        cmp     count, 32
71
        b.hi    4f  // copy32_128
72

73
        /* Small copies: 0..32 bytes.  */
74
        cmp     count, 16
75
        b.lo    0f  // copy16
76
        ldp     A_l, A_h, [src]
77
        ldp     D_l, D_h, [srcend1, -16]
78
        stp     A_l, A_h, [dstin]
79
        stp     D_l, D_h, [dstend1, -16]
80
        ret
81

82
        /* Copy 8-15 bytes.  */
83
0:  // copy16
84
        tbz     count, 3, 1f  // copy8
85
        ldr     A_l, [src]
86
        ldr     A_h, [srcend1, -8]
87
        str     A_l, [dstin]
88
        str     A_h, [dstend1, -8]
89
        ret
90

91
        .p2align 3
92
        /* Copy 4-7 bytes.  */
93
1:  // copy8
94
        tbz     count, 2, 2f  // copy4
95
        ldr     A_lw, [src]
96
        ldr     B_lw, [srcend1, -4]
97
        str     A_lw, [dstin]
98
        str     B_lw, [dstend1, -4]
99
        ret
100

101
        /* Copy 0..3 bytes using a branchless sequence.  */
102
2:  // copy4
103
        cbz     count, 3f // copy0
104
        lsr     tmp1, count, 1
105
        ldrb    A_lw, [src]
106
        ldrb    C_lw, [srcend1, -1]
107
        ldrb    B_lw, [src, tmp1]
108
        strb    A_lw, [dstin]
109
        strb    B_lw, [dstin, tmp1]
110
        strb    C_lw, [dstend1, -1]
111
3:  // copy0
112
        ret
113

114
        .p2align 4
115
        /* Medium copies: 33..128 bytes.  */
116
4:  // copy32_128
117
        ldp     A_l, A_h, [src]
118
        ldp     B_l, B_h, [src, 16]
119
        ldp     C_l, C_h, [srcend1, -32]
120
        ldp     D_l, D_h, [srcend1, -16]
121
        cmp     count, 64
122
        b.hi    5f  // copy128
123
        stp     A_l, A_h, [dstin]
124
        stp     B_l, B_h, [dstin, 16]
125
        stp     C_l, C_h, [dstend1, -32]
126
        stp     D_l, D_h, [dstend1, -16]
127
        ret
128

129
        .p2align 4
130
        /* Copy 65..128 bytes.  */
131
5:  // copy128
132
        ldp     E_l, E_h, [src, 32]
133
        ldp     F_l, F_h, [src, 48]
134
        cmp     count, 96
135
        b.ls    6f  // copy96
136
        ldp     G_l, G_h, [srcend1, -64]
137
        ldp     H_l, H_h, [srcend1, -48]
138
        stp     G_l, G_h, [dstend1, -64]
139
        stp     H_l, H_h, [dstend1, -48]
140
6:  // copy96
141
        stp     A_l, A_h, [dstin]
142
        stp     B_l, B_h, [dstin, 16]
143
        stp     E_l, E_h, [dstin, 32]
144
        stp     F_l, F_h, [dstin, 48]
145
        stp     C_l, C_h, [dstend1, -32]
146
        stp     D_l, D_h, [dstend1, -16]
147
        ret
148

149
        .p2align 4
150
        /* Copy more than 128 bytes.  */
151
7:  // copy_long
152
        /* Use backwards copy if there is an overlap.  */
153
        sub     tmp1, dstin, src
154
        cbz     tmp1, 3b  // copy0
155
        cmp     tmp1, count
156
        b.lo    10f //copy_long_backwards
157

158
        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
159

160
        ldp     D_l, D_h, [src]
161
        and     tmp1, dstin, 15
162
        bic     dst, dstin, 15
163
        sub     src, src, tmp1
164
        add     count, count, tmp1      /* Count is now 16 too large.  */
165
        ldp     A_l, A_h, [src, 16]
166
        stp     D_l, D_h, [dstin]
167
        ldp     B_l, B_h, [src, 32]
168
        ldp     C_l, C_h, [src, 48]
169
        ldp     D_l, D_h, [src, 64]!
170
        subs    count, count, 128 + 16  /* Test and readjust count.  */
171
        b.ls    9f  // copy64_from_end
172
8:  // loop64
173
        stp     A_l, A_h, [dst, 16]
174
        ldp     A_l, A_h, [src, 16]
175
        stp     B_l, B_h, [dst, 32]
176
        ldp     B_l, B_h, [src, 32]
177
        stp     C_l, C_h, [dst, 48]
178
        ldp     C_l, C_h, [src, 48]
179
        stp     D_l, D_h, [dst, 64]!
180
        ldp     D_l, D_h, [src, 64]!
181
        subs    count, count, 64
182
        b.hi    8b  // loop64
183

184
        /* Write the last iteration and copy 64 bytes from the end.  */
185
9:  // copy64_from_end
186
        ldp     E_l, E_h, [srcend1, -64]
187
        stp     A_l, A_h, [dst, 16]
188
        ldp     A_l, A_h, [srcend1, -48]
189
        stp     B_l, B_h, [dst, 32]
190
        ldp     B_l, B_h, [srcend1, -32]
191
        stp     C_l, C_h, [dst, 48]
192
        ldp     C_l, C_h, [srcend1, -16]
193
        stp     D_l, D_h, [dst, 64]
194
        stp     E_l, E_h, [dstend1, -64]
195
        stp     A_l, A_h, [dstend1, -48]
196
        stp     B_l, B_h, [dstend1, -32]
197
        stp     C_l, C_h, [dstend1, -16]
198
        ret
199

200
        .p2align 4
201

202
        /* Large backwards copy for overlapping copies.
203
           Copy 16 bytes and then align dst to 16-byte alignment.  */
204
10: // copy_long_backwards
205
        ldp     D_l, D_h, [srcend1, -16]
206
        and     tmp1, dstend1, 15
207
        sub     srcend1, srcend1, tmp1
208
        sub     count, count, tmp1
209
        ldp     A_l, A_h, [srcend1, -16]
210
        stp     D_l, D_h, [dstend1, -16]
211
        ldp     B_l, B_h, [srcend1, -32]
212
        ldp     C_l, C_h, [srcend1, -48]
213
        ldp     D_l, D_h, [srcend1, -64]!
214
        sub     dstend1, dstend1, tmp1
215
        subs    count, count, 128
216
        b.ls    12f // copy64_from_start
217

218
11: // loop64_backwards
219
        stp     A_l, A_h, [dstend1, -16]
220
        ldp     A_l, A_h, [srcend1, -16]
221
        stp     B_l, B_h, [dstend1, -32]
222
        ldp     B_l, B_h, [srcend1, -32]
223
        stp     C_l, C_h, [dstend1, -48]
224
        ldp     C_l, C_h, [srcend1, -48]
225
        stp     D_l, D_h, [dstend1, -64]!
226
        ldp     D_l, D_h, [srcend1, -64]!
227
        subs    count, count, 64
228
        b.hi    11b // loop64_backwards
229

230
        /* Write the last iteration and copy 64 bytes from the start.  */
231
12: // copy64_from_start
232
        ldp     G_l, G_h, [src, 48]
233
        stp     A_l, A_h, [dstend1, -16]
234
        ldp     A_l, A_h, [src, 32]
235
        stp     B_l, B_h, [dstend1, -32]
236
        ldp     B_l, B_h, [src, 16]
237
        stp     C_l, C_h, [dstend1, -48]
238
        ldp     C_l, C_h, [src]
239
        stp     D_l, D_h, [dstend1, -64]
240
        stp     G_l, G_h, [dstin, 48]
241
        stp     A_l, A_h, [dstin, 32]
242
        stp     B_l, B_h, [dstin, 16]
243
        stp     C_l, C_h, [dstin]
244
        ret
245
END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
246

247
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
248

249

250
Product

Resources

Company