CoCalc -- sme-libc-mem-routines.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
³⁵²⁹² views
1
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2
// See https://llvm.org/LICENSE.txt for license information.
3
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4

5
// Routines taken from libc/AOR_v20.02/string/aarch64
6

7
#include "../assembly.h"
8

9
#ifdef __aarch64__
10

11
#define L(l) .L ## l
12

13
//
14
//  __arm_sc_memcpy / __arm_sc_memmove
15
//
16

17
#define dstin    x0
18
#define src      x1
19
#define count    x2
20
#define dst      x3
21
#define srcend1  x4
22
#define dstend1  x5
23
#define A_l      x6
24
#define A_lw     w6
25
#define A_h      x7
26
#define B_l      x8
27
#define B_lw     w8
28
#define B_h      x9
29
#define C_l      x10
30
#define C_lw     w10
31
#define C_h      x11
32
#define D_l      x12
33
#define D_h      x13
34
#define E_l      x14
35
#define E_h      x15
36
#define F_l      x16
37
#define F_h      x17
38
#define G_l      count
39
#define G_h      dst
40
#define H_l      src
41
#define H_h      srcend1
42
#define tmp1     x14
43

44
/* This implementation handles overlaps and supports both memcpy and memmove
45
   from a single entry point.  It uses unaligned accesses and branchless
46
   sequences to keep the code small, simple and improve performance.
47

48
   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
49
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
50
   check is negligible since it is only required for large copies.
51

52
   Large copies use a software pipelined loop processing 64 bytes per iteration.
53
   The destination pointer is 16-byte aligned to minimize unaligned accesses.
54
   The loop tail is handled by always copying 64 bytes from the end.
55
*/
56

57
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
58
        add     srcend1, src, count
59
        add     dstend1, dstin, count
60
        cmp     count, 128
61
        b.hi    L(copy_long)
62
        cmp     count, 32
63
        b.hi    L(copy32_128)
64

65
        /* Small copies: 0..32 bytes.  */
66
        cmp     count, 16
67
        b.lo    L(copy16)
68
        ldp     A_l, A_h, [src]
69
        ldp     D_l, D_h, [srcend1, -16]
70
        stp     A_l, A_h, [dstin]
71
        stp     D_l, D_h, [dstend1, -16]
72
        ret
73

74
        /* Copy 8-15 bytes.  */
75
L(copy16):
76
        tbz     count, 3, L(copy8)
77
        ldr     A_l, [src]
78
        ldr     A_h, [srcend1, -8]
79
        str     A_l, [dstin]
80
        str     A_h, [dstend1, -8]
81
        ret
82

83
        .p2align 3
84
        /* Copy 4-7 bytes.  */
85
L(copy8):
86
        tbz     count, 2, L(copy4)
87
        ldr     A_lw, [src]
88
        ldr     B_lw, [srcend1, -4]
89
        str     A_lw, [dstin]
90
        str     B_lw, [dstend1, -4]
91
        ret
92

93
        /* Copy 0..3 bytes using a branchless sequence.  */
94
L(copy4):
95
        cbz     count, L(copy0)
96
        lsr     tmp1, count, 1
97
        ldrb    A_lw, [src]
98
        ldrb    C_lw, [srcend1, -1]
99
        ldrb    B_lw, [src, tmp1]
100
        strb    A_lw, [dstin]
101
        strb    B_lw, [dstin, tmp1]
102
        strb    C_lw, [dstend1, -1]
103
L(copy0):
104
        ret
105

106
        .p2align 4
107
        /* Medium copies: 33..128 bytes.  */
108
L(copy32_128):
109
        ldp     A_l, A_h, [src]
110
        ldp     B_l, B_h, [src, 16]
111
        ldp     C_l, C_h, [srcend1, -32]
112
        ldp     D_l, D_h, [srcend1, -16]
113
        cmp     count, 64
114
        b.hi    L(copy128)
115
        stp     A_l, A_h, [dstin]
116
        stp     B_l, B_h, [dstin, 16]
117
        stp     C_l, C_h, [dstend1, -32]
118
        stp     D_l, D_h, [dstend1, -16]
119
        ret
120

121
        .p2align 4
122
        /* Copy 65..128 bytes.  */
123
L(copy128):
124
        ldp     E_l, E_h, [src, 32]
125
        ldp     F_l, F_h, [src, 48]
126
        cmp     count, 96
127
        b.ls    L(copy96)
128
        ldp     G_l, G_h, [srcend1, -64]
129
        ldp     H_l, H_h, [srcend1, -48]
130
        stp     G_l, G_h, [dstend1, -64]
131
        stp     H_l, H_h, [dstend1, -48]
132
L(copy96):
133
        stp     A_l, A_h, [dstin]
134
        stp     B_l, B_h, [dstin, 16]
135
        stp     E_l, E_h, [dstin, 32]
136
        stp     F_l, F_h, [dstin, 48]
137
        stp     C_l, C_h, [dstend1, -32]
138
        stp     D_l, D_h, [dstend1, -16]
139
        ret
140

141
        .p2align 4
142
        /* Copy more than 128 bytes.  */
143
L(copy_long):
144
        /* Use backwards copy if there is an overlap.  */
145
        sub     tmp1, dstin, src
146
        cbz     tmp1, L(copy0)
147
        cmp     tmp1, count
148
        b.lo    L(copy_long_backwards)
149

150
        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
151

152
        ldp     D_l, D_h, [src]
153
        and     tmp1, dstin, 15
154
        bic     dst, dstin, 15
155
        sub     src, src, tmp1
156
        add     count, count, tmp1      /* Count is now 16 too large.  */
157
        ldp     A_l, A_h, [src, 16]
158
        stp     D_l, D_h, [dstin]
159
        ldp     B_l, B_h, [src, 32]
160
        ldp     C_l, C_h, [src, 48]
161
        ldp     D_l, D_h, [src, 64]!
162
        subs    count, count, 128 + 16  /* Test and readjust count.  */
163
        b.ls    L(copy64_from_end)
164
L(loop64):
165
        stp     A_l, A_h, [dst, 16]
166
        ldp     A_l, A_h, [src, 16]
167
        stp     B_l, B_h, [dst, 32]
168
        ldp     B_l, B_h, [src, 32]
169
        stp     C_l, C_h, [dst, 48]
170
        ldp     C_l, C_h, [src, 48]
171
        stp     D_l, D_h, [dst, 64]!
172
        ldp     D_l, D_h, [src, 64]!
173
        subs    count, count, 64
174
        b.hi    L(loop64)
175

176
        /* Write the last iteration and copy 64 bytes from the end.  */
177
L(copy64_from_end):
178
        ldp     E_l, E_h, [srcend1, -64]
179
        stp     A_l, A_h, [dst, 16]
180
        ldp     A_l, A_h, [srcend1, -48]
181
        stp     B_l, B_h, [dst, 32]
182
        ldp     B_l, B_h, [srcend1, -32]
183
        stp     C_l, C_h, [dst, 48]
184
        ldp     C_l, C_h, [srcend1, -16]
185
        stp     D_l, D_h, [dst, 64]
186
        stp     E_l, E_h, [dstend1, -64]
187
        stp     A_l, A_h, [dstend1, -48]
188
        stp     B_l, B_h, [dstend1, -32]
189
        stp     C_l, C_h, [dstend1, -16]
190
        ret
191

192
        .p2align 4
193

194
        /* Large backwards copy for overlapping copies.
195
           Copy 16 bytes and then align dst to 16-byte alignment.  */
196
L(copy_long_backwards):
197
        ldp     D_l, D_h, [srcend1, -16]
198
        and     tmp1, dstend1, 15
199
        sub     srcend1, srcend1, tmp1
200
        sub     count, count, tmp1
201
        ldp     A_l, A_h, [srcend1, -16]
202
        stp     D_l, D_h, [dstend1, -16]
203
        ldp     B_l, B_h, [srcend1, -32]
204
        ldp     C_l, C_h, [srcend1, -48]
205
        ldp     D_l, D_h, [srcend1, -64]!
206
        sub     dstend1, dstend1, tmp1
207
        subs    count, count, 128
208
        b.ls    L(copy64_from_start)
209

210
L(loop64_backwards):
211
        stp     A_l, A_h, [dstend1, -16]
212
        ldp     A_l, A_h, [srcend1, -16]
213
        stp     B_l, B_h, [dstend1, -32]
214
        ldp     B_l, B_h, [srcend1, -32]
215
        stp     C_l, C_h, [dstend1, -48]
216
        ldp     C_l, C_h, [srcend1, -48]
217
        stp     D_l, D_h, [dstend1, -64]!
218
        ldp     D_l, D_h, [srcend1, -64]!
219
        subs    count, count, 64
220
        b.hi    L(loop64_backwards)
221

222
        /* Write the last iteration and copy 64 bytes from the start.  */
223
L(copy64_from_start):
224
        ldp     G_l, G_h, [src, 48]
225
        stp     A_l, A_h, [dstend1, -16]
226
        ldp     A_l, A_h, [src, 32]
227
        stp     B_l, B_h, [dstend1, -32]
228
        ldp     B_l, B_h, [src, 16]
229
        stp     C_l, C_h, [dstend1, -48]
230
        ldp     C_l, C_h, [src]
231
        stp     D_l, D_h, [dstend1, -64]
232
        stp     G_l, G_h, [dstin, 48]
233
        stp     A_l, A_h, [dstin, 32]
234
        stp     B_l, B_h, [dstin, 16]
235
        stp     C_l, C_h, [dstin]
236
        ret
237
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
238

239
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
240

241

242
//
243
//  __arm_sc_memset
244
//
245

246
#define dstin    x0
247
#define val      x1
248
#define valw     w1
249
#define count    x2
250
#define dst      x3
251
#define dstend2  x4
252
#define zva_val  x5
253

254
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
255
#ifdef __ARM_FEATURE_SVE
256
        mov     z0.b, valw
257
#else
258
        bfi valw, valw, #8, #8
259
        bfi valw, valw, #16, #16
260
        bfi val, val, #32, #32
261
        fmov d0, val
262
        fmov v0.d[1], val
263
#endif
264
        add     dstend2, dstin, count
265

266
        cmp     count, 96
267
        b.hi    L(set_long)
268
        cmp     count, 16
269
        b.hs    L(set_medium)
270
        mov     val, v0.D[0]
271

272
        /* Set 0..15 bytes.  */
273
        tbz     count, 3, 1f
274
        str     val, [dstin]
275
        str     val, [dstend2, -8]
276
        ret
277
        nop
278
1:      tbz     count, 2, 2f
279
        str     valw, [dstin]
280
        str     valw, [dstend2, -4]
281
        ret
282
2:      cbz     count, 3f
283
        strb    valw, [dstin]
284
        tbz     count, 1, 3f
285
        strh    valw, [dstend2, -2]
286
3:      ret
287

288
        /* Set 17..96 bytes.  */
289
L(set_medium):
290
        str     q0, [dstin]
291
        tbnz    count, 6, L(set96)
292
        str     q0, [dstend2, -16]
293
        tbz     count, 5, 1f
294
        str     q0, [dstin, 16]
295
        str     q0, [dstend2, -32]
296
1:      ret
297

298
        .p2align 4
299
        /* Set 64..96 bytes.  Write 64 bytes from the start and
300
           32 bytes from the end.  */
301
L(set96):
302
        str     q0, [dstin, 16]
303
        stp     q0, q0, [dstin, 32]
304
        stp     q0, q0, [dstend2, -32]
305
        ret
306

307
        .p2align 4
308
L(set_long):
309
        and     valw, valw, 255
310
        bic     dst, dstin, 15
311
        str     q0, [dstin]
312
        cmp     count, 160
313
        ccmp    valw, 0, 0, hs
314
        b.ne    L(no_zva)
315

316
#ifndef SKIP_ZVA_CHECK
317
        mrs     zva_val, dczid_el0
318
        and     zva_val, zva_val, 31
319
        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
320
        b.ne    L(no_zva)
321
#endif
322
        str     q0, [dst, 16]
323
        stp     q0, q0, [dst, 32]
324
        bic     dst, dst, 63
325
        sub     count, dstend2, dst      /* Count is now 64 too large.  */
326
        sub     count, count, 128       /* Adjust count and bias for loop.  */
327

328
        .p2align 4
329
L(zva_loop):
330
        add     dst, dst, 64
331
        dc      zva, dst
332
        subs    count, count, 64
333
        b.hi    L(zva_loop)
334
        stp     q0, q0, [dstend2, -64]
335
        stp     q0, q0, [dstend2, -32]
336
        ret
337

338
L(no_zva):
339
        sub     count, dstend2, dst      /* Count is 16 too large.  */
340
        sub     dst, dst, 16            /* Dst is biased by -32.  */
341
        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
342
L(no_zva_loop):
343
        stp     q0, q0, [dst, 32]
344
        stp     q0, q0, [dst, 64]!
345
        subs    count, count, 64
346
        b.hi    L(no_zva_loop)
347
        stp     q0, q0, [dstend2, -64]
348
        stp     q0, q0, [dstend2, -32]
349
        ret
350
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
351

352
#endif // __aarch64__
353

354
Product

Resources

Company