Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
39486 views
1
/*
2
* memchr - find a character in a memory zone
3
*
4
* Copyright (c) 2020-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD.
11
* MTE compatible.
12
*/
13
14
#include "asmdefs.h"
15
16
#define srcin x0
17
#define chrin w1
18
#define cntin x2
19
#define result x0
20
21
#define src x3
22
#define cntrem x4
23
#define synd x5
24
#define shift x6
25
#define tmp x7
26
27
#define vrepchr v0
28
#define qdata q1
29
#define vdata v1
30
#define vhas_chr v2
31
#define vend v3
32
#define dend d3
33
34
/*
35
Core algorithm:
36
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
37
per byte. We take 4 bits of every comparison byte with shift right and narrow
38
by 4 instruction. Since the bits in the nibble mask reflect the order in
39
which things occur in the original string, counting leading zeros identifies
40
exactly which byte matched. */
41
42
ENTRY (__memchr_aarch64_mte)
43
bic src, srcin, 15
44
cbz cntin, L(nomatch)
45
ld1 {vdata.16b}, [src]
46
dup vrepchr.16b, chrin
47
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
48
lsl shift, srcin, 2
49
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
50
fmov synd, dend
51
lsr synd, synd, shift
52
cbz synd, L(start_loop)
53
54
rbit synd, synd
55
clz synd, synd
56
cmp cntin, synd, lsr 2
57
add result, srcin, synd, lsr 2
58
csel result, result, xzr, hi
59
ret
60
61
.p2align 3
62
L(start_loop):
63
sub tmp, src, srcin
64
add tmp, tmp, 17
65
subs cntrem, cntin, tmp
66
b.lo L(nomatch)
67
68
/* Make sure that it won't overread by a 16-byte chunk */
69
tbz cntrem, 4, L(loop32_2)
70
sub src, src, 16
71
.p2align 4
72
L(loop32):
73
ldr qdata, [src, 32]!
74
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
75
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
76
fmov synd, dend
77
cbnz synd, L(end)
78
79
L(loop32_2):
80
ldr qdata, [src, 16]
81
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
82
subs cntrem, cntrem, 32
83
b.lo L(end_2)
84
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
85
fmov synd, dend
86
cbz synd, L(loop32)
87
L(end_2):
88
add src, src, 16
89
L(end):
90
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
91
sub cntrem, src, srcin
92
fmov synd, dend
93
sub cntrem, cntin, cntrem
94
#ifndef __AARCH64EB__
95
rbit synd, synd
96
#endif
97
clz synd, synd
98
cmp cntrem, synd, lsr 2
99
add result, src, synd, lsr 2
100
csel result, result, xzr, hi
101
ret
102
103
L(nomatch):
104
mov result, 0
105
ret
106
107
END (__memchr_aarch64_mte)
108
109
110