Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
39498 views
1
/*
2
* strrchr - find last position of a character in a string.
3
*
4
* Copyright (c) 2020-2023, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD.
11
* MTE compatible.
12
*/
13
14
#include "asmdefs.h"
15
16
#define srcin x0
17
#define chrin w1
18
#define result x0
19
20
#define src x2
21
#define tmp x3
22
#define synd x3
23
#define shift x4
24
#define src_match x4
25
#define nul_match x5
26
#define chr_match x6
27
28
#define vrepchr v0
29
#define vdata v1
30
#define vhas_nul v2
31
#define vhas_chr v3
32
#define vrepmask v4
33
#define vend v5
34
#define dend d5
35
36
/* Core algorithm.
37
38
For each 16-byte chunk we calculate a 64-bit syndrome value, with
39
four bits per byte (LSB is always in bits 0 and 1, for both big
40
and little-endian systems). For each tuple, bits 0-1 are set if
41
the relevant byte matched the requested character; bits 2-3 are set
42
if the relevant byte matched the NUL end of string. */
43
44
ENTRY (__strrchr_aarch64_mte)
45
bic src, srcin, 15
46
dup vrepchr.16b, chrin
47
movi vrepmask.16b, 0x33
48
ld1 {vdata.16b}, [src]
49
cmeq vhas_nul.16b, vdata.16b, 0
50
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
51
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
52
shrn vend.8b, vhas_nul.8h, 4
53
lsl shift, srcin, 2
54
fmov synd, dend
55
lsr synd, synd, shift
56
lsl synd, synd, shift
57
ands nul_match, synd, 0xcccccccccccccccc
58
bne L(tail)
59
cbnz synd, L(loop2_start)
60
61
.p2align 4
62
L(loop1):
63
ldr q1, [src, 16]
64
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
65
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
66
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
67
fmov synd, dend
68
cbnz synd, L(loop1_end)
69
ldr q1, [src, 32]!
70
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
71
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
72
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
73
fmov synd, dend
74
cbz synd, L(loop1)
75
sub src, src, 16
76
L(loop1_end):
77
add src, src, 16
78
cmeq vhas_nul.16b, vdata.16b, 0
79
#ifdef __AARCH64EB__
80
bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
81
shrn vend.8b, vhas_nul.8h, 4
82
fmov synd, dend
83
rbit synd, synd
84
#else
85
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
86
shrn vend.8b, vhas_nul.8h, 4
87
fmov synd, dend
88
#endif
89
ands nul_match, synd, 0xcccccccccccccccc
90
beq L(loop2_start)
91
L(tail):
92
sub nul_match, nul_match, 1
93
and chr_match, synd, 0x3333333333333333
94
ands chr_match, chr_match, nul_match
95
add result, src, 15
96
clz tmp, chr_match
97
sub result, result, tmp, lsr 2
98
csel result, result, xzr, ne
99
ret
100
101
.p2align 4
102
nop
103
nop
104
L(loop2_start):
105
add src, src, 16
106
bic vrepmask.8h, 0xf0
107
108
L(loop2):
109
cmp synd, 0
110
csel src_match, src, src_match, ne
111
csel chr_match, synd, chr_match, ne
112
ld1 {vdata.16b}, [src], 16
113
cmeq vhas_nul.16b, vdata.16b, 0
114
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
115
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
116
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
117
fmov synd, dend
118
tst synd, 0xcccccccccccccccc
119
beq L(loop2)
120
121
bic vhas_nul.8h, 0x0f, lsl 8
122
addp vend.16b, vhas_nul.16b, vhas_nul.16b
123
fmov synd, dend
124
and nul_match, synd, 0xcccccccccccccccc
125
sub nul_match, nul_match, 1
126
and tmp, synd, 0x3333333333333333
127
ands tmp, tmp, nul_match
128
csel chr_match, tmp, chr_match, ne
129
csel src_match, src, src_match, ne
130
sub src_match, src_match, 1
131
clz tmp, chr_match
132
sub result, src_match, tmp, lsr 2
133
ret
134
135
END (__strrchr_aarch64_mte)
136
137
138