Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memrchr.S
39491 views
1
/*
2
* memrchr - find last character in a memory zone.
3
*
4
* Copyright (c) 2020-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD.
11
* MTE compatible.
12
*/
13
14
#include "asmdefs.h"
15
16
#define srcin x0
17
#define chrin w1
18
#define cntin x2
19
#define result x0
20
21
#define src x3
22
#define cntrem x4
23
#define synd x5
24
#define shift x6
25
#define tmp x7
26
#define end x8
27
#define endm1 x9
28
29
#define vrepchr v0
30
#define qdata q1
31
#define vdata v1
32
#define vhas_chr v2
33
#define vend v3
34
#define dend d3
35
36
/*
37
Core algorithm:
38
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
39
per byte. We take 4 bits of every comparison byte with shift right and narrow
40
by 4 instruction. Since the bits in the nibble mask reflect the order in
41
which things occur in the original string, counting leading zeros identifies
42
exactly which byte matched. */
43
44
ENTRY (__memrchr_aarch64)
45
add end, srcin, cntin
46
sub endm1, end, 1
47
bic src, endm1, 15
48
cbz cntin, L(nomatch)
49
ld1 {vdata.16b}, [src]
50
dup vrepchr.16b, chrin
51
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
52
neg shift, end, lsl 2
53
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
54
fmov synd, dend
55
lsl synd, synd, shift
56
cbz synd, L(start_loop)
57
58
clz synd, synd
59
sub result, endm1, synd, lsr 2
60
cmp cntin, synd, lsr 2
61
csel result, result, xzr, hi
62
ret
63
64
nop
65
L(start_loop):
66
subs cntrem, src, srcin
67
b.ls L(nomatch)
68
69
/* Make sure that it won't overread by a 16-byte chunk */
70
sub cntrem, cntrem, 1
71
tbz cntrem, 4, L(loop32_2)
72
add src, src, 16
73
74
.p2align 5
75
L(loop32):
76
ldr qdata, [src, -32]!
77
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
78
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
79
fmov synd, dend
80
cbnz synd, L(end)
81
82
L(loop32_2):
83
ldr qdata, [src, -16]
84
subs cntrem, cntrem, 32
85
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
86
b.lo L(end_2)
87
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
88
fmov synd, dend
89
cbz synd, L(loop32)
90
L(end_2):
91
sub src, src, 16
92
L(end):
93
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
94
fmov synd, dend
95
96
add tmp, src, 15
97
#ifdef __AARCH64EB__
98
rbit synd, synd
99
#endif
100
clz synd, synd
101
sub tmp, tmp, synd, lsr 2
102
cmp tmp, srcin
103
csel result, tmp, xzr, hs
104
ret
105
106
L(nomatch):
107
mov result, 0
108
ret
109
110
END (__memrchr_aarch64)
111
112
113