Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memchr.S
39486 views
1
/*
2
* memchr - find a character in a memory zone
3
*
4
* Copyright (c) 2014-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64
11
* Neon Available.
12
*/
13
14
#include "asmdefs.h"
15
16
/* Arguments and results. */
17
#define srcin x0
18
#define chrin w1
19
#define cntin x2
20
21
#define result x0
22
23
#define src x3
24
#define tmp x4
25
#define wtmp2 w5
26
#define synd x6
27
#define soff x9
28
#define cntrem x10
29
30
#define vrepchr v0
31
#define vdata1 v1
32
#define vdata2 v2
33
#define vhas_chr1 v3
34
#define vhas_chr2 v4
35
#define vrepmask v5
36
#define vend v6
37
38
/*
39
* Core algorithm:
40
*
41
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
42
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
43
* requested character and bit 1 is not used (faster than using a 32bit
44
* syndrome). Since the bits in the syndrome reflect exactly the order in which
45
* things occur in the original string, counting trailing zeros allows to
46
* identify exactly which byte has matched.
47
*/
48
49
ENTRY (__memchr_aarch64)
50
/* Do not dereference srcin if no bytes to compare. */
51
cbz cntin, L(zero_length)
52
/*
53
* Magic constant 0x40100401 allows us to identify which lane matches
54
* the requested byte.
55
*/
56
mov wtmp2, #0x0401
57
movk wtmp2, #0x4010, lsl #16
58
dup vrepchr.16b, chrin
59
/* Work with aligned 32-byte chunks */
60
bic src, srcin, #31
61
dup vrepmask.4s, wtmp2
62
ands soff, srcin, #31
63
and cntrem, cntin, #31
64
b.eq L(loop)
65
66
/*
67
* Input string is not 32-byte aligned. We calculate the syndrome
68
* value for the aligned 32 bytes block containing the first bytes
69
* and mask the irrelevant part.
70
*/
71
72
ld1 {vdata1.16b, vdata2.16b}, [src], #32
73
sub tmp, soff, #32
74
adds cntin, cntin, tmp
75
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
76
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
77
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
78
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
79
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
80
addp vend.16b, vend.16b, vend.16b /* 128->64 */
81
mov synd, vend.d[0]
82
/* Clear the soff*2 lower bits */
83
lsl tmp, soff, #1
84
lsr synd, synd, tmp
85
lsl synd, synd, tmp
86
/* The first block can also be the last */
87
b.ls L(masklast)
88
/* Have we found something already? */
89
cbnz synd, L(tail)
90
91
L(loop):
92
ld1 {vdata1.16b, vdata2.16b}, [src], #32
93
subs cntin, cntin, #32
94
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
95
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
96
/* If we're out of data we finish regardless of the result */
97
b.ls L(end)
98
/* Use a fast check for the termination condition */
99
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
100
addp vend.2d, vend.2d, vend.2d
101
mov synd, vend.d[0]
102
/* We're not out of data, loop if we haven't found the character */
103
cbz synd, L(loop)
104
105
L(end):
106
/* Termination condition found, let's calculate the syndrome value */
107
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
108
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
109
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
110
addp vend.16b, vend.16b, vend.16b /* 128->64 */
111
mov synd, vend.d[0]
112
/* Only do the clear for the last possible block */
113
b.hs L(tail)
114
115
L(masklast):
116
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
117
add tmp, cntrem, soff
118
and tmp, tmp, #31
119
sub tmp, tmp, #32
120
neg tmp, tmp, lsl #1
121
lsl synd, synd, tmp
122
lsr synd, synd, tmp
123
124
L(tail):
125
/* Count the trailing zeros using bit reversing */
126
rbit synd, synd
127
/* Compensate the last post-increment */
128
sub src, src, #32
129
/* Check that we have found a character */
130
cmp synd, #0
131
/* And count the leading zeros */
132
clz synd, synd
133
/* Compute the potential result */
134
add result, src, synd, lsr #1
135
/* Select result or NULL */
136
csel result, xzr, result, eq
137
ret
138
139
L(zero_length):
140
mov result, #0
141
ret
142
143
END (__memchr_aarch64)
144
145
146