Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
39486 views
1
/*
2
* strchrnul - find a character or nul in a string
3
*
4
* Copyright (c) 2014-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64
11
* Neon Available.
12
*/
13
14
#include "asmdefs.h"
15
16
/* Arguments and results. */
17
#define srcin x0
18
#define chrin w1
19
20
#define result x0
21
22
#define src x2
23
#define tmp1 x3
24
#define wtmp2 w4
25
#define tmp3 x5
26
27
#define vrepchr v0
28
#define vdata1 v1
29
#define vdata2 v2
30
#define vhas_nul1 v3
31
#define vhas_nul2 v4
32
#define vhas_chr1 v5
33
#define vhas_chr2 v6
34
#define vrepmask v7
35
#define vend1 v16
36
37
/* Core algorithm.
38
39
For each 32-byte hunk we calculate a 64-bit syndrome value, with
40
two bits per byte (LSB is always in bits 0 and 1, for both big
41
and little-endian systems). For each tuple, bit 0 is set iff
42
the relevant byte matched the requested character or nul. Since the
43
bits in the syndrome reflect exactly the order in which things occur
44
in the original string a count_trailing_zeros() operation will
45
identify exactly which byte is causing the termination. */
46
47
/* Locals and temporaries. */
48
49
ENTRY (__strchrnul_aarch64)
50
/* Magic constant 0x40100401 to allow us to identify which lane
51
matches the termination condition. */
52
mov wtmp2, #0x0401
53
movk wtmp2, #0x4010, lsl #16
54
dup vrepchr.16b, chrin
55
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
56
dup vrepmask.4s, wtmp2
57
ands tmp1, srcin, #31
58
b.eq L(loop)
59
60
/* Input string is not 32-byte aligned. Rather than forcing
61
the padding bytes to a safe value, we calculate the syndrome
62
for all the bytes, but then mask off those bits of the
63
syndrome that are related to the padding. */
64
ld1 {vdata1.16b, vdata2.16b}, [src], #32
65
neg tmp1, tmp1
66
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
67
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
68
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
69
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
70
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
71
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
72
lsl tmp1, tmp1, #1
73
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
74
mov tmp3, #~0
75
addp vend1.16b, vend1.16b, vend1.16b // 128->64
76
lsr tmp1, tmp3, tmp1
77
78
mov tmp3, vend1.d[0]
79
bic tmp1, tmp3, tmp1 // Mask padding bits.
80
cbnz tmp1, L(tail)
81
82
.p2align 4
83
L(loop):
84
ld1 {vdata1.16b, vdata2.16b}, [src], #32
85
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
86
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
87
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
88
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
89
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
90
umaxp vend1.16b, vend1.16b, vend1.16b
91
mov tmp1, vend1.d[0]
92
cbz tmp1, L(loop)
93
94
/* Termination condition found. Now need to establish exactly why
95
we terminated. */
96
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
97
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
98
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
99
addp vend1.16b, vend1.16b, vend1.16b // 128->64
100
101
mov tmp1, vend1.d[0]
102
L(tail):
103
/* Count the trailing zeros, by bit reversing... */
104
rbit tmp1, tmp1
105
/* Re-bias source. */
106
sub src, src, #32
107
clz tmp1, tmp1 /* ... and counting the leading zeros. */
108
/* tmp1 is twice the offset into the fragment. */
109
add result, src, tmp1, lsr #1
110
ret
111
112
END (__strchrnul_aarch64)
113
114
115