CoCalc -- strrchr-mte.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
³⁹⁴⁹⁸ views
1
/*
2
 * strrchr - find last position of a character in a string.
3
 *
4
 * Copyright (c) 2020-2023, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7

8
/* Assumptions:
9
 *
10
 * ARMv8-a, AArch64, Advanced SIMD.
11
 * MTE compatible.
12
 */
13

14
#include "asmdefs.h"
15

16
#define srcin		x0
17
#define chrin		w1
18
#define result		x0
19

20
#define src		x2
21
#define tmp		x3
22
#define synd		x3
23
#define shift		x4
24
#define src_match	x4
25
#define nul_match	x5
26
#define chr_match	x6
27

28
#define vrepchr		v0
29
#define vdata		v1
30
#define vhas_nul	v2
31
#define vhas_chr	v3
32
#define vrepmask	v4
33
#define vend		v5
34
#define dend		d5
35

36
/* Core algorithm.
37

38
   For each 16-byte chunk we calculate a 64-bit syndrome value, with
39
   four bits per byte (LSB is always in bits 0 and 1, for both big
40
   and little-endian systems).  For each tuple, bits 0-1 are set if
41
   the relevant byte matched the requested character; bits 2-3 are set
42
   if the relevant byte matched the NUL end of string.  */
43

44
ENTRY (__strrchr_aarch64_mte)
45
	bic	src, srcin, 15
46
	dup	vrepchr.16b, chrin
47
	movi	vrepmask.16b, 0x33
48
	ld1	{vdata.16b}, [src]
49
	cmeq	vhas_nul.16b, vdata.16b, 0
50
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
51
	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
52
	shrn	vend.8b, vhas_nul.8h, 4
53
	lsl	shift, srcin, 2
54
	fmov	synd, dend
55
	lsr	synd, synd, shift
56
	lsl	synd, synd, shift
57
	ands	nul_match, synd, 0xcccccccccccccccc
58
	bne	L(tail)
59
	cbnz	synd, L(loop2_start)
60

61
	.p2align 4
62
L(loop1):
63
	ldr	q1, [src, 16]
64
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
65
	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
66
	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
67
	fmov	synd, dend
68
	cbnz	synd, L(loop1_end)
69
	ldr	q1, [src, 32]!
70
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
71
	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
72
	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
73
	fmov	synd, dend
74
	cbz	synd, L(loop1)
75
	sub	src, src, 16
76
L(loop1_end):
77
	add	src, src, 16
78
	cmeq	vhas_nul.16b, vdata.16b, 0
79
#ifdef __AARCH64EB__
80
	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
81
	shrn	vend.8b, vhas_nul.8h, 4
82
	fmov	synd, dend
83
	rbit	synd, synd
84
#else
85
	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
86
	shrn	vend.8b, vhas_nul.8h, 4
87
	fmov	synd, dend
88
#endif
89
	ands	nul_match, synd, 0xcccccccccccccccc
90
	beq	L(loop2_start)
91
L(tail):
92
	sub	nul_match, nul_match, 1
93
	and	chr_match, synd, 0x3333333333333333
94
	ands	chr_match, chr_match, nul_match
95
	add	result, src, 15
96
	clz	tmp, chr_match
97
	sub	result, result, tmp, lsr 2
98
	csel	result, result, xzr, ne
99
	ret
100

101
	.p2align 4
102
	nop
103
	nop
104
L(loop2_start):
105
	add	src, src, 16
106
	bic	vrepmask.8h, 0xf0
107

108
L(loop2):
109
	cmp	synd, 0
110
	csel	src_match, src, src_match, ne
111
	csel	chr_match, synd, chr_match, ne
112
	ld1	{vdata.16b}, [src], 16
113
	cmeq	vhas_nul.16b, vdata.16b, 0
114
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
115
	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
116
	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
117
	fmov	synd, dend
118
	tst	synd, 0xcccccccccccccccc
119
	beq	L(loop2)
120

121
	bic	vhas_nul.8h, 0x0f, lsl 8
122
	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
123
	fmov	synd, dend
124
	and	nul_match, synd, 0xcccccccccccccccc
125
	sub	nul_match, nul_match, 1
126
	and	tmp, synd, 0x3333333333333333
127
	ands	tmp, tmp, nul_match
128
	csel	chr_match, tmp, chr_match, ne
129
	csel	src_match, src, src_match, ne
130
	sub	src_match, src_match, 1
131
	clz	tmp, chr_match
132
	sub	result, src_match, tmp, lsr 2
133
	ret
134

135
END (__strrchr_aarch64_mte)
136

137

138
Product

Resources

Company