Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcmp.S
39486 views
1
/* memcmp - compare memory
2
*
3
* Copyright (c) 2013-2022, Arm Limited.
4
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
5
*/
6
7
/* Assumptions:
8
*
9
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
10
*/
11
12
#include "asmdefs.h"
13
14
#define src1 x0
15
#define src2 x1
16
#define limit x2
17
#define result w0
18
19
#define data1 x3
20
#define data1w w3
21
#define data2 x4
22
#define data2w w4
23
#define data3 x5
24
#define data3w w5
25
#define data4 x6
26
#define data4w w6
27
#define tmp x6
28
#define src1end x7
29
#define src2end x8
30
31
32
ENTRY (__memcmp_aarch64)
33
cmp limit, 16
34
b.lo L(less16)
35
ldp data1, data3, [src1]
36
ldp data2, data4, [src2]
37
ccmp data1, data2, 0, ne
38
ccmp data3, data4, 0, eq
39
b.ne L(return2)
40
41
add src1end, src1, limit
42
add src2end, src2, limit
43
cmp limit, 32
44
b.ls L(last_bytes)
45
cmp limit, 160
46
b.hs L(loop_align)
47
sub limit, limit, 32
48
49
.p2align 4
50
L(loop32):
51
ldp data1, data3, [src1, 16]
52
ldp data2, data4, [src2, 16]
53
cmp data1, data2
54
ccmp data3, data4, 0, eq
55
b.ne L(return2)
56
cmp limit, 16
57
b.ls L(last_bytes)
58
59
ldp data1, data3, [src1, 32]
60
ldp data2, data4, [src2, 32]
61
cmp data1, data2
62
ccmp data3, data4, 0, eq
63
b.ne L(return2)
64
add src1, src1, 32
65
add src2, src2, 32
66
L(last64):
67
subs limit, limit, 32
68
b.hi L(loop32)
69
70
/* Compare last 1-16 bytes using unaligned access. */
71
L(last_bytes):
72
ldp data1, data3, [src1end, -16]
73
ldp data2, data4, [src2end, -16]
74
L(return2):
75
cmp data1, data2
76
csel data1, data1, data3, ne
77
csel data2, data2, data4, ne
78
79
/* Compare data bytes and set return value to 0, -1 or 1. */
80
L(return):
81
#ifndef __AARCH64EB__
82
rev data1, data1
83
rev data2, data2
84
#endif
85
cmp data1, data2
86
cset result, ne
87
cneg result, result, lo
88
ret
89
90
.p2align 4
91
L(less16):
92
add src1end, src1, limit
93
add src2end, src2, limit
94
tbz limit, 3, L(less8)
95
ldr data1, [src1]
96
ldr data2, [src2]
97
ldr data3, [src1end, -8]
98
ldr data4, [src2end, -8]
99
b L(return2)
100
101
.p2align 4
102
L(less8):
103
tbz limit, 2, L(less4)
104
ldr data1w, [src1]
105
ldr data2w, [src2]
106
ldr data3w, [src1end, -4]
107
ldr data4w, [src2end, -4]
108
b L(return2)
109
110
L(less4):
111
tbz limit, 1, L(less2)
112
ldrh data1w, [src1]
113
ldrh data2w, [src2]
114
cmp data1w, data2w
115
b.ne L(return)
116
L(less2):
117
mov result, 0
118
tbz limit, 0, L(return_zero)
119
ldrb data1w, [src1end, -1]
120
ldrb data2w, [src2end, -1]
121
sub result, data1w, data2w
122
L(return_zero):
123
ret
124
125
L(loop_align):
126
ldp data1, data3, [src1, 16]
127
ldp data2, data4, [src2, 16]
128
cmp data1, data2
129
ccmp data3, data4, 0, eq
130
b.ne L(return2)
131
132
/* Align src2 and adjust src1, src2 and limit. */
133
and tmp, src2, 15
134
sub tmp, tmp, 16
135
sub src2, src2, tmp
136
add limit, limit, tmp
137
sub src1, src1, tmp
138
sub limit, limit, 64 + 16
139
140
.p2align 4
141
L(loop64):
142
ldr q0, [src1, 16]
143
ldr q1, [src2, 16]
144
subs limit, limit, 64
145
ldr q2, [src1, 32]
146
ldr q3, [src2, 32]
147
eor v0.16b, v0.16b, v1.16b
148
eor v1.16b, v2.16b, v3.16b
149
ldr q2, [src1, 48]
150
ldr q3, [src2, 48]
151
umaxp v0.16b, v0.16b, v1.16b
152
ldr q4, [src1, 64]!
153
ldr q5, [src2, 64]!
154
eor v1.16b, v2.16b, v3.16b
155
eor v2.16b, v4.16b, v5.16b
156
umaxp v1.16b, v1.16b, v2.16b
157
umaxp v0.16b, v0.16b, v1.16b
158
umaxp v0.16b, v0.16b, v0.16b
159
fmov tmp, d0
160
ccmp tmp, 0, 0, hi
161
b.eq L(loop64)
162
163
/* If equal, process last 1-64 bytes using scalar loop. */
164
add limit, limit, 64 + 16
165
cbz tmp, L(last64)
166
167
/* Determine the 8-byte aligned offset of the first difference. */
168
#ifdef __AARCH64EB__
169
rev16 tmp, tmp
170
#endif
171
rev tmp, tmp
172
clz tmp, tmp
173
bic tmp, tmp, 7
174
sub tmp, tmp, 48
175
ldr data1, [src1, tmp]
176
ldr data2, [src2, tmp]
177
#ifndef __AARCH64EB__
178
rev data1, data1
179
rev data2, data2
180
#endif
181
mov result, 1
182
cmp data1, data2
183
cneg result, result, lo
184
ret
185
186
END (__memcmp_aarch64)
187
188