Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/amd64/string/strcmp.S
39491 views
1
/*-
2
* Copyright (c) 2023, The FreeBSD Foundation
3
*
4
* SPDX-License-Expression: BSD-2-Clause
5
*
6
* Portions of this software were developed by Robert Clausecker
7
* <[email protected]> under sponsorship from the FreeBSD Foundation.
8
*
9
* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S
10
* written by J.T. Conklin <[email protected]> that was originally
11
* dedicated to the public domain.
12
*/
13
14
#include <machine/asm.h>
15
#include <machine/param.h>
16
17
#if 0
18
RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
19
#endif
20
21
#include "amd64_archlevel.h"
22
23
#define ALIGN_TEXT .p2align 4, 0x90
24
25
ARCHFUNCS(strcmp)
26
ARCHFUNC(strcmp, scalar)
27
ARCHFUNC(strcmp, baseline)
28
ENDARCHFUNCS(strcmp)
29
30
ARCHENTRY(strcmp, scalar)
31
/*
32
* Align s1 to word boundary.
33
* Consider unrolling loop?
34
*/
35
.Ls1align:
36
testb $7,%dil
37
je .Ls1aligned
38
movb (%rdi),%al
39
incq %rdi
40
movb (%rsi),%dl
41
incq %rsi
42
testb %al,%al
43
je .Ldone
44
cmpb %al,%dl
45
je .Ls1align
46
jmp .Ldone
47
48
/*
49
* Check whether s2 is aligned to a word boundary. If it is, we
50
* can compare by words. Otherwise we have to compare by bytes.
51
*/
52
.Ls1aligned:
53
testb $7,%sil
54
jne .Lbyte_loop
55
56
movabsq $0x0101010101010101,%r8
57
subq $8,%rdi
58
movabsq $0x8080808080808080,%r9
59
subq $8,%rsi
60
61
ALIGN_TEXT
62
.Lword_loop:
63
movq 8(%rdi),%rax
64
addq $8,%rdi
65
movq 8(%rsi),%rdx
66
addq $8,%rsi
67
cmpq %rax,%rdx
68
jne .Lbyte_loop
69
subq %r8,%rdx
70
notq %rax
71
andq %rax,%rdx
72
testq %r9,%rdx
73
je .Lword_loop
74
75
ALIGN_TEXT
76
.Lbyte_loop:
77
movb (%rdi),%al
78
incq %rdi
79
movb (%rsi),%dl
80
incq %rsi
81
testb %al,%al
82
je .Ldone
83
cmpb %al,%dl
84
je .Lbyte_loop
85
86
.Ldone:
87
movzbq %al,%rax
88
movzbq %dl,%rdx
89
subq %rdx,%rax
90
ret
91
ARCHEND(strcmp, scalar)
92
93
ARCHENTRY(strcmp, baseline)
94
/* check if either string crosses a page in the head */
95
lea 15(%rdi), %r8d # end of head
96
lea 15(%rsi), %r9d
97
mov %edi, %eax
98
mov %esi, %edx
99
xor %edi, %r8d # bits that changed between first and last byte
100
xor %esi, %r9d
101
and $~0xf, %rdi # align heads to 16 bytes
102
and $~0xf, %rsi
103
or %r8d, %r9d # in either RSI or RDI
104
and $0xf, %eax # offset from alignment
105
and $0xf, %edx
106
pxor %xmm1, %xmm1
107
test $PAGE_SIZE, %r9d # did the page change?
108
jz 0f # if not, take fast path
109
110
/* heads may cross page boundary, avoid unmapped loads */
111
movdqa (%rdi), %xmm0 # load aligned heads
112
movdqa (%rsi), %xmm2
113
mov $-1, %r8d
114
mov $-1, %r9d
115
mov %eax, %ecx
116
shl %cl, %r8d # string head in XMM0
117
mov %edx, %ecx
118
shl %cl, %r9d # string head in XMM2
119
movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack
120
movdqa %xmm2, -24(%rsp)
121
pcmpeqb %xmm1, %xmm0
122
pcmpeqb %xmm1, %xmm2
123
pmovmskb %xmm0, %r10d
124
pmovmskb %xmm2, %r11d
125
test %r8d, %r10d # NUL byte present in first string?
126
lea -40(%rsp), %r8
127
cmovz %rdi, %r8
128
test %r9d, %r11d # NUL byte present in second string?
129
lea -24(%rsp), %r9
130
cmovz %rsi, %r9
131
movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads
132
movdqu (%r9, %rdx, 1), %xmm4
133
jmp 1f
134
135
0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads
136
movdqu (%rsi, %rdx, 1), %xmm4
137
1: pxor %xmm2, %xmm2
138
pcmpeqb %xmm0, %xmm2 # NUL byte present?
139
pcmpeqb %xmm0, %xmm4 # which bytes match?
140
pandn %xmm4, %xmm2 # match and not NUL byte?
141
pmovmskb %xmm2, %r9d
142
xor $0xffff, %r9d # mismatch or NUL byte?
143
jnz .Lhead_mismatch
144
145
/* load head and second chunk */
146
movdqa 16(%rdi), %xmm2 # load second chunks
147
movdqa 16(%rsi), %xmm3
148
sub %rdx, %rax # is a&0xf >= b&0xf?
149
jb .Lswapped # if not, proceed with swapped operands
150
151
neg %rax
152
movdqu 16(%rsi, %rax, 1), %xmm0
153
sub %rdi, %rsi # express RSI as distance from RDI
154
lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string
155
neg %rax
156
pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI
157
pcmpeqb %xmm2, %xmm0
158
pmovmskb %xmm1, %r8d
159
pmovmskb %xmm0, %r9d
160
add $16, %rdi
161
test %r8d, %r8d
162
jnz .Lnul_found
163
xor $0xffff, %r9d
164
jnz .Lmismatch
165
add $16, %rdi # advance aligned pointers
166
167
/*
168
* During the main loop, the layout of the two strings is something like:
169
*
170
* v ------1------ v ------2------ v
171
* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
172
* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
173
*
174
* where v indicates the alignment boundaries and corresponding chunks
175
* of the strings have the same letters. Chunk A has been checked in
176
* the previous iteration. This iteration, we first check that string
177
* RSI doesn't end within region 2, then we compare chunk B between the
178
* two strings. As RSI is known not to hold a NUL byte in regsions 1
179
* and 2 at this point, this also ensures that RDI has not ended yet.
180
*/
181
ALIGN_TEXT
182
0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
183
pxor %xmm1, %xmm1
184
pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?
185
pcmpeqb (%rdi), %xmm0 # where do the chunks match?
186
pmovmskb %xmm1, %r8d
187
pmovmskb %xmm0, %r9d
188
test %r8d, %r8d
189
jnz .Lnul_found
190
xor $0xffff, %r9d # any mismatches?
191
jnz .Lmismatch
192
193
/* main loop unrolled twice */
194
movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
195
pxor %xmm1, %xmm1
196
pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
197
pcmpeqb 16(%rdi), %xmm0 # where do the chunks match?
198
pmovmskb %xmm1, %r8d
199
pmovmskb %xmm0, %r9d
200
add $32, %rdi
201
test %r8d, %r8d
202
jnz .Lnul_found2
203
xor $0xffff, %r9d # any mismatches?
204
jz 0b
205
206
sub $16, %rdi # roll back second increment
207
208
/* a mismatch has been found between RDX and RSI */
209
.Lmismatch:
210
tzcnt %r9d, %r9d # where is the mismatch?
211
add %rdi, %rdx # turn RDX from offset to pointer
212
movzbl (%rdx, %r9, 1), %ecx
213
movzbl (%rdi, %r9, 1), %eax
214
sub %ecx, %eax # difference of the mismatching chars
215
ret
216
217
/* mismatch in true heads */
218
.Lhead_mismatch:
219
tzcnt %r9d, %r9d # where is the mismatch?
220
add %rax, %rdi # return to true heads
221
add %rdx, %rsi
222
movzbl (%rdi, %r9, 1), %eax # mismatching characters
223
movzbl (%rsi, %r9, 1), %ecx
224
sub %ecx, %eax
225
ret
226
227
.Lnul_found2:
228
sub $16, %rdi # roll back second increment
229
230
/* a NUL has been found in RSI */
231
.Lnul_found:
232
mov %eax, %ecx
233
mov %r8d, %r10d
234
shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX
235
xor $0xffff, %r9d # mask of mismatches
236
or %r8d, %r9d # NUL bytes also count as mismatches
237
jnz .Lmismatch
238
239
/*
240
* (RDI) == (RSI) and NUL is past the string.
241
* Compare (RSI) with the corresponding part
242
* of the other string until the NUL byte.
243
*/
244
movdqu (%rdi, %rax, 1), %xmm0
245
pcmpeqb (%rdi, %rsi, 1), %xmm0
246
add %rdi, %rsi # restore RSI pointer
247
add %rax, %rdi # point RDI to chunk corresponding to (RSI)
248
pmovmskb %xmm0, %ecx # mask of matches
249
not %ecx # mask of mismatches
250
or %r10d, %ecx # mask of mismatches or NUL bytes
251
tzcnt %ecx, %ecx # location of first mismatch
252
movzbl (%rdi, %rcx, 1), %eax
253
movzbl (%rsi, %rcx, 1), %ecx
254
sub %ecx, %eax
255
ret
256
257
/*
258
* If (a&0xf) < (b&0xf), we do the same thing but with swapped
259
* operands. I found that this performs slightly better than
260
* using conditional moves to do the swap branchless.
261
*/
262
.Lswapped:
263
movdqu 16(%rdi, %rax, 1), %xmm0
264
sub %rsi, %rdi # express RDI as distance from RSI
265
lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI
266
neg %rax # make difference positive
267
pcmpeqb %xmm2, %xmm1
268
pcmpeqb %xmm3, %xmm0
269
pmovmskb %xmm1, %r8d
270
pmovmskb %xmm0, %r9d
271
add $16, %rsi # advance aligned pointers
272
test %r8d, %r8d
273
jnz .Lnul_founds
274
xor $0xffff, %r9d
275
jnz .Lmismatchs
276
add $16, %rsi
277
278
/*
279
* During the main loop, the layout of the two strings is something like:
280
*
281
* v ------1------ v ------2------ v
282
* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
283
* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
284
*
285
* where v indicates the alignment boundaries and corresponding chunks
286
* of the strings have the same letters. Chunk A has been checked in
287
* the previous iteration. This iteration, we first check that string
288
* RSI doesn't end within region 2, then we compare chunk B between the
289
* two strings. As RSI is known not to hold a NUL byte in regsions 1
290
* and 2 at this point, this also ensures that RDI has not ended yet.
291
*/
292
ALIGN_TEXT
293
0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
294
pxor %xmm1, %xmm1
295
pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI?
296
pcmpeqb (%rsi), %xmm0 # where do the chunks match?
297
pmovmskb %xmm1, %r8d
298
pmovmskb %xmm0, %r9d
299
test %r8d, %r8d
300
jnz .Lnul_founds
301
xor $0xffff, %r9d # any mismatches?
302
jnz .Lmismatchs
303
304
/* main loop unrolled twice */
305
movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
306
pxor %xmm1, %xmm1
307
pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
308
pcmpeqb 16(%rsi), %xmm0 # where do the chunks match?
309
pmovmskb %xmm1, %r8d
310
pmovmskb %xmm0, %r9d
311
add $32, %rsi
312
test %r8d, %r8d
313
jnz .Lnul_found2s
314
xor $0xffff, %r9d # any mismatches?
315
jz 0b
316
317
sub $16, %rsi # roll back second increment
318
319
/* a mismatch has been found between RDX and RDI */
320
.Lmismatchs:
321
tzcnt %r9d, %r9d # where is the mismatch?
322
add %rsi, %rdx # turn RDX from offset to pointer
323
movzbl (%rdx, %r9, 1), %eax
324
movzbl (%rsi, %r9, 1), %ecx
325
sub %ecx, %eax # difference of the mismatching chars
326
ret
327
328
.Lnul_found2s:
329
sub $16, %rsi # roll back second increment
330
331
/* a NUL has been found in RSI */
332
.Lnul_founds:
333
mov %eax, %ecx
334
mov %r8d, %r10d
335
shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX
336
xor $0xffff, %r9d # mask of mismatches
337
or %r8d, %r9d # NUL bytes also count as mismatches
338
jnz .Lmismatchs
339
340
/*
341
* (RDI) == (RSI) and NUL is past the string.
342
* Compare (RSI) with the corresponding part
343
* of the other string until the NUL byte.
344
*/
345
movdqu (%rsi, %rax, 1), %xmm0
346
pcmpeqb (%rsi, %rdi, 1), %xmm0
347
add %rsi, %rdi # restore RDI pointer
348
add %rax, %rsi # point RSI to chunk corresponding to (RDI)
349
pmovmskb %xmm0, %ecx # mask of matches
350
not %ecx # mask of mismatches
351
or %r10d, %ecx # mask of mismatches or NUL bytes
352
tzcnt %ecx, %ecx # location of first mismatch
353
movzbl (%rdi, %rcx, 1), %eax
354
movzbl (%rsi, %rcx, 1), %ecx
355
sub %ecx, %eax
356
ret
357
ARCHEND(strcmp, baseline)
358
359
.section .note.GNU-stack,"",%progbits
360
361