Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/amd64/string/strspn.S
39492 views
1
/*-
2
* Copyright (c) 2023 The FreeBSD Foundation
3
*
4
* This software was developed by Robert Clausecker <[email protected]>
5
* under sponsorship from the FreeBSD Foundation.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE
27
*/
28
29
#include <machine/asm.h>
30
#include <machine/param.h>
31
32
#include "amd64_archlevel.h"
33
34
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
35
36
ARCHFUNCS(strspn)
37
ARCHFUNC(strspn, scalar)
38
NOARCHFUNC
39
ARCHFUNC(strspn, x86_64_v2)
40
ENDARCHFUNCS(strspn)
41
42
ARCHENTRY(strspn, scalar)
43
push %rbp # align stack to enable function call
44
mov %rsp, %rbp
45
sub $256, %rsp # allocate space for lookup table
46
47
/* check for special cases */
48
movzbl (%rsi), %edx # first character in the set
49
test %edx, %edx
50
jz .Lzero # empty set always returns 0
51
52
movzbl 1(%rsi), %eax # second character in the set
53
test %eax, %eax
54
jz .Lsingle
55
56
/* no special case matches -- prepare lookup table */
57
xor %r8d, %r8d
58
mov $28, %ecx
59
0: mov %r8, (%rsp, %rcx, 8)
60
mov %r8, 8(%rsp, %rcx, 8)
61
mov %r8, 16(%rsp, %rcx, 8)
62
mov %r8, 24(%rsp, %rcx, 8)
63
sub $4, %ecx
64
jnc 0b
65
66
movb $1, (%rsp, %rdx, 1) # register first char in set
67
add $2, %rsi
68
69
/* process remaining chars in set */
70
ALIGN_TEXT
71
0: movb $1, (%rsp, %rax, 1) # register previous char
72
movzbl (%rsi), %eax # next char in set
73
test %eax, %eax # end of string?
74
jz 1f
75
76
movb $1, (%rsp, %rax, 1)
77
add $2, %rsi
78
movzbl -1(%rsi), %eax
79
test %eax, %eax
80
jnz 0b
81
82
1: mov %rdi, %rax # a copy of the source to iterate over
83
84
/* find mismatch */
85
ALIGN_TEXT
86
0: movzbl (%rax), %ecx
87
cmpb $0, (%rsp, %rcx, 1)
88
je 2f
89
90
movzbl 1(%rax), %ecx
91
cmpb $0, (%rsp, %rcx, 1)
92
je 3f
93
94
movzbl 2(%rax), %ecx
95
cmpb $0, (%rsp, %rcx, 1)
96
je 4f
97
98
movzbl 3(%rax), %ecx
99
add $4, %rax
100
cmpb $0, (%rsp, %rcx, 1)
101
jne 0b
102
103
sub $3, %rax
104
4: dec %rdi
105
3: inc %rax
106
2: sub %rdi, %rax # number of characters preceding match
107
leave
108
ret
109
110
/* empty set never matches */
111
.Lzero: xor %eax, %eax
112
leave
113
ret
114
115
/* find repeated single character */
116
ALIGN_TEXT
117
.Lsingle:
118
cmpb %dl, (%rdi, %rax, 1)
119
jne 1f
120
121
cmpb %dl, 1(%rdi, %rax, 1)
122
jne 2f
123
124
cmpb %dl, 2(%rdi, %rax, 1)
125
jne 3f
126
127
cmpb %dl, 3(%rdi, %rax, 1)
128
lea 4(%rax), %rax
129
je .Lsingle
130
131
sub $3, %rax
132
3: inc %rax
133
2: inc %rax
134
1: leave
135
ret
136
ARCHEND(strspn, scalar)
137
138
/*
139
* This kernel uses pcmpistri to do the heavy lifting.
140
* We provide three code paths, depending on set size:
141
*
142
* 0--16: one pcmpistri per 16 bytes of input
143
* 17--32: two pcmpistri per 16 bytes of input
144
* >=33: fall back to look up table
145
*/
146
ARCHENTRY(strspn, x86_64_v2)
147
push %rbp
148
mov %rsp, %rbp
149
sub $256, %rsp
150
151
/* find set size and copy up to 32 bytes to (%rsp) */
152
mov %esi, %ecx
153
and $~0xf, %rsi # align set pointer
154
movdqa (%rsi), %xmm0
155
pxor %xmm1, %xmm1
156
and $0xf, %ecx # amount of bytes rsi is past alignment
157
xor %edx, %edx
158
pcmpeqb %xmm0, %xmm1 # end of string reached?
159
movdqa %xmm0, 32(%rsp) # transfer head of set to stack
160
pmovmskb %xmm1, %eax
161
shr %cl, %eax # clear out junk before string
162
test %eax, %eax # end of set reached?
163
jnz 0f
164
165
movdqa 16(%rsi), %xmm0 # second chunk of the set
166
mov $16, %edx
167
sub %ecx, %edx # length of set preceding xmm0
168
pxor %xmm1, %xmm1
169
pcmpeqb %xmm0, %xmm1
170
movdqa %xmm0, 48(%rsp)
171
movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set
172
pmovmskb %xmm1, %eax
173
test %eax, %eax
174
jnz 1f
175
176
movdqa 32(%rsi), %xmm0 # third chunk
177
add $16, %edx
178
pxor %xmm1, %xmm1
179
pcmpeqb %xmm0, %xmm1
180
movdqa %xmm0, 64(%rsp)
181
pmovmskb %xmm1, %eax
182
test %eax, %eax # still not done?
183
jz .Lgt32v2
184
185
0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set
186
1: tzcnt %eax, %eax
187
add %eax, %edx # length of set (excluding NUL byte)
188
cmp $32, %edx # above 32 bytes?
189
ja .Lgt32v2
190
191
/*
192
* At this point we know that we want to use pcmpistri.
193
* one last problem obtains: the head of the string is not
194
* aligned and may cross a cacheline. If this is the case,
195
* we take the part before the page boundary and repeat the
196
* last byte to fill up the xmm register.
197
*/
198
mov %rdi, %rax # save original string pointer
199
lea 15(%rdi), %esi # last byte of the head
200
xor %edi, %esi
201
test $PAGE_SIZE, %esi # does the head cross a page?
202
jz 0f
203
204
/* head crosses page: copy to stack to fix up */
205
and $~0xf, %rax # align head pointer temporarily
206
movzbl 15(%rax), %esi # last head byte on the page
207
movdqa (%rax), %xmm0
208
movabs $0x0101010101010101, %r8
209
imul %r8, %rsi # repeated 8 times
210
movdqa %xmm0, (%rsp) # head word on stack
211
mov %rsi, 16(%rsp) # followed by filler (last byte x8)
212
mov %rsi, 24(%rsp)
213
mov %edi, %eax
214
and $0xf, %eax # offset of head from alignment
215
add %rsp, %rax # pointer to fake head
216
217
0: movdqu (%rax), %xmm1 # load head (fake or real)
218
lea 16(%rdi), %rax
219
and $~0xf, %rax # second 16 bytes of string (aligned)
220
1: cmp $16, %edx # 16--32 bytes?
221
ja .Lgt16v2
222
223
224
/* set is 2--16 bytes in size */
225
226
/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */
227
pcmpistri $0x10, %xmm1, %xmm2 # match in head?
228
jc .Lheadmismatchv2
229
230
ALIGN_TEXT
231
0: pcmpistri $0x10, (%rax), %xmm2
232
jc 1f # match or end of string?
233
pcmpistri $0x10, 16(%rax), %xmm2
234
lea 32(%rax), %rax
235
jnc 0b # match or end of string?
236
237
sub $16, %rax # go back to second half
238
1: sub %rdi, %rax # offset of (%rax) from beginning of string
239
add %rcx, %rax # prefix length before match/NUL
240
leave
241
ret
242
243
.Lheadmismatchv2:
244
mov %ecx, %eax # prefix length before mismatch/NUL
245
leave
246
ret
247
248
/* set is 17--32 bytes in size */
249
.Lgt16v2:
250
movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set
251
252
/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */
253
pcmpistrm $0x10, %xmm1, %xmm2 # any mismatch in first half?
254
movdqa %xmm0, %xmm4
255
pcmpistrm $0x10, %xmm1, %xmm3 # any mismatch in the second half?
256
ptest %xmm0, %xmm4 # any entry that doesn't match either?
257
jnz 2f
258
259
ALIGN_TEXT
260
0: movdqa (%rax), %xmm1
261
pcmpistrm $0x10, %xmm1, %xmm2
262
movdqa %xmm0, %xmm4
263
pcmpistrm $0x10, %xmm1, %xmm3
264
ptest %xmm0, %xmm4
265
jnz 1f
266
movdqa 16(%rax), %xmm1
267
add $32, %rax
268
pcmpistrm $0x10, %xmm1, %xmm2
269
movdqa %xmm0, %xmm4
270
pcmpistrm $0x10, %xmm1, %xmm3
271
ptest %xmm0, %xmm4
272
jz 0b
273
274
sub $16, %rax
275
1: pand %xmm4, %xmm0
276
movd %xmm0, %ecx
277
sub %rdi, %rax # offset of %xmm1 from beginning of string
278
tzcnt %ecx, %ecx
279
add %rcx, %rax # prefix length before match/NUL
280
leave
281
ret
282
283
/* mismatch or string end in head */
284
2: pand %xmm4, %xmm0 # bit mask of mismatches (end of string counts)
285
movd %xmm0, %eax
286
tzcnt %eax, %eax # prefix length before mismatch/NUL
287
leave
288
ret
289
290
/* set is >=33 bytes in size */
291
.Lgt32v2:
292
xorps %xmm0, %xmm0
293
mov $256-64, %edx
294
295
/* clear out look up table */
296
0: movaps %xmm0, (%rsp, %rdx, 1)
297
movaps %xmm0, 16(%rsp, %rdx, 1)
298
movaps %xmm0, 32(%rsp, %rdx, 1)
299
movaps %xmm0, 48(%rsp, %rdx, 1)
300
sub $64, %edx
301
jnc 0b
302
303
add %rcx, %rsi # restore string pointer
304
mov %rdi, %rax # keep a copy of the string
305
306
/* initialise look up table */
307
movzbl (%rsi), %ecx # string is known not to be empty
308
309
ALIGN_TEXT
310
0: movb $1, (%rsp, %rcx, 1)
311
movzbl 1(%rsi), %ecx
312
test %ecx, %ecx
313
jz 1f
314
315
movb $1, (%rsp, %rcx, 1)
316
movzbl 2(%rsi), %ecx
317
test %ecx, %ecx
318
jz 1f
319
320
movb $1, (%rsp, %rcx, 1)
321
movzbl 3(%rsi), %ecx
322
add $4, %rsi
323
test %ecx, %ecx
324
jz 1f
325
326
movb $1, (%rsp, %rcx, 1)
327
movzbl (%rsi), %ecx
328
test %ecx, %ecx
329
jnz 0b
330
331
/* find match */
332
ALIGN_TEXT
333
1: movzbl (%rax), %ecx
334
cmpb $0, (%rsp, %rcx, 1)
335
je 2f
336
337
movzbl 1(%rax), %ecx
338
cmpb $0, (%rsp, %rcx, 1)
339
je 3f
340
341
movzbl 2(%rax), %ecx
342
cmpb $0, (%rsp, %rcx, 1)
343
je 4f
344
345
movzbl 3(%rax), %ecx
346
add $4, %rax
347
cmpb $0, (%rsp, %rcx, 1)
348
jne 1b
349
350
sub $3, %rax
351
4: dec %rdi
352
3: inc %rax
353
2: sub %rdi, %rax # number of characters preceding match
354
leave
355
ret
356
ARCHEND(strspn, x86_64_v2)
357
358
.section .note.GNU-stack,"",%progbits
359
360