/*1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>29#include <machine/param.h>3031#include "amd64_archlevel.h"3233#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */3435.weak strcspn36.set strcspn, __strcspn37ARCHFUNCS(__strcspn)38ARCHFUNC(__strcspn, scalar)39NOARCHFUNC40ARCHFUNC(__strcspn, x86_64_v2)41ENDARCHFUNCS(__strcspn)4243ARCHENTRY(__strcspn, scalar)44push %rbp # align stack to enable function call45mov %rsp, %rbp46sub $256, %rsp # allocate space for lookup table4748/* check for special cases */49movzbl (%rsi), %eax # first character in the set50test %eax, %eax51jz .Lstrlen5253movzbl 1(%rsi), %edx # second character in the set54test %edx, %edx55jz .Lstrchr5657/* no special case matches -- prepare lookup table */58xor %r8d, %r8d59mov $28, %ecx600: mov %r8, (%rsp, %rcx, 8)61mov %r8, 8(%rsp, %rcx, 8)62mov %r8, 16(%rsp, %rcx, 8)63mov %r8, 24(%rsp, %rcx, 8)64sub $4, %ecx65jnc 0b6667add $2, %rsi68movb $1, (%rsp, %rax, 1) # register first chars in set69movb $1, (%rsp, %rdx, 1)70mov %rdi, %rax # a copy of the source to iterate over7172/* process remaining chars in set */73ALIGN_TEXT740: movzbl (%rsi), %ecx75movb $1, (%rsp, %rcx, 1)76test %ecx, %ecx77jz 1f7879movzbl 1(%rsi), %ecx80movb $1, (%rsp, %rcx, 1)81test %ecx, %ecx82jz 1f8384add $2, %rsi85jmp 0b8687/* find match */88ALIGN_TEXT891: movzbl (%rax), %ecx90cmpb $0, (%rsp, %rcx, 1)91jne 2f9293movzbl 1(%rax), %ecx94cmpb $0, (%rsp, %rcx, 1)95jne 3f9697movzbl 2(%rax), %ecx98cmpb $0, (%rsp, %rcx, 1)99jne 4f100101movzbl 3(%rax), %ecx102add $4, %rax103cmpb $0, (%rsp, %rcx, 1)104je 1b105106sub $3, %rax1074: dec %rdi1083: inc %rax1092: sub %rdi, %rax # number of characters preceding match110leave111ret112113/* set is empty, degrades to strlen */114.Lstrlen:115leave116jmp CNAME(strlen)117118/* just one character in set, degrades to strchr */119.Lstrchr:120mov %rdi, (%rsp) # stash a copy of the string121mov %eax, %esi # find the character in the set122call CNAME(strchrnul)123sub (%rsp), %rax # length of prefix before match124leave125ret126ARCHEND(__strcspn, scalar)127128/*129* This kernel uses pcmpistri to do the heavy lifting.130* We provide five code paths, depending on set size:131*132* 0: call strlen()133* 1: call strchr()134* 2--16: one pcmpistri per 16 bytes of input135* 17--32: two pcmpistri per 16 bytes of input136* >=33: fall back to look up table137*/138ARCHENTRY(__strcspn, x86_64_v2)139push %rbp140mov %rsp, %rbp141sub $256, %rsp142143/* check for special cases */144movzbl (%rsi), %eax145test %eax, %eax # empty string?146jz .Lstrlenv2147148cmpb $0, 1(%rsi) # single character string?149jz .Lstrchrv2150151/* find set size and copy up to 32 bytes to (%rsp) */152mov %esi, %ecx153and $~0xf, %rsi # align set pointer154movdqa (%rsi), %xmm0155pxor %xmm1, %xmm1156and $0xf, %ecx # amount of bytes rsi is past alignment157xor %edx, %edx158pcmpeqb %xmm0, %xmm1 # end of string reached?159movdqa %xmm0, 32(%rsp) # transfer head of set to stack160pmovmskb %xmm1, %eax161shr %cl, %eax # clear out junk before string162test %eax, %eax # end of set reached?163jnz 0f164165movdqa 16(%rsi), %xmm0 # second chunk of the set166mov $16, %edx167sub %ecx, %edx # length of set preceding xmm0168pxor %xmm1, %xmm1169pcmpeqb %xmm0, %xmm1170movdqa %xmm0, 48(%rsp)171movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set172pmovmskb %xmm1, %eax173test %eax, %eax174jnz 1f175176movdqa 32(%rsi), %xmm0 # third chunk177add $16, %edx178pxor %xmm1, %xmm1179pcmpeqb %xmm0, %xmm1180movdqa %xmm0, 64(%rsp)181pmovmskb %xmm1, %eax182test %eax, %eax # still not done?183jz .Lgt32v21841850: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set1861: tzcnt %eax, %eax187add %eax, %edx # length of set (excluding NUL byte)188cmp $32, %edx # above 32 bytes?189ja .Lgt32v2190191/*192* At this point we know that we want to use pcmpistri.193* one last problem obtains: the head of the string is not194* aligned and may cross a cacheline. If this is the case,195* we take the part before the page boundary and repeat the196* last byte to fill up the xmm register.197*/198mov %rdi, %rax # save original string pointer199lea 15(%rdi), %esi # last byte of the head200xor %edi, %esi201test $PAGE_SIZE, %esi # does the head cross a page?202jz 0f203204/* head crosses page: copy to stack to fix up */205and $~0xf, %rax # align head pointer temporarily206movzbl 15(%rax), %esi # last head byte on the page207movdqa (%rax), %xmm0208movabs $0x0101010101010101, %r8209imul %r8, %rsi # repeated 8 times210movdqa %xmm0, (%rsp) # head word on stack211mov %rsi, 16(%rsp) # followed by filler (last byte x8)212mov %rsi, 24(%rsp)213mov %edi, %eax214and $0xf, %eax # offset of head from alignment215add %rsp, %rax # pointer to fake head2162170: movdqu (%rax), %xmm0 # load head (fake or real)218lea 16(%rdi), %rax219and $~0xf, %rax # second 16 bytes of string (aligned)2201: cmp $16, %edx # 16--32 bytes?221ja .Lgt16v2222223224/* set is 2--16 bytes in size */225226/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */227pcmpistri $0, %xmm0, %xmm2 # match in head?228jbe .Lheadmatchv2229230ALIGN_TEXT2310: pcmpistri $0, (%rax), %xmm2232jbe 1f # match or end of string?233pcmpistri $0, 16(%rax), %xmm2234lea 32(%rax), %rax235ja 0b # match or end of string?2362373: lea -16(%rax), %rax # go back to second half2381: jc 2f # jump if match found239movdqa (%rax), %xmm0 # reload string piece240pxor %xmm1, %xmm1241pcmpeqb %xmm1, %xmm0 # where is the NUL byte?242pmovmskb %xmm0, %ecx243tzcnt %ecx, %ecx # location of NUL byte in (%rax)2442: sub %rdi, %rax # offset of %xmm0 from beginning of string245add %rcx, %rax # prefix length before match/NUL246leave247ret248249.Lheadmatchv2:250jc 2f # jump if match found251pxor %xmm1, %xmm1252pcmpeqb %xmm1, %xmm0253pmovmskb %xmm0, %ecx254tzcnt %ecx, %ecx # location of NUL byte2552: mov %ecx, %eax # prefix length before match/NUL256leave257ret258259/* match in first set half during head */260.Lheadmatchv2first:261mov %ecx, %eax262pcmpistri $0, %xmm0, %xmm3 # match in second set half?263cmp %ecx, %eax # before the first half match?264cmova %ecx, %eax # use the earlier match265leave266ret267268.Lgt16v2:269movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set270271/* set is 17--32 bytes in size */272pcmpistri $0, %xmm0, %xmm2 # match in first set half?273jb .Lheadmatchv2first274pcmpistri $0, %xmm0, %xmm3 # match in second set half or end of string?275jbe .Lheadmatchv2276277ALIGN_TEXT2780: movdqa (%rax), %xmm0279pcmpistri $0, %xmm0, %xmm2280jb 4f # match in first set half?281pcmpistri $0, %xmm0, %xmm3282jbe 1f # match in second set half or end of string?283movdqa 16(%rax), %xmm0284add $32, %rax285pcmpistri $0, %xmm0, %xmm2286jb 3f # match in first set half?287pcmpistri $0, %xmm0, %xmm3288ja 0b # neither match in 2nd half nor string end?289290/* match in second half or NUL */291lea -16(%rax), %rax # go back to second half2921: jc 2f # jump if match found293pxor %xmm1, %xmm1294pcmpeqb %xmm1, %xmm0 # where is the NUL byte?295pmovmskb %xmm0, %ecx296tzcnt %ecx, %ecx # location of NUL byte in (%rax)2972: sub %rdi, %rax # offset of %xmm0 from beginning of string298add %rcx, %rax # prefix length before match/NUL299leave300ret301302/* match in first half */3033: sub $16, %rax # go back to second half3044: sub %rdi, %rax # offset of %xmm0 from beginning of string305mov %ecx, %edx306pcmpistri $0, %xmm0, %xmm3 # match in second set half?307cmp %ecx, %edx # before the first half match?308cmova %ecx, %edx # use the earlier match309add %rdx, %rax # return full ofset310leave311ret312313/* set is empty, degrades to strlen */314.Lstrlenv2:315leave316jmp CNAME(strlen)317318/* just one character in set, degrades to strchr */319.Lstrchrv2:320mov %rdi, (%rsp) # stash a copy of the string321mov %eax, %esi # find this character322call CNAME(strchrnul)323sub (%rsp), %rax # length of prefix before match324leave325ret326327/* set is >=33 bytes in size */328.Lgt32v2:329xorps %xmm0, %xmm0330mov $256-64, %edx331332/* clear out look up table */3330: movaps %xmm0, (%rsp, %rdx, 1)334movaps %xmm0, 16(%rsp, %rdx, 1)335movaps %xmm0, 32(%rsp, %rdx, 1)336movaps %xmm0, 48(%rsp, %rdx, 1)337sub $64, %edx338jnc 0b339340add %rcx, %rsi # restore string pointer341mov %rdi, %rax # keep a copy of the string342343/* initialise look up table */344ALIGN_TEXT3450: movzbl (%rsi), %ecx346movb $1, (%rsp, %rcx, 1)347test %ecx, %ecx348jz 1f349350movzbl 1(%rsi), %ecx351movb $1, (%rsp, %rcx, 1)352test %ecx, %ecx353jz 1f354355movzbl 2(%rsi), %ecx356movb $1, (%rsp, %rcx, 1)357test %ecx, %ecx358jz 1f359360movzbl 3(%rsi), %ecx361movb $1, (%rsp, %rcx, 1)362test %ecx, %ecx363jz 1f364365add $4, %rsi366jmp 0b367368/* find match */369ALIGN_TEXT3701: movzbl (%rax), %ecx371cmpb $0, (%rsp, %rcx, 1)372jne 2f373374movzbl 1(%rax), %ecx375cmpb $0, (%rsp, %rcx, 1)376jne 3f377378movzbl 2(%rax), %ecx379cmpb $0, (%rsp, %rcx, 1)380jne 4f381382movzbl 3(%rax), %ecx383add $4, %rax384cmpb $0, (%rsp, %rcx, 1)385je 1b386387sub $3, %rax3884: dec %rdi3893: inc %rax3902: sub %rdi, %rax # number of characters preceding match391leave392ret393ARCHEND(__strcspn, x86_64_v2)394395.section .note.GNU-stack,"",%progbits396397398