/*-1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>29#include <machine/param.h>3031#include "amd64_archlevel.h"3233#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */3435ARCHFUNCS(strspn)36ARCHFUNC(strspn, scalar)37NOARCHFUNC38ARCHFUNC(strspn, x86_64_v2)39ENDARCHFUNCS(strspn)4041ARCHENTRY(strspn, scalar)42push %rbp # align stack to enable function call43mov %rsp, %rbp44sub $256, %rsp # allocate space for lookup table4546/* check for special cases */47movzbl (%rsi), %edx # first character in the set48test %edx, %edx49jz .Lzero # empty set always returns 05051movzbl 1(%rsi), %eax # second character in the set52test %eax, %eax53jz .Lsingle5455/* no special case matches -- prepare lookup table */56xor %r8d, %r8d57mov $28, %ecx580: mov %r8, (%rsp, %rcx, 8)59mov %r8, 8(%rsp, %rcx, 8)60mov %r8, 16(%rsp, %rcx, 8)61mov %r8, 24(%rsp, %rcx, 8)62sub $4, %ecx63jnc 0b6465movb $1, (%rsp, %rdx, 1) # register first char in set66add $2, %rsi6768/* process remaining chars in set */69ALIGN_TEXT700: movb $1, (%rsp, %rax, 1) # register previous char71movzbl (%rsi), %eax # next char in set72test %eax, %eax # end of string?73jz 1f7475movb $1, (%rsp, %rax, 1)76add $2, %rsi77movzbl -1(%rsi), %eax78test %eax, %eax79jnz 0b80811: mov %rdi, %rax # a copy of the source to iterate over8283/* find mismatch */84ALIGN_TEXT850: movzbl (%rax), %ecx86cmpb $0, (%rsp, %rcx, 1)87je 2f8889movzbl 1(%rax), %ecx90cmpb $0, (%rsp, %rcx, 1)91je 3f9293movzbl 2(%rax), %ecx94cmpb $0, (%rsp, %rcx, 1)95je 4f9697movzbl 3(%rax), %ecx98add $4, %rax99cmpb $0, (%rsp, %rcx, 1)100jne 0b101102sub $3, %rax1034: dec %rdi1043: inc %rax1052: sub %rdi, %rax # number of characters preceding match106leave107ret108109/* empty set never matches */110.Lzero: xor %eax, %eax111leave112ret113114/* find repeated single character */115ALIGN_TEXT116.Lsingle:117cmpb %dl, (%rdi, %rax, 1)118jne 1f119120cmpb %dl, 1(%rdi, %rax, 1)121jne 2f122123cmpb %dl, 2(%rdi, %rax, 1)124jne 3f125126cmpb %dl, 3(%rdi, %rax, 1)127lea 4(%rax), %rax128je .Lsingle129130sub $3, %rax1313: inc %rax1322: inc %rax1331: leave134ret135ARCHEND(strspn, scalar)136137/*138* This kernel uses pcmpistri to do the heavy lifting.139* We provide three code paths, depending on set size:140*141* 0--16: one pcmpistri per 16 bytes of input142* 17--32: two pcmpistri per 16 bytes of input143* >=33: fall back to look up table144*/145ARCHENTRY(strspn, x86_64_v2)146push %rbp147mov %rsp, %rbp148sub $256, %rsp149150/* find set size and copy up to 32 bytes to (%rsp) */151mov %esi, %ecx152and $~0xf, %rsi # align set pointer153movdqa (%rsi), %xmm0154pxor %xmm1, %xmm1155and $0xf, %ecx # amount of bytes rsi is past alignment156xor %edx, %edx157pcmpeqb %xmm0, %xmm1 # end of string reached?158movdqa %xmm0, 32(%rsp) # transfer head of set to stack159pmovmskb %xmm1, %eax160shr %cl, %eax # clear out junk before string161test %eax, %eax # end of set reached?162jnz 0f163164movdqa 16(%rsi), %xmm0 # second chunk of the set165mov $16, %edx166sub %ecx, %edx # length of set preceding xmm0167pxor %xmm1, %xmm1168pcmpeqb %xmm0, %xmm1169movdqa %xmm0, 48(%rsp)170movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set171pmovmskb %xmm1, %eax172test %eax, %eax173jnz 1f174175movdqa 32(%rsi), %xmm0 # third chunk176add $16, %edx177pxor %xmm1, %xmm1178pcmpeqb %xmm0, %xmm1179movdqa %xmm0, 64(%rsp)180pmovmskb %xmm1, %eax181test %eax, %eax # still not done?182jz .Lgt32v21831840: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set1851: tzcnt %eax, %eax186add %eax, %edx # length of set (excluding NUL byte)187cmp $32, %edx # above 32 bytes?188ja .Lgt32v2189190/*191* At this point we know that we want to use pcmpistri.192* one last problem obtains: the head of the string is not193* aligned and may cross a cacheline. If this is the case,194* we take the part before the page boundary and repeat the195* last byte to fill up the xmm register.196*/197mov %rdi, %rax # save original string pointer198lea 15(%rdi), %esi # last byte of the head199xor %edi, %esi200test $PAGE_SIZE, %esi # does the head cross a page?201jz 0f202203/* head crosses page: copy to stack to fix up */204and $~0xf, %rax # align head pointer temporarily205movzbl 15(%rax), %esi # last head byte on the page206movdqa (%rax), %xmm0207movabs $0x0101010101010101, %r8208imul %r8, %rsi # repeated 8 times209movdqa %xmm0, (%rsp) # head word on stack210mov %rsi, 16(%rsp) # followed by filler (last byte x8)211mov %rsi, 24(%rsp)212mov %edi, %eax213and $0xf, %eax # offset of head from alignment214add %rsp, %rax # pointer to fake head2152160: movdqu (%rax), %xmm1 # load head (fake or real)217lea 16(%rdi), %rax218and $~0xf, %rax # second 16 bytes of string (aligned)2191: cmp $16, %edx # 16--32 bytes?220ja .Lgt16v2221222223/* set is 2--16 bytes in size */224225/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */226pcmpistri $0x10, %xmm1, %xmm2 # match in head?227jc .Lheadmismatchv2228229ALIGN_TEXT2300: pcmpistri $0x10, (%rax), %xmm2231jc 1f # match or end of string?232pcmpistri $0x10, 16(%rax), %xmm2233lea 32(%rax), %rax234jnc 0b # match or end of string?235236sub $16, %rax # go back to second half2371: sub %rdi, %rax # offset of (%rax) from beginning of string238add %rcx, %rax # prefix length before match/NUL239leave240ret241242.Lheadmismatchv2:243mov %ecx, %eax # prefix length before mismatch/NUL244leave245ret246247/* set is 17--32 bytes in size */248.Lgt16v2:249movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set250251/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */252pcmpistrm $0x10, %xmm1, %xmm2 # any mismatch in first half?253movdqa %xmm0, %xmm4254pcmpistrm $0x10, %xmm1, %xmm3 # any mismatch in the second half?255ptest %xmm0, %xmm4 # any entry that doesn't match either?256jnz 2f257258ALIGN_TEXT2590: movdqa (%rax), %xmm1260pcmpistrm $0x10, %xmm1, %xmm2261movdqa %xmm0, %xmm4262pcmpistrm $0x10, %xmm1, %xmm3263ptest %xmm0, %xmm4264jnz 1f265movdqa 16(%rax), %xmm1266add $32, %rax267pcmpistrm $0x10, %xmm1, %xmm2268movdqa %xmm0, %xmm4269pcmpistrm $0x10, %xmm1, %xmm3270ptest %xmm0, %xmm4271jz 0b272273sub $16, %rax2741: pand %xmm4, %xmm0275movd %xmm0, %ecx276sub %rdi, %rax # offset of %xmm1 from beginning of string277tzcnt %ecx, %ecx278add %rcx, %rax # prefix length before match/NUL279leave280ret281282/* mismatch or string end in head */2832: pand %xmm4, %xmm0 # bit mask of mismatches (end of string counts)284movd %xmm0, %eax285tzcnt %eax, %eax # prefix length before mismatch/NUL286leave287ret288289/* set is >=33 bytes in size */290.Lgt32v2:291xorps %xmm0, %xmm0292mov $256-64, %edx293294/* clear out look up table */2950: movaps %xmm0, (%rsp, %rdx, 1)296movaps %xmm0, 16(%rsp, %rdx, 1)297movaps %xmm0, 32(%rsp, %rdx, 1)298movaps %xmm0, 48(%rsp, %rdx, 1)299sub $64, %edx300jnc 0b301302add %rcx, %rsi # restore string pointer303mov %rdi, %rax # keep a copy of the string304305/* initialise look up table */306movzbl (%rsi), %ecx # string is known not to be empty307308ALIGN_TEXT3090: movb $1, (%rsp, %rcx, 1)310movzbl 1(%rsi), %ecx311test %ecx, %ecx312jz 1f313314movb $1, (%rsp, %rcx, 1)315movzbl 2(%rsi), %ecx316test %ecx, %ecx317jz 1f318319movb $1, (%rsp, %rcx, 1)320movzbl 3(%rsi), %ecx321add $4, %rsi322test %ecx, %ecx323jz 1f324325movb $1, (%rsp, %rcx, 1)326movzbl (%rsi), %ecx327test %ecx, %ecx328jnz 0b329330/* find match */331ALIGN_TEXT3321: movzbl (%rax), %ecx333cmpb $0, (%rsp, %rcx, 1)334je 2f335336movzbl 1(%rax), %ecx337cmpb $0, (%rsp, %rcx, 1)338je 3f339340movzbl 2(%rax), %ecx341cmpb $0, (%rsp, %rcx, 1)342je 4f343344movzbl 3(%rax), %ecx345add $4, %rax346cmpb $0, (%rsp, %rcx, 1)347jne 1b348349sub $3, %rax3504: dec %rdi3513: inc %rax3522: sub %rdi, %rax # number of characters preceding match353leave354ret355ARCHEND(strspn, x86_64_v2)356357.section .note.GNU-stack,"",%progbits358359360