/*-1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */3334.weak memchr35.set memchr, __memchr36ARCHFUNCS(__memchr)37ARCHFUNC(__memchr, scalar)38ARCHFUNC(__memchr, baseline)39ENDARCHFUNCS(__memchr)4041ARCHENTRY(__memchr, scalar)42test %rdx, %rdx # empty input?43je .Lnomatch4445lea (, %rdi, 8), %ecx46mov $-1, %rax47add %rdi, %rdx # pointer to end of buffer or to end of48cmovc %rax, %rdx # address space (whichever comes first)49and $~7, %rdi # align to 8 bytes50mov (%rdi), %rax # load first word51movzbl %sil, %esi # clear stray high bits52movabs $0x0101010101010101, %r853imul %r8, %rsi # replicate char 8 times5455/* compute head and tail masks */56mov %r8, %r1057movabs $0x8080808080808080, %r958shl %cl, %r10 # 0x01 where string head is59lea (, %rdx, 8), %ecx60xor %r8, %r10 # 0x01 where it is not61neg %r8 # negate 01..01 so we can use lea62mov %r9, %r1163xor %rsi, %rax # str ^ c (0x00 where str[i] == c)64neg %ecx65or %r10, %rax # except before the string66shr %cl, %r11 # 0x80 where string tail is6768add $8, %rdi # advance to next 8 bytes69cmp %rdx, %rdi # end of buffer reached during head?70jae .Ltail # and go to tail-processing code7172/* main loop, unrolled twice */73ALIGN_TEXT740: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..0175not %rax # ~(str ^ c)76and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)77and %rcx, %rax # not including junk bytes78jnz .Lmatch7980mov (%rdi), %rax81add $8, %rdi82xor %rsi, %rax # str ^ c83cmp %rdx, %rdi84jae .Ltail8586lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..0187not %rax # ~(str ^ c)88and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)89and %rcx, %rax # not including junk bytes90jnz .Lmatch9192mov (%rdi), %rax93add $8, %rdi94xor %rsi, %rax # str ^ c95cmp %rdx, %rdi96jb 0b9798.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..0199not %rax # ~(str ^ c)100and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c)101and %rcx, %rax # not including junk bytes or bytes past buffer102jz .Lnomatch103104.Lmatch:105tzcnt %rax, %rax # first match106shr $3, %eax # scale from bit to byte index107lea -8(%rdi, %rax), %rax # pointer to found c108ret109110/* no match found */111.Lnomatch:112xor %eax, %eax # return null pointer113ret114ARCHEND(__memchr, scalar)115116ARCHENTRY(__memchr, baseline)117test %rdx, %rdx # empty input?118je .Lnomatchb119120movd %esi, %xmm2121mov %edi, %ecx122mov $-1, %r9123add %rdi, %rdx # pointer to end of buffer or to end of124cmovc %r9, %rdx # address space (whichever comes first)125and $~0x1f, %rdi # align to 32 bytes126movdqa (%rdi), %xmm0 # load first 32 bytes127movdqa 16(%rdi), %xmm1128129punpcklbw %xmm2, %xmm2 # c -> cc130131shl %cl, %r9d # mask with zeroes before the string132133punpcklwd %xmm2, %xmm2 # cc -> cccc134135mov $-1, %r8d136xor %ecx, %ecx137sub %edx, %ecx # edx = -ecx138shr %cl, %r8d # bytes in tail that are part of the buffer139140pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc141142add $32, %rdi # advance to next 32 bytes143mov $-1, %eax144cmp %rdx, %rdi # end of buffer reached during head?145cmovae %r8d, %eax # if yes, do combined head/tail processing146and %r9d, %eax # mask of bytes in head part of string147148/* process head */149pcmpeqb %xmm2, %xmm1150pcmpeqb %xmm2, %xmm0151pmovmskb %xmm1, %esi152pmovmskb %xmm0, %ecx153shl $16, %esi154or %esi, %ecx # locations of matches155and %ecx, %eax # any match inside buffer?156jnz .Lprecisematchb157158cmp %rdx, %rdi # did the buffer end here?159jae .Lnomatchb # if yes we are done160161/* main loop */162ALIGN_TEXT1630: movdqa (%rdi), %xmm0 # load next string chunk164movdqa 16(%rdi), %xmm1165add $32, %rdi166cmp %rdx, %rdi # ready for main loop?167jae .Ltailb168169pcmpeqb %xmm2, %xmm0170pcmpeqb %xmm2, %xmm1171por %xmm1, %xmm0 # match in either half?172pmovmskb %xmm0, %eax173test %eax, %eax174jz 0b175176.Lmatchb:177pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes178pmovmskb %xmm1, %ecx179pmovmskb %xmm2, %eax180shl $16, %ecx181or %ecx, %eax # location of matches182183.Lprecisematchb:184tzcnt %eax, %eax # find location of match185lea -32(%rdi, %rax, 1), %rax # point to matching byte186ret187188.Ltailb:189pcmpeqb %xmm2, %xmm1190pcmpeqb %xmm2, %xmm0191pmovmskb %xmm1, %edx192pmovmskb %xmm0, %eax193shl $16, %edx194or %edx, %eax # location of matches195and %r8d, %eax # mask out matches beyond buffer196bsf %eax, %edx # location of match197lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any)198cmovnz %rdx, %rax # point to match if present,199ret # else null pointer200201.Lnomatchb:202xor %eax, %eax # return null pointer203ret204ARCHEND(__memchr, baseline)205206.section .note.GNU-stack,"",%progbits207208209