/*-1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled3334.weak strchrnul35.set strchrnul, __strchrnul3637ARCHFUNCS(__strchrnul)38ARCHFUNC(__strchrnul, scalar)39ARCHFUNC(__strchrnul, baseline)40ENDARCHFUNCS(__strchrnul)4142/*43* strchrnul(str, c)44* This is implemented like strlen(str), but we check for the45* presence of both NUL and c in each iteration.46*/47ARCHENTRY(__strchrnul, scalar)48mov %edi, %ecx49and $~7, %rdi # align to 8 byte50movzbl %sil, %esi # clear stray high bits51movabs $0x0101010101010101, %r852mov (%rdi), %rax # load first word53imul %r8, %rsi # replicate char 8 times5455/*56* Unaligned input: align to 8 bytes. Then proceed the same57* way as with aligned input, but prevent matches before the58* beginning of the string. This is achieved by oring 0x0159* into each byte of the buffer before the string60*/61shl $3, %ecx62mov %r8, %r1063add $8, %rdi64shl %cl, %r10 # 0x01 where the string is65xor %r8, %r10 # 0x01 where it is not66neg %r8 # negate 01..01 so we can use lea67movabs $0x8080808080808080, %r96869mov %rsi, %rcx70xor %rax, %rcx # str ^ c71or %r10, %rax # str without NUL bytes before it72or %r10, %rcx # (str ^ c) without matches before it73lea (%rax, %r8, 1), %rdx # str - 0x01..0174lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..0175not %rax # ~str76not %rcx # ~(str ^ c)77and %rdx, %rax # (str - 0x01..01) & ~str78and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)79or %rcx, %rax # matches for both80and %r9, %rax # not including junk bytes81jnz 1f8283/* main loop unrolled twice */84ALIGN_TEXT850: mov (%rdi), %rax # str86mov %rsi, %rcx87xor %rax, %rcx # str ^ c88lea (%rax, %r8, 1), %rdx # str - 0x01..0189lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..0190not %rax # ~str91not %rcx # ~(str ^ c)92and %rdx, %rax # (str - 0x01..01) & ~str93and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)94or %rcx, %rax # matches for both95and %r9, %rax # not including junk bits96jnz 2f9798mov 8(%rdi), %rax # str99add $16, %rdi100mov %rsi, %rcx101xor %rax, %rcx # str ^ c102lea (%rax, %r8, 1), %rdx # str - 0x01..01103lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01104not %rax # ~str105not %rcx # ~(str ^ c)106and %rdx, %rax # (str - 0x01..01) & ~str107and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)108or %rcx, %rax # matches for both109and %r9, %rax # not including junk bits110jz 0b111112/* NUL or c found */1131: sub $8, %rdi # undo advance past buffer1142: tzcnt %rax, %rax # first NUL or c byte match115shr $3, %eax # scale from bit to byte index116add %rdi, %rax # pointer to found c or NUL117ret118ARCHEND(__strchrnul, scalar)119120ARCHENTRY(__strchrnul, baseline)121mov %edi, %ecx122and $~0xf, %rdi # align to 16 byte123movdqa (%rdi), %xmm1124movd %esi, %xmm0125and $0xf, %ecx # distance from (%rdi) to start of string126pxor %xmm2, %xmm2127mov $-1, %edx128punpcklbw %xmm0, %xmm0 # c -> cc129shl %cl, %edx # bits corresponding to bytes in the string130punpcklwd %xmm0, %xmm0 # cc -> cccc131add $16, %rdi132133/* check for match in head */134pcmpeqb %xmm1, %xmm2 # NUL bytes present?135pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc136pcmpeqb %xmm0, %xmm1 # c present?137por %xmm2, %xmm1 # either present?138pmovmskb %xmm1, %eax139and %edx, %eax # match in the string?140jnz 1f141142/* main loop unrolled twice */143ALIGN_TEXT1440: movdqa (%rdi), %xmm1145pxor %xmm2, %xmm2146pcmpeqb %xmm1, %xmm2 # NUL bytes present?147pcmpeqb %xmm0, %xmm1 # c present?148por %xmm2, %xmm1 # either present?149pmovmskb %xmm1, %eax150test %eax, %eax # match in the string?151jnz 2f152153movdqa 16(%rdi), %xmm1154add $32, %rdi155pxor %xmm2, %xmm2156pcmpeqb %xmm1, %xmm2 # NUL bytes present?157pcmpeqb %xmm0, %xmm1 # c present?158por %xmm2, %xmm1 # either present?159pmovmskb %xmm1, %eax160test %eax, %eax # match in the string?161jz 0b1621631: sub $16, %rdi # undo advance past buffer1642: tzcnt %eax, %eax # where is the match?165add %rdi, %rax # pointer to found c or NUL166ret167ARCHEND(__strchrnul, baseline)168169.section .note.GNU-stack,"",%progbits170171172