/*-1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled3334.weak rindex35.set rindex, strrchr3637ARCHFUNCS(strrchr)38ARCHFUNC(strrchr, scalar)39ARCHFUNC(strrchr, baseline)40ENDARCHFUNCS(strrchr)4142ARCHENTRY(strrchr, scalar)43mov %edi, %ecx44and $~7, %rdi # align to 8 byte45movzbl %sil, %esi # clear stray high bits46movabs $0x0101010101010101, %r847mov (%rdi), %rax # load first word48imul %r8, %rsi # replicate char 8 times4950/*51* Unaligned input: align to 8 bytes. Then proceed the same52* way as with aligned input, but prevent matches before the53* beginning of the string. This is achieved by oring 0x0154* into each byte of the buffer before the string55*/56shl $3, %ecx57mov %r8, %r1058shl %cl, %r10 # 0x01 where the string is59xor %r8, %r10 # 0x01 where it is not60neg %r8 # negate 01..01 so we can use lea61movabs $0x8080808080808080, %r96263mov %rsi, %rcx64xor %rax, %rcx # str ^ c65or %r10, %rax # ensure str != 0 before string66or %r10, %rcx # ensure str^c != 0 before string67bswap %rcx # in reverse order, to find last match68mov %rdi, %r10 # location of initial mismatch (if any)69xor %r11, %r11 # initial mismatch (none)70add $8, %rdi # advance to next iteration71lea (%rax, %r8, 1), %rdx # str - 0x01..0172not %rax # ~str73and %rdx, %rax # (str - 0x01..01) & ~str74and %r9, %rax # not including junk bits75jnz 1f # end of string?7677lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..0178not %rcx # ~(str ^ c)79and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)80and %r9, %rcx # not including junk bits81mov %rcx, %r11 # remember mismatch in head82jmp 0f8384/* main loop unrolled twice */85ALIGN_TEXT863: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..0187not %rcx # ~(str ^ c)88and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)89and %r9, %rcx # not including junk bits90lea -8(%rdi), %rdx91cmovnz %rdx, %r10 # remember location of current mismatch92cmovnz %rcx, %r1193940: mov (%rdi), %rax # str95mov %rsi, %rcx96xor %rax, %rcx # str ^ c97bswap %rcx # in reverse order, to find last match98lea (%rax, %r8, 1), %rdx # str - 0x01..0199not %rax # ~str100and %rdx, %rax # (str - 0x01..01) & ~str101and %r9, %rax # not including junk bits102jnz 2f # end of string?103104lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01105not %rcx # ~(str ^ c)106and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)107and %r9, %rcx # not including junk bits108cmovnz %rdi, %r10 # remember location of current mismatch109cmovnz %rcx, %r11110111mov 8(%rdi), %rax # str112add $16, %rdi113mov %rsi, %rcx114xor %rax, %rcx # str ^ c115bswap %rcx116lea (%rax, %r8, 1), %rdx # str - 0x01..01117not %rax # ~str118and %rdx, %rax # (str - 0x01..01) & ~str119and %r9, %rax # not including junk bits120jz 3b # end of string?121122/* NUL found */1231: sub $8, %rdi # undo advance past buffer1242: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01125not %rcx # ~(str ^ c)126and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)127and %r9, %rcx # not including junk bits128lea -1(%rax), %rdx129xor %rdx, %rax # mask of bytes in the string130bswap %rdx # in reverse order131and %rdx, %rcx # c found in the tail?132cmovnz %rdi, %r10133cmovnz %rcx, %r11134bswap %r11 # unreverse byte order135bsr %r11, %rcx # last location of c in (R10)136shr $3, %rcx # as byte offset137lea (%r10, %rcx, 1), %rax # pointer to match138test %r11, %r11 # was there actually a match?139cmovz %r11, %rax # if not, return null pointer140ret141ARCHEND(strrchr, scalar)142143ARCHENTRY(strrchr, baseline)144mov %edi, %ecx145and $~0xf, %rdi # align to 16 bytes146movdqa (%rdi), %xmm1147movd %esi, %xmm0148and $0xf, %ecx # offset from alignment149pxor %xmm2, %xmm2150mov $-1, %edx151punpcklbw %xmm0, %xmm0 # c -> cc152shl %cl, %edx # bits corresponding to bytes in the string153punpcklwd %xmm0, %xmm0 # cc -> cccc154xor %r8, %r8 # address of latest match155mov $1, %esi # bit mask of latest match156mov %rdi, %r9 # candidate location for next match157add $16, %rdi # advance to next chunk158159/* check for match in head */160pcmpeqb %xmm1, %xmm2 # NUL byte present?161pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc162pcmpeqb %xmm0, %xmm1 # c present?163pmovmskb %xmm2, %eax164pmovmskb %xmm1, %ecx165and %edx, %ecx # c present in the string?166and %edx, %eax # NUL present in the string?167jnz .Lend2168169/* main loop unrolled twice */170ALIGN_TEXT1710: movdqa (%rdi), %xmm1172test %ecx, %ecx # was there a match in the last iter.?173cmovnz %r9, %r8 # remember match if any174cmovnz %ecx, %esi175pxor %xmm2, %xmm2176pcmpeqb %xmm1, %xmm2 # NUL byte present?177pcmpeqb %xmm0, %xmm1 # c present?178pmovmskb %xmm2, %eax179pmovmskb %xmm1, %ecx180test %eax, %eax # end of string in first half?181jnz .Lend182183movdqa 16(%rdi), %xmm1184test %ecx, %ecx # was there a match in the last iter.?185cmovnz %rdi, %r8 # remember match if any186cmovnz %ecx, %esi187pxor %xmm2, %xmm2188pcmpeqb %xmm1, %xmm2 # NUL byte present?189pcmpeqb %xmm0, %xmm1 # c present?190pmovmskb %xmm2, %eax191pmovmskb %xmm1, %ecx192lea 16(%rdi), %r9193add $32, %rdi194test %eax, %eax # end of string in second half?195jz 0b196197ALIGN_TEXT198.Lend2: sub $16, %rdi199.Lend: lea -1(%rax), %edx200xor %edx, %eax # mask of bytes in the string201and %eax, %ecx # c found in the tail?202cmovnz %rdi, %r8203cmovnz %ecx, %esi204bsr %esi, %esi # last location of c in (R8)205lea (%r8, %rsi, 1), %rax # pointer to match206ret207ARCHEND(strrchr, baseline)208.section .note.GNU-stack,"",%progbits209210211