/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2023, 2025 Robert Clausecker <[email protected]>4*/56#include <machine/asm.h>78#include "amd64_archlevel.h"910#define ALIGN_TEXT .p2align 4, 0x901112ARCHFUNCS(memrchr)13ARCHFUNC(memrchr, scalar)14ARCHFUNC(memrchr, baseline)15ENDARCHFUNCS(memrchr)1617ARCHENTRY(memrchr, scalar)18lea -1(%rdi, %rdx, 1), %rax # point to last char in buffer19sub $4, %rdx # 4 bytes left to process?20jb .Ltail2122ALIGN_TEXT230: cmp %sil, (%rax) # match at last entry?24je 1f2526cmp %sil, -1(%rax) # match at second to last entry?27je 2f2829cmp %sil, -2(%rax) # match at third to last entry?30je 3f3132cmp %sil, -3(%rax) # match at fourth to last entry?33je 4f3435sub $4, %rax36sub $4, %rdx37jae 0b3839.Ltail: cmp $-3, %edx # at least one character left to process?40jb .Lnotfound4142cmp %sil, (%rax)43je 1f4445cmp $-2, %edx # at least two characters left to process?46jb .Lnotfound4748cmp %sil, -1(%rax)49je 2f5051cmp $-1, %edx # at least three characters left to process?52jb .Lnotfound5354cmp %sil, -2(%rax)55je 3f5657.Lnotfound:58xor %eax, %eax59ret6061/* match found -- adjust rax to point to matching byte */624: dec %rax633: dec %rax642: dec %rax651: ret66ARCHEND(memrchr, scalar)6768ARCHENTRY(memrchr, baseline)69test %rdx, %rdx # empty input?70je .Lnomatchb717273lea (%rdi, %rdx, 1), %ecx # pointer to end of buffer74lea -1(%rdi, %rdx, 1), %rdx # pointer to last char in buffer75movd %esi, %xmm276and $~0x1f, %rdx # pointer to final 32 buffer bytes77movdqa (%rdx), %xmm0 # load last 32 bytes78movdqa 16(%rdx), %xmm17980punpcklbw %xmm2, %xmm2 # c -> cc8182mov $-1, %r8d83neg %ecx84mov %r8d, %r9d85shr %cl, %r8d # mask with zeroes after the string8687punpcklwd %xmm2, %xmm2 # cc -> cccc8889mov %edi, %ecx90mov %r9d, %eax91shl %cl, %r9d # mask with zeroes before the string9293pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc9495cmp %rdx, %rdi # tail is beginning of buffer?96cmovae %r9d, %eax # if yes, do combined head/tail processing97and %r8d, %eax # mak of bytes in tail part of string9899/* process tail */100pcmpeqb %xmm2, %xmm1101pcmpeqb %xmm2, %xmm0102pmovmskb %xmm1, %esi103pmovmskb %xmm0, %ecx104shl $16, %esi105or %esi, %ecx # locations of matches106and %ecx, %eax # any match inside buffer?107jnz .Lprecisematchb108109cmp %rdx, %rdi # did the buffer begin here?110jae .Lnomatchb # if yes, we are done111112/* main loop */113ALIGN_TEXT1140: movdqa -32(%rdx), %xmm0 # load previous string chunk115movdqa -16(%rdx), %xmm1116sub $32, %rdx # beginning of string reached?117cmp %rdx, %rdi118jae .Ltailb119120pcmpeqb %xmm2, %xmm0121pcmpeqb %xmm2, %xmm1122por %xmm1, %xmm0 # match in either half?123pmovmskb %xmm0, %eax124test %eax, %eax125jz 0b126127.Lmatchb:128pcmpeqb (%rdx), %xmm2 # redo comparison of first 16 bytes129pmovmskb %xmm1, %ecx130pmovmskb %xmm2, %eax131shl $16, %ecx132or %ecx, %eax # location of matches133134.Lprecisematchb:135bsr %eax, %eax # find location of match136add %rdx, %rax # point to matching byte137ret138139.Ltailb:140pcmpeqb %xmm2, %xmm1141pcmpeqb %xmm2, %xmm0142pmovmskb %xmm1, %ecx143pmovmskb %xmm0, %eax144shl $16, %ecx145or %ecx, %eax # location of matches146and %r9d, %eax # mask out matches before buffer147bsr %eax, %edi # location of match148lea (%rdx, %rdi, 1), %rdx # pointer to match (if any)149cmovnz %rdx, %rax # point to match if present,150ret # else null pointer151152.Lnomatchb:153xor %eax, %eax # return null pointer154ret155ARCHEND(memrchr, baseline)156157.section .note.GNU-stack, "", %progbits158159160