/*1* Copyright (c) 2023, 2024 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4, 0x903334.weak memccpy35.set memccpy, __memccpy36ARCHFUNCS(__memccpy)37ARCHFUNC(__memccpy, scalar)38ARCHFUNC(__memccpy, baseline)39ENDARCHFUNCS(__memccpy)4041ARCHENTRY(__memccpy, scalar)42push %rbp # establish stack frame43mov %rsp, %rbp44push %rax # dummy push for alignment45push %rbx46push %rdi47push %rsi4849mov %rsi, %rdi50mov %edx, %esi51mov %rcx, %rdx52mov %rcx, %rbx53call CNAME(__memchr) # ptr = memchr(src, c, len)5455pop %rsi56pop %rdi57lea 1(%rax), %rdx58sub %rsi, %rdx # size = ptr - src + 159mov %rbx, %rcx60lea (%rdi, %rdx, 1), %rbx # res = dest + size61test %rax, %rax # if (ptr == NULL)62cmovz %rcx, %rdx # size = len63cmovz %rax, %rbx # res = NULL64call CNAME(memcpy)6566mov %rbx, %rax # return (res)67pop %rbx68leave69ret70ARCHEND(__memccpy, scalar)7172ARCHENTRY(__memccpy, baseline)73sub $1, %rcx # RCX refers to last character in buffer74jb .L0 # go to special code path if len was 07576movd %edx, %xmm477mov %rcx, %rdx78punpcklbw %xmm4, %xmm4 # c -> cc79mov %esi, %ecx80punpcklwd %xmm4, %xmm4 # cc -> cccc81mov %rsi, %r9 # stash a copy of the source pointer for later82pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc83and $~0xf, %rsi84movdqa %xmm4, %xmm185pcmpeqb (%rsi), %xmm1 # c found in head?86and $0xf, %ecx87mov $-1, %eax88pmovmskb %xmm1, %r8d89lea -32(%rcx), %r1190shl %cl, %eax # mask of bytes in the string91add %rdx, %r11 # distance from alignment boundary - 3292jnc .Lrunt # jump if buffer length is 32 or less9394and %r8d, %eax95jz 0f # match (or induced match) found?9697/* match in first chunk */98tzcnt %eax, %edx # where is c?99sub %ecx, %edx # ... from the beginning of the string?100lea 1(%rdi, %rdx, 1), %rax # return value101jmp .L01161021030: movdqa 16(%rsi), %xmm3 # load second string chunk104movdqu (%r9), %xmm2 # load unaligned string head105movdqa %xmm4, %xmm1106pcmpeqb %xmm3, %xmm1 # c found in second chunk?107108/* process second chunk */109pmovmskb %xmm1, %eax110test %eax, %eax111jz 0f112113/* match in second chunk */114tzcnt %eax, %edx # where is c?115sub $16, %ecx116sub %ecx, %edx # adjust for alignment offset117lea 1(%rdi, %rdx, 1), %rax # return value118jmp .L0132119120/* c not found in second chunk: prepare for main loop */1210: movdqa 32(%rsi), %xmm0 # load next string chunk122movdqa %xmm4, %xmm1123movdqu %xmm2, (%rdi) # deposit head into buffer124sub %rcx, %rdi # adjust RDI to correspond to RSI125mov %r11, %rdx126movdqu %xmm3, 16(%rdi) # deposit second chunk127sub %rsi, %rdi # express RDI as distance from RSI128add $32, %rsi # advance RSI past first two chunks129sub $16, %rdx # enough left for another round?130jb 1f131132/* main loop unrolled twice */133ALIGN_TEXT1340: pcmpeqb %xmm0, %xmm1 # c encountered?135pmovmskb %xmm1, %eax136test %eax, %eax137jnz 3f138139movdqu %xmm0, (%rsi, %rdi)140movdqa 16(%rsi), %xmm0 # load next string chunk141movdqa %xmm4, %xmm1142cmp $16, %rdx # more than a full chunk left?143jb 2f144145add $32, %rsi # advance pointers to next chunk146pcmpeqb %xmm0, %xmm1 # c encountered?147pmovmskb %xmm1, %eax148test %eax, %eax149jnz 4f150151movdqu %xmm0, -16(%rsi, %rdi)152movdqa (%rsi), %xmm0 # load next string chunk153movdqa %xmm4, %xmm1154sub $32, %rdx155jae 0b1561571: sub $16, %rsi # undo second advancement158add $16, %edx159160/* 1--16 bytes left in the buffer but string has not ended yet */1612: pcmpeqb %xmm1, %xmm0 # c encountered?162pmovmskb %xmm0, %r8d163mov %r8d, %ecx164bts %edx, %r8d # treat end of buffer as end of string165tzcnt %r8d, %r8d # find tail length166add %rsi, %rdi # restore RDI167movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail168movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail169lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered170xor %eax, %eax # return value if no terminator encountered171bt %r8d, %ecx # terminator encountered inside buffer?172cmovc %rsi, %rax # if yes, return pointer, else NULL173ret1741754: sub $16, %rsi # undo second advancement176177/* terminator found and buffer has not ended yet */1783: tzcnt %eax, %eax # find length of string tail179movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)180add %rsi, %rdi # restore destination pointer181movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)182lea 1(%rdi, %rax, 1), %rax # compute return value183ret184185/* buffer is 1--32 bytes in size */186ALIGN_TEXT187.Lrunt: add $32, %r11d # undo earlier decrement188mov %r8d, %r10d # keep a copy of the original match mask189bts %r11d, %r8d # induce match at buffer end190and %ax, %r8w # is there a match in the first 16 bytes?191jnz 0f # if yes, skip looking at second chunk192193pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk194pmovmskb %xmm4, %r8d195shl $16, %r8d # place second chunk matches in bits 16--31196mov %r8d, %r10d # keep a copy of the original match mask197bts %r11d, %r8d # induce a match at buffer end1981990: xor %eax, %eax # return value if terminator not found200tzcnt %r8d, %edx # find string/buffer length from alignment boundary201lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx202sub %rcx, %r8203bt %edx, %r10d # was the terminator present?204cmovc %r8, %rax # if yes, return pointer, else NULL205sub %ecx, %edx # find actual string/buffer length206207ALIGN_TEXT208.L0132: cmp $16, %rdx # at least 17 bytes to copy?209jb .L0116210211/* copy 17--32 bytes */212movdqu (%r9), %xmm0 # load first 16 bytes213movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes214movdqu %xmm0, (%rdi)215movdqu %xmm1, -15(%rdi, %rdx, 1)216ret217218/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */219ALIGN_TEXT220.L0116: cmp $8, %rdx # at least 9 bytes to copy?221jae .L0916222223cmp $4, %rdx # at least 5 bytes to copy?224jae .L0508225226cmp $2, %rdx # at least 3 bytes to copy?227jae .L0304228229/* copy one or two bytes */230movzbl (%r9), %ecx # load first byte from src231movzbl (%r9, %rdx, 1), %esi # load last byte from src232mov %cl, (%rdi) # deposit into destination233mov %sil, (%rdi, %rdx, 1)234ret235236.L0304: movzwl (%r9), %ecx237movzwl -1(%r9, %rdx, 1), %esi238mov %cx, (%rdi)239mov %si, -1(%rdi, %rdx, 1)240ret241242.L0508: mov (%r9), %ecx243mov -3(%r9, %rdx, 1), %esi244mov %ecx, (%rdi)245mov %esi, -3(%rdi, %rdx, 1)246ret247248.L0916: mov (%r9), %rcx249mov -7(%r9, %rdx, 1), %rsi250mov %rcx, (%rdi)251mov %rsi, -7(%rdi, %rdx, 1)252ret253254/* length zero destination: return null pointer */255.L0: xor %eax, %eax256ret257ARCHEND(__memccpy, baseline)258259.section .note.GNU-stack,"",%progbits260261262