/*1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4, 0x903334.weak stpncpy35.set stpncpy, __stpncpy36ARCHFUNCS(__stpncpy)37ARCHFUNC(__stpncpy, scalar)38ARCHFUNC(__stpncpy, baseline)39ENDARCHFUNCS(__stpncpy)4041ARCHENTRY(__stpncpy, scalar)42push %rbp # establish stack frame43mov %rsp, %rbp4445push %rdx46push %rdi47push %rsi48push %rax # dummy push for alignment4950mov %rsi, %rdi51xor %esi, %esi52call CNAME(__memchr) # memchr(src, '\0', len)53pop %rcx # dummy pop54pop %rsi55mov -16(%rbp), %rdi5657test %rax, %rax # NUL found?58jz .Lfullcopy5960mov %rax, %rdx61sub %rsi, %rdx # copy until the NUL byte62add %rdx, -16(%rbp) # advance destination by string length63sub %rdx, -8(%rbp) # and shorten buffer size by string length64call CNAME(memcpy)6566pop %rdi67pop %rdx68xor %esi, %esi69pop %rbp70jmp CNAME(memset) # clear remaining buffer7172.Lfullcopy:73mov -8(%rbp), %rdx74call CNAME(memcpy) # copy whole string75add -8(%rbp), %rax # point to dest[n]76leave77ret78ARCHEND(__stpncpy, scalar)7980/*81* this mask allows us to generate masks of 16-n 0xff bytes82* followed by n 0x00 bytes by loading from .Lmask+n.83*/84.section .rodata85.Lmask: .quad 0xffffffffffffffff86.quad 0xffffffffffffffff87.quad 0x000000000000000088.quad 0x00000000000000008990/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */91ARCHENTRY(__stpncpy, baseline)92#define bounce (-3*16-8) /* location of on-stack bounce buffer */9394test %rdx, %rdx # no bytes to copy?95jz .L09697mov %esi, %ecx98and $~0xf, %rsi # align source to 16 bytes99movdqa (%rsi), %xmm0 # load head100and $0xf, %ecx # offset from alignment101mov $-1, %r9d102lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32103shl %cl, %r9d # mask of bytes belonging to the string104sub %rcx, %rdi # adjust RDI to correspond to RSI105pxor %xmm1, %xmm1106movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack107pcmpeqb %xmm1, %xmm0108pmovmskb %xmm0, %r8d109110lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary111add %rdx, %rax # less than 2 chunks (32 bytes) to play with?112jnc .Lrunt # if yes, use special runt processing113114movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination115and %r9d, %r8d # end of string within head?116jnz .Lheadnul117118movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer119movdqu %xmm2, (%rdi, %rcx, 1) # an deposit120121add $16, %rsi122add $16, %rdi123sub $32, %r10124125/* main loop unrolled twice */126ALIGN_TEXT1270: movdqa (%rsi), %xmm0128pxor %xmm1, %xmm1129pcmpeqb %xmm0, %xmm1 # NUL byte encountered?130pmovmskb %xmm1, %r8d131test %r8d, %r8d132jnz 3f133134movdqu %xmm0, (%rdi)135cmp $16, %r10 # more than a full chunk left?136jbe 1f137138movdqa 16(%rsi), %xmm0139add $32, %rdi # advance pointers to next chunk140add $32, %rsi141pxor %xmm1, %xmm1142pcmpeqb %xmm0, %xmm1 # NUL byte encountered?143pmovmskb %xmm1, %r8d144test %r8d, %r8d145jnz 2f146147movdqu %xmm0, -16(%rdi)148sub $32, %r10 # more than another full chunk left?149ja 0b150151sub $16, %rdi # undo second advancement152sub $16, %rsi153add $16, %r10d # restore number of remaining bytes154155/* 1--16 bytes left but string has not ended yet */1561: pxor %xmm1, %xmm1157pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail?158pmovmskb %xmm1, %r8d159bts %r10d, %r8d # treat end of buffer as NUL160tzcnt %r8d, %r8d # where is the NUL byte?161movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL162lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte163# or end of buffer164movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer165ret1661672: sub $16, %rdi # undo second advancement168sub $16, %rsi169sub $16, %r10170171/* string has ended and buffer has not */1723: tzcnt %r8d, %r8d # where did the string end?173lea .Lmask+16(%rip), %rcx174lea (%rdi, %r8, 1), %rax # where the NUL byte will be175neg %r8176movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is,177# 00 where it is not178pand %xmm1, %xmm0 # mask out bytes after the string179movdqu %xmm0, (%rdi) # store masked current chunk180pxor %xmm1, %xmm1181sub $16, %r10 # another full chunk left?182jbe 1f183184/* clear remaining destination buffer (tail has been cleared earlier) */185ALIGN_TEXT1860: movdqu %xmm1, 16(%rdi)187cmp $16, %r10188jbe 1f189190movdqu %xmm1, 32(%rdi)191add $32, %rdi192sub $32, %r10193ja 0b1941951: ret196197/* at least two chunks to play with and NUL while processing head */198.Lheadnul:199movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack200tzcnt %r8d, %r8d # find location of NUL byte201movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination202movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes203movdqu %xmm1, 16(%rdi) # clear out second chunk204lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte205206add $32, %rdi # advance past first two chunks207sub $32+16, %r10 # advance past first three chunks208jbe 1f # did we pass the end of the buffer?209210/* clear remaining destination buffer (tail has been cleared earlier) */211ALIGN_TEXT2120: movdqu %xmm1, (%rdi) # clear out buffer chunk213cmp $16, %r10214jbe 1f215216movdqu %xmm1, 16(%rdi)217add $32, %rdi218sub $32, %r10219ja 0b2202211: ret222223/* 1--32 bytes to copy, bounce through the stack */224.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy225bts %r10d, %r8d # treat end of buffer as end of string226and %r9w, %r8w # end of string within first buffer?227jnz 0f # if yes, do not inspect second buffer228229movdqa 16(%rsi), %xmm0 # load second chunk of input230movdqa %xmm0, bounce+16(%rsp) # stash copy on stack231pcmpeqb %xmm1, %xmm0 # NUL in second chunk?232pmovmskb %xmm0, %r9d233shl $16, %r9d234or %r9d, %r8d # merge found NUL bytes into NUL mask235236/* end of string after one buffer */2370: tzcnt %r8d, %r8d # location of last char in string238movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string239lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack240lea (%rdi, %r8, 1), %rax # return pointer to NUL byte241242cmp $16, %edx # at least 16 bytes to transfer?243jae .L1631244245mov (%rsi), %r8 # load string head246cmp $8, %edx # at least 8 bytes to transfer?247jae .L0815248249cmp $4, %edx # at least 4 bytes to transfer?250jae .L0407251252movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string253mov %r8b, (%rdi, %rcx, 1) # store first byte254255cmp $2, %edx # at least 2 bytes to transfer?256jb .L1257258mov %si, -2(%rdi, %r10, 1) # store last two bytes of string259.L1: ret260261.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string262movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string263movdqu %xmm0, (%rdi, %rcx, 1)264movdqu %xmm1, -16(%rdi, %r10, 1)265ret266267.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string268mov %r8, (%rdi, %rcx, 1)269mov %rdx, -8(%rdi, %r10, 1)270ret271272.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string273mov %r8d, (%rdi, %rcx, 1)274mov %edx, -4(%rdi, %r10, 1)275ret276277/* length 0 buffer: just return dest */278.L0: mov %rdi, %rax279ret280ARCHEND(__stpncpy, baseline)281282.section .note.GNU-stack,"",%progbits283284285