/*1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4, 0x903334.weak stpncpy35.set stpncpy, __stpncpy36ARCHFUNCS(__stpncpy)37ARCHFUNC(__stpncpy, scalar)38ARCHFUNC(__stpncpy, baseline)39ENDARCHFUNCS(__stpncpy)4041ARCHENTRY(__stpncpy, scalar)42push %rbp # establish stack frame43mov %rsp, %rbp4445push %rdx46push %rdi47push %rsi48push %rax # dummy push for alignment4950mov %rsi, %rdi51xor %esi, %esi52call CNAME(__memchr) # memchr(src, '\0', len)53pop %rcx # dummy pop54pop %rsi55mov -16(%rbp), %rdi5657test %rax, %rax # NUL found?58jz .Lfullcopy5960mov %rax, %rdx61sub %rsi, %rdx # copy until the NUL byte62add %rdx, -16(%rbp) # advance destination by string length63sub %rdx, -8(%rbp) # and shorten buffer size by string length64call CNAME(memcpy)6566pop %rdi67pop %rdx68xor %esi, %esi69pop %rbp70jmp CNAME(memset) # clear remaining buffer7172.Lfullcopy:73mov -8(%rbp), %rdx74call CNAME(memcpy) # copy whole string75add -8(%rbp), %rax # point to dest[n]76leave77ret78ARCHEND(__stpncpy, scalar)7980/*81* this mask allows us to generate masks of 16-n 0xff bytes82* followed by n 0x00 bytes by loading from .Lmask+n.83*/84.section .rodata85.Lmask: .quad 0xffffffffffffffff86.quad 0xffffffffffffffff87.quad 0x000000000000000088.quad 0x00000000000000008990/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */91ARCHENTRY(__stpncpy, baseline)92#define bounce (-3*16-8) /* location of on-stack bounce buffer */93test %rdx, %rdx # no bytes to copy?94jz .L09596mov %esi, %ecx97and $~0xf, %rsi # align source to 16 bytes98movdqa (%rsi), %xmm0 # load head99and $0xf, %ecx # offset from alignment100mov $-1, %r9d101lea -33(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32102shl %cl, %r9d # mask of bytes belonging to the string103sub %rcx, %rdi # adjust RDI to correspond to RSI104pxor %xmm1, %xmm1105movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack106pcmpeqb %xmm1, %xmm0107pmovmskb %xmm0, %r8d108109lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary110add %rdx, %rax # less than 2 chunks (32 bytes) to play with?111jnc .Lrunt # if yes, use special runt processing112113movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination114and %r9d, %r8d # end of string within head?115jnz .Lheadnul116117movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer118movdqu %xmm2, (%rdi, %rcx, 1) # an deposit119120add $16, %rsi121add $16, %rdi122sub $32, %r10123124/* main loop unrolled twice */125ALIGN_TEXT1260: movdqa (%rsi), %xmm0127pxor %xmm1, %xmm1128pcmpeqb %xmm0, %xmm1 # NUL byte encountered?129pmovmskb %xmm1, %r8d130test %r8d, %r8d131jnz 3f132133movdqu %xmm0, (%rdi)134cmp $16, %r10 # more than a full chunk left?135jbe 1f136137movdqa 16(%rsi), %xmm0138add $32, %rdi # advance pointers to next chunk139add $32, %rsi140pxor %xmm1, %xmm1141pcmpeqb %xmm0, %xmm1 # NUL byte encountered?142pmovmskb %xmm1, %r8d143test %r8d, %r8d144jnz 2f145146movdqu %xmm0, -16(%rdi)147sub $32, %r10 # more than another full chunk left?148ja 0b149150sub $16, %rdi # undo second advancement151sub $16, %rsi152add $16, %r10d # restore number of remaining bytes153154/* 1--16 bytes left but string has not ended yet */1551: pxor %xmm1, %xmm1156pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail?157pmovmskb %xmm1, %r8d158bts %r10d, %r8d # treat end of buffer as NUL159tzcnt %r8d, %r8d # where is the NUL byte?160movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL161lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte162# or end of buffer163movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer164ret1651662: sub $16, %rdi # undo second advancement167sub $16, %rsi168sub $16, %r10169170/* string has ended and buffer has not */1713: tzcnt %r8d, %r8d # where did the string end?172lea .Lmask+16(%rip), %rcx173lea (%rdi, %r8, 1), %rax # where the NUL byte will be174neg %r8175movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is,176# 00 where it is not177pand %xmm1, %xmm0 # mask out bytes after the string178movdqu %xmm0, (%rdi) # store masked current chunk179pxor %xmm1, %xmm1180sub $16, %r10 # another full chunk left?181jbe 1f182183/* clear remaining destination buffer (tail has been cleared earlier) */184ALIGN_TEXT1850: movdqu %xmm1, 16(%rdi)186cmp $16, %r10187jbe 1f188189movdqu %xmm1, 32(%rdi)190add $32, %rdi191sub $32, %r10192ja 0b1931941: ret195196/* at least two chunks to play with and NUL while processing head */197.Lheadnul:198movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack199tzcnt %r8d, %r8d # find location of NUL byte200movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination201movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes202movdqu %xmm1, 16(%rdi) # clear out second chunk203lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte204205add $32, %rdi # advance past first two chunks206sub $32+16, %r10 # advance past first three chunks207jbe 1f # did we pass the end of the buffer?208209/* clear remaining destination buffer (tail has been cleared earlier) */210ALIGN_TEXT2110: movdqu %xmm1, (%rdi) # clear out buffer chunk212cmp $16, %r10213jbe 1f214215movdqu %xmm1, 16(%rdi)216add $32, %rdi217sub $32, %r10218ja 0b2192201: ret221222/* 1--32 bytes to copy, bounce through the stack */223.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy224and %r9d, %r8d # mask out head before string225bts %r10, %r8 # treat end of buffer as end of string226test $0x1ffff, %r8d # end of string within first chunk or right after?227jnz 0f # if yes, do not inspect second buffer228229movdqa 16(%rsi), %xmm0 # load second chunk of input230movdqa %xmm0, bounce+16(%rsp) # stash copy on stack231pcmpeqb %xmm1, %xmm0 # NUL in second chunk?232pmovmskb %xmm0, %r9d233shl $16, %r9d234or %r9, %r8 # merge found NUL bytes into NUL mask235236/* end of string after one buffer */2370: tzcnt %r8, %r8 # location of last char in string238movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string239lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack240lea (%rdi, %r8, 1), %rax # return pointer to NUL byte241242cmp $16, %edx # at least 16 bytes to transfer?243jae .L1631244245mov (%rsi), %r8 # load string head246cmp $8, %edx # at least 8 bytes to transfer?247jae .L0815248249cmp $4, %edx # at least 4 bytes to transfer?250jae .L0407251252movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string253mov %r8b, (%rdi, %rcx, 1) # store first byte254255cmp $2, %edx # at least 2 bytes to transfer?256jb .L1257258mov %si, -2(%rdi, %r10, 1) # store last two bytes of string259.L1: ret260261.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string262movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string263movdqu %xmm0, (%rdi, %rcx, 1)264movdqu %xmm1, -16(%rdi, %r10, 1)265ret266267.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string268mov %r8, (%rdi, %rcx, 1)269mov %rdx, -8(%rdi, %r10, 1)270ret271272.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string273mov %r8d, (%rdi, %rcx, 1)274mov %edx, -4(%rdi, %r10, 1)275ret276277/* length 0 buffer: just return dest */278.L0: mov %rdi, %rax279ret280ARCHEND(__stpncpy, baseline)281282.section .note.GNU-stack,"",%progbits283284285