/*1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4, 0x903334.weak strlcpy35.set strlcpy, __strlcpy36ARCHFUNCS(__strlcpy)37ARCHFUNC(__strlcpy, scalar)38ARCHFUNC(__strlcpy, baseline)39ENDARCHFUNCS(__strlcpy)4041ARCHENTRY(__strlcpy, scalar)42push %rbp # establish stack frame43mov %rsp, %rbp44push %rsi45push %rbx46push %rdi47push %rdx48mov %rsi, %rdi49call CNAME(strlen) # strlen(src)50pop %rdx51pop %rdi52mov -8(%rbp), %rsi53mov %rax, %rbx # remember string length for return value54sub $1, %rdx # do not copy into the final byte of the buffer55jc 0f # skip copying altogether if buffer was empty56cmp %rax, %rdx # is the buffer longer than the input?57cmova %rax, %rdx # if yes, only copy the part that fits58movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer59call CNAME(memcpy) # copy string to output600: mov %rbx, %rax # restore return value61pop %rbx62leave63ret64ARCHEND(__strlcpy, scalar)6566ARCHENTRY(__strlcpy, baseline)67sub $1, %rdx # do not count NUL byte in buffer length68jb .L0 # go to special code path if len was 06970mov %esi, %ecx71pxor %xmm1, %xmm172mov %rsi, %r9 # stash a copy of the source pointer for later73and $~0xf, %rsi74pcmpeqb (%rsi), %xmm1 # NUL found in head?75mov $-1, %r8d76and $0xf, %ecx77shl %cl, %r8d # mask of bytes in the string78pmovmskb %xmm1, %eax79and %r8d, %eax80jnz .Lhead_nul8182movdqa 16(%rsi), %xmm3 # load second string chunk83movdqu (%r9), %xmm2 # load unaligned string head84mov $32, %r8d85sub %ecx, %r8d # head length + length of second chunk86pxor %xmm1, %xmm187pcmpeqb %xmm3, %xmm1 # NUL found in second chunk?8889sub %r8, %rdx # enough space left for the second chunk?90jbe .Lhead_buf_end9192/* process second chunk */93pmovmskb %xmm1, %eax94test %eax, %eax95jnz .Lsecond_nul9697/* string didn't end in second chunk and neither did buffer -- not a runt! */98movdqa 32(%rsi), %xmm0 # load next string chunk99pxor %xmm1, %xmm1100movdqu %xmm2, (%rdi) # deposit head into buffer101sub %rcx, %rdi # adjust RDI to correspond to RSI102movdqu %xmm3, 16(%rdi) # deposit second chunk103sub %rsi, %rdi # express RDI as distance from RSI104add $32, %rsi # advance RSI past first two chunks105sub $16, %rdx # enough left for another round?106jbe 1f107108/* main loop unrolled twice */109ALIGN_TEXT1100: pcmpeqb %xmm0, %xmm1 # NUL byte encountered?111pmovmskb %xmm1, %eax112test %eax, %eax113jnz 3f114115movdqu %xmm0, (%rsi, %rdi)116movdqa 16(%rsi), %xmm0 # load next string chunk117pxor %xmm1, %xmm1118cmp $16, %rdx # more than a full chunk left?119jbe 2f120121add $32, %rsi # advance pointers to next chunk122pcmpeqb %xmm0, %xmm1 # NUL byte encountered?123pmovmskb %xmm1, %eax124test %eax, %eax125jnz 4f126127movdqu %xmm0, -16(%rsi, %rdi)128movdqa (%rsi), %xmm0 # load next string chunk129pxor %xmm1, %xmm1130sub $32, %rdx131ja 0b1321331: sub $16, %rsi # undo second advancement134add $16, %edx135136/* 1--16 bytes left in the buffer but string has not ended yet */1372: pcmpeqb %xmm1, %xmm0 # NUL byte encountered?138pmovmskb %xmm0, %r8d139mov %r8d, %eax140bts %edx, %r8d # treat end of buffer as end of string141tzcnt %r8d, %r8d # find tail length142add %rsi, %rdi # restore RDI143movdqu (%rsi, %r8, 1), %xmm0 # load string tail144movdqu %xmm0, (%rdi, %r8, 1) # store string tail145movb $0, 16(%rdi, %r8, 1) # NUL terminate146147/* continue to find the end of the string */148test %eax, %eax # end of string already reached?149jnz 1f150151ALIGN_TEXT1520: pcmpeqb 32(%rsi), %xmm1153pmovmskb %xmm1, %eax154pxor %xmm1, %xmm1155test %eax, %eax156jnz 2f157158pcmpeqb 48(%rsi), %xmm1159pmovmskb %xmm1, %eax160add $32, %rsi161pxor %xmm1, %xmm1162test %eax, %eax163jz 0b1641651: sub $16, %rsi # undo second advancement1662: tzcnt %eax, %eax # where is the NUL byte?167sub %r9, %rsi168lea 32(%rsi, %rax, 1), %rax # return string length169ret1701714: sub $16, %rsi # undo second advancement172add $16, %rdx # restore number of remaining bytes173174/* string has ended but buffer has not */1753: tzcnt %eax, %eax # find length of string tail176movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)177add %rsi, %rdi # restore destination pointer178movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)179sub %r9, %rsi # string length to current chunk180add %rsi, %rax # plus length of current chunk181ret182183.Lhead_buf_end:184pmovmskb %xmm1, %r8d185add $32, %edx # restore edx to (len-1) + ecx186mov %r8d, %eax187shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31188bts %rdx, %r8 # treat end of buffer as end of string189tzcnt %r8, %rdx # find string/bufer len from alignment boundary190sub %ecx, %edx # find actual string/buffer len191movb $0, (%rdi, %rdx, 1) # write NUL terminator192193/* continue to find the end of the string */194test %eax, %eax # end of string already reached?195jnz 1f196197ALIGN_TEXT1980: pcmpeqb 32(%rsi), %xmm1199pmovmskb %xmm1, %eax200pxor %xmm1, %xmm1201test %eax, %eax202jnz 2f203204pcmpeqb 48(%rsi), %xmm1205pmovmskb %xmm1, %eax206add $32, %rsi207pxor %xmm1, %xmm1208test %eax, %eax209jz 0b2102111: sub $16, %rsi2122: tzcnt %eax, %eax213sub %r9, %rsi214lea 32(%rsi, %rax, 1), %rax # return string length215jmp .L0031216217.Lsecond_nul:218add %r8, %rdx # restore buffer length219tzcnt %eax, %eax # where is the NUL byte?220lea -16(%rcx), %r8d221sub %r8d, %eax # string length222cmp %rax, %rdx # is the string shorter than the buffer?223cmova %rax, %rdx # copy only min(buflen, srclen) bytes224movb $0, (%rdi, %rdx, 1) # write NUL terminator225.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)?226jb .L0015227228/* copy 16--31 bytes */229movdqu (%r9), %xmm0 # load first 16 bytes230movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes231movdqu %xmm0, (%rdi)232movdqu %xmm1, -16(%rdi, %rdx, 1)233ret234235.Lhead_nul:236tzcnt %eax, %eax # where is the NUL byte?237sub %ecx, %eax # ... from the beginning of the string?238cmp %rax, %rdx # is the string shorter than the buffer?239cmova %rax, %rdx # copy only min(buflen, srclen) bytes240movb $0, (%rdi, %rdx, 1) # write NUL terminator241242/* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */243.L0015: cmp $8, %rdx # at least 8 bytes to copy?244jae .L0815245246cmp $4, %rdx # at least 4 bytes to copy?247jae .L0407248249cmp $2, %rdx # at least 2 bytes to copy?250jae .L0203251252movzbl (%r9), %ecx # load first byte from src253mov %cl, (%rdi) # deposit into destination254movb $0, (%rdi, %rdx, 1) # add NUL terminator (again)255ret256257.L0203: movzwl (%r9), %ecx258movzwl -2(%r9, %rdx, 1), %esi259mov %cx, (%rdi)260mov %si, -2(%rdi, %rdx, 1)261ret262263.L0407: mov (%r9), %ecx264mov -4(%r9, %rdx, 1), %esi265mov %ecx, (%rdi)266mov %esi, -4(%rdi, %rdx, 1)267ret268269.L0815: mov (%r9), %rcx270mov -8(%r9, %rdx, 1), %rsi271mov %rcx, (%rdi)272mov %rsi, -8(%rdi, %rdx, 1)273ret274275/* length zero destination: just return the string length */276.L0: mov %rsi, %rdi277jmp CNAME(strlen)278ARCHEND(__strlcpy, baseline)279280.section .note.GNU-stack,"",%progbits281282283