/*-1* Copyright (c) 2023, The FreeBSD Foundation2*3* SPDX-License-Expression: BSD-2-Clause4*5* Portions of this software were developed by Robert Clausecker6* <[email protected]> under sponsorship from the FreeBSD Foundation.7*8* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S9* written by J.T. Conklin <[email protected]> and10* adapted by Guillaume Morin <[email protected]> to implement stpcpy11* that was originally dedicated to the public domain12*/1314#include <machine/asm.h>1516#include "amd64_archlevel.h"1718#define ALIGN_TEXT .p2align 4, 0x901920.weak stpcpy21.set stpcpy, __stpcpy22ARCHFUNCS(__stpcpy)23ARCHFUNC(__stpcpy, scalar)24ARCHFUNC(__stpcpy, baseline)25ENDARCHFUNCS(__stpcpy)2627/*28* This stpcpy implementation copies a byte at a time until the29* source pointer is aligned to a word boundary, it then copies by30* words until it finds a word containing a zero byte, and finally31* copies by bytes until the end of the string is reached.32*33* While this may result in unaligned stores if the source and34* destination pointers are unaligned with respect to each other,35* it is still faster than either byte copies or the overhead of36* an implementation suitable for machines with strict alignment37* requirements.38*/3940ARCHENTRY(__stpcpy, scalar)41movabsq $0x0101010101010101,%r842movabsq $0x8080808080808080,%r94344/*45* Align source to a word boundary.46* Consider unrolling loop?47*/48.Lalign:49testb $7,%sil50je .Lword_aligned51movb (%rsi),%dl52incq %rsi53movb %dl,(%rdi)54incq %rdi55testb %dl,%dl56jne .Lalign57movq %rdi,%rax58dec %rax59ret6061ALIGN_TEXT62.Lloop:63movq %rdx,(%rdi)64addq $8,%rdi65.Lword_aligned:66movq (%rsi),%rdx67movq %rdx,%rcx68addq $8,%rsi69subq %r8,%rcx70testq %r9,%rcx71je .Lloop7273/*74* In rare cases, the above loop may exit prematurely. We must75* return to the loop if none of the bytes in the word equal 0.76*/7778movb %dl,(%rdi)79testb %dl,%dl /* 1st byte == 0? */80je .Ldone81incq %rdi8283shrq $8,%rdx84movb %dl,(%rdi)85testb %dl,%dl /* 2nd byte == 0? */86je .Ldone87incq %rdi8889shrq $8,%rdx90movb %dl,(%rdi)91testb %dl,%dl /* 3rd byte == 0? */92je .Ldone93incq %rdi9495shrq $8,%rdx96movb %dl,(%rdi)97testb %dl,%dl /* 4th byte == 0? */98je .Ldone99incq %rdi100101shrq $8,%rdx102movb %dl,(%rdi)103testb %dl,%dl /* 5th byte == 0? */104je .Ldone105incq %rdi106107shrq $8,%rdx108movb %dl,(%rdi)109testb %dl,%dl /* 6th byte == 0? */110je .Ldone111incq %rdi112113shrq $8,%rdx114movb %dl,(%rdi)115testb %dl,%dl /* 7th byte == 0? */116je .Ldone117incq %rdi118119shrq $8,%rdx120movb %dl,(%rdi)121incq %rdi122testb %dl,%dl /* 8th byte == 0? */123jne .Lword_aligned124decq %rdi125126.Ldone:127movq %rdi,%rax128ret129ARCHEND(__stpcpy, scalar)130131ARCHENTRY(__stpcpy, baseline)132mov %esi, %ecx133mov %rdi, %rdx134sub %rsi, %rdi # express destination as distance to surce135and $~0xf, %rsi # align source to 16 byte136movdqa (%rsi), %xmm0 # head of string with junk before137pxor %xmm1, %xmm1138and $0xf, %ecx # misalignment in bytes139pcmpeqb %xmm1, %xmm0 # NUL byte present?140pmovmskb %xmm0, %eax141shr %cl, %eax # clear out matches in junk bytes142bsf %eax, %eax # find match if any143jnz .Lrunt144145/* first normal iteration: write head back if it succeeds */146movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration147movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string148pcmpeqb %xmm0, %xmm1 # NUL byte present?149pmovmskb %xmm1, %eax150test %eax, %eax # find match if any151jnz .Lshorty152153movdqu %xmm2, (%rdx) # store beginning of string154155/* main loop, unrolled twice */156ALIGN_TEXT1570: movdqa 32(%rsi), %xmm2 # load current iteraion158movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion159pxor %xmm1, %xmm1160add $32, %rsi161pcmpeqb %xmm2, %xmm1 # NUL byte present?162pmovmskb %xmm1, %eax163test %eax, %eax164jnz 1f165166movdqa 16(%rsi), %xmm0 # load current iteraion167movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion168pxor %xmm1, %xmm1169pcmpeqb %xmm0, %xmm1 # NUL byte present?170pmovmskb %xmm1, %eax171test %eax, %eax172jz 0b173174/* end of string after main loop has iterated */175add $16, %rsi # advance rsi to second unrolled half1761: tzcnt %eax, %eax # find location of match177# (behaves as bsf on pre-x86-64-v3 CPUs)178add %rsi, %rax # point to NUL byte179movdqu -15(%rax), %xmm0 # last 16 bytes of string180movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination181add %rdi, %rax # point to destination's NUL byte182ret183184/* NUL encountered in second iteration */185.Lshorty:186tzcnt %eax, %eax187add $16, %eax # account for length of first iteration188sub %ecx, %eax # but not the parts before the string189190/* NUL encountered in first iteration */191.Lrunt: lea 1(%rax), %edi # string length including NUL byte192add %rcx, %rsi # point to beginning of string193add %rdx, %rax # point to NUL byte194195/* transfer 16--32 bytes */196.L1632: cmp $16, %edi197jb .L0815198199movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes200movdqu %xmm2, (%rdx) # store first 16 bytes201movdqu %xmm0, -15(%rax) # store last 16 bytes202ret203204/* transfer 8--15 bytes */205.L0815: cmp $8, %edi206jb .L0407207208mov (%rsi), %rcx # load first 8 bytes209mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes210mov %rcx, (%rdx) # store to dst211mov %rdi, -7(%rax) # dito212ret213214/* transfer 4--7 bytes */215.L0407: cmp $4, %edi216jb .L0203217218mov (%rsi), %ecx219mov -4(%rsi, %rdi, 1), %edi220mov %ecx, (%rdx)221mov %edi, -3(%rax)222ret223224/* transfer 2--3 bytes */225.L0203: cmp $2, %edi226jb .L0101227228movzwl (%rsi), %ecx229mov %cx, (%rdx) # store first two bytes230231/* transfer 0 bytes (last byte is always NUL) */232.L0101: movb $0, (%rax) # store terminating NUL byte233ret234ARCHEND(__stpcpy, baseline)235236.section .note.GNU-stack,"",%progbits237238239