/*-1* Copyright (c) 2023, The FreeBSD Foundation2*3* SPDX-License-Expression: BSD-2-Clause4*5* Portions of this software were developed by Robert Clausecker6* <[email protected]> under sponsorship from the FreeBSD Foundation.7*8* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S9* written by J.T. Conklin <[email protected]>10* that was originally dedicated to the public domain11*/1213#include <machine/asm.h>14#if 015RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $")16#endif1718#include "amd64_archlevel.h"1920ARCHFUNCS(strcat)21ARCHFUNC(strcat, scalar)22ARCHFUNC(strcat, baseline)23ENDARCHFUNCS(strcat)2425ARCHENTRY(strcat, scalar)26movq %rdi,%rax27movabsq $0x0101010101010101,%r828movabsq $0x8080808080808080,%r92930/*31* Align destination to word boundary.32* Consider unrolling loop?33*/34.Lscan:35.Lscan_align:36testb $7,%dil37je .Lscan_aligned38cmpb $0,(%rdi)39je .Lcopy40incq %rdi41jmp .Lscan_align4243.align 444.Lscan_aligned:45.Lscan_loop:46movq (%rdi),%rdx47addq $8,%rdi48subq %r8,%rdx49testq %r9,%rdx50je .Lscan_loop5152/*53* In rare cases, the above loop may exit prematurely. We must54* return to the loop if none of the bytes in the word equal 0.55*/5657cmpb $0,-8(%rdi) /* 1st byte == 0? */58jne 1f59subq $8,%rdi60jmp .Lcopy61621: cmpb $0,-7(%rdi) /* 2nd byte == 0? */63jne 1f64subq $7,%rdi65jmp .Lcopy66671: cmpb $0,-6(%rdi) /* 3rd byte == 0? */68jne 1f69subq $6,%rdi70jmp .Lcopy71721: cmpb $0,-5(%rdi) /* 4th byte == 0? */73jne 1f74subq $5,%rdi75jmp .Lcopy76771: cmpb $0,-4(%rdi) /* 5th byte == 0? */78jne 1f79subq $4,%rdi80jmp .Lcopy81821: cmpb $0,-3(%rdi) /* 6th byte == 0? */83jne 1f84subq $3,%rdi85jmp .Lcopy86871: cmpb $0,-2(%rdi) /* 7th byte == 0? */88jne 1f89subq $2,%rdi90jmp .Lcopy91921: cmpb $0,-1(%rdi) /* 8th byte == 0? */93jne .Lscan_loop94subq $1,%rdi9596/*97* Align source to a word boundary.98* Consider unrolling loop?99*/100.Lcopy:101.Lcopy_align:102testb $7,%sil103je .Lcopy_aligned104movb (%rsi),%dl105incq %rsi106movb %dl,(%rdi)107incq %rdi108testb %dl,%dl109jne .Lcopy_align110ret111112.align 4113.Lcopy_loop:114movq %rdx,(%rdi)115addq $8,%rdi116.Lcopy_aligned:117movq (%rsi),%rdx118movq %rdx,%rcx119addq $8,%rsi120subq %r8,%rcx121testq %r9,%rcx122je .Lcopy_loop123124/*125* In rare cases, the above loop may exit prematurely. We must126* return to the loop if none of the bytes in the word equal 0.127*/128129movb %dl,(%rdi)130incq %rdi131testb %dl,%dl /* 1st byte == 0? */132je .Ldone133134shrq $8,%rdx135movb %dl,(%rdi)136incq %rdi137testb %dl,%dl /* 2nd byte == 0? */138je .Ldone139140shrq $8,%rdx141movb %dl,(%rdi)142incq %rdi143testb %dl,%dl /* 3rd byte == 0? */144je .Ldone145146shrq $8,%rdx147movb %dl,(%rdi)148incq %rdi149testb %dl,%dl /* 4th byte == 0? */150je .Ldone151152shrq $8,%rdx153movb %dl,(%rdi)154incq %rdi155testb %dl,%dl /* 5th byte == 0? */156je .Ldone157158shrq $8,%rdx159movb %dl,(%rdi)160incq %rdi161testb %dl,%dl /* 6th byte == 0? */162je .Ldone163164shrq $8,%rdx165movb %dl,(%rdi)166incq %rdi167testb %dl,%dl /* 7th byte == 0? */168je .Ldone169170shrq $8,%rdx171movb %dl,(%rdi)172incq %rdi173testb %dl,%dl /* 8th byte == 0? */174jne .Lcopy_aligned175176.Ldone:177ret178ARCHEND(strcat, scalar)179180/*181* Call into strlen + strcpy if we have any SIMD at all.182* The scalar implementation above is better for the scalar183* case as it avoids the function call overhead, but pessimal184* if we could call SIMD routines instead.185*/186ARCHENTRY(strcat, baseline)187push %rbp188mov %rsp, %rbp189push %rsi190push %rbx191mov %rdi, %rbx # remember destination for later192call CNAME(strlen) # strlen(dest)193mov -8(%rbp), %rsi194lea (%rbx, %rax, 1), %rdi # dest + strlen(dest)195call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src)196mov %rbx, %rax # return dest197pop %rbx198leave199ret200ARCHEND(strcat, baseline)201202.section .note.GNU-stack,"",%progbits203204205