/*-1* Written by Mateusz Guzik <[email protected]>2* Copyright (c) 2023 The FreeBSD Foundation3*4* Portions of this software were developed by Robert Clausecker5* <[email protected]> under sponsorship from the FreeBSD Foundation.6*7* Public domain.8*/910#include <machine/asm.h>11#include "amd64_archlevel.h"1213/*14* Note: this routine was written with kernel use in mind (read: no simd),15* it is only present in userspace as a temporary measure until something16* better gets imported.17*/1819#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */2021ARCHFUNCS(strlen)22ARCHFUNC(strlen, scalar)23ARCHFUNC(strlen, baseline)24ENDARCHFUNCS(strlen)2526/*27* strlen(string)28* %rdi29*30* Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.31*32* 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added33* with leaq.34*35* For a description see either:36* - "Hacker's Delight" by Henry S. Warren, Jr.37* - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"38* by Agner Fog39*40* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.41*/42ARCHENTRY(strlen, scalar)43movabsq $0xfefefefefefefeff,%r844movabsq $0x8080808080808080,%r94546movq %rdi,%r1047movq %rdi,%rcx48testb $7,%dil49jz 2f5051/*52* Handle misaligned reads: align to 8 and fill53* the spurious bytes.54*/55andq $~7,%rdi56movq (%rdi),%r1157shlq $3,%rcx58movq $-1,%rdx59shlq %cl,%rdx60notq %rdx61orq %rdx,%r116263leaq (%r11,%r8),%rcx64notq %r1165andq %r11,%rcx66andq %r9,%rcx67jnz 3f6869/*70* Main loop.71*/72ALIGN_TEXT731:74leaq 8(%rdi),%rdi752:76movq (%rdi),%r1177leaq (%r11,%r8),%rcx78notq %r1179andq %r11,%rcx80andq %r9,%rcx81jz 1b823:83bsfq %rcx,%rcx84shrq $3,%rcx85leaq (%rcx,%rdi),%rax86subq %r10,%rax87ret88ARCHEND(strlen, scalar)8990ARCHENTRY(strlen, baseline)91mov %rdi, %rcx92pxor %xmm1, %xmm193and $~0xf, %rdi # align string94pcmpeqb (%rdi), %xmm1 # compare head (with junk before string)95mov %rcx, %rsi # string pointer copy for later96and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment97pmovmskb %xmm1, %eax98add $32, %rdi # advance to next iteration99shr %cl, %eax # clear out matches in junk bytes100test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible)101jnz 2f102103ALIGN_TEXT1041: pxor %xmm1, %xmm1105pcmpeqb -16(%rdi), %xmm1 # find NUL bytes106pmovmskb %xmm1, %eax107test %eax, %eax # were any NUL bytes present?108jnz 3f109110/* the same unrolled once more */111pxor %xmm1, %xmm1112pcmpeqb (%rdi), %xmm1113pmovmskb %xmm1, %eax114add $32, %rdi # advance to next iteration115test %eax, %eax116jz 1b117118/* match found in loop body */119sub $16, %rdi # undo half the advancement1203: tzcnt %eax, %eax # find the first NUL byte121sub %rsi, %rdi # string length until beginning of (%rdi)122lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length123ret124125/* match found in head */1262: tzcnt %eax, %eax # compute string length127ret128ARCHEND(strlen, baseline)129130.section .note.GNU-stack,"",%progbits131132133