/*-1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>29#include <machine/param.h>3031#include "amd64_archlevel.h"3233#define ALIGN_TEXT .p2align 4, 0x903435ARCHFUNCS(strncmp)36ARCHFUNC(strncmp, scalar)37ARCHFUNC(strncmp, baseline)38ENDARCHFUNCS(strncmp)3940/*41* This is just the scalar loop unrolled a bunch of times.42*/43ARCHENTRY(strncmp, scalar)44xor %eax, %eax45sub $4, %rdx # 4 chars left to compare?46jbe 1f4748ALIGN_TEXT490: movzbl (%rdi), %ecx50test %ecx, %ecx # NUL char in first string?51jz .L052cmpb (%rsi), %cl # mismatch between strings?53jnz .L05455movzbl 1(%rdi), %ecx56test %ecx, %ecx57jz .L158cmpb 1(%rsi), %cl59jnz .L16061movzbl 2(%rdi), %ecx62test %ecx, %ecx63jz .L264cmpb 2(%rsi), %cl65jnz .L26667movzbl 3(%rdi), %ecx68test %ecx, %ecx69jz .L370cmpb 3(%rsi), %cl71jnz .L37273add $4, %rdi # advance to next iteration74add $4, %rsi75sub $4, %rdx76ja 0b7778/* end of string within the next 4 characters */791: cmp $-4, %edx # end of string reached immediately?80jz .Leq81movzbl (%rdi), %ecx82test %ecx, %ecx83jz .L084cmpb (%rsi), %cl85jnz .L08687cmp $-3, %edx # end of string reached after 1 char?88jz .Leq89movzbl 1(%rdi), %ecx90test %ecx, %ecx91jz .L192cmpb 1(%rsi), %cl93jnz .L19495cmp $-2, %edx96jz .Leq97movzbl 2(%rdi), %ecx98test %ecx, %ecx99jz .L2100cmpb 2(%rsi), %cl101jnz .L2102103cmp $-1, %edx # either end of string after 3 chars,104jz .Leq # or it boils down to the last char105106.L3: inc %eax107.L2: inc %eax108.L1: inc %eax109.L0: movzbl (%rsi, %rax, 1), %ecx110movzbl (%rdi, %rax, 1), %eax111sub %ecx, %eax112.Leq: ret113ARCHEND(strncmp, scalar)114115ARCHENTRY(strncmp, baseline)116push %rbx117sub $1, %rdx # RDX--, so RDX points to the last byte to compare118jb .Lempty # where there any bytes to compare at all?119120lea 15(%rdi), %r8d # end of head121lea 15(%rsi), %r9d122mov %edi, %eax123mov %esi, %ebx124xor %edi, %r8d # bits that changed between first and last byte125xor %esi, %r9d126and $~0xf, %rdi # align heads to 16 bytes127and $~0xf, %rsi128or %r8d, %r9d129and $0xf, %eax # offset from alignment130and $0xf, %ebx131movdqa (%rdi), %xmm0 # load aligned heads132movdqa (%rsi), %xmm2133pxor %xmm1, %xmm1134cmp $16, %rdx # end of buffer within the first 32 bytes?135jb .Llt16136137test $PAGE_SIZE, %r9d # did the page change?138jz 0f # if not, take fast path139140141/* heads may cross page boundary, avoid unmapped loads */142movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack143movdqa %xmm2, -16(%rsp)144mov $-1, %r8d145mov $-1, %r9d146mov %eax, %ecx147shl %cl, %r8d # string head in XMM0148mov %ebx, %ecx149shl %cl, %r9d # string head in XMM2150pcmpeqb %xmm1, %xmm0151pcmpeqb %xmm1, %xmm2152pmovmskb %xmm0, %r10d153pmovmskb %xmm2, %r11d154test %r8d, %r10d # NUL byte present in first string?155lea -32(%rsp), %r8156cmovz %rdi, %r8157test %r9d, %r11d # NUL byte present in second string?158lea -16(%rsp), %r9159cmovz %rsi, %r9160movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads161movdqu (%r9, %rbx, 1), %xmm4162jmp 1f163164/* rdx == 0 */165.Lempty:166xor %eax, %eax # zero-length buffers compare equal167pop %rbx168ret1691700: movdqu (%rdi, %rax, 1), %xmm0 # load true heads171movdqu (%rsi, %rbx, 1), %xmm41721: pxor %xmm2, %xmm2173pcmpeqb %xmm0, %xmm2 # NUL byte present?174pcmpeqb %xmm0, %xmm4 # which bytes match?175pandn %xmm4, %xmm2 # match and not NUL byte?176pmovmskb %xmm2, %r9d177xor $0xffff, %r9d # mismatch or NUL byte?178jnz .Lhead_mismatch179180/* load head and second chunk */181movdqa 16(%rdi), %xmm2 # load second chunks182movdqa 16(%rsi), %xmm3183lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk184sub %rbx, %rax # is a&0xf >= b&0xf?185jb .Lswapped # if not, proceed with swapped operands186jmp .Lnormal187188/* buffer ends within the first 16 bytes */189.Llt16: test $PAGE_SIZE, %r9d # did the page change?190jz 0f # if not, take fast path191192/* heads may cross page boundary */193movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack194movdqa %xmm2, -16(%rsp)195mov $-1, %r8d196mov $-1, %r9d197mov %eax, %ecx198shl %cl, %r8d # string head in XMM0199mov %ebx, %ecx200shl %cl, %r9d # string head in XMM2201pcmpeqb %xmm1, %xmm0202pcmpeqb %xmm1, %xmm2203pmovmskb %xmm0, %r10d204pmovmskb %xmm2, %r11d205lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0206bts %ecx, %r10d # treat as if NUL byte present207lea (%rdx, %rbx, 1), %ecx208bts %ecx, %r11d209test %r8w, %r10w # NUL byte present in first string head?210lea -32(%rsp), %r8211cmovz %rdi, %r8212test %r9w, %r11w # NUL byte present in second string head?213lea -16(%rsp), %r9214cmovz %rsi, %r9215movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads216movdqu (%r9, %rbx, 1), %xmm4217jmp 1f2182190: movdqu (%rdi, %rax, 1), %xmm0 # load true heads220movdqu (%rsi, %rbx, 1), %xmm42211: pxor %xmm2, %xmm2222pcmpeqb %xmm0, %xmm2 # NUL byte present?223pcmpeqb %xmm0, %xmm4 # which bytes match?224pandn %xmm4, %xmm2 # match and not NUL byte?225pmovmskb %xmm2, %r9d226btr %edx, %r9d # induce mismatch in last byte of buffer227not %r9d # mismatch or NUL byte?228229/* mismatch in true heads */230ALIGN_TEXT231.Lhead_mismatch:232tzcnt %r9d, %r9d # where is the mismatch?233add %rax, %rdi # return to true heads234add %rbx, %rsi235movzbl (%rdi, %r9, 1), %eax # mismatching characters236movzbl (%rsi, %r9, 1), %ecx237sub %ecx, %eax238pop %rbx239ret240241/* rax >= 0 */242ALIGN_TEXT243.Lnormal:244neg %rax245movdqu 16(%rsi, %rax, 1), %xmm0246sub %rdi, %rsi # express RSI as distance from RDI247lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string248neg %rax # ... corresponding to RDI249pcmpeqb %xmm3, %xmm1 # NUL present?250pcmpeqb %xmm2, %xmm0 # Mismatch between chunks?251pmovmskb %xmm1, %r8d252pmovmskb %xmm0, %r9d253mov $16, %ecx254cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)?255cmovb %edx, %ecx # ECX = min(16, RDX)256add $32, %rdi # advance to next iteration257bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte258test %r8w, %r8w # NUL or end of buffer found?259jnz .Lnul_found2260xor $0xffff, %r9d261jnz .Lmismatch2262sub $48, %rdx # end of buffer within first main loop iteration?263jb .Ltail # if yes, process tail264265/*266* During the main loop, the layout of the two strings is something like:267*268* v ------1------ v ------2------ v269* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...270* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...271*272* where v indicates the alignment boundaries and corresponding chunks273* of the strings have the same letters. Chunk A has been checked in274* the previous iteration. This iteration, we first check that string275* RSI doesn't end within region 2, then we compare chunk B between the276* two strings. As RSI is known not to hold a NUL byte in regsions 1277* and 2 at this point, this also ensures that RDI has not ended yet.278*/279ALIGN_TEXT2800: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI281pxor %xmm1, %xmm1282pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?283pcmpeqb (%rdi), %xmm0 # where do the chunks match?284pmovmskb %xmm1, %r8d285pmovmskb %xmm0, %r9d286test %r8d, %r8d287jnz .Lnul_found288xor $0xffff, %r9d # any mismatches?289jnz .Lmismatch290291/* main loop unrolled twice */292movdqu 16(%rdi, %rbx, 1), %xmm0293pxor %xmm1, %xmm1294pcmpeqb 16(%rdi, %rsi, 1), %xmm1295pcmpeqb 16(%rdi), %xmm0296pmovmskb %xmm1, %r8d297pmovmskb %xmm0, %r9d298add $32, %rdi299test %r8d, %r8d300jnz .Lnul_found2301xor $0xffff, %r9d302jnz .Lmismatch2303sub $32, %rdx # end of buffer within next iteration?304jae 0b305306/* end of buffer will occur in next 32 bytes */307.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI308pxor %xmm1, %xmm1309pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?310pcmpeqb (%rdi), %xmm0 # where do the chunks match?311pmovmskb %xmm1, %r8d312pmovmskb %xmm0, %r9d313bts %edx, %r8d # indicate NUL byte at last byte in buffer314test %r8w, %r8w # NUL byte in first chunk?315jnz .Lnul_found316xor $0xffff, %r9d # any mismatches?317jnz .Lmismatch318319/* main loop unrolled twice */320movdqu 16(%rdi, %rbx, 1), %xmm0321pxor %xmm1, %xmm1322pcmpeqb 16(%rdi, %rsi, 1), %xmm1323pcmpeqb 16(%rdi), %xmm0324pmovmskb %xmm1, %r8d325pmovmskb %xmm0, %r9d326sub $16, %edx # take first half into account327bts %edx, %r8d # indicate NUL byte at last byte in buffer328add $32, %rdi329330.Lnul_found2:331sub $16, %rdi332333.Lnul_found:334mov %eax, %ecx335mov %r8d, %r10d336shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX337not %r9d # mask of mismatches338or %r8w, %r9w # NUL bytes als count as mismatches339jnz .Lmismatch340341/*342* (RDI) == (RSI) and NUL is past the string.343* compare (RSI) with the corresponding part344* of the other string until the NUL byte.345*/346movdqu (%rdi, %rax, 1), %xmm0347pcmpeqb (%rdi, %rsi, 1), %xmm0348add %rdi, %rsi # restore RSI pointer349add %rax, %rdi # point RDI to chunk corresponding to (RSI)350pmovmskb %xmm0, %ecx # mask of matches351not %ecx # mask of mismatches352or %r10d, %ecx # mask of mismatches or NUL bytes353tzcnt %ecx, %ecx # location of first mismatch354movzbl (%rdi, %rcx, 1), %eax355movzbl (%rsi, %rcx, 1), %ecx356sub %ecx, %eax357pop %rbx358ret359360.Lmismatch2:361sub $16, %rdi362363/* a mismatch has been found between RBX and RSI */364.Lmismatch:365tzcnt %r9d, %r9d # where is the mismatch?366add %rdi, %rbx # turn RBX from offset into pointer367movzbl (%rbx, %r9, 1), %ecx368movzbl (%rdi, %r9, 1), %eax369sub %ecx, %eax370pop %rbx371ret372373/* rax < 0 */374ALIGN_TEXT375.Lswapped:376movdqu 16(%rdi, %rax, 1), %xmm0377sub %rsi, %rdi # express RDI as distance from RDI378lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string379pcmpeqb %xmm2, %xmm1 # NUL present?380pcmpeqb %xmm3, %xmm0 # mismatch between chunks?381pmovmskb %xmm1, %r8d382pmovmskb %xmm0, %r9d383add %rax, %rdx # RDX points to buffer end in RSI384neg %rax # ... corresponding to RSI385mov $16, %ecx386cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)?387cmovb %edx, %ecx # ECX = min(16, RDX)388add $32, %rsi389bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte390test %r8w, %r8w # NUL or end of buffer found?391jnz .Lnul_found2s392xor $0xffff, %r9d393jnz .Lmismatch2s394sub $48, %rdx # end of buffer within first main loop iteration?395jb .Ltails # if yes, process tail396397ALIGN_TEXT3980: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI399pxor %xmm1, %xmm1400pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI?401pcmpeqb (%rsi), %xmm0 # where do the chunks match?402pmovmskb %xmm1, %r8d403pmovmskb %xmm0, %r9d404test %r8d, %r8d405jnz .Lnul_founds406xor $0xffff, %r9d # any mismatches?407jnz .Lmismatchs408409/* main loop unrolled twice */410movdqu 16(%rsi, %rbx, 1), %xmm0411pxor %xmm1, %xmm1412pcmpeqb 16(%rsi, %rdi, 1), %xmm1413pcmpeqb 16(%rsi), %xmm0414pmovmskb %xmm1, %r8d415pmovmskb %xmm0, %r9d416add $32, %rsi417test %r8d, %r8d418jnz .Lnul_found2s419xor $0xffff, %r9d420jnz .Lmismatch2s421sub $32, %rdx # end of buffer within next iteration?422jae 0b423424/* end of buffer will occur in next 32 bytes */425.Ltails:426movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI427pxor %xmm1, %xmm1428pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI?429pcmpeqb (%rsi), %xmm0 # where do the chunks match?430pmovmskb %xmm1, %r8d431pmovmskb %xmm0, %r9d432bts %edx, %r8d # indicate NUL byte at laste byte in buffer433test %r8w, %r8w # NUL byte in first chunk?434jnz .Lnul_founds435xor $0xffff, %r9d # any mismatches?436jnz .Lmismatchs437438/* main loop unrolled twice */439movdqu 16(%rsi, %rbx, 1), %xmm0440pxor %xmm1, %xmm1441pcmpeqb 16(%rsi, %rdi, 1), %xmm1442pcmpeqb 16(%rsi), %xmm0443pmovmskb %xmm1, %r8d444pmovmskb %xmm0, %r9d445sub $16, %edx # take first half into account446bts %edx, %r8d # indicate NUL byte at laste byte in buffer447add $32, %rsi448449.Lnul_found2s:450sub $16, %rsi451452.Lnul_founds:453mov %eax, %ecx454mov %r8d, %r10d455shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX456not %r9d # mask of mismatches457or %r8w, %r9w # NUL bytes also count as mismatches458jnz .Lmismatchs459460movdqu (%rsi, %rax, 1), %xmm0461pcmpeqb (%rsi, %rdi, 1), %xmm0462add %rsi, %rdi # restore RDI pointer463add %rax, %rsi # point RSI to chunk corresponding to (RDI)464pmovmskb %xmm0, %ecx # mask of matches465not %ecx # mask of mismatches466or %r10d, %ecx # mask of mismatches or NUL bytes467tzcnt %ecx, %ecx # location of first mismatch468movzbl (%rdi, %rcx, 1), %eax469movzbl (%rsi, %rcx, 1), %ecx470sub %ecx, %eax471pop %rbx472ret473474.Lmismatch2s:475sub $16, %rsi476477.Lmismatchs:478tzcnt %r9d, %r9d # where is the mismatch?479add %rsi, %rbx # turn RBX from offset into pointer480movzbl (%rbx, %r9, 1), %eax481movzbl (%rsi, %r9, 1), %ecx482sub %ecx, %eax483pop %rbx484ret485ARCHEND(strncmp, baseline)486487.section .note.GNU-stack,"",%progbits488489490