/*-1* Copyright (c) 2023, The FreeBSD Foundation2*3* SPDX-License-Expression: BSD-2-Clause4*5* Portions of this software were developed by Robert Clausecker6* <[email protected]> under sponsorship from the FreeBSD Foundation.7*8* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S9* written by J.T. Conklin <[email protected]> that was originally10* dedicated to the public domain.11*/1213#include <machine/asm.h>14#include <machine/param.h>1516#if 017RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")18#endif1920#include "amd64_archlevel.h"2122#define ALIGN_TEXT .p2align 4, 0x902324ARCHFUNCS(strcmp)25ARCHFUNC(strcmp, scalar)26ARCHFUNC(strcmp, baseline)27ENDARCHFUNCS(strcmp)2829ARCHENTRY(strcmp, scalar)30/*31* Align s1 to word boundary.32* Consider unrolling loop?33*/34.Ls1align:35testb $7,%dil36je .Ls1aligned37movb (%rdi),%al38incq %rdi39movb (%rsi),%dl40incq %rsi41testb %al,%al42je .Ldone43cmpb %al,%dl44je .Ls1align45jmp .Ldone4647/*48* Check whether s2 is aligned to a word boundary. If it is, we49* can compare by words. Otherwise we have to compare by bytes.50*/51.Ls1aligned:52testb $7,%sil53jne .Lbyte_loop5455movabsq $0x0101010101010101,%r856subq $8,%rdi57movabsq $0x8080808080808080,%r958subq $8,%rsi5960ALIGN_TEXT61.Lword_loop:62movq 8(%rdi),%rax63addq $8,%rdi64movq 8(%rsi),%rdx65addq $8,%rsi66cmpq %rax,%rdx67jne .Lbyte_loop68subq %r8,%rdx69notq %rax70andq %rax,%rdx71testq %r9,%rdx72je .Lword_loop7374ALIGN_TEXT75.Lbyte_loop:76movb (%rdi),%al77incq %rdi78movb (%rsi),%dl79incq %rsi80testb %al,%al81je .Ldone82cmpb %al,%dl83je .Lbyte_loop8485.Ldone:86movzbq %al,%rax87movzbq %dl,%rdx88subq %rdx,%rax89ret90ARCHEND(strcmp, scalar)9192ARCHENTRY(strcmp, baseline)93/* check if either string crosses a page in the head */94lea 15(%rdi), %r8d # end of head95lea 15(%rsi), %r9d96mov %edi, %eax97mov %esi, %edx98xor %edi, %r8d # bits that changed between first and last byte99xor %esi, %r9d100and $~0xf, %rdi # align heads to 16 bytes101and $~0xf, %rsi102or %r8d, %r9d # in either RSI or RDI103and $0xf, %eax # offset from alignment104and $0xf, %edx105pxor %xmm1, %xmm1106test $PAGE_SIZE, %r9d # did the page change?107jz 0f # if not, take fast path108109/* heads may cross page boundary, avoid unmapped loads */110movdqa (%rdi), %xmm0 # load aligned heads111movdqa (%rsi), %xmm2112mov $-1, %r8d113mov $-1, %r9d114mov %eax, %ecx115shl %cl, %r8d # string head in XMM0116mov %edx, %ecx117shl %cl, %r9d # string head in XMM2118movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack119movdqa %xmm2, -24(%rsp)120pcmpeqb %xmm1, %xmm0121pcmpeqb %xmm1, %xmm2122pmovmskb %xmm0, %r10d123pmovmskb %xmm2, %r11d124test %r8d, %r10d # NUL byte present in first string?125lea -40(%rsp), %r8126cmovz %rdi, %r8127test %r9d, %r11d # NUL byte present in second string?128lea -24(%rsp), %r9129cmovz %rsi, %r9130movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads131movdqu (%r9, %rdx, 1), %xmm4132jmp 1f1331340: movdqu (%rdi, %rax, 1), %xmm0 # load true heads135movdqu (%rsi, %rdx, 1), %xmm41361: pxor %xmm2, %xmm2137pcmpeqb %xmm0, %xmm2 # NUL byte present?138pcmpeqb %xmm0, %xmm4 # which bytes match?139pandn %xmm4, %xmm2 # match and not NUL byte?140pmovmskb %xmm2, %r9d141xor $0xffff, %r9d # mismatch or NUL byte?142jnz .Lhead_mismatch143144/* load head and second chunk */145movdqa 16(%rdi), %xmm2 # load second chunks146movdqa 16(%rsi), %xmm3147sub %rdx, %rax # is a&0xf >= b&0xf?148jb .Lswapped # if not, proceed with swapped operands149150neg %rax151movdqu 16(%rsi, %rax, 1), %xmm0152sub %rdi, %rsi # express RSI as distance from RDI153lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string154neg %rax155pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI156pcmpeqb %xmm2, %xmm0157pmovmskb %xmm1, %r8d158pmovmskb %xmm0, %r9d159add $16, %rdi160test %r8d, %r8d161jnz .Lnul_found162xor $0xffff, %r9d163jnz .Lmismatch164add $16, %rdi # advance aligned pointers165166/*167* During the main loop, the layout of the two strings is something like:168*169* v ------1------ v ------2------ v170* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...171* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...172*173* where v indicates the alignment boundaries and corresponding chunks174* of the strings have the same letters. Chunk A has been checked in175* the previous iteration. This iteration, we first check that string176* RSI doesn't end within region 2, then we compare chunk B between the177* two strings. As RSI is known not to hold a NUL byte in regsions 1178* and 2 at this point, this also ensures that RDI has not ended yet.179*/180ALIGN_TEXT1810: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?182pxor %xmm1, %xmm1183pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?184pcmpeqb (%rdi), %xmm0 # where do the chunks match?185pmovmskb %xmm1, %r8d186pmovmskb %xmm0, %r9d187test %r8d, %r8d188jnz .Lnul_found189xor $0xffff, %r9d # any mismatches?190jnz .Lmismatch191192/* main loop unrolled twice */193movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?194pxor %xmm1, %xmm1195pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?196pcmpeqb 16(%rdi), %xmm0 # where do the chunks match?197pmovmskb %xmm1, %r8d198pmovmskb %xmm0, %r9d199add $32, %rdi200test %r8d, %r8d201jnz .Lnul_found2202xor $0xffff, %r9d # any mismatches?203jz 0b204205sub $16, %rdi # roll back second increment206207/* a mismatch has been found between RDX and RSI */208.Lmismatch:209tzcnt %r9d, %r9d # where is the mismatch?210add %rdi, %rdx # turn RDX from offset to pointer211movzbl (%rdx, %r9, 1), %ecx212movzbl (%rdi, %r9, 1), %eax213sub %ecx, %eax # difference of the mismatching chars214ret215216/* mismatch in true heads */217.Lhead_mismatch:218tzcnt %r9d, %r9d # where is the mismatch?219add %rax, %rdi # return to true heads220add %rdx, %rsi221movzbl (%rdi, %r9, 1), %eax # mismatching characters222movzbl (%rsi, %r9, 1), %ecx223sub %ecx, %eax224ret225226.Lnul_found2:227sub $16, %rdi # roll back second increment228229/* a NUL has been found in RSI */230.Lnul_found:231mov %eax, %ecx232mov %r8d, %r10d233shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX234xor $0xffff, %r9d # mask of mismatches235or %r8d, %r9d # NUL bytes also count as mismatches236jnz .Lmismatch237238/*239* (RDI) == (RSI) and NUL is past the string.240* Compare (RSI) with the corresponding part241* of the other string until the NUL byte.242*/243movdqu (%rdi, %rax, 1), %xmm0244pcmpeqb (%rdi, %rsi, 1), %xmm0245add %rdi, %rsi # restore RSI pointer246add %rax, %rdi # point RDI to chunk corresponding to (RSI)247pmovmskb %xmm0, %ecx # mask of matches248not %ecx # mask of mismatches249or %r10d, %ecx # mask of mismatches or NUL bytes250tzcnt %ecx, %ecx # location of first mismatch251movzbl (%rdi, %rcx, 1), %eax252movzbl (%rsi, %rcx, 1), %ecx253sub %ecx, %eax254ret255256/*257* If (a&0xf) < (b&0xf), we do the same thing but with swapped258* operands. I found that this performs slightly better than259* using conditional moves to do the swap branchless.260*/261.Lswapped:262movdqu 16(%rdi, %rax, 1), %xmm0263sub %rsi, %rdi # express RDI as distance from RSI264lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI265neg %rax # make difference positive266pcmpeqb %xmm2, %xmm1267pcmpeqb %xmm3, %xmm0268pmovmskb %xmm1, %r8d269pmovmskb %xmm0, %r9d270add $16, %rsi # advance aligned pointers271test %r8d, %r8d272jnz .Lnul_founds273xor $0xffff, %r9d274jnz .Lmismatchs275add $16, %rsi276277/*278* During the main loop, the layout of the two strings is something like:279*280* v ------1------ v ------2------ v281* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...282* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...283*284* where v indicates the alignment boundaries and corresponding chunks285* of the strings have the same letters. Chunk A has been checked in286* the previous iteration. This iteration, we first check that string287* RSI doesn't end within region 2, then we compare chunk B between the288* two strings. As RSI is known not to hold a NUL byte in regsions 1289* and 2 at this point, this also ensures that RDI has not ended yet.290*/291ALIGN_TEXT2920: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?293pxor %xmm1, %xmm1294pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI?295pcmpeqb (%rsi), %xmm0 # where do the chunks match?296pmovmskb %xmm1, %r8d297pmovmskb %xmm0, %r9d298test %r8d, %r8d299jnz .Lnul_founds300xor $0xffff, %r9d # any mismatches?301jnz .Lmismatchs302303/* main loop unrolled twice */304movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?305pxor %xmm1, %xmm1306pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?307pcmpeqb 16(%rsi), %xmm0 # where do the chunks match?308pmovmskb %xmm1, %r8d309pmovmskb %xmm0, %r9d310add $32, %rsi311test %r8d, %r8d312jnz .Lnul_found2s313xor $0xffff, %r9d # any mismatches?314jz 0b315316sub $16, %rsi # roll back second increment317318/* a mismatch has been found between RDX and RDI */319.Lmismatchs:320tzcnt %r9d, %r9d # where is the mismatch?321add %rsi, %rdx # turn RDX from offset to pointer322movzbl (%rdx, %r9, 1), %eax323movzbl (%rsi, %r9, 1), %ecx324sub %ecx, %eax # difference of the mismatching chars325ret326327.Lnul_found2s:328sub $16, %rsi # roll back second increment329330/* a NUL has been found in RSI */331.Lnul_founds:332mov %eax, %ecx333mov %r8d, %r10d334shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX335xor $0xffff, %r9d # mask of mismatches336or %r8d, %r9d # NUL bytes also count as mismatches337jnz .Lmismatchs338339/*340* (RDI) == (RSI) and NUL is past the string.341* Compare (RSI) with the corresponding part342* of the other string until the NUL byte.343*/344movdqu (%rsi, %rax, 1), %xmm0345pcmpeqb (%rsi, %rdi, 1), %xmm0346add %rsi, %rdi # restore RDI pointer347add %rax, %rsi # point RSI to chunk corresponding to (RDI)348pmovmskb %xmm0, %ecx # mask of matches349not %ecx # mask of mismatches350or %r10d, %ecx # mask of mismatches or NUL bytes351tzcnt %ecx, %ecx # location of first mismatch352movzbl (%rdi, %rcx, 1), %eax353movzbl (%rsi, %rcx, 1), %ecx354sub %ecx, %eax355ret356ARCHEND(strcmp, baseline)357358.section .note.GNU-stack,"",%progbits359360361