/*-1* Copyright (c) 2018, 2023 The FreeBSD Foundation2*3* This software was developed by Mateusz Guzik <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Portions of this software were developed by Robert Clausecker7* <[email protected]> under sponsorship from the FreeBSD Foundation.8*9* Redistribution and use in source and binary forms, with or without10* modification, are permitted provided that the following conditions11* are met:12* 1. Redistributions of source code must retain the above copyright13* notice, this list of conditions and the following disclaimer.14* 2. Redistributions in binary form must reproduce the above copyright15* notice, this list of conditions and the following disclaimer in the16* documentation and/or other materials provided with the distribution.17*18* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND19* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE20* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE21* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE22* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL23* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS24* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)25* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT26* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY27* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF28* SUCH DAMAGE.29*/3031#include <machine/asm.h>32#include <machine/param.h>3334#include "amd64_archlevel.h"3536/*37* Note: this routine was written with kernel use in mind (read: no simd),38* it is only present in userspace as a temporary measure until something39* better gets imported.40*/4142#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */4344#ifdef BCMP45#define memcmp bcmp46#endif4748ARCHFUNCS(memcmp)49ARCHFUNC(memcmp, scalar)50ARCHFUNC(memcmp, baseline)51ENDARCHFUNCS(memcmp)5253ARCHENTRY(memcmp, scalar)54xorl %eax,%eax5510:56cmpq $16,%rdx57ja 101632f5859cmpb $8,%dl60jg 100816f6162cmpb $4,%dl63jg 100408f6465cmpb $2,%dl66jge 100204f6768cmpb $1,%dl69jl 100000f70movzbl (%rdi),%eax71movzbl (%rsi),%r8d72subl %r8d,%eax73100000:74ret7576ALIGN_TEXT77100816:78movq (%rdi),%r879movq (%rsi),%r980cmpq %r8,%r981jne 80f82movq -8(%rdi,%rdx),%r883movq -8(%rsi,%rdx),%r984cmpq %r8,%r985jne 10081608f86ret87ALIGN_TEXT88100408:89movl (%rdi),%r8d90movl (%rsi),%r9d91cmpl %r8d,%r9d92jne 80f93movl -4(%rdi,%rdx),%r8d94movl -4(%rsi,%rdx),%r9d95cmpl %r8d,%r9d96jne 10040804f97ret98ALIGN_TEXT99100204:100movzwl (%rdi),%r8d101movzwl (%rsi),%r9d102cmpl %r8d,%r9d103jne 1f104movzwl -2(%rdi,%rdx),%r8d105movzwl -2(%rsi,%rdx),%r9d106cmpl %r8d,%r9d107jne 1f108ret109ALIGN_TEXT110101632:111cmpq $32,%rdx112ja 103200f113movq (%rdi),%r8114movq (%rsi),%r9115cmpq %r8,%r9116jne 80f117movq 8(%rdi),%r8118movq 8(%rsi),%r9119cmpq %r8,%r9120jne 10163208f121movq -16(%rdi,%rdx),%r8122movq -16(%rsi,%rdx),%r9123cmpq %r8,%r9124jne 10163216f125movq -8(%rdi,%rdx),%r8126movq -8(%rsi,%rdx),%r9127cmpq %r8,%r9128jne 10163224f129ret130ALIGN_TEXT131103200:132movq (%rdi),%r8133movq 8(%rdi),%r9134subq (%rsi),%r8135subq 8(%rsi),%r9136orq %r8,%r9137jnz 10320000f138139movq 16(%rdi),%r8140movq 24(%rdi),%r9141subq 16(%rsi),%r8142subq 24(%rsi),%r9143orq %r8,%r9144jnz 10320016f145146leaq 32(%rdi),%rdi147leaq 32(%rsi),%rsi148subq $32,%rdx149cmpq $32,%rdx150jae 103200b151cmpb $0,%dl152jne 10b153ret154155/*156* Mismatch was found.157*/158#ifdef BCMP159ALIGN_TEXT16010320016:16110320000:16210081608:16310163224:16410163216:16510163208:16610040804:16780:1681:169leal 1(%eax),%eax170ret171#else172/*173* We need to compute the difference between strings.174* Start with narrowing the range down (16 -> 8 -> 4 bytes).175*/176ALIGN_TEXT17710320016:178leaq 16(%rdi),%rdi179leaq 16(%rsi),%rsi18010320000:181movq (%rdi),%r8182movq (%rsi),%r9183cmpq %r8,%r9184jne 80f185leaq 8(%rdi),%rdi186leaq 8(%rsi),%rsi187jmp 80f188ALIGN_TEXT18910081608:19010163224:191leaq -8(%rdi,%rdx),%rdi192leaq -8(%rsi,%rdx),%rsi193jmp 80f194ALIGN_TEXT19510163216:196leaq -16(%rdi,%rdx),%rdi197leaq -16(%rsi,%rdx),%rsi198jmp 80f199ALIGN_TEXT20010163208:201leaq 8(%rdi),%rdi202leaq 8(%rsi),%rsi203jmp 80f204ALIGN_TEXT20510040804:206leaq -4(%rdi,%rdx),%rdi207leaq -4(%rsi,%rdx),%rsi208jmp 1f209210ALIGN_TEXT21180:212movl (%rdi),%r8d213movl (%rsi),%r9d214cmpl %r8d,%r9d215jne 1f216leaq 4(%rdi),%rdi217leaq 4(%rsi),%rsi218219/*220* We have up to 4 bytes to inspect.221*/2221:223movzbl (%rdi),%eax224movzbl (%rsi),%r8d225cmpb %r8b,%al226jne 2f227228movzbl 1(%rdi),%eax229movzbl 1(%rsi),%r8d230cmpb %r8b,%al231jne 2f232233movzbl 2(%rdi),%eax234movzbl 2(%rsi),%r8d235cmpb %r8b,%al236jne 2f237238movzbl 3(%rdi),%eax239movzbl 3(%rsi),%r8d2402:241subl %r8d,%eax242ret243#endif244ARCHEND(memcmp, scalar)245246ARCHENTRY(memcmp, baseline)247cmp $32, %rdx # enough to permit use of the long kernel?248ja .Llong249250test %rdx, %rdx # zero bytes buffer?251je .L0252253/*254* Compare strings of 1--32 bytes. We want to do this by255* loading into two xmm registers and then comparing. To avoid256* crossing into unmapped pages, we either load 32 bytes from257* the start of the buffer or 32 bytes before its end, depending258* on whether there is a page boundary between the overread area259* or not.260*/261262/* check for page boundaries overreads */263lea 31(%rdi), %eax # end of overread264lea 31(%rsi), %r8d265lea -1(%rdi, %rdx, 1), %ecx # last character in buffer266lea -1(%rsi, %rdx, 1), %r9d267xor %ecx, %eax268xor %r9d, %r8d269test $PAGE_SIZE, %eax # are they on different pages?270jz 0f271272/* fix up rdi */273movdqu -32(%rdi, %rdx, 1), %xmm0274movdqu -16(%rdi, %rdx, 1), %xmm1275lea -8(%rsp), %rdi # end of replacement buffer276sub %rdx, %rdi # start of replacement buffer277movdqa %xmm0, -40(%rsp) # copy to replacement buffer278movdqa %xmm1, -24(%rsp)2792800: test $PAGE_SIZE, %r8d281jz 0f282283/* fix up rsi */284movdqu -32(%rsi, %rdx, 1), %xmm0285movdqu -16(%rsi, %rdx, 1), %xmm1286lea -40(%rsp), %rsi # end of replacement buffer287sub %rdx, %rsi # start of replacement buffer288movdqa %xmm0, -72(%rsp) # copy to replacement buffer289movdqa %xmm1, -56(%rsp)290291/* load data and compare properly */2920: movdqu 16(%rdi), %xmm1293movdqu 16(%rsi), %xmm3294movdqu (%rdi), %xmm0295movdqu (%rsi), %xmm2296mov %edx, %ecx297mov $-1, %edx298shl %cl, %rdx # ones where the buffer is not299pcmpeqb %xmm3, %xmm1300pcmpeqb %xmm2, %xmm0301pmovmskb %xmm1, %ecx302pmovmskb %xmm0, %eax303shl $16, %ecx304or %ecx, %eax # ones where the buffers match305or %edx, %eax # including where the buffer is not306not %eax # ones where there is a mismatch307#ifndef BCMP308bsf %eax, %edx # location of the first mismatch309cmovz %eax, %edx # including if there is no mismatch310movzbl (%rdi, %rdx, 1), %eax # mismatching bytes311movzbl (%rsi, %rdx, 1), %edx312sub %edx, %eax313#endif314ret315316/* empty input */317.L0: xor %eax, %eax318ret319320/* compare 33+ bytes */321ALIGN_TEXT322.Llong: movdqu (%rdi), %xmm0 # load head323movdqu (%rsi), %xmm2324mov %rdi, %rcx325sub %rdi, %rsi # express rsi as distance from rdi326and $~0xf, %rdi # align rdi to 16 bytes327movdqu 16(%rsi, %rdi, 1), %xmm1328pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration329add %rcx, %rdx # pointer to last byte in buffer330jc .Loverflow # did this overflow?3310: pcmpeqb %xmm2, %xmm0332pmovmskb %xmm0, %eax333xor $0xffff, %eax # any mismatch?334jne .Lmismatch_head335add $64, %rdi # advance to next iteration336jmp 1f # and get going with the loop337338/*339* If we got here, a buffer length was passed to memcmp(a, b, len)340* such that a + len < a. While this sort of usage is illegal,341* it is plausible that a caller tries to do something like342* memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending343* for memcmp() to stop comparing at the first mismatch. This344* behaviour is not guaranteed by any version of ISO/IEC 9899,345* but usually works out in practice. Let's try to make this346* case work by comparing until the end of the address space.347*/348.Loverflow:349mov $-1, %rdx # compare until the end of memory350jmp 0b351352/* process buffer 32 bytes at a time */353ALIGN_TEXT3540: movdqu -32(%rsi, %rdi, 1), %xmm0355movdqu -16(%rsi, %rdi, 1), %xmm1356pcmpeqb -32(%rdi), %xmm0357pcmpeqb -16(%rdi), %xmm1358add $32, %rdi # advance to next iteration3591: pand %xmm0, %xmm1 # 0xff where both halves matched360pmovmskb %xmm1, %eax361cmp $0xffff, %eax # all bytes matched?362jne .Lmismatch363cmp %rdx, %rdi # end of buffer reached?364jb 0b365366/* less than 32 bytes left to compare */367movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer368movdqu -16(%rdx, %rsi, 1), %xmm3369movdqu -32(%rdx), %xmm0370movdqu -32(%rdx, %rsi, 1), %xmm2371pcmpeqb %xmm3, %xmm1372pcmpeqb %xmm2, %xmm0373pmovmskb %xmm1, %ecx374pmovmskb %xmm0, %eax375shl $16, %ecx376or %ecx, %eax # ones where the buffers match377not %eax # ones where there is a mismatch378#ifndef BCMP379bsf %eax, %ecx # location of the first mismatch380cmovz %eax, %ecx # including if there is no mismatch381add %rcx, %rdx # pointer to potential mismatch382movzbl -32(%rdx), %eax # mismatching bytes383movzbl -32(%rdx, %rsi, 1), %edx384sub %edx, %eax385#endif386ret387388#ifdef BCMP389.Lmismatch:390mov $1, %eax391.Lmismatch_head:392ret393#else /* memcmp */394.Lmismatch_head:395tzcnt %eax, %eax # location of mismatch396add %rax, %rcx # pointer to mismatch397movzbl (%rcx), %eax # mismatching bytes398movzbl (%rcx, %rsi, 1), %ecx399sub %ecx, %eax400ret401402.Lmismatch:403movdqu -48(%rsi, %rdi, 1), %xmm1404pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND405pmovmskb %xmm0, %eax # mismatches in first 16 bytes406pmovmskb %xmm1, %edx # mismatches in second 16 bytes407shl $16, %edx408or %edx, %eax # mismatches in both409not %eax # matches in both410tzcnt %eax, %eax # location of mismatch411add %rax, %rdi # pointer to mismatch412movzbl -64(%rdi), %eax # mismatching bytes413movzbl -64(%rdi, %rsi, 1), %ecx414sub %ecx, %eax415ret416#endif417ARCHEND(memcmp, baseline)418419.section .note.GNU-stack,"",%progbits420421422