Path: blob/main/lib/libc/amd64/string/timingsafe_bcmp.S
39486 views
/*-1* Copyright (c) 2023 The FreeBSD Foundation2*3* This software was developed by Robert Clausecker <[email protected]>4* under sponsorship from the FreeBSD Foundation.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE26*/2728#include <machine/asm.h>2930#include "amd64_archlevel.h"3132#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */3334ARCHFUNCS(timingsafe_bcmp)35ARCHFUNC(timingsafe_bcmp, scalar)36ARCHFUNC(timingsafe_bcmp, baseline)37ENDARCHFUNCS(timingsafe_bcmp)3839ARCHENTRY(timingsafe_bcmp, scalar)40cmp $16, %rdx # at least 17 bytes to process?41ja .Lgt164243cmp $8, %edx # at least 9 bytes to process?44ja .L09164546cmp $4, %edx # at least 5 bytes to process?47ja .L05084849cmp $2, %edx # at least 3 bytes to process?50ja .L03045152test %edx, %edx # buffer empty?53jnz .L01025455xor %eax, %eax # empty buffer always matches56ret5758.L0102: movzbl (%rdi), %eax # load 1--2 bytes from first buffer59movzbl -1(%rdi, %rdx, 1), %ecx60xor (%rsi), %al # xor in second buffer61xor -1(%rsi, %rdx, 1), %cl62or %ecx, %eax # mismatch in any of the two?63ret6465.L0304: movzwl (%rdi), %eax66movzwl -2(%rdi, %rdx, 1), %ecx67xor (%rsi), %ax68xor -2(%rsi, %rdx, 1), %cx69or %ecx, %eax70ret7172.L0508: mov (%rdi), %eax73mov -4(%rdi, %rdx, 1), %ecx74xor (%rsi), %eax75xor -4(%rsi, %rdx, 1), %ecx76or %ecx, %eax77ret7879.L0916: mov (%rdi), %rax80mov -8(%rdi, %rdx, 1), %rcx81xor (%rsi), %rax82xor -8(%rsi, %rdx, 1), %rcx83or %rcx, %rax84setnz %al # ensure EAX nonzero even if only85ret # high bits of RAX were set8687/* more than 16 bytes: process buffer in a loop */88.Lgt16: mov (%rdi), %rax # process first 16 bytes89mov 8(%rdi), %r990mov $32, %ecx91xor (%rsi), %rax92xor 8(%rsi), %r993or %r9, %rax9495cmp %rdx, %rcx # enough left for a full iteration?96jae .Ltail9798/* main loop processing 16 bytes per iteration */99ALIGN_TEXT1000: mov -16(%rdi, %rcx, 1), %r8101mov -8(%rdi, %rcx, 1), %r9102xor -16(%rsi, %rcx, 1), %r8103xor -8(%rsi, %rcx, 1), %r9104add $16, %rcx105or %r9, %r8106or %r8, %rax107108cmp %rdx, %rcx109jb 0b110111/* process last 16 bytes */112.Ltail: mov -16(%rdi, %rdx, 1), %r8113mov -8(%rdi, %rdx, 1), %r9114xor -16(%rsi, %rdx, 1), %r8115xor -8(%rsi, %rdx, 1), %r9116or %r9, %r8117or %r8, %rax118setnz %al119ret120ARCHEND(timingsafe_bcmp, scalar)121122ARCHENTRY(timingsafe_bcmp, baseline)123cmp $32, %rdx # at least 33 bytes to process?124ja .Lgt32b125126cmp $16, %edx # at least 17 bytes to process?127ja .L1732b128129cmp $8, %edx # at least 9 bytes to process?130ja .L0916b131132cmp $4, %edx # at least 5 bytes to process?133ja .L0508b134135cmp $2, %edx # at least 3 bytes to process?136ja .L0304b137138test %edx, %edx # buffer empty?139jnz .L0102b140141xor %eax, %eax # empty buffer always matches142ret143144.L0102b:145movzbl (%rdi), %eax # load 1--2 bytes from first buffer146movzbl -1(%rdi, %rdx, 1), %ecx147xor (%rsi), %al # xor in second buffer148xor -1(%rsi, %rdx, 1), %cl149or %ecx, %eax # mismatch in any of the two?150ret151152.L0304b:153movzwl (%rdi), %eax154movzwl -2(%rdi, %rdx, 1), %ecx155xor (%rsi), %ax156xor -2(%rsi, %rdx, 1), %cx157or %ecx, %eax158ret159160.L0508b:161mov (%rdi), %eax162mov -4(%rdi, %rdx, 1), %ecx163xor (%rsi), %eax164xor -4(%rsi, %rdx, 1), %ecx165or %ecx, %eax166ret167168.L0916b:169mov (%rdi), %rax170mov -8(%rdi, %rdx, 1), %rcx171xor (%rsi), %rax172xor -8(%rsi, %rdx, 1), %rcx173or %rcx, %rax174setnz %al # ensure EAX nonzero even if only175ret # high bits of RAX were set176177.L1732b:178movdqu (%rdi), %xmm0179movdqu (%rsi), %xmm2180movdqu -16(%rdi, %rdx, 1), %xmm1181movdqu -16(%rsi, %rdx, 1), %xmm3182pcmpeqb %xmm2, %xmm0183pcmpeqb %xmm3, %xmm1184pand %xmm1, %xmm0185pmovmskb %xmm0, %eax # 1 where equal186xor $0xffff, %eax # 1 where not equal187ret188189/* more than 32 bytes: process buffer in a loop */190.Lgt32b:191movdqu (%rdi), %xmm4192movdqu (%rsi), %xmm2193movdqu 16(%rdi), %xmm1194movdqu 16(%rsi), %xmm3195mov $64, %ecx196pcmpeqb %xmm2, %xmm4197pcmpeqb %xmm3, %xmm1198pand %xmm1, %xmm4199cmp %rdx, %rcx # enough left for a full iteration?200jae .Ltailb201202/* main loop processing 32 bytes per iteration */203ALIGN_TEXT2040: movdqu -32(%rdi, %rcx, 1), %xmm0205movdqu -32(%rsi, %rcx, 1), %xmm2206movdqu -16(%rdi, %rcx, 1), %xmm1207movdqu -16(%rsi, %rcx, 1), %xmm3208add $32, %rcx209pcmpeqb %xmm2, %xmm0210pcmpeqb %xmm3, %xmm1211pand %xmm1, %xmm0212pand %xmm0, %xmm4213cmp %rdx, %rcx214jb 0b215216/* process last 32 bytes */217.Ltailb:218movdqu -32(%rdi, %rdx, 1), %xmm0219movdqu -32(%rsi, %rdx, 1), %xmm2220movdqu -16(%rdi, %rdx, 1), %xmm1221movdqu -16(%rsi, %rdx, 1), %xmm3222pcmpeqb %xmm2, %xmm0223pcmpeqb %xmm3, %xmm1224pand %xmm1, %xmm0225pand %xmm4, %xmm0226pmovmskb %xmm0, %eax227xor $0xffff, %eax228ret229ARCHEND(timingsafe_bcmp, baseline)230231.section .note.GNU-stack,"",%progbits232233234