Path: blob/main/contrib/bionic-x86_64-string/sse2-memmove-slm.S
39475 views
/*1Copyright (c) 2014, Intel Corporation2All rights reserved.34Redistribution and use in source and binary forms, with or without5modification, are permitted provided that the following conditions are met:67* Redistributions of source code must retain the above copyright notice,8* this list of conditions and the following disclaimer.910* Redistributions in binary form must reproduce the above copyright notice,11* this list of conditions and the following disclaimer in the documentation12* and/or other materials provided with the distribution.1314* Neither the name of Intel Corporation nor the names of its contributors15* may be used to endorse or promote products derived from this software16* without specific prior written permission.1718THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND19ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED20WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE21DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR22ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES23(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;24LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON25ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT26(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS27SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.28*/2930#include "cache.h"3132#ifndef MEMMOVE33# define MEMMOVE memmove34#endif3536#ifndef L37# define L(label) .L##label38#endif3940#ifndef cfi_startproc41# define cfi_startproc .cfi_startproc42#endif4344#ifndef cfi_endproc45# define cfi_endproc .cfi_endproc46#endif4748#ifndef cfi_rel_offset49# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off50#endif5152#ifndef cfi_restore53# define cfi_restore(reg) .cfi_restore reg54#endif5556#ifndef cfi_adjust_cfa_offset57# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off58#endif5960#ifndef ENTRY61# define ENTRY(name) \62.type name, @function; \63.globl name; \64.p2align 4; \65name: \66cfi_startproc67#endif6869#ifndef ALIAS_SYMBOL70# define ALIAS_SYMBOL(alias, original) \71.globl alias; \72.equ alias, original73#endif7475#ifndef END76# define END(name) \77cfi_endproc; \78.size name, .-name79#endif8081#define CFI_PUSH(REG) \82cfi_adjust_cfa_offset (4); \83cfi_rel_offset (REG, 0)8485#define CFI_POP(REG) \86cfi_adjust_cfa_offset (-4); \87cfi_restore (REG)8889#define PUSH(REG) push REG;90#define POP(REG) pop REG;9192#define ENTRANCE PUSH (%rbx);93#define RETURN_END POP (%rbx); ret94#define RETURN RETURN_END;9596.section .text.sse2,"ax",@progbits97ENTRY (MEMMOVE)98ENTRANCE99mov %rdi, %rax100101/* Check whether we should copy backward or forward. */102cmp %rsi, %rdi103je L(mm_return)104jg L(mm_len_0_or_more_backward)105106/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]107separately. */108cmp $16, %rdx109jbe L(mm_len_0_16_bytes_forward)110111cmp $32, %rdx112ja L(mm_len_32_or_more_forward)113114/* Copy [0..32] and return. */115movdqu (%rsi), %xmm0116movdqu -16(%rsi, %rdx), %xmm1117movdqu %xmm0, (%rdi)118movdqu %xmm1, -16(%rdi, %rdx)119jmp L(mm_return)120121L(mm_len_32_or_more_forward):122cmp $64, %rdx123ja L(mm_len_64_or_more_forward)124125/* Copy [0..64] and return. */126movdqu (%rsi), %xmm0127movdqu 16(%rsi), %xmm1128movdqu -16(%rsi, %rdx), %xmm2129movdqu -32(%rsi, %rdx), %xmm3130movdqu %xmm0, (%rdi)131movdqu %xmm1, 16(%rdi)132movdqu %xmm2, -16(%rdi, %rdx)133movdqu %xmm3, -32(%rdi, %rdx)134jmp L(mm_return)135136L(mm_len_64_or_more_forward):137cmp $128, %rdx138ja L(mm_len_128_or_more_forward)139140/* Copy [0..128] and return. */141movdqu (%rsi), %xmm0142movdqu 16(%rsi), %xmm1143movdqu 32(%rsi), %xmm2144movdqu 48(%rsi), %xmm3145movdqu -64(%rsi, %rdx), %xmm4146movdqu -48(%rsi, %rdx), %xmm5147movdqu -32(%rsi, %rdx), %xmm6148movdqu -16(%rsi, %rdx), %xmm7149movdqu %xmm0, (%rdi)150movdqu %xmm1, 16(%rdi)151movdqu %xmm2, 32(%rdi)152movdqu %xmm3, 48(%rdi)153movdqu %xmm4, -64(%rdi, %rdx)154movdqu %xmm5, -48(%rdi, %rdx)155movdqu %xmm6, -32(%rdi, %rdx)156movdqu %xmm7, -16(%rdi, %rdx)157jmp L(mm_return)158159L(mm_len_128_or_more_forward):160/* Aligning the address of destination. */161/* save first unaligned 64 bytes */162movdqu (%rsi), %xmm0163movdqu 16(%rsi), %xmm1164movdqu 32(%rsi), %xmm2165movdqu 48(%rsi), %xmm3166167lea 64(%rdi), %r8168and $-64, %r8 /* r8 now aligned to next 64 byte boundary */169sub %rdi, %rsi /* rsi = src - dst = diff */170171movdqu (%r8, %rsi), %xmm4172movdqu 16(%r8, %rsi), %xmm5173movdqu 32(%r8, %rsi), %xmm6174movdqu 48(%r8, %rsi), %xmm7175176movdqu %xmm0, (%rdi)177movdqu %xmm1, 16(%rdi)178movdqu %xmm2, 32(%rdi)179movdqu %xmm3, 48(%rdi)180movdqa %xmm4, (%r8)181movaps %xmm5, 16(%r8)182movaps %xmm6, 32(%r8)183movaps %xmm7, 48(%r8)184add $64, %r8185186lea (%rdi, %rdx), %rbx187and $-64, %rbx188cmp %r8, %rbx189jbe L(mm_copy_remaining_forward)190191cmp $SHARED_CACHE_SIZE_HALF, %rdx192jae L(mm_large_page_loop_forward)193194.p2align 4195L(mm_main_loop_forward):196197prefetcht0 128(%r8, %rsi)198199movdqu (%r8, %rsi), %xmm0200movdqu 16(%r8, %rsi), %xmm1201movdqu 32(%r8, %rsi), %xmm2202movdqu 48(%r8, %rsi), %xmm3203movdqa %xmm0, (%r8)204movaps %xmm1, 16(%r8)205movaps %xmm2, 32(%r8)206movaps %xmm3, 48(%r8)207lea 64(%r8), %r8208cmp %r8, %rbx209ja L(mm_main_loop_forward)210211L(mm_copy_remaining_forward):212add %rdi, %rdx213sub %r8, %rdx214/* We copied all up till %rdi position in the dst.215In %rdx now is how many bytes are left to copy.216Now we need to advance %r8. */217lea (%r8, %rsi), %r9218219L(mm_remaining_0_64_bytes_forward):220cmp $32, %rdx221ja L(mm_remaining_33_64_bytes_forward)222cmp $16, %rdx223ja L(mm_remaining_17_32_bytes_forward)224test %rdx, %rdx225.p2align 4,,2226je L(mm_return)227228cmpb $8, %dl229ja L(mm_remaining_9_16_bytes_forward)230cmpb $4, %dl231.p2align 4,,5232ja L(mm_remaining_5_8_bytes_forward)233cmpb $2, %dl234.p2align 4,,1235ja L(mm_remaining_3_4_bytes_forward)236movzbl -1(%r9,%rdx), %esi237movzbl (%r9), %ebx238movb %sil, -1(%r8,%rdx)239movb %bl, (%r8)240jmp L(mm_return)241242L(mm_remaining_33_64_bytes_forward):243movdqu (%r9), %xmm0244movdqu 16(%r9), %xmm1245movdqu -32(%r9, %rdx), %xmm2246movdqu -16(%r9, %rdx), %xmm3247movdqu %xmm0, (%r8)248movdqu %xmm1, 16(%r8)249movdqu %xmm2, -32(%r8, %rdx)250movdqu %xmm3, -16(%r8, %rdx)251jmp L(mm_return)252253L(mm_remaining_17_32_bytes_forward):254movdqu (%r9), %xmm0255movdqu -16(%r9, %rdx), %xmm1256movdqu %xmm0, (%r8)257movdqu %xmm1, -16(%r8, %rdx)258jmp L(mm_return)259260L(mm_remaining_5_8_bytes_forward):261movl (%r9), %esi262movl -4(%r9,%rdx), %ebx263movl %esi, (%r8)264movl %ebx, -4(%r8,%rdx)265jmp L(mm_return)266267L(mm_remaining_9_16_bytes_forward):268mov (%r9), %rsi269mov -8(%r9, %rdx), %rbx270mov %rsi, (%r8)271mov %rbx, -8(%r8, %rdx)272jmp L(mm_return)273274L(mm_remaining_3_4_bytes_forward):275movzwl -2(%r9,%rdx), %esi276movzwl (%r9), %ebx277movw %si, -2(%r8,%rdx)278movw %bx, (%r8)279jmp L(mm_return)280281L(mm_len_0_16_bytes_forward):282testb $24, %dl283jne L(mm_len_9_16_bytes_forward)284testb $4, %dl285.p2align 4,,5286jne L(mm_len_5_8_bytes_forward)287test %rdx, %rdx288.p2align 4,,2289je L(mm_return)290testb $2, %dl291.p2align 4,,1292jne L(mm_len_2_4_bytes_forward)293movzbl -1(%rsi,%rdx), %ebx294movzbl (%rsi), %esi295movb %bl, -1(%rdi,%rdx)296movb %sil, (%rdi)297jmp L(mm_return)298299L(mm_len_2_4_bytes_forward):300movzwl -2(%rsi,%rdx), %ebx301movzwl (%rsi), %esi302movw %bx, -2(%rdi,%rdx)303movw %si, (%rdi)304jmp L(mm_return)305306L(mm_len_5_8_bytes_forward):307movl (%rsi), %ebx308movl -4(%rsi,%rdx), %esi309movl %ebx, (%rdi)310movl %esi, -4(%rdi,%rdx)311jmp L(mm_return)312313L(mm_len_9_16_bytes_forward):314mov (%rsi), %rbx315mov -8(%rsi, %rdx), %rsi316mov %rbx, (%rdi)317mov %rsi, -8(%rdi, %rdx)318jmp L(mm_return)319320L(mm_recalc_len):321/* Compute in %rdx how many bytes are left to copy after322the main loop stops. */323mov %rbx, %rdx324sub %rdi, %rdx325/* The code for copying backwards. */326L(mm_len_0_or_more_backward):327328/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]329separately. */330cmp $16, %rdx331jbe L(mm_len_0_16_bytes_backward)332333cmp $32, %rdx334ja L(mm_len_32_or_more_backward)335336/* Copy [0..32] and return. */337movdqu (%rsi), %xmm0338movdqu -16(%rsi, %rdx), %xmm1339movdqu %xmm0, (%rdi)340movdqu %xmm1, -16(%rdi, %rdx)341jmp L(mm_return)342343L(mm_len_32_or_more_backward):344cmp $64, %rdx345ja L(mm_len_64_or_more_backward)346347/* Copy [0..64] and return. */348movdqu (%rsi), %xmm0349movdqu 16(%rsi), %xmm1350movdqu -16(%rsi, %rdx), %xmm2351movdqu -32(%rsi, %rdx), %xmm3352movdqu %xmm0, (%rdi)353movdqu %xmm1, 16(%rdi)354movdqu %xmm2, -16(%rdi, %rdx)355movdqu %xmm3, -32(%rdi, %rdx)356jmp L(mm_return)357358L(mm_len_64_or_more_backward):359cmp $128, %rdx360ja L(mm_len_128_or_more_backward)361362/* Copy [0..128] and return. */363movdqu (%rsi), %xmm0364movdqu 16(%rsi), %xmm1365movdqu 32(%rsi), %xmm2366movdqu 48(%rsi), %xmm3367movdqu -64(%rsi, %rdx), %xmm4368movdqu -48(%rsi, %rdx), %xmm5369movdqu -32(%rsi, %rdx), %xmm6370movdqu -16(%rsi, %rdx), %xmm7371movdqu %xmm0, (%rdi)372movdqu %xmm1, 16(%rdi)373movdqu %xmm2, 32(%rdi)374movdqu %xmm3, 48(%rdi)375movdqu %xmm4, -64(%rdi, %rdx)376movdqu %xmm5, -48(%rdi, %rdx)377movdqu %xmm6, -32(%rdi, %rdx)378movdqu %xmm7, -16(%rdi, %rdx)379jmp L(mm_return)380381L(mm_len_128_or_more_backward):382/* Aligning the address of destination. We need to save38316 bits from the source in order not to overwrite them. */384movdqu -16(%rsi, %rdx), %xmm0385movdqu -32(%rsi, %rdx), %xmm1386movdqu -48(%rsi, %rdx), %xmm2387movdqu -64(%rsi, %rdx), %xmm3388389lea (%rdi, %rdx), %r9390and $-64, %r9 /* r9 = aligned dst */391392mov %rsi, %r8393sub %rdi, %r8 /* r8 = src - dst, diff */394395movdqu -16(%r9, %r8), %xmm4396movdqu -32(%r9, %r8), %xmm5397movdqu -48(%r9, %r8), %xmm6398movdqu -64(%r9, %r8), %xmm7399400movdqu %xmm0, -16(%rdi, %rdx)401movdqu %xmm1, -32(%rdi, %rdx)402movdqu %xmm2, -48(%rdi, %rdx)403movdqu %xmm3, -64(%rdi, %rdx)404movdqa %xmm4, -16(%r9)405movaps %xmm5, -32(%r9)406movaps %xmm6, -48(%r9)407movaps %xmm7, -64(%r9)408lea -64(%r9), %r9409410lea 64(%rdi), %rbx411and $-64, %rbx412413cmp %r9, %rbx414jae L(mm_recalc_len)415416cmp $SHARED_CACHE_SIZE_HALF, %rdx417jae L(mm_large_page_loop_backward)418419.p2align 4420L(mm_main_loop_backward):421422prefetcht0 -128(%r9, %r8)423424movdqu -64(%r9, %r8), %xmm0425movdqu -48(%r9, %r8), %xmm1426movdqu -32(%r9, %r8), %xmm2427movdqu -16(%r9, %r8), %xmm3428movdqa %xmm0, -64(%r9)429movaps %xmm1, -48(%r9)430movaps %xmm2, -32(%r9)431movaps %xmm3, -16(%r9)432lea -64(%r9), %r9433cmp %r9, %rbx434jb L(mm_main_loop_backward)435jmp L(mm_recalc_len)436437/* Copy [0..16] and return. */438L(mm_len_0_16_bytes_backward):439testb $24, %dl440jnz L(mm_len_9_16_bytes_backward)441testb $4, %dl442.p2align 4,,5443jnz L(mm_len_5_8_bytes_backward)444test %rdx, %rdx445.p2align 4,,2446je L(mm_return)447testb $2, %dl448.p2align 4,,1449jne L(mm_len_3_4_bytes_backward)450movzbl -1(%rsi,%rdx), %ebx451movzbl (%rsi), %ecx452movb %bl, -1(%rdi,%rdx)453movb %cl, (%rdi)454jmp L(mm_return)455456L(mm_len_3_4_bytes_backward):457movzwl -2(%rsi,%rdx), %ebx458movzwl (%rsi), %ecx459movw %bx, -2(%rdi,%rdx)460movw %cx, (%rdi)461jmp L(mm_return)462463L(mm_len_9_16_bytes_backward):464movl -4(%rsi,%rdx), %ebx465movl -8(%rsi,%rdx), %ecx466movl %ebx, -4(%rdi,%rdx)467movl %ecx, -8(%rdi,%rdx)468sub $8, %rdx469jmp L(mm_len_0_16_bytes_backward)470471L(mm_len_5_8_bytes_backward):472movl (%rsi), %ebx473movl -4(%rsi,%rdx), %ecx474movl %ebx, (%rdi)475movl %ecx, -4(%rdi,%rdx)476477L(mm_return):478RETURN479480/* Big length copy forward part. */481482.p2align 4483L(mm_large_page_loop_forward):484movdqu (%r8, %rsi), %xmm0485movdqu 16(%r8, %rsi), %xmm1486movdqu 32(%r8, %rsi), %xmm2487movdqu 48(%r8, %rsi), %xmm3488movntdq %xmm0, (%r8)489movntdq %xmm1, 16(%r8)490movntdq %xmm2, 32(%r8)491movntdq %xmm3, 48(%r8)492lea 64(%r8), %r8493cmp %r8, %rbx494ja L(mm_large_page_loop_forward)495sfence496jmp L(mm_copy_remaining_forward)497498/* Big length copy backward part. */499.p2align 4500L(mm_large_page_loop_backward):501movdqu -64(%r9, %r8), %xmm0502movdqu -48(%r9, %r8), %xmm1503movdqu -32(%r9, %r8), %xmm2504movdqu -16(%r9, %r8), %xmm3505movntdq %xmm0, -64(%r9)506movntdq %xmm1, -48(%r9)507movntdq %xmm2, -32(%r9)508movntdq %xmm3, -16(%r9)509lea -64(%r9), %r9510cmp %r9, %rbx511jb L(mm_large_page_loop_backward)512sfence513jmp L(mm_recalc_len)514515END (MEMMOVE)516517ALIAS_SYMBOL(memcpy, MEMMOVE)518519520