/* Copyright 2002 Andi Kleen */12#include <linux/linkage.h>34#include <asm/cpufeature.h>5#include <asm/dwarf2.h>6#include <asm/alternative-asm.h>78/*9* memcpy - Copy a memory block.10*11* Input:12* rdi destination13* rsi source14* rdx count15*16* Output:17* rax original destination18*/1920/*21* memcpy_c() - fast string ops (REP MOVSQ) based variant.22*23* This gets patched over the unrolled variant (below) via the24* alternative instructions framework:25*/26.section .altinstr_replacement, "ax", @progbits27.Lmemcpy_c:28movq %rdi, %rax2930movl %edx, %ecx31shrl $3, %ecx32andl $7, %edx33rep movsq34movl %edx, %ecx35rep movsb36ret37.Lmemcpy_e:38.previous3940/*41* memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than42* memcpy_c. Use memcpy_c_e when possible.43*44* This gets patched over the unrolled variant (below) via the45* alternative instructions framework:46*/47.section .altinstr_replacement, "ax", @progbits48.Lmemcpy_c_e:49movq %rdi, %rax5051movl %edx, %ecx52rep movsb53ret54.Lmemcpy_e_e:55.previous5657ENTRY(__memcpy)58ENTRY(memcpy)59CFI_STARTPROC60movq %rdi, %rax6162/*63* Use 32bit CMP here to avoid long NOP padding.64*/65cmp $0x20, %edx66jb .Lhandle_tail6768/*69* We check whether memory false dependence could occur,70* then jump to corresponding copy mode.71*/72cmp %dil, %sil73jl .Lcopy_backward74subl $0x20, %edx75.Lcopy_forward_loop:76subq $0x20, %rdx7778/*79* Move in blocks of 4x8 bytes:80*/81movq 0*8(%rsi), %r882movq 1*8(%rsi), %r983movq 2*8(%rsi), %r1084movq 3*8(%rsi), %r1185leaq 4*8(%rsi), %rsi8687movq %r8, 0*8(%rdi)88movq %r9, 1*8(%rdi)89movq %r10, 2*8(%rdi)90movq %r11, 3*8(%rdi)91leaq 4*8(%rdi), %rdi92jae .Lcopy_forward_loop93addq $0x20, %rdx94jmp .Lhandle_tail9596.Lcopy_backward:97/*98* Calculate copy position to tail.99*/100addq %rdx, %rsi101addq %rdx, %rdi102subq $0x20, %rdx103/*104* At most 3 ALU operations in one cycle,105* so append NOPS in the same 16bytes trunk.106*/107.p2align 4108.Lcopy_backward_loop:109subq $0x20, %rdx110movq -1*8(%rsi), %r8111movq -2*8(%rsi), %r9112movq -3*8(%rsi), %r10113movq -4*8(%rsi), %r11114leaq -4*8(%rsi), %rsi115movq %r8, -1*8(%rdi)116movq %r9, -2*8(%rdi)117movq %r10, -3*8(%rdi)118movq %r11, -4*8(%rdi)119leaq -4*8(%rdi), %rdi120jae .Lcopy_backward_loop121122/*123* Calculate copy position to head.124*/125addq $0x20, %rdx126subq %rdx, %rsi127subq %rdx, %rdi128.Lhandle_tail:129cmpq $16, %rdx130jb .Lless_16bytes131132/*133* Move data from 16 bytes to 31 bytes.134*/135movq 0*8(%rsi), %r8136movq 1*8(%rsi), %r9137movq -2*8(%rsi, %rdx), %r10138movq -1*8(%rsi, %rdx), %r11139movq %r8, 0*8(%rdi)140movq %r9, 1*8(%rdi)141movq %r10, -2*8(%rdi, %rdx)142movq %r11, -1*8(%rdi, %rdx)143retq144.p2align 4145.Lless_16bytes:146cmpq $8, %rdx147jb .Lless_8bytes148/*149* Move data from 8 bytes to 15 bytes.150*/151movq 0*8(%rsi), %r8152movq -1*8(%rsi, %rdx), %r9153movq %r8, 0*8(%rdi)154movq %r9, -1*8(%rdi, %rdx)155retq156.p2align 4157.Lless_8bytes:158cmpq $4, %rdx159jb .Lless_3bytes160161/*162* Move data from 4 bytes to 7 bytes.163*/164movl (%rsi), %ecx165movl -4(%rsi, %rdx), %r8d166movl %ecx, (%rdi)167movl %r8d, -4(%rdi, %rdx)168retq169.p2align 4170.Lless_3bytes:171cmpl $0, %edx172je .Lend173/*174* Move data from 1 bytes to 3 bytes.175*/176.Lloop_1:177movb (%rsi), %r8b178movb %r8b, (%rdi)179incq %rdi180incq %rsi181decl %edx182jnz .Lloop_1183184.Lend:185retq186CFI_ENDPROC187ENDPROC(memcpy)188ENDPROC(__memcpy)189190/*191* Some CPUs are adding enhanced REP MOVSB/STOSB feature192* If the feature is supported, memcpy_c_e() is the first choice.193* If enhanced rep movsb copy is not available, use fast string copy194* memcpy_c() when possible. This is faster and code is simpler than195* original memcpy().196* Otherwise, original memcpy() is used.197* In .altinstructions section, ERMS feature is placed after REG_GOOD198* feature to implement the right patch order.199*200* Replace only beginning, memcpy is used to apply alternatives,201* so it is silly to overwrite itself with nops - reboot is the202* only outcome...203*/204.section .altinstructions, "a"205altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\206.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c207altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \208.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e209.previous210211212