/* Copyright 2002 Andi Kleen, SuSE Labs */12#include <linux/linkage.h>3#include <asm/dwarf2.h>4#include <asm/cpufeature.h>5#include <asm/alternative-asm.h>67/*8* ISO C memset - set a memory block to a byte value. This function uses fast9* string to get better performance than the original function. The code is10* simpler and shorter than the orignal function as well.11*12* rdi destination13* rsi value (char)14* rdx count (bytes)15*16* rax original destination17*/18.section .altinstr_replacement, "ax", @progbits19.Lmemset_c:20movq %rdi,%r921movl %edx,%r8d22andl $7,%r8d23movl %edx,%ecx24shrl $3,%ecx25/* expand byte value */26movzbl %sil,%esi27movabs $0x0101010101010101,%rax28mulq %rsi /* with rax, clobbers rdx */29rep stosq30movl %r8d,%ecx31rep stosb32movq %r9,%rax33ret34.Lmemset_e:35.previous3637/*38* ISO C memset - set a memory block to a byte value. This function uses39* enhanced rep stosb to override the fast string function.40* The code is simpler and shorter than the fast string function as well.41*42* rdi destination43* rsi value (char)44* rdx count (bytes)45*46* rax original destination47*/48.section .altinstr_replacement, "ax", @progbits49.Lmemset_c_e:50movq %rdi,%r951movb %sil,%al52movl %edx,%ecx53rep stosb54movq %r9,%rax55ret56.Lmemset_e_e:57.previous5859ENTRY(memset)60ENTRY(__memset)61CFI_STARTPROC62movq %rdi,%r1063movq %rdx,%r116465/* expand byte value */66movzbl %sil,%ecx67movabs $0x0101010101010101,%rax68mul %rcx /* with rax, clobbers rdx */6970/* align dst */71movl %edi,%r9d72andl $7,%r9d73jnz .Lbad_alignment74CFI_REMEMBER_STATE75.Lafter_bad_alignment:7677movl %r11d,%ecx78shrl $6,%ecx79jz .Lhandle_tail8081.p2align 482.Lloop_64:83decl %ecx84movq %rax,(%rdi)85movq %rax,8(%rdi)86movq %rax,16(%rdi)87movq %rax,24(%rdi)88movq %rax,32(%rdi)89movq %rax,40(%rdi)90movq %rax,48(%rdi)91movq %rax,56(%rdi)92leaq 64(%rdi),%rdi93jnz .Lloop_649495/* Handle tail in loops. The loops should be faster than hard96to predict jump tables. */97.p2align 498.Lhandle_tail:99movl %r11d,%ecx100andl $63&(~7),%ecx101jz .Lhandle_7102shrl $3,%ecx103.p2align 4104.Lloop_8:105decl %ecx106movq %rax,(%rdi)107leaq 8(%rdi),%rdi108jnz .Lloop_8109110.Lhandle_7:111movl %r11d,%ecx112andl $7,%ecx113jz .Lende114.p2align 4115.Lloop_1:116decl %ecx117movb %al,(%rdi)118leaq 1(%rdi),%rdi119jnz .Lloop_1120121.Lende:122movq %r10,%rax123ret124125CFI_RESTORE_STATE126.Lbad_alignment:127cmpq $7,%r11128jbe .Lhandle_7129movq %rax,(%rdi) /* unaligned store */130movq $8,%r8131subq %r9,%r8132addq %r8,%rdi133subq %r8,%r11134jmp .Lafter_bad_alignment135.Lfinal:136CFI_ENDPROC137ENDPROC(memset)138ENDPROC(__memset)139140/* Some CPUs support enhanced REP MOVSB/STOSB feature.141* It is recommended to use this when possible.142*143* If enhanced REP MOVSB/STOSB feature is not available, use fast string144* instructions.145*146* Otherwise, use original memset function.147*148* In .altinstructions section, ERMS feature is placed after REG_GOOD149* feature to implement the right patch order.150*/151.section .altinstructions,"a"152altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\153.Lfinal-memset,.Lmemset_e-.Lmemset_c154altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \155.Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e156.previous157158159