Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/arch/x86/lib/memset_64.S
26285 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/* Copyright 2002 Andi Kleen, SuSE Labs */
3
4
#include <linux/export.h>
5
#include <linux/linkage.h>
6
#include <linux/cfi_types.h>
7
#include <asm/cpufeatures.h>
8
#include <asm/alternative.h>
9
10
.section .noinstr.text, "ax"
11
12
/*
13
* ISO C memset - set a memory block to a byte value. This function uses fast
14
* string to get better performance than the original function. The code is
15
* simpler and shorter than the original function as well.
16
*
17
* rdi destination
18
* rsi value (char)
19
* rdx count (bytes)
20
*
21
* rax original destination
22
*
23
* The FSRS alternative should be done inline (avoiding the call and
24
* the disgusting return handling), but that would require some help
25
* from the compiler for better calling conventions.
26
*
27
* The 'rep stosb' itself is small enough to replace the call, but all
28
* the register moves blow up the code. And two of them are "needed"
29
* only for the return value that is the same as the source input,
30
* which the compiler could/should do much better anyway.
31
*/
32
SYM_TYPED_FUNC_START(__memset)
33
ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
34
35
movq %rdi,%r9
36
movb %sil,%al
37
movq %rdx,%rcx
38
rep stosb
39
movq %r9,%rax
40
RET
41
SYM_FUNC_END(__memset)
42
EXPORT_SYMBOL(__memset)
43
44
SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
45
SYM_PIC_ALIAS(memset)
46
EXPORT_SYMBOL(memset)
47
48
SYM_FUNC_START_LOCAL(memset_orig)
49
movq %rdi,%r10
50
51
/* expand byte value */
52
movzbl %sil,%ecx
53
movabs $0x0101010101010101,%rax
54
imulq %rcx,%rax
55
56
/* align dst */
57
movl %edi,%r9d
58
andl $7,%r9d
59
jnz .Lbad_alignment
60
.Lafter_bad_alignment:
61
62
movq %rdx,%rcx
63
shrq $6,%rcx
64
jz .Lhandle_tail
65
66
.p2align 4
67
.Lloop_64:
68
decq %rcx
69
movq %rax,(%rdi)
70
movq %rax,8(%rdi)
71
movq %rax,16(%rdi)
72
movq %rax,24(%rdi)
73
movq %rax,32(%rdi)
74
movq %rax,40(%rdi)
75
movq %rax,48(%rdi)
76
movq %rax,56(%rdi)
77
leaq 64(%rdi),%rdi
78
jnz .Lloop_64
79
80
/* Handle tail in loops. The loops should be faster than hard
81
to predict jump tables. */
82
.p2align 4
83
.Lhandle_tail:
84
movl %edx,%ecx
85
andl $63&(~7),%ecx
86
jz .Lhandle_7
87
shrl $3,%ecx
88
.p2align 4
89
.Lloop_8:
90
decl %ecx
91
movq %rax,(%rdi)
92
leaq 8(%rdi),%rdi
93
jnz .Lloop_8
94
95
.Lhandle_7:
96
andl $7,%edx
97
jz .Lende
98
.p2align 4
99
.Lloop_1:
100
decl %edx
101
movb %al,(%rdi)
102
leaq 1(%rdi),%rdi
103
jnz .Lloop_1
104
105
.Lende:
106
movq %r10,%rax
107
RET
108
109
.Lbad_alignment:
110
cmpq $7,%rdx
111
jbe .Lhandle_7
112
movq %rax,(%rdi) /* unaligned store */
113
movq $8,%r8
114
subq %r9,%r8
115
addq %r8,%rdi
116
subq %r8,%rdx
117
jmp .Lafter_bad_alignment
118
.Lfinal:
119
SYM_FUNC_END(memset_orig)
120
121