Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/lib/memcpy_64.S
26442 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/* Copyright 2002 Andi Kleen */
3
4
#include <linux/export.h>
5
#include <linux/linkage.h>
6
#include <linux/cfi_types.h>
7
#include <asm/errno.h>
8
#include <asm/cpufeatures.h>
9
#include <asm/alternative.h>
10
11
.section .noinstr.text, "ax"
12
13
/*
14
* memcpy - Copy a memory block.
15
*
16
* Input:
17
* rdi destination
18
* rsi source
19
* rdx count
20
*
21
* Output:
22
* rax original destination
23
*
24
* The FSRM alternative should be done inline (avoiding the call and
25
* the disgusting return handling), but that would require some help
26
* from the compiler for better calling conventions.
27
*
28
* The 'rep movsb' itself is small enough to replace the call, but the
29
* two register moves blow up the code. And one of them is "needed"
30
* only for the return value that is the same as the source input,
31
* which the compiler could/should do much better anyway.
32
*/
33
SYM_TYPED_FUNC_START(__memcpy)
34
ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
35
36
movq %rdi, %rax
37
movq %rdx, %rcx
38
rep movsb
39
RET
40
SYM_FUNC_END(__memcpy)
41
EXPORT_SYMBOL(__memcpy)
42
43
SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
44
SYM_PIC_ALIAS(memcpy)
45
EXPORT_SYMBOL(memcpy)
46
47
SYM_FUNC_START_LOCAL(memcpy_orig)
48
movq %rdi, %rax
49
50
cmpq $0x20, %rdx
51
jb .Lhandle_tail
52
53
/*
54
* We check whether memory false dependence could occur,
55
* then jump to corresponding copy mode.
56
*/
57
cmp %dil, %sil
58
jl .Lcopy_backward
59
subq $0x20, %rdx
60
.Lcopy_forward_loop:
61
subq $0x20, %rdx
62
63
/*
64
* Move in blocks of 4x8 bytes:
65
*/
66
movq 0*8(%rsi), %r8
67
movq 1*8(%rsi), %r9
68
movq 2*8(%rsi), %r10
69
movq 3*8(%rsi), %r11
70
leaq 4*8(%rsi), %rsi
71
72
movq %r8, 0*8(%rdi)
73
movq %r9, 1*8(%rdi)
74
movq %r10, 2*8(%rdi)
75
movq %r11, 3*8(%rdi)
76
leaq 4*8(%rdi), %rdi
77
jae .Lcopy_forward_loop
78
addl $0x20, %edx
79
jmp .Lhandle_tail
80
81
.Lcopy_backward:
82
/*
83
* Calculate copy position to tail.
84
*/
85
addq %rdx, %rsi
86
addq %rdx, %rdi
87
subq $0x20, %rdx
88
/*
89
* At most 3 ALU operations in one cycle,
90
* so append NOPS in the same 16 bytes trunk.
91
*/
92
.p2align 4
93
.Lcopy_backward_loop:
94
subq $0x20, %rdx
95
movq -1*8(%rsi), %r8
96
movq -2*8(%rsi), %r9
97
movq -3*8(%rsi), %r10
98
movq -4*8(%rsi), %r11
99
leaq -4*8(%rsi), %rsi
100
movq %r8, -1*8(%rdi)
101
movq %r9, -2*8(%rdi)
102
movq %r10, -3*8(%rdi)
103
movq %r11, -4*8(%rdi)
104
leaq -4*8(%rdi), %rdi
105
jae .Lcopy_backward_loop
106
107
/*
108
* Calculate copy position to head.
109
*/
110
addl $0x20, %edx
111
subq %rdx, %rsi
112
subq %rdx, %rdi
113
.Lhandle_tail:
114
cmpl $16, %edx
115
jb .Lless_16bytes
116
117
/*
118
* Move data from 16 bytes to 31 bytes.
119
*/
120
movq 0*8(%rsi), %r8
121
movq 1*8(%rsi), %r9
122
movq -2*8(%rsi, %rdx), %r10
123
movq -1*8(%rsi, %rdx), %r11
124
movq %r8, 0*8(%rdi)
125
movq %r9, 1*8(%rdi)
126
movq %r10, -2*8(%rdi, %rdx)
127
movq %r11, -1*8(%rdi, %rdx)
128
RET
129
.p2align 4
130
.Lless_16bytes:
131
cmpl $8, %edx
132
jb .Lless_8bytes
133
/*
134
* Move data from 8 bytes to 15 bytes.
135
*/
136
movq 0*8(%rsi), %r8
137
movq -1*8(%rsi, %rdx), %r9
138
movq %r8, 0*8(%rdi)
139
movq %r9, -1*8(%rdi, %rdx)
140
RET
141
.p2align 4
142
.Lless_8bytes:
143
cmpl $4, %edx
144
jb .Lless_3bytes
145
146
/*
147
* Move data from 4 bytes to 7 bytes.
148
*/
149
movl (%rsi), %ecx
150
movl -4(%rsi, %rdx), %r8d
151
movl %ecx, (%rdi)
152
movl %r8d, -4(%rdi, %rdx)
153
RET
154
.p2align 4
155
.Lless_3bytes:
156
subl $1, %edx
157
jb .Lend
158
/*
159
* Move data from 1 bytes to 3 bytes.
160
*/
161
movzbl (%rsi), %ecx
162
jz .Lstore_1byte
163
movzbq 1(%rsi), %r8
164
movzbq (%rsi, %rdx), %r9
165
movb %r8b, 1(%rdi)
166
movb %r9b, (%rdi, %rdx)
167
.Lstore_1byte:
168
movb %cl, (%rdi)
169
170
.Lend:
171
RET
172
SYM_FUNC_END(memcpy_orig)
173
174
175