Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/arch/x86/lib/memcpy_64.S
26285 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/* Copyright 2002 Andi Kleen */
3
4
#include <linux/export.h>
5
#include <linux/linkage.h>
6
#include <asm/errno.h>
7
#include <asm/cpufeatures.h>
8
#include <asm/alternative.h>
9
10
.section .noinstr.text, "ax"
11
12
/*
13
* memcpy - Copy a memory block.
14
*
15
* Input:
16
* rdi destination
17
* rsi source
18
* rdx count
19
*
20
* Output:
21
* rax original destination
22
*
23
* The FSRM alternative should be done inline (avoiding the call and
24
* the disgusting return handling), but that would require some help
25
* from the compiler for better calling conventions.
26
*
27
* The 'rep movsb' itself is small enough to replace the call, but the
28
* two register moves blow up the code. And one of them is "needed"
29
* only for the return value that is the same as the source input,
30
* which the compiler could/should do much better anyway.
31
*/
32
SYM_TYPED_FUNC_START(__memcpy)
33
ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
34
35
movq %rdi, %rax
36
movq %rdx, %rcx
37
rep movsb
38
RET
39
SYM_FUNC_END(__memcpy)
40
EXPORT_SYMBOL(__memcpy)
41
42
SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
43
SYM_PIC_ALIAS(memcpy)
44
EXPORT_SYMBOL(memcpy)
45
46
SYM_FUNC_START_LOCAL(memcpy_orig)
47
movq %rdi, %rax
48
49
cmpq $0x20, %rdx
50
jb .Lhandle_tail
51
52
/*
53
* We check whether memory false dependence could occur,
54
* then jump to corresponding copy mode.
55
*/
56
cmp %dil, %sil
57
jl .Lcopy_backward
58
subq $0x20, %rdx
59
.Lcopy_forward_loop:
60
subq $0x20, %rdx
61
62
/*
63
* Move in blocks of 4x8 bytes:
64
*/
65
movq 0*8(%rsi), %r8
66
movq 1*8(%rsi), %r9
67
movq 2*8(%rsi), %r10
68
movq 3*8(%rsi), %r11
69
leaq 4*8(%rsi), %rsi
70
71
movq %r8, 0*8(%rdi)
72
movq %r9, 1*8(%rdi)
73
movq %r10, 2*8(%rdi)
74
movq %r11, 3*8(%rdi)
75
leaq 4*8(%rdi), %rdi
76
jae .Lcopy_forward_loop
77
addl $0x20, %edx
78
jmp .Lhandle_tail
79
80
.Lcopy_backward:
81
/*
82
* Calculate copy position to tail.
83
*/
84
addq %rdx, %rsi
85
addq %rdx, %rdi
86
subq $0x20, %rdx
87
/*
88
* At most 3 ALU operations in one cycle,
89
* so append NOPS in the same 16 bytes trunk.
90
*/
91
.p2align 4
92
.Lcopy_backward_loop:
93
subq $0x20, %rdx
94
movq -1*8(%rsi), %r8
95
movq -2*8(%rsi), %r9
96
movq -3*8(%rsi), %r10
97
movq -4*8(%rsi), %r11
98
leaq -4*8(%rsi), %rsi
99
movq %r8, -1*8(%rdi)
100
movq %r9, -2*8(%rdi)
101
movq %r10, -3*8(%rdi)
102
movq %r11, -4*8(%rdi)
103
leaq -4*8(%rdi), %rdi
104
jae .Lcopy_backward_loop
105
106
/*
107
* Calculate copy position to head.
108
*/
109
addl $0x20, %edx
110
subq %rdx, %rsi
111
subq %rdx, %rdi
112
.Lhandle_tail:
113
cmpl $16, %edx
114
jb .Lless_16bytes
115
116
/*
117
* Move data from 16 bytes to 31 bytes.
118
*/
119
movq 0*8(%rsi), %r8
120
movq 1*8(%rsi), %r9
121
movq -2*8(%rsi, %rdx), %r10
122
movq -1*8(%rsi, %rdx), %r11
123
movq %r8, 0*8(%rdi)
124
movq %r9, 1*8(%rdi)
125
movq %r10, -2*8(%rdi, %rdx)
126
movq %r11, -1*8(%rdi, %rdx)
127
RET
128
.p2align 4
129
.Lless_16bytes:
130
cmpl $8, %edx
131
jb .Lless_8bytes
132
/*
133
* Move data from 8 bytes to 15 bytes.
134
*/
135
movq 0*8(%rsi), %r8
136
movq -1*8(%rsi, %rdx), %r9
137
movq %r8, 0*8(%rdi)
138
movq %r9, -1*8(%rdi, %rdx)
139
RET
140
.p2align 4
141
.Lless_8bytes:
142
cmpl $4, %edx
143
jb .Lless_3bytes
144
145
/*
146
* Move data from 4 bytes to 7 bytes.
147
*/
148
movl (%rsi), %ecx
149
movl -4(%rsi, %rdx), %r8d
150
movl %ecx, (%rdi)
151
movl %r8d, -4(%rdi, %rdx)
152
RET
153
.p2align 4
154
.Lless_3bytes:
155
subl $1, %edx
156
jb .Lend
157
/*
158
* Move data from 1 bytes to 3 bytes.
159
*/
160
movzbl (%rsi), %ecx
161
jz .Lstore_1byte
162
movzbq 1(%rsi), %r8
163
movzbq (%rsi, %rdx), %r9
164
movb %r8b, 1(%rdi)
165
movb %r9b, (%rdi, %rdx)
166
.Lstore_1byte:
167
movb %cl, (%rdi)
168
169
.Lend:
170
RET
171
SYM_FUNC_END(memcpy_orig)
172
173
174