Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/entry/vdso/vgetrandom-chacha.S
26493 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2022-2024 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
4
*/
5
6
#include <linux/linkage.h>
7
#include <asm/frame.h>
8
9
.section .rodata, "a"
10
.align 16
11
CONSTANTS: .octa 0x6b20657479622d323320646e61707865
12
.text
13
14
/*
15
* Very basic SSE2 implementation of ChaCha20. Produces a given positive number
16
* of blocks of output with a nonce of 0, taking an input key and 8-byte
17
* counter. Importantly does not spill to the stack. Its arguments are:
18
*
19
* rdi: output bytes
20
* rsi: 32-byte key input
21
* rdx: 8-byte counter input/output
22
* rcx: number of 64-byte blocks to write to output
23
*/
24
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
25
26
.set output, %rdi
27
.set key, %rsi
28
.set counter, %rdx
29
.set nblocks, %rcx
30
.set i, %al
31
/* xmm registers are *not* callee-save. */
32
.set temp, %xmm0
33
.set state0, %xmm1
34
.set state1, %xmm2
35
.set state2, %xmm3
36
.set state3, %xmm4
37
.set copy0, %xmm5
38
.set copy1, %xmm6
39
.set copy2, %xmm7
40
.set copy3, %xmm8
41
.set one, %xmm9
42
43
/* copy0 = "expand 32-byte k" */
44
movaps CONSTANTS(%rip),copy0
45
/* copy1,copy2 = key */
46
movups 0x00(key),copy1
47
movups 0x10(key),copy2
48
/* copy3 = counter || zero nonce */
49
movq 0x00(counter),copy3
50
/* one = 1 || 0 */
51
movq $1,%rax
52
movq %rax,one
53
54
.Lblock:
55
/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
56
movdqa copy0,state0
57
movdqa copy1,state1
58
movdqa copy2,state2
59
movdqa copy3,state3
60
61
movb $10,i
62
.Lpermute:
63
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
64
paddd state1,state0
65
pxor state0,state3
66
movdqa state3,temp
67
pslld $16,temp
68
psrld $16,state3
69
por temp,state3
70
71
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
72
paddd state3,state2
73
pxor state2,state1
74
movdqa state1,temp
75
pslld $12,temp
76
psrld $20,state1
77
por temp,state1
78
79
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
80
paddd state1,state0
81
pxor state0,state3
82
movdqa state3,temp
83
pslld $8,temp
84
psrld $24,state3
85
por temp,state3
86
87
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
88
paddd state3,state2
89
pxor state2,state1
90
movdqa state1,temp
91
pslld $7,temp
92
psrld $25,state1
93
por temp,state1
94
95
/* state1[0,1,2,3] = state1[1,2,3,0] */
96
pshufd $0x39,state1,state1
97
/* state2[0,1,2,3] = state2[2,3,0,1] */
98
pshufd $0x4e,state2,state2
99
/* state3[0,1,2,3] = state3[3,0,1,2] */
100
pshufd $0x93,state3,state3
101
102
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
103
paddd state1,state0
104
pxor state0,state3
105
movdqa state3,temp
106
pslld $16,temp
107
psrld $16,state3
108
por temp,state3
109
110
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111
paddd state3,state2
112
pxor state2,state1
113
movdqa state1,temp
114
pslld $12,temp
115
psrld $20,state1
116
por temp,state1
117
118
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
119
paddd state1,state0
120
pxor state0,state3
121
movdqa state3,temp
122
pslld $8,temp
123
psrld $24,state3
124
por temp,state3
125
126
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
127
paddd state3,state2
128
pxor state2,state1
129
movdqa state1,temp
130
pslld $7,temp
131
psrld $25,state1
132
por temp,state1
133
134
/* state1[0,1,2,3] = state1[3,0,1,2] */
135
pshufd $0x93,state1,state1
136
/* state2[0,1,2,3] = state2[2,3,0,1] */
137
pshufd $0x4e,state2,state2
138
/* state3[0,1,2,3] = state3[1,2,3,0] */
139
pshufd $0x39,state3,state3
140
141
decb i
142
jnz .Lpermute
143
144
/* output0 = state0 + copy0 */
145
paddd copy0,state0
146
movups state0,0x00(output)
147
/* output1 = state1 + copy1 */
148
paddd copy1,state1
149
movups state1,0x10(output)
150
/* output2 = state2 + copy2 */
151
paddd copy2,state2
152
movups state2,0x20(output)
153
/* output3 = state3 + copy3 */
154
paddd copy3,state3
155
movups state3,0x30(output)
156
157
/* ++copy3.counter */
158
paddq one,copy3
159
160
/* output += 64, --nblocks */
161
addq $64,output
162
decq nblocks
163
jnz .Lblock
164
165
/* counter = copy3.counter */
166
movq copy3,0x00(counter)
167
168
/* Zero out the potentially sensitive regs, in case nothing uses these again. */
169
pxor state0,state0
170
pxor state1,state1
171
pxor state2,state2
172
pxor state3,state3
173
pxor copy1,copy1
174
pxor copy2,copy2
175
pxor temp,temp
176
177
ret
178
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
179
180