Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/chacha-ssse3-x86_64.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
4
*
5
* Copyright (C) 2015 Martin Willi
6
*/
7
8
#include <linux/linkage.h>
9
#include <asm/frame.h>
10
11
.section .rodata.cst16.ROT8, "aM", @progbits, 16
12
.align 16
13
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
14
.section .rodata.cst16.ROT16, "aM", @progbits, 16
15
.align 16
16
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
17
.section .rodata.cst16.CTRINC, "aM", @progbits, 16
18
.align 16
19
CTRINC: .octa 0x00000003000000020000000100000000
20
21
.text
22
23
/*
24
* chacha_permute - permute one block
25
*
26
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
27
* function performs matrix operations on four words in parallel, but requires
28
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
29
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
30
* rotation uses traditional shift+OR.
31
*
32
* The round count is given in %r8d.
33
*
34
* Clobbers: %r8d, %xmm4-%xmm7
35
*/
36
SYM_FUNC_START_LOCAL(chacha_permute)
37
38
movdqa ROT8(%rip),%xmm4
39
movdqa ROT16(%rip),%xmm5
40
41
.Ldoubleround:
42
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
43
paddd %xmm1,%xmm0
44
pxor %xmm0,%xmm3
45
pshufb %xmm5,%xmm3
46
47
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
48
paddd %xmm3,%xmm2
49
pxor %xmm2,%xmm1
50
movdqa %xmm1,%xmm6
51
pslld $12,%xmm6
52
psrld $20,%xmm1
53
por %xmm6,%xmm1
54
55
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
56
paddd %xmm1,%xmm0
57
pxor %xmm0,%xmm3
58
pshufb %xmm4,%xmm3
59
60
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
61
paddd %xmm3,%xmm2
62
pxor %xmm2,%xmm1
63
movdqa %xmm1,%xmm7
64
pslld $7,%xmm7
65
psrld $25,%xmm1
66
por %xmm7,%xmm1
67
68
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
69
pshufd $0x39,%xmm1,%xmm1
70
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
71
pshufd $0x4e,%xmm2,%xmm2
72
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
73
pshufd $0x93,%xmm3,%xmm3
74
75
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
76
paddd %xmm1,%xmm0
77
pxor %xmm0,%xmm3
78
pshufb %xmm5,%xmm3
79
80
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
81
paddd %xmm3,%xmm2
82
pxor %xmm2,%xmm1
83
movdqa %xmm1,%xmm6
84
pslld $12,%xmm6
85
psrld $20,%xmm1
86
por %xmm6,%xmm1
87
88
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
89
paddd %xmm1,%xmm0
90
pxor %xmm0,%xmm3
91
pshufb %xmm4,%xmm3
92
93
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
94
paddd %xmm3,%xmm2
95
pxor %xmm2,%xmm1
96
movdqa %xmm1,%xmm7
97
pslld $7,%xmm7
98
psrld $25,%xmm1
99
por %xmm7,%xmm1
100
101
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
102
pshufd $0x93,%xmm1,%xmm1
103
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104
pshufd $0x4e,%xmm2,%xmm2
105
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
106
pshufd $0x39,%xmm3,%xmm3
107
108
sub $2,%r8d
109
jnz .Ldoubleround
110
111
RET
112
SYM_FUNC_END(chacha_permute)
113
114
SYM_FUNC_START(chacha_block_xor_ssse3)
115
# %rdi: Input state matrix, s
116
# %rsi: up to 1 data block output, o
117
# %rdx: up to 1 data block input, i
118
# %rcx: input/output length in bytes
119
# %r8d: nrounds
120
FRAME_BEGIN
121
122
# x0..3 = s0..3
123
movdqu 0x00(%rdi),%xmm0
124
movdqu 0x10(%rdi),%xmm1
125
movdqu 0x20(%rdi),%xmm2
126
movdqu 0x30(%rdi),%xmm3
127
movdqa %xmm0,%xmm8
128
movdqa %xmm1,%xmm9
129
movdqa %xmm2,%xmm10
130
movdqa %xmm3,%xmm11
131
132
mov %rcx,%rax
133
call chacha_permute
134
135
# o0 = i0 ^ (x0 + s0)
136
paddd %xmm8,%xmm0
137
cmp $0x10,%rax
138
jl .Lxorpart
139
movdqu 0x00(%rdx),%xmm4
140
pxor %xmm4,%xmm0
141
movdqu %xmm0,0x00(%rsi)
142
# o1 = i1 ^ (x1 + s1)
143
paddd %xmm9,%xmm1
144
movdqa %xmm1,%xmm0
145
cmp $0x20,%rax
146
jl .Lxorpart
147
movdqu 0x10(%rdx),%xmm0
148
pxor %xmm1,%xmm0
149
movdqu %xmm0,0x10(%rsi)
150
# o2 = i2 ^ (x2 + s2)
151
paddd %xmm10,%xmm2
152
movdqa %xmm2,%xmm0
153
cmp $0x30,%rax
154
jl .Lxorpart
155
movdqu 0x20(%rdx),%xmm0
156
pxor %xmm2,%xmm0
157
movdqu %xmm0,0x20(%rsi)
158
# o3 = i3 ^ (x3 + s3)
159
paddd %xmm11,%xmm3
160
movdqa %xmm3,%xmm0
161
cmp $0x40,%rax
162
jl .Lxorpart
163
movdqu 0x30(%rdx),%xmm0
164
pxor %xmm3,%xmm0
165
movdqu %xmm0,0x30(%rsi)
166
167
.Ldone:
168
FRAME_END
169
RET
170
171
.Lxorpart:
172
# xor remaining bytes from partial register into output
173
mov %rax,%r9
174
and $0x0f,%r9
175
jz .Ldone
176
and $~0x0f,%rax
177
178
mov %rsi,%r11
179
180
lea 8(%rsp),%r10
181
sub $0x10,%rsp
182
and $~31,%rsp
183
184
lea (%rdx,%rax),%rsi
185
mov %rsp,%rdi
186
mov %r9,%rcx
187
rep movsb
188
189
pxor 0x00(%rsp),%xmm0
190
movdqa %xmm0,0x00(%rsp)
191
192
mov %rsp,%rsi
193
lea (%r11,%rax),%rdi
194
mov %r9,%rcx
195
rep movsb
196
197
lea -8(%r10),%rsp
198
jmp .Ldone
199
200
SYM_FUNC_END(chacha_block_xor_ssse3)
201
202
SYM_FUNC_START(hchacha_block_ssse3)
203
# %rdi: Input state matrix, s
204
# %rsi: output (8 32-bit words)
205
# %edx: nrounds
206
FRAME_BEGIN
207
208
movdqu 0x00(%rdi),%xmm0
209
movdqu 0x10(%rdi),%xmm1
210
movdqu 0x20(%rdi),%xmm2
211
movdqu 0x30(%rdi),%xmm3
212
213
mov %edx,%r8d
214
call chacha_permute
215
216
movdqu %xmm0,0x00(%rsi)
217
movdqu %xmm3,0x10(%rsi)
218
219
FRAME_END
220
RET
221
SYM_FUNC_END(hchacha_block_ssse3)
222
223
SYM_FUNC_START(chacha_4block_xor_ssse3)
224
# %rdi: Input state matrix, s
225
# %rsi: up to 4 data blocks output, o
226
# %rdx: up to 4 data blocks input, i
227
# %rcx: input/output length in bytes
228
# %r8d: nrounds
229
230
# This function encrypts four consecutive ChaCha blocks by loading the
231
# the state matrix in SSE registers four times. As we need some scratch
232
# registers, we save the first four registers on the stack. The
233
# algorithm performs each operation on the corresponding word of each
234
# state matrix, hence requires no word shuffling. For final XORing step
235
# we transpose the matrix by interleaving 32- and then 64-bit words,
236
# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
237
# done with the slightly better performing SSSE3 byte shuffling,
238
# 7/12-bit word rotation uses traditional shift+OR.
239
240
lea 8(%rsp),%r10
241
sub $0x80,%rsp
242
and $~63,%rsp
243
mov %rcx,%rax
244
245
# x0..15[0-3] = s0..3[0..3]
246
movq 0x00(%rdi),%xmm1
247
pshufd $0x00,%xmm1,%xmm0
248
pshufd $0x55,%xmm1,%xmm1
249
movq 0x08(%rdi),%xmm3
250
pshufd $0x00,%xmm3,%xmm2
251
pshufd $0x55,%xmm3,%xmm3
252
movq 0x10(%rdi),%xmm5
253
pshufd $0x00,%xmm5,%xmm4
254
pshufd $0x55,%xmm5,%xmm5
255
movq 0x18(%rdi),%xmm7
256
pshufd $0x00,%xmm7,%xmm6
257
pshufd $0x55,%xmm7,%xmm7
258
movq 0x20(%rdi),%xmm9
259
pshufd $0x00,%xmm9,%xmm8
260
pshufd $0x55,%xmm9,%xmm9
261
movq 0x28(%rdi),%xmm11
262
pshufd $0x00,%xmm11,%xmm10
263
pshufd $0x55,%xmm11,%xmm11
264
movq 0x30(%rdi),%xmm13
265
pshufd $0x00,%xmm13,%xmm12
266
pshufd $0x55,%xmm13,%xmm13
267
movq 0x38(%rdi),%xmm15
268
pshufd $0x00,%xmm15,%xmm14
269
pshufd $0x55,%xmm15,%xmm15
270
# x0..3 on stack
271
movdqa %xmm0,0x00(%rsp)
272
movdqa %xmm1,0x10(%rsp)
273
movdqa %xmm2,0x20(%rsp)
274
movdqa %xmm3,0x30(%rsp)
275
276
movdqa CTRINC(%rip),%xmm1
277
movdqa ROT8(%rip),%xmm2
278
movdqa ROT16(%rip),%xmm3
279
280
# x12 += counter values 0-3
281
paddd %xmm1,%xmm12
282
283
.Ldoubleround4:
284
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
285
movdqa 0x00(%rsp),%xmm0
286
paddd %xmm4,%xmm0
287
movdqa %xmm0,0x00(%rsp)
288
pxor %xmm0,%xmm12
289
pshufb %xmm3,%xmm12
290
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
291
movdqa 0x10(%rsp),%xmm0
292
paddd %xmm5,%xmm0
293
movdqa %xmm0,0x10(%rsp)
294
pxor %xmm0,%xmm13
295
pshufb %xmm3,%xmm13
296
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
297
movdqa 0x20(%rsp),%xmm0
298
paddd %xmm6,%xmm0
299
movdqa %xmm0,0x20(%rsp)
300
pxor %xmm0,%xmm14
301
pshufb %xmm3,%xmm14
302
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
303
movdqa 0x30(%rsp),%xmm0
304
paddd %xmm7,%xmm0
305
movdqa %xmm0,0x30(%rsp)
306
pxor %xmm0,%xmm15
307
pshufb %xmm3,%xmm15
308
309
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
310
paddd %xmm12,%xmm8
311
pxor %xmm8,%xmm4
312
movdqa %xmm4,%xmm0
313
pslld $12,%xmm0
314
psrld $20,%xmm4
315
por %xmm0,%xmm4
316
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
317
paddd %xmm13,%xmm9
318
pxor %xmm9,%xmm5
319
movdqa %xmm5,%xmm0
320
pslld $12,%xmm0
321
psrld $20,%xmm5
322
por %xmm0,%xmm5
323
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
324
paddd %xmm14,%xmm10
325
pxor %xmm10,%xmm6
326
movdqa %xmm6,%xmm0
327
pslld $12,%xmm0
328
psrld $20,%xmm6
329
por %xmm0,%xmm6
330
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
331
paddd %xmm15,%xmm11
332
pxor %xmm11,%xmm7
333
movdqa %xmm7,%xmm0
334
pslld $12,%xmm0
335
psrld $20,%xmm7
336
por %xmm0,%xmm7
337
338
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
339
movdqa 0x00(%rsp),%xmm0
340
paddd %xmm4,%xmm0
341
movdqa %xmm0,0x00(%rsp)
342
pxor %xmm0,%xmm12
343
pshufb %xmm2,%xmm12
344
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
345
movdqa 0x10(%rsp),%xmm0
346
paddd %xmm5,%xmm0
347
movdqa %xmm0,0x10(%rsp)
348
pxor %xmm0,%xmm13
349
pshufb %xmm2,%xmm13
350
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
351
movdqa 0x20(%rsp),%xmm0
352
paddd %xmm6,%xmm0
353
movdqa %xmm0,0x20(%rsp)
354
pxor %xmm0,%xmm14
355
pshufb %xmm2,%xmm14
356
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
357
movdqa 0x30(%rsp),%xmm0
358
paddd %xmm7,%xmm0
359
movdqa %xmm0,0x30(%rsp)
360
pxor %xmm0,%xmm15
361
pshufb %xmm2,%xmm15
362
363
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
364
paddd %xmm12,%xmm8
365
pxor %xmm8,%xmm4
366
movdqa %xmm4,%xmm0
367
pslld $7,%xmm0
368
psrld $25,%xmm4
369
por %xmm0,%xmm4
370
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
371
paddd %xmm13,%xmm9
372
pxor %xmm9,%xmm5
373
movdqa %xmm5,%xmm0
374
pslld $7,%xmm0
375
psrld $25,%xmm5
376
por %xmm0,%xmm5
377
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
378
paddd %xmm14,%xmm10
379
pxor %xmm10,%xmm6
380
movdqa %xmm6,%xmm0
381
pslld $7,%xmm0
382
psrld $25,%xmm6
383
por %xmm0,%xmm6
384
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
385
paddd %xmm15,%xmm11
386
pxor %xmm11,%xmm7
387
movdqa %xmm7,%xmm0
388
pslld $7,%xmm0
389
psrld $25,%xmm7
390
por %xmm0,%xmm7
391
392
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
393
movdqa 0x00(%rsp),%xmm0
394
paddd %xmm5,%xmm0
395
movdqa %xmm0,0x00(%rsp)
396
pxor %xmm0,%xmm15
397
pshufb %xmm3,%xmm15
398
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
399
movdqa 0x10(%rsp),%xmm0
400
paddd %xmm6,%xmm0
401
movdqa %xmm0,0x10(%rsp)
402
pxor %xmm0,%xmm12
403
pshufb %xmm3,%xmm12
404
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
405
movdqa 0x20(%rsp),%xmm0
406
paddd %xmm7,%xmm0
407
movdqa %xmm0,0x20(%rsp)
408
pxor %xmm0,%xmm13
409
pshufb %xmm3,%xmm13
410
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
411
movdqa 0x30(%rsp),%xmm0
412
paddd %xmm4,%xmm0
413
movdqa %xmm0,0x30(%rsp)
414
pxor %xmm0,%xmm14
415
pshufb %xmm3,%xmm14
416
417
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
418
paddd %xmm15,%xmm10
419
pxor %xmm10,%xmm5
420
movdqa %xmm5,%xmm0
421
pslld $12,%xmm0
422
psrld $20,%xmm5
423
por %xmm0,%xmm5
424
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
425
paddd %xmm12,%xmm11
426
pxor %xmm11,%xmm6
427
movdqa %xmm6,%xmm0
428
pslld $12,%xmm0
429
psrld $20,%xmm6
430
por %xmm0,%xmm6
431
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
432
paddd %xmm13,%xmm8
433
pxor %xmm8,%xmm7
434
movdqa %xmm7,%xmm0
435
pslld $12,%xmm0
436
psrld $20,%xmm7
437
por %xmm0,%xmm7
438
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
439
paddd %xmm14,%xmm9
440
pxor %xmm9,%xmm4
441
movdqa %xmm4,%xmm0
442
pslld $12,%xmm0
443
psrld $20,%xmm4
444
por %xmm0,%xmm4
445
446
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
447
movdqa 0x00(%rsp),%xmm0
448
paddd %xmm5,%xmm0
449
movdqa %xmm0,0x00(%rsp)
450
pxor %xmm0,%xmm15
451
pshufb %xmm2,%xmm15
452
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
453
movdqa 0x10(%rsp),%xmm0
454
paddd %xmm6,%xmm0
455
movdqa %xmm0,0x10(%rsp)
456
pxor %xmm0,%xmm12
457
pshufb %xmm2,%xmm12
458
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
459
movdqa 0x20(%rsp),%xmm0
460
paddd %xmm7,%xmm0
461
movdqa %xmm0,0x20(%rsp)
462
pxor %xmm0,%xmm13
463
pshufb %xmm2,%xmm13
464
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
465
movdqa 0x30(%rsp),%xmm0
466
paddd %xmm4,%xmm0
467
movdqa %xmm0,0x30(%rsp)
468
pxor %xmm0,%xmm14
469
pshufb %xmm2,%xmm14
470
471
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
472
paddd %xmm15,%xmm10
473
pxor %xmm10,%xmm5
474
movdqa %xmm5,%xmm0
475
pslld $7,%xmm0
476
psrld $25,%xmm5
477
por %xmm0,%xmm5
478
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
479
paddd %xmm12,%xmm11
480
pxor %xmm11,%xmm6
481
movdqa %xmm6,%xmm0
482
pslld $7,%xmm0
483
psrld $25,%xmm6
484
por %xmm0,%xmm6
485
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
486
paddd %xmm13,%xmm8
487
pxor %xmm8,%xmm7
488
movdqa %xmm7,%xmm0
489
pslld $7,%xmm0
490
psrld $25,%xmm7
491
por %xmm0,%xmm7
492
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
493
paddd %xmm14,%xmm9
494
pxor %xmm9,%xmm4
495
movdqa %xmm4,%xmm0
496
pslld $7,%xmm0
497
psrld $25,%xmm4
498
por %xmm0,%xmm4
499
500
sub $2,%r8d
501
jnz .Ldoubleround4
502
503
# x0[0-3] += s0[0]
504
# x1[0-3] += s0[1]
505
movq 0x00(%rdi),%xmm3
506
pshufd $0x00,%xmm3,%xmm2
507
pshufd $0x55,%xmm3,%xmm3
508
paddd 0x00(%rsp),%xmm2
509
movdqa %xmm2,0x00(%rsp)
510
paddd 0x10(%rsp),%xmm3
511
movdqa %xmm3,0x10(%rsp)
512
# x2[0-3] += s0[2]
513
# x3[0-3] += s0[3]
514
movq 0x08(%rdi),%xmm3
515
pshufd $0x00,%xmm3,%xmm2
516
pshufd $0x55,%xmm3,%xmm3
517
paddd 0x20(%rsp),%xmm2
518
movdqa %xmm2,0x20(%rsp)
519
paddd 0x30(%rsp),%xmm3
520
movdqa %xmm3,0x30(%rsp)
521
522
# x4[0-3] += s1[0]
523
# x5[0-3] += s1[1]
524
movq 0x10(%rdi),%xmm3
525
pshufd $0x00,%xmm3,%xmm2
526
pshufd $0x55,%xmm3,%xmm3
527
paddd %xmm2,%xmm4
528
paddd %xmm3,%xmm5
529
# x6[0-3] += s1[2]
530
# x7[0-3] += s1[3]
531
movq 0x18(%rdi),%xmm3
532
pshufd $0x00,%xmm3,%xmm2
533
pshufd $0x55,%xmm3,%xmm3
534
paddd %xmm2,%xmm6
535
paddd %xmm3,%xmm7
536
537
# x8[0-3] += s2[0]
538
# x9[0-3] += s2[1]
539
movq 0x20(%rdi),%xmm3
540
pshufd $0x00,%xmm3,%xmm2
541
pshufd $0x55,%xmm3,%xmm3
542
paddd %xmm2,%xmm8
543
paddd %xmm3,%xmm9
544
# x10[0-3] += s2[2]
545
# x11[0-3] += s2[3]
546
movq 0x28(%rdi),%xmm3
547
pshufd $0x00,%xmm3,%xmm2
548
pshufd $0x55,%xmm3,%xmm3
549
paddd %xmm2,%xmm10
550
paddd %xmm3,%xmm11
551
552
# x12[0-3] += s3[0]
553
# x13[0-3] += s3[1]
554
movq 0x30(%rdi),%xmm3
555
pshufd $0x00,%xmm3,%xmm2
556
pshufd $0x55,%xmm3,%xmm3
557
paddd %xmm2,%xmm12
558
paddd %xmm3,%xmm13
559
# x14[0-3] += s3[2]
560
# x15[0-3] += s3[3]
561
movq 0x38(%rdi),%xmm3
562
pshufd $0x00,%xmm3,%xmm2
563
pshufd $0x55,%xmm3,%xmm3
564
paddd %xmm2,%xmm14
565
paddd %xmm3,%xmm15
566
567
# x12 += counter values 0-3
568
paddd %xmm1,%xmm12
569
570
# interleave 32-bit words in state n, n+1
571
movdqa 0x00(%rsp),%xmm0
572
movdqa 0x10(%rsp),%xmm1
573
movdqa %xmm0,%xmm2
574
punpckldq %xmm1,%xmm2
575
punpckhdq %xmm1,%xmm0
576
movdqa %xmm2,0x00(%rsp)
577
movdqa %xmm0,0x10(%rsp)
578
movdqa 0x20(%rsp),%xmm0
579
movdqa 0x30(%rsp),%xmm1
580
movdqa %xmm0,%xmm2
581
punpckldq %xmm1,%xmm2
582
punpckhdq %xmm1,%xmm0
583
movdqa %xmm2,0x20(%rsp)
584
movdqa %xmm0,0x30(%rsp)
585
movdqa %xmm4,%xmm0
586
punpckldq %xmm5,%xmm4
587
punpckhdq %xmm5,%xmm0
588
movdqa %xmm0,%xmm5
589
movdqa %xmm6,%xmm0
590
punpckldq %xmm7,%xmm6
591
punpckhdq %xmm7,%xmm0
592
movdqa %xmm0,%xmm7
593
movdqa %xmm8,%xmm0
594
punpckldq %xmm9,%xmm8
595
punpckhdq %xmm9,%xmm0
596
movdqa %xmm0,%xmm9
597
movdqa %xmm10,%xmm0
598
punpckldq %xmm11,%xmm10
599
punpckhdq %xmm11,%xmm0
600
movdqa %xmm0,%xmm11
601
movdqa %xmm12,%xmm0
602
punpckldq %xmm13,%xmm12
603
punpckhdq %xmm13,%xmm0
604
movdqa %xmm0,%xmm13
605
movdqa %xmm14,%xmm0
606
punpckldq %xmm15,%xmm14
607
punpckhdq %xmm15,%xmm0
608
movdqa %xmm0,%xmm15
609
610
# interleave 64-bit words in state n, n+2
611
movdqa 0x00(%rsp),%xmm0
612
movdqa 0x20(%rsp),%xmm1
613
movdqa %xmm0,%xmm2
614
punpcklqdq %xmm1,%xmm2
615
punpckhqdq %xmm1,%xmm0
616
movdqa %xmm2,0x00(%rsp)
617
movdqa %xmm0,0x20(%rsp)
618
movdqa 0x10(%rsp),%xmm0
619
movdqa 0x30(%rsp),%xmm1
620
movdqa %xmm0,%xmm2
621
punpcklqdq %xmm1,%xmm2
622
punpckhqdq %xmm1,%xmm0
623
movdqa %xmm2,0x10(%rsp)
624
movdqa %xmm0,0x30(%rsp)
625
movdqa %xmm4,%xmm0
626
punpcklqdq %xmm6,%xmm4
627
punpckhqdq %xmm6,%xmm0
628
movdqa %xmm0,%xmm6
629
movdqa %xmm5,%xmm0
630
punpcklqdq %xmm7,%xmm5
631
punpckhqdq %xmm7,%xmm0
632
movdqa %xmm0,%xmm7
633
movdqa %xmm8,%xmm0
634
punpcklqdq %xmm10,%xmm8
635
punpckhqdq %xmm10,%xmm0
636
movdqa %xmm0,%xmm10
637
movdqa %xmm9,%xmm0
638
punpcklqdq %xmm11,%xmm9
639
punpckhqdq %xmm11,%xmm0
640
movdqa %xmm0,%xmm11
641
movdqa %xmm12,%xmm0
642
punpcklqdq %xmm14,%xmm12
643
punpckhqdq %xmm14,%xmm0
644
movdqa %xmm0,%xmm14
645
movdqa %xmm13,%xmm0
646
punpcklqdq %xmm15,%xmm13
647
punpckhqdq %xmm15,%xmm0
648
movdqa %xmm0,%xmm15
649
650
# xor with corresponding input, write to output
651
movdqa 0x00(%rsp),%xmm0
652
cmp $0x10,%rax
653
jl .Lxorpart4
654
movdqu 0x00(%rdx),%xmm1
655
pxor %xmm1,%xmm0
656
movdqu %xmm0,0x00(%rsi)
657
658
movdqu %xmm4,%xmm0
659
cmp $0x20,%rax
660
jl .Lxorpart4
661
movdqu 0x10(%rdx),%xmm1
662
pxor %xmm1,%xmm0
663
movdqu %xmm0,0x10(%rsi)
664
665
movdqu %xmm8,%xmm0
666
cmp $0x30,%rax
667
jl .Lxorpart4
668
movdqu 0x20(%rdx),%xmm1
669
pxor %xmm1,%xmm0
670
movdqu %xmm0,0x20(%rsi)
671
672
movdqu %xmm12,%xmm0
673
cmp $0x40,%rax
674
jl .Lxorpart4
675
movdqu 0x30(%rdx),%xmm1
676
pxor %xmm1,%xmm0
677
movdqu %xmm0,0x30(%rsi)
678
679
movdqa 0x20(%rsp),%xmm0
680
cmp $0x50,%rax
681
jl .Lxorpart4
682
movdqu 0x40(%rdx),%xmm1
683
pxor %xmm1,%xmm0
684
movdqu %xmm0,0x40(%rsi)
685
686
movdqu %xmm6,%xmm0
687
cmp $0x60,%rax
688
jl .Lxorpart4
689
movdqu 0x50(%rdx),%xmm1
690
pxor %xmm1,%xmm0
691
movdqu %xmm0,0x50(%rsi)
692
693
movdqu %xmm10,%xmm0
694
cmp $0x70,%rax
695
jl .Lxorpart4
696
movdqu 0x60(%rdx),%xmm1
697
pxor %xmm1,%xmm0
698
movdqu %xmm0,0x60(%rsi)
699
700
movdqu %xmm14,%xmm0
701
cmp $0x80,%rax
702
jl .Lxorpart4
703
movdqu 0x70(%rdx),%xmm1
704
pxor %xmm1,%xmm0
705
movdqu %xmm0,0x70(%rsi)
706
707
movdqa 0x10(%rsp),%xmm0
708
cmp $0x90,%rax
709
jl .Lxorpart4
710
movdqu 0x80(%rdx),%xmm1
711
pxor %xmm1,%xmm0
712
movdqu %xmm0,0x80(%rsi)
713
714
movdqu %xmm5,%xmm0
715
cmp $0xa0,%rax
716
jl .Lxorpart4
717
movdqu 0x90(%rdx),%xmm1
718
pxor %xmm1,%xmm0
719
movdqu %xmm0,0x90(%rsi)
720
721
movdqu %xmm9,%xmm0
722
cmp $0xb0,%rax
723
jl .Lxorpart4
724
movdqu 0xa0(%rdx),%xmm1
725
pxor %xmm1,%xmm0
726
movdqu %xmm0,0xa0(%rsi)
727
728
movdqu %xmm13,%xmm0
729
cmp $0xc0,%rax
730
jl .Lxorpart4
731
movdqu 0xb0(%rdx),%xmm1
732
pxor %xmm1,%xmm0
733
movdqu %xmm0,0xb0(%rsi)
734
735
movdqa 0x30(%rsp),%xmm0
736
cmp $0xd0,%rax
737
jl .Lxorpart4
738
movdqu 0xc0(%rdx),%xmm1
739
pxor %xmm1,%xmm0
740
movdqu %xmm0,0xc0(%rsi)
741
742
movdqu %xmm7,%xmm0
743
cmp $0xe0,%rax
744
jl .Lxorpart4
745
movdqu 0xd0(%rdx),%xmm1
746
pxor %xmm1,%xmm0
747
movdqu %xmm0,0xd0(%rsi)
748
749
movdqu %xmm11,%xmm0
750
cmp $0xf0,%rax
751
jl .Lxorpart4
752
movdqu 0xe0(%rdx),%xmm1
753
pxor %xmm1,%xmm0
754
movdqu %xmm0,0xe0(%rsi)
755
756
movdqu %xmm15,%xmm0
757
cmp $0x100,%rax
758
jl .Lxorpart4
759
movdqu 0xf0(%rdx),%xmm1
760
pxor %xmm1,%xmm0
761
movdqu %xmm0,0xf0(%rsi)
762
763
.Ldone4:
764
lea -8(%r10),%rsp
765
RET
766
767
.Lxorpart4:
768
# xor remaining bytes from partial register into output
769
mov %rax,%r9
770
and $0x0f,%r9
771
jz .Ldone4
772
and $~0x0f,%rax
773
774
mov %rsi,%r11
775
776
lea (%rdx,%rax),%rsi
777
mov %rsp,%rdi
778
mov %r9,%rcx
779
rep movsb
780
781
pxor 0x00(%rsp),%xmm0
782
movdqa %xmm0,0x00(%rsp)
783
784
mov %rsp,%rsi
785
lea (%r11,%rax),%rdi
786
mov %r9,%rcx
787
rep movsb
788
789
jmp .Ldone4
790
791
SYM_FUNC_END(chacha_4block_xor_ssse3)
792
793