Path: blob/master/arch/x86/entry/vdso/vgetrandom-chacha.S
26493 views
// SPDX-License-Identifier: GPL-2.01/*2* Copyright (C) 2022-2024 Jason A. Donenfeld <[email protected]>. All Rights Reserved.3*/45#include <linux/linkage.h>6#include <asm/frame.h>78.section .rodata, "a"9.align 1610CONSTANTS: .octa 0x6b20657479622d323320646e6170786511.text1213/*14* Very basic SSE2 implementation of ChaCha20. Produces a given positive number15* of blocks of output with a nonce of 0, taking an input key and 8-byte16* counter. Importantly does not spill to the stack. Its arguments are:17*18* rdi: output bytes19* rsi: 32-byte key input20* rdx: 8-byte counter input/output21* rcx: number of 64-byte blocks to write to output22*/23SYM_FUNC_START(__arch_chacha20_blocks_nostack)2425.set output, %rdi26.set key, %rsi27.set counter, %rdx28.set nblocks, %rcx29.set i, %al30/* xmm registers are *not* callee-save. */31.set temp, %xmm032.set state0, %xmm133.set state1, %xmm234.set state2, %xmm335.set state3, %xmm436.set copy0, %xmm537.set copy1, %xmm638.set copy2, %xmm739.set copy3, %xmm840.set one, %xmm94142/* copy0 = "expand 32-byte k" */43movaps CONSTANTS(%rip),copy044/* copy1,copy2 = key */45movups 0x00(key),copy146movups 0x10(key),copy247/* copy3 = counter || zero nonce */48movq 0x00(counter),copy349/* one = 1 || 0 */50movq $1,%rax51movq %rax,one5253.Lblock:54/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */55movdqa copy0,state056movdqa copy1,state157movdqa copy2,state258movdqa copy3,state35960movb $10,i61.Lpermute:62/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */63paddd state1,state064pxor state0,state365movdqa state3,temp66pslld $16,temp67psrld $16,state368por temp,state36970/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */71paddd state3,state272pxor state2,state173movdqa state1,temp74pslld $12,temp75psrld $20,state176por temp,state17778/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */79paddd state1,state080pxor state0,state381movdqa state3,temp82pslld $8,temp83psrld $24,state384por temp,state38586/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */87paddd state3,state288pxor state2,state189movdqa state1,temp90pslld $7,temp91psrld $25,state192por temp,state19394/* state1[0,1,2,3] = state1[1,2,3,0] */95pshufd $0x39,state1,state196/* state2[0,1,2,3] = state2[2,3,0,1] */97pshufd $0x4e,state2,state298/* state3[0,1,2,3] = state3[3,0,1,2] */99pshufd $0x93,state3,state3100101/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */102paddd state1,state0103pxor state0,state3104movdqa state3,temp105pslld $16,temp106psrld $16,state3107por temp,state3108109/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */110paddd state3,state2111pxor state2,state1112movdqa state1,temp113pslld $12,temp114psrld $20,state1115por temp,state1116117/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */118paddd state1,state0119pxor state0,state3120movdqa state3,temp121pslld $8,temp122psrld $24,state3123por temp,state3124125/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */126paddd state3,state2127pxor state2,state1128movdqa state1,temp129pslld $7,temp130psrld $25,state1131por temp,state1132133/* state1[0,1,2,3] = state1[3,0,1,2] */134pshufd $0x93,state1,state1135/* state2[0,1,2,3] = state2[2,3,0,1] */136pshufd $0x4e,state2,state2137/* state3[0,1,2,3] = state3[1,2,3,0] */138pshufd $0x39,state3,state3139140decb i141jnz .Lpermute142143/* output0 = state0 + copy0 */144paddd copy0,state0145movups state0,0x00(output)146/* output1 = state1 + copy1 */147paddd copy1,state1148movups state1,0x10(output)149/* output2 = state2 + copy2 */150paddd copy2,state2151movups state2,0x20(output)152/* output3 = state3 + copy3 */153paddd copy3,state3154movups state3,0x30(output)155156/* ++copy3.counter */157paddq one,copy3158159/* output += 64, --nblocks */160addq $64,output161decq nblocks162jnz .Lblock163164/* counter = copy3.counter */165movq copy3,0x00(counter)166167/* Zero out the potentially sensitive regs, in case nothing uses these again. */168pxor state0,state0169pxor state1,state1170pxor state2,state2171pxor state3,state3172pxor copy1,copy1173pxor copy2,copy2174pxor temp,temp175176ret177SYM_FUNC_END(__arch_chacha20_blocks_nostack)178179180