.section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
.section .rodata.cst16.ROT16, "aM", @progbits, 16
.align 16
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
.section .rodata.cst16.CTRINC, "aM", @progbits, 16
.align 16
CTRINC: .octa 0x00000003000000020000000100000000
.text
SYM_FUNC_START_LOCAL(chacha_permute)
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
.Ldoubleround:
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm5,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm6
pslld $12,%xmm6
psrld $20,%xmm1
por %xmm6,%xmm1
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm4,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm7
pslld $7,%xmm7
psrld $25,%xmm1
por %xmm7,%xmm1
pshufd $0x39,%xmm1,%xmm1
pshufd $0x4e,%xmm2,%xmm2
pshufd $0x93,%xmm3,%xmm3
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm5,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm6
pslld $12,%xmm6
psrld $20,%xmm1
por %xmm6,%xmm1
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm4,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm7
pslld $7,%xmm7
psrld $25,%xmm1
por %xmm7,%xmm1
pshufd $0x93,%xmm1,%xmm1
pshufd $0x4e,%xmm2,%xmm2
pshufd $0x39,%xmm3,%xmm3
sub $2,%r8d
jnz .Ldoubleround
RET
SYM_FUNC_END(chacha_permute)
SYM_FUNC_START(chacha_block_xor_ssse3)
FRAME_BEGIN
movdqu 0x00(%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu 0x20(%rdi),%xmm2
movdqu 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
mov %rcx,%rax
call chacha_permute
paddd %xmm8,%xmm0
cmp $0x10,%rax
jl .Lxorpart
movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
paddd %xmm9,%xmm1
movdqa %xmm1,%xmm0
cmp $0x20,%rax
jl .Lxorpart
movdqu 0x10(%rdx),%xmm0
pxor %xmm1,%xmm0
movdqu %xmm0,0x10(%rsi)
paddd %xmm10,%xmm2
movdqa %xmm2,%xmm0
cmp $0x30,%rax
jl .Lxorpart
movdqu 0x20(%rdx),%xmm0
pxor %xmm2,%xmm0
movdqu %xmm0,0x20(%rsi)
paddd %xmm11,%xmm3
movdqa %xmm3,%xmm0
cmp $0x40,%rax
jl .Lxorpart
movdqu 0x30(%rdx),%xmm0
pxor %xmm3,%xmm0
movdqu %xmm0,0x30(%rsi)
.Ldone:
FRAME_END
RET
.Lxorpart:
mov %rax,%r9
and $0x0f,%r9
jz .Ldone
and $~0x0f,%rax
mov %rsi,%r11
lea 8(%rsp),%r10
sub $0x10,%rsp
and $~31,%rsp
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
lea -8(%r10),%rsp
jmp .Ldone
SYM_FUNC_END(chacha_block_xor_ssse3)
SYM_FUNC_START(hchacha_block_ssse3)
FRAME_BEGIN
movdqu 0x00(%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu 0x20(%rdi),%xmm2
movdqu 0x30(%rdi),%xmm3
mov %edx,%r8d
call chacha_permute
movdqu %xmm0,0x00(%rsi)
movdqu %xmm3,0x10(%rsi)
FRAME_END
RET
SYM_FUNC_END(hchacha_block_ssse3)
SYM_FUNC_START(chacha_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
mov %rcx,%rax
movq 0x00(%rdi),%xmm1
pshufd $0x00,%xmm1,%xmm0
pshufd $0x55,%xmm1,%xmm1
movq 0x08(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
movq 0x10(%rdi),%xmm5
pshufd $0x00,%xmm5,%xmm4
pshufd $0x55,%xmm5,%xmm5
movq 0x18(%rdi),%xmm7
pshufd $0x00,%xmm7,%xmm6
pshufd $0x55,%xmm7,%xmm7
movq 0x20(%rdi),%xmm9
pshufd $0x00,%xmm9,%xmm8
pshufd $0x55,%xmm9,%xmm9
movq 0x28(%rdi),%xmm11
pshufd $0x00,%xmm11,%xmm10
pshufd $0x55,%xmm11,%xmm11
movq 0x30(%rdi),%xmm13
pshufd $0x00,%xmm13,%xmm12
pshufd $0x55,%xmm13,%xmm13
movq 0x38(%rdi),%xmm15
pshufd $0x00,%xmm15,%xmm14
pshufd $0x55,%xmm15,%xmm15
movdqa %xmm0,0x00(%rsp)
movdqa %xmm1,0x10(%rsp)
movdqa %xmm2,0x20(%rsp)
movdqa %xmm3,0x30(%rsp)
movdqa CTRINC(%rip),%xmm1
movdqa ROT8(%rip),%xmm2
movdqa ROT16(%rip),%xmm3
paddd %xmm1,%xmm12
.Ldoubleround4:
movdqa 0x00(%rsp),%xmm0
paddd %xmm4,%xmm0
movdqa %xmm0,0x00(%rsp)
pxor %xmm0,%xmm12
pshufb %xmm3,%xmm12
movdqa 0x10(%rsp),%xmm0
paddd %xmm5,%xmm0
movdqa %xmm0,0x10(%rsp)
pxor %xmm0,%xmm13
pshufb %xmm3,%xmm13
movdqa 0x20(%rsp),%xmm0
paddd %xmm6,%xmm0
movdqa %xmm0,0x20(%rsp)
pxor %xmm0,%xmm14
pshufb %xmm3,%xmm14
movdqa 0x30(%rsp),%xmm0
paddd %xmm7,%xmm0
movdqa %xmm0,0x30(%rsp)
pxor %xmm0,%xmm15
pshufb %xmm3,%xmm15
paddd %xmm12,%xmm8
pxor %xmm8,%xmm4
movdqa %xmm4,%xmm0
pslld $12,%xmm0
psrld $20,%xmm4
por %xmm0,%xmm4
paddd %xmm13,%xmm9
pxor %xmm9,%xmm5
movdqa %xmm5,%xmm0
pslld $12,%xmm0
psrld $20,%xmm5
por %xmm0,%xmm5
paddd %xmm14,%xmm10
pxor %xmm10,%xmm6
movdqa %xmm6,%xmm0
pslld $12,%xmm0
psrld $20,%xmm6
por %xmm0,%xmm6
paddd %xmm15,%xmm11
pxor %xmm11,%xmm7
movdqa %xmm7,%xmm0
pslld $12,%xmm0
psrld $20,%xmm7
por %xmm0,%xmm7
movdqa 0x00(%rsp),%xmm0
paddd %xmm4,%xmm0
movdqa %xmm0,0x00(%rsp)
pxor %xmm0,%xmm12
pshufb %xmm2,%xmm12
movdqa 0x10(%rsp),%xmm0
paddd %xmm5,%xmm0
movdqa %xmm0,0x10(%rsp)
pxor %xmm0,%xmm13
pshufb %xmm2,%xmm13
movdqa 0x20(%rsp),%xmm0
paddd %xmm6,%xmm0
movdqa %xmm0,0x20(%rsp)
pxor %xmm0,%xmm14
pshufb %xmm2,%xmm14
movdqa 0x30(%rsp),%xmm0
paddd %xmm7,%xmm0
movdqa %xmm0,0x30(%rsp)
pxor %xmm0,%xmm15
pshufb %xmm2,%xmm15
paddd %xmm12,%xmm8
pxor %xmm8,%xmm4
movdqa %xmm4,%xmm0
pslld $7,%xmm0
psrld $25,%xmm4
por %xmm0,%xmm4
paddd %xmm13,%xmm9
pxor %xmm9,%xmm5
movdqa %xmm5,%xmm0
pslld $7,%xmm0
psrld $25,%xmm5
por %xmm0,%xmm5
paddd %xmm14,%xmm10
pxor %xmm10,%xmm6
movdqa %xmm6,%xmm0
pslld $7,%xmm0
psrld $25,%xmm6
por %xmm0,%xmm6
paddd %xmm15,%xmm11
pxor %xmm11,%xmm7
movdqa %xmm7,%xmm0
pslld $7,%xmm0
psrld $25,%xmm7
por %xmm0,%xmm7
movdqa 0x00(%rsp),%xmm0
paddd %xmm5,%xmm0
movdqa %xmm0,0x00(%rsp)
pxor %xmm0,%xmm15
pshufb %xmm3,%xmm15
movdqa 0x10(%rsp),%xmm0
paddd %xmm6,%xmm0
movdqa %xmm0,0x10(%rsp)
pxor %xmm0,%xmm12
pshufb %xmm3,%xmm12
movdqa 0x20(%rsp),%xmm0
paddd %xmm7,%xmm0
movdqa %xmm0,0x20(%rsp)
pxor %xmm0,%xmm13
pshufb %xmm3,%xmm13
movdqa 0x30(%rsp),%xmm0
paddd %xmm4,%xmm0
movdqa %xmm0,0x30(%rsp)
pxor %xmm0,%xmm14
pshufb %xmm3,%xmm14
paddd %xmm15,%xmm10
pxor %xmm10,%xmm5
movdqa %xmm5,%xmm0
pslld $12,%xmm0
psrld $20,%xmm5
por %xmm0,%xmm5
paddd %xmm12,%xmm11
pxor %xmm11,%xmm6
movdqa %xmm6,%xmm0
pslld $12,%xmm0
psrld $20,%xmm6
por %xmm0,%xmm6
paddd %xmm13,%xmm8
pxor %xmm8,%xmm7
movdqa %xmm7,%xmm0
pslld $12,%xmm0
psrld $20,%xmm7
por %xmm0,%xmm7
paddd %xmm14,%xmm9
pxor %xmm9,%xmm4
movdqa %xmm4,%xmm0
pslld $12,%xmm0
psrld $20,%xmm4
por %xmm0,%xmm4
movdqa 0x00(%rsp),%xmm0
paddd %xmm5,%xmm0
movdqa %xmm0,0x00(%rsp)
pxor %xmm0,%xmm15
pshufb %xmm2,%xmm15
movdqa 0x10(%rsp),%xmm0
paddd %xmm6,%xmm0
movdqa %xmm0,0x10(%rsp)
pxor %xmm0,%xmm12
pshufb %xmm2,%xmm12
movdqa 0x20(%rsp),%xmm0
paddd %xmm7,%xmm0
movdqa %xmm0,0x20(%rsp)
pxor %xmm0,%xmm13
pshufb %xmm2,%xmm13
movdqa 0x30(%rsp),%xmm0
paddd %xmm4,%xmm0
movdqa %xmm0,0x30(%rsp)
pxor %xmm0,%xmm14
pshufb %xmm2,%xmm14
paddd %xmm15,%xmm10
pxor %xmm10,%xmm5
movdqa %xmm5,%xmm0
pslld $7,%xmm0
psrld $25,%xmm5
por %xmm0,%xmm5
paddd %xmm12,%xmm11
pxor %xmm11,%xmm6
movdqa %xmm6,%xmm0
pslld $7,%xmm0
psrld $25,%xmm6
por %xmm0,%xmm6
paddd %xmm13,%xmm8
pxor %xmm8,%xmm7
movdqa %xmm7,%xmm0
pslld $7,%xmm0
psrld $25,%xmm7
por %xmm0,%xmm7
paddd %xmm14,%xmm9
pxor %xmm9,%xmm4
movdqa %xmm4,%xmm0
pslld $7,%xmm0
psrld $25,%xmm4
por %xmm0,%xmm4
sub $2,%r8d
jnz .Ldoubleround4
movq 0x00(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd 0x00(%rsp),%xmm2
movdqa %xmm2,0x00(%rsp)
paddd 0x10(%rsp),%xmm3
movdqa %xmm3,0x10(%rsp)
movq 0x08(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd 0x20(%rsp),%xmm2
movdqa %xmm2,0x20(%rsp)
paddd 0x30(%rsp),%xmm3
movdqa %xmm3,0x30(%rsp)
movq 0x10(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd %xmm2,%xmm4
paddd %xmm3,%xmm5
movq 0x18(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd %xmm2,%xmm6
paddd %xmm3,%xmm7
movq 0x20(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd %xmm2,%xmm8
paddd %xmm3,%xmm9
movq 0x28(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd %xmm2,%xmm10
paddd %xmm3,%xmm11
movq 0x30(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd %xmm2,%xmm12
paddd %xmm3,%xmm13
movq 0x38(%rdi),%xmm3
pshufd $0x00,%xmm3,%xmm2
pshufd $0x55,%xmm3,%xmm3
paddd %xmm2,%xmm14
paddd %xmm3,%xmm15
paddd %xmm1,%xmm12
movdqa 0x00(%rsp),%xmm0
movdqa 0x10(%rsp),%xmm1
movdqa %xmm0,%xmm2
punpckldq %xmm1,%xmm2
punpckhdq %xmm1,%xmm0
movdqa %xmm2,0x00(%rsp)
movdqa %xmm0,0x10(%rsp)
movdqa 0x20(%rsp),%xmm0
movdqa 0x30(%rsp),%xmm1
movdqa %xmm0,%xmm2
punpckldq %xmm1,%xmm2
punpckhdq %xmm1,%xmm0
movdqa %xmm2,0x20(%rsp)
movdqa %xmm0,0x30(%rsp)
movdqa %xmm4,%xmm0
punpckldq %xmm5,%xmm4
punpckhdq %xmm5,%xmm0
movdqa %xmm0,%xmm5
movdqa %xmm6,%xmm0
punpckldq %xmm7,%xmm6
punpckhdq %xmm7,%xmm0
movdqa %xmm0,%xmm7
movdqa %xmm8,%xmm0
punpckldq %xmm9,%xmm8
punpckhdq %xmm9,%xmm0
movdqa %xmm0,%xmm9
movdqa %xmm10,%xmm0
punpckldq %xmm11,%xmm10
punpckhdq %xmm11,%xmm0
movdqa %xmm0,%xmm11
movdqa %xmm12,%xmm0
punpckldq %xmm13,%xmm12
punpckhdq %xmm13,%xmm0
movdqa %xmm0,%xmm13
movdqa %xmm14,%xmm0
punpckldq %xmm15,%xmm14
punpckhdq %xmm15,%xmm0
movdqa %xmm0,%xmm15
movdqa 0x00(%rsp),%xmm0
movdqa 0x20(%rsp),%xmm1
movdqa %xmm0,%xmm2
punpcklqdq %xmm1,%xmm2
punpckhqdq %xmm1,%xmm0
movdqa %xmm2,0x00(%rsp)
movdqa %xmm0,0x20(%rsp)
movdqa 0x10(%rsp),%xmm0
movdqa 0x30(%rsp),%xmm1
movdqa %xmm0,%xmm2
punpcklqdq %xmm1,%xmm2
punpckhqdq %xmm1,%xmm0
movdqa %xmm2,0x10(%rsp)
movdqa %xmm0,0x30(%rsp)
movdqa %xmm4,%xmm0
punpcklqdq %xmm6,%xmm4
punpckhqdq %xmm6,%xmm0
movdqa %xmm0,%xmm6
movdqa %xmm5,%xmm0
punpcklqdq %xmm7,%xmm5
punpckhqdq %xmm7,%xmm0
movdqa %xmm0,%xmm7
movdqa %xmm8,%xmm0
punpcklqdq %xmm10,%xmm8
punpckhqdq %xmm10,%xmm0
movdqa %xmm0,%xmm10
movdqa %xmm9,%xmm0
punpcklqdq %xmm11,%xmm9
punpckhqdq %xmm11,%xmm0
movdqa %xmm0,%xmm11
movdqa %xmm12,%xmm0
punpcklqdq %xmm14,%xmm12
punpckhqdq %xmm14,%xmm0
movdqa %xmm0,%xmm14
movdqa %xmm13,%xmm0
punpcklqdq %xmm15,%xmm13
punpckhqdq %xmm15,%xmm0
movdqa %xmm0,%xmm15
movdqa 0x00(%rsp),%xmm0
cmp $0x10,%rax
jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
movdqu %xmm4,%xmm0
cmp $0x20,%rax
jl .Lxorpart4
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x10(%rsi)
movdqu %xmm8,%xmm0
cmp $0x30,%rax
jl .Lxorpart4
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x20(%rsi)
movdqu %xmm12,%xmm0
cmp $0x40,%rax
jl .Lxorpart4
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x30(%rsi)
movdqa 0x20(%rsp),%xmm0
cmp $0x50,%rax
jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
movdqu %xmm6,%xmm0
cmp $0x60,%rax
jl .Lxorpart4
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x50(%rsi)
movdqu %xmm10,%xmm0
cmp $0x70,%rax
jl .Lxorpart4
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x60(%rsi)
movdqu %xmm14,%xmm0
cmp $0x80,%rax
jl .Lxorpart4
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x70(%rsi)
movdqa 0x10(%rsp),%xmm0
cmp $0x90,%rax
jl .Lxorpart4
movdqu 0x80(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm5,%xmm0
cmp $0xa0,%rax
jl .Lxorpart4
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x90(%rsi)
movdqu %xmm9,%xmm0
cmp $0xb0,%rax
jl .Lxorpart4
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xa0(%rsi)
movdqu %xmm13,%xmm0
cmp $0xc0,%rax
jl .Lxorpart4
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xb0(%rsi)
movdqa 0x30(%rsp),%xmm0
cmp $0xd0,%rax
jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
movdqu %xmm7,%xmm0
cmp $0xe0,%rax
jl .Lxorpart4
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xd0(%rsi)
movdqu %xmm11,%xmm0
cmp $0xf0,%rax
jl .Lxorpart4
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xe0(%rsi)
movdqu %xmm15,%xmm0
cmp $0x100,%rax
jl .Lxorpart4
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xf0(%rsi)
.Ldone4:
lea -8(%r10),%rsp
RET
.Lxorpart4:
mov %rax,%r9
and $0x0f,%r9
jz .Ldone4
and $~0x0f,%rax
mov %rsi,%r11
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
jmp .Ldone4
SYM_FUNC_END(chacha_4block_xor_ssse3)