/* SPDX-License-Identifier: GPL-2.0 OR MIT */1/*2* Copyright (C) 2016-2018 René van Dorst <[email protected]>. All Rights Reserved.3* Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.4*/56#define MASK_U32 0x3c7#define CHACHA20_BLOCK_SIZE 648#define STACK_SIZE 32910#define X0 $t011#define X1 $t112#define X2 $t213#define X3 $t314#define X4 $t415#define X5 $t516#define X6 $t617#define X7 $t718#define X8 $t819#define X9 $t920#define X10 $v121#define X11 $s622#define X12 $s523#define X13 $s424#define X14 $s325#define X15 $s226/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */27#define T0 $s128#define T1 $s029#define T(n) T ## n30#define X(n) X ## n3132/* Input arguments */33#define STATE $a034#define OUT $a135#define IN $a236#define BYTES $a33738/* Output argument */39/* NONCE[0] is kept in a register and not in memory.40* We don't want to touch original value in memory.41* Must be incremented every loop iteration.42*/43#define NONCE_0 $v04445/* SAVED_X and SAVED_CA are set in the jump table.46* Use regs which are overwritten on exit else we don't leak clear data.47* They are used to handling the last bytes which are not multiple of 4.48*/49#define SAVED_X X1550#define SAVED_CA $s75152#define IS_UNALIGNED $s75354#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__55#define MSB 056#define LSB 357#define CPU_TO_LE32(n) \58wsbh n, n; \59rotr n, 16;60#else61#define MSB 362#define LSB 063#define CPU_TO_LE32(n)64#endif6566#define FOR_EACH_WORD(x) \67x( 0); \68x( 1); \69x( 2); \70x( 3); \71x( 4); \72x( 5); \73x( 6); \74x( 7); \75x( 8); \76x( 9); \77x(10); \78x(11); \79x(12); \80x(13); \81x(14); \82x(15);8384#define FOR_EACH_WORD_REV(x) \85x(15); \86x(14); \87x(13); \88x(12); \89x(11); \90x(10); \91x( 9); \92x( 8); \93x( 7); \94x( 6); \95x( 5); \96x( 4); \97x( 3); \98x( 2); \99x( 1); \100x( 0);101102#define PLUS_ONE_0 1103#define PLUS_ONE_1 2104#define PLUS_ONE_2 3105#define PLUS_ONE_3 4106#define PLUS_ONE_4 5107#define PLUS_ONE_5 6108#define PLUS_ONE_6 7109#define PLUS_ONE_7 8110#define PLUS_ONE_8 9111#define PLUS_ONE_9 10112#define PLUS_ONE_10 11113#define PLUS_ONE_11 12114#define PLUS_ONE_12 13115#define PLUS_ONE_13 14116#define PLUS_ONE_14 15117#define PLUS_ONE_15 16118#define PLUS_ONE(x) PLUS_ONE_ ## x119#define _CONCAT3(a,b,c) a ## b ## c120#define CONCAT3(a,b,c) _CONCAT3(a,b,c)121122#define STORE_UNALIGNED(x) \123CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \124.if (x != 12); \125lw T0, (x*4)(STATE); \126.endif; \127lwl T1, (x*4)+MSB ## (IN); \128lwr T1, (x*4)+LSB ## (IN); \129.if (x == 12); \130addu X ## x, NONCE_0; \131.else; \132addu X ## x, T0; \133.endif; \134CPU_TO_LE32(X ## x); \135xor X ## x, T1; \136swl X ## x, (x*4)+MSB ## (OUT); \137swr X ## x, (x*4)+LSB ## (OUT);138139#define STORE_ALIGNED(x) \140CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \141.if (x != 12); \142lw T0, (x*4)(STATE); \143.endif; \144lw T1, (x*4) ## (IN); \145.if (x == 12); \146addu X ## x, NONCE_0; \147.else; \148addu X ## x, T0; \149.endif; \150CPU_TO_LE32(X ## x); \151xor X ## x, T1; \152sw X ## x, (x*4) ## (OUT);153154/* Jump table macro.155* Used for setup and handling the last bytes, which are not multiple of 4.156* X15 is free to store Xn157* Every jumptable entry must be equal in size.158*/159#define JMPTBL_ALIGNED(x) \160.Lchacha_mips_jmptbl_aligned_ ## x: ; \161.set noreorder; \162b .Lchacha_mips_xor_aligned_ ## x ## _b; \163.if (x == 12); \164addu SAVED_X, X ## x, NONCE_0; \165.else; \166addu SAVED_X, X ## x, SAVED_CA; \167.endif; \168.set reorder169170#define JMPTBL_UNALIGNED(x) \171.Lchacha_mips_jmptbl_unaligned_ ## x: ; \172.set noreorder; \173b .Lchacha_mips_xor_unaligned_ ## x ## _b; \174.if (x == 12); \175addu SAVED_X, X ## x, NONCE_0; \176.else; \177addu SAVED_X, X ## x, SAVED_CA; \178.endif; \179.set reorder180181#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \182addu X(A), X(K); \183addu X(B), X(L); \184addu X(C), X(M); \185addu X(D), X(N); \186xor X(V), X(A); \187xor X(W), X(B); \188xor X(Y), X(C); \189xor X(Z), X(D); \190rotr X(V), 32 - S; \191rotr X(W), 32 - S; \192rotr X(Y), 32 - S; \193rotr X(Z), 32 - S;194195.text196.set reorder197.set noat198.globl chacha_crypt_arch199.ent chacha_crypt_arch200chacha_crypt_arch:201.frame $sp, STACK_SIZE, $ra202203/* Load number of rounds */204lw $at, 16($sp)205206addiu $sp, -STACK_SIZE207208/* Return bytes = 0. */209beqz BYTES, .Lchacha_mips_end210211lw NONCE_0, 48(STATE)212213/* Save s0-s7 */214sw $s0, 0($sp)215sw $s1, 4($sp)216sw $s2, 8($sp)217sw $s3, 12($sp)218sw $s4, 16($sp)219sw $s5, 20($sp)220sw $s6, 24($sp)221sw $s7, 28($sp)222223/* Test IN or OUT is unaligned.224* IS_UNALIGNED = ( IN | OUT ) & 0x00000003225*/226or IS_UNALIGNED, IN, OUT227andi IS_UNALIGNED, 0x3228229b .Lchacha_rounds_start230231.align 4232.Loop_chacha_rounds:233addiu IN, CHACHA20_BLOCK_SIZE234addiu OUT, CHACHA20_BLOCK_SIZE235addiu NONCE_0, 1236237.Lchacha_rounds_start:238lw X0, 0(STATE)239lw X1, 4(STATE)240lw X2, 8(STATE)241lw X3, 12(STATE)242243lw X4, 16(STATE)244lw X5, 20(STATE)245lw X6, 24(STATE)246lw X7, 28(STATE)247lw X8, 32(STATE)248lw X9, 36(STATE)249lw X10, 40(STATE)250lw X11, 44(STATE)251252move X12, NONCE_0253lw X13, 52(STATE)254lw X14, 56(STATE)255lw X15, 60(STATE)256257.Loop_chacha_xor_rounds:258addiu $at, -2259AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);260AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);261AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);262AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);263AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);264AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);265AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);266AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);267bnez $at, .Loop_chacha_xor_rounds268269addiu BYTES, -(CHACHA20_BLOCK_SIZE)270271/* Is data src/dst unaligned? Jump */272bnez IS_UNALIGNED, .Loop_chacha_unaligned273274/* Set number rounds here to fill delayslot. */275lw $at, (STACK_SIZE+16)($sp)276277/* BYTES < 0, it has no full block. */278bltz BYTES, .Lchacha_mips_no_full_block_aligned279280FOR_EACH_WORD_REV(STORE_ALIGNED)281282/* BYTES > 0? Loop again. */283bgtz BYTES, .Loop_chacha_rounds284285/* Place this here to fill delay slot */286addiu NONCE_0, 1287288/* BYTES < 0? Handle last bytes */289bltz BYTES, .Lchacha_mips_xor_bytes290291.Lchacha_mips_xor_done:292/* Restore used registers */293lw $s0, 0($sp)294lw $s1, 4($sp)295lw $s2, 8($sp)296lw $s3, 12($sp)297lw $s4, 16($sp)298lw $s5, 20($sp)299lw $s6, 24($sp)300lw $s7, 28($sp)301302/* Write NONCE_0 back to right location in state */303sw NONCE_0, 48(STATE)304305.Lchacha_mips_end:306addiu $sp, STACK_SIZE307jr $ra308309.Lchacha_mips_no_full_block_aligned:310/* Restore the offset on BYTES */311addiu BYTES, CHACHA20_BLOCK_SIZE312313/* Get number of full WORDS */314andi $at, BYTES, MASK_U32315316/* Load upper half of jump table addr */317lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)318319/* Calculate lower half jump table offset */320ins T0, $at, 1, 6321322/* Add offset to STATE */323addu T1, STATE, $at324325/* Add lower half jump table addr */326addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)327328/* Read value from STATE */329lw SAVED_CA, 0(T1)330331/* Store remaining bytecounter as negative value */332subu BYTES, $at, BYTES333334jr T0335336/* Jump table */337FOR_EACH_WORD(JMPTBL_ALIGNED)338339340.Loop_chacha_unaligned:341/* Set number rounds here to fill delayslot. */342lw $at, (STACK_SIZE+16)($sp)343344/* BYTES > 0, it has no full block. */345bltz BYTES, .Lchacha_mips_no_full_block_unaligned346347FOR_EACH_WORD_REV(STORE_UNALIGNED)348349/* BYTES > 0? Loop again. */350bgtz BYTES, .Loop_chacha_rounds351352/* Write NONCE_0 back to right location in state */353sw NONCE_0, 48(STATE)354355.set noreorder356/* Fall through to byte handling */357bgez BYTES, .Lchacha_mips_xor_done358.Lchacha_mips_xor_unaligned_0_b:359.Lchacha_mips_xor_aligned_0_b:360/* Place this here to fill delay slot */361addiu NONCE_0, 1362.set reorder363364.Lchacha_mips_xor_bytes:365addu IN, $at366addu OUT, $at367/* First byte */368lbu T1, 0(IN)369addiu $at, BYTES, 1370xor T1, SAVED_X371sb T1, 0(OUT)372beqz $at, .Lchacha_mips_xor_done373/* Second byte */374lbu T1, 1(IN)375addiu $at, BYTES, 2376rotr SAVED_X, 8377xor T1, SAVED_X378sb T1, 1(OUT)379beqz $at, .Lchacha_mips_xor_done380/* Third byte */381lbu T1, 2(IN)382rotr SAVED_X, 8383xor T1, SAVED_X384sb T1, 2(OUT)385b .Lchacha_mips_xor_done386387.Lchacha_mips_no_full_block_unaligned:388/* Restore the offset on BYTES */389addiu BYTES, CHACHA20_BLOCK_SIZE390391/* Get number of full WORDS */392andi $at, BYTES, MASK_U32393394/* Load upper half of jump table addr */395lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)396397/* Calculate lower half jump table offset */398ins T0, $at, 1, 6399400/* Add offset to STATE */401addu T1, STATE, $at402403/* Add lower half jump table addr */404addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)405406/* Read value from STATE */407lw SAVED_CA, 0(T1)408409/* Store remaining bytecounter as negative value */410subu BYTES, $at, BYTES411412jr T0413414/* Jump table */415FOR_EACH_WORD(JMPTBL_UNALIGNED)416.end chacha_crypt_arch417.set at418419/* Input arguments420* STATE $a0421* OUT $a1422* NROUND $a2423*/424425#undef X12426#undef X13427#undef X14428#undef X15429430#define X12 $a3431#define X13 $at432#define X14 $v0433#define X15 STATE434435.set noat436.globl hchacha_block_arch437.ent hchacha_block_arch438hchacha_block_arch:439.frame $sp, STACK_SIZE, $ra440441addiu $sp, -STACK_SIZE442443/* Save X11(s6) */444sw X11, 0($sp)445446lw X0, 0(STATE)447lw X1, 4(STATE)448lw X2, 8(STATE)449lw X3, 12(STATE)450lw X4, 16(STATE)451lw X5, 20(STATE)452lw X6, 24(STATE)453lw X7, 28(STATE)454lw X8, 32(STATE)455lw X9, 36(STATE)456lw X10, 40(STATE)457lw X11, 44(STATE)458lw X12, 48(STATE)459lw X13, 52(STATE)460lw X14, 56(STATE)461lw X15, 60(STATE)462463.Loop_hchacha_xor_rounds:464addiu $a2, -2465AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);466AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);467AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);468AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);469AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);470AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);471AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);472AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);473bnez $a2, .Loop_hchacha_xor_rounds474475/* Restore used register */476lw X11, 0($sp)477478sw X0, 0(OUT)479sw X1, 4(OUT)480sw X2, 8(OUT)481sw X3, 12(OUT)482sw X12, 16(OUT)483sw X13, 20(OUT)484sw X14, 24(OUT)485sw X15, 28(OUT)486487addiu $sp, STACK_SIZE488jr $ra489.end hchacha_block_arch490.set at491492493