Path: blob/main/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c
48775 views
// SPDX-License-Identifier: LicenseRef-OpenZFS-ThirdParty-PublicDomain1/*2* Implementation of the Skein block functions.3* Source code author: Doug Whiting, 2008.4* This algorithm and source code is released to the public domain.5* Compile-time switches:6* SKEIN_USE_ASM -- set bits (256/512/1024) to select which7* versions use ASM code for block processing8* [default: use C for all block sizes]9*/10/* Copyright 2013 Doug Whiting. This code is released to the public domain. */1112#include <sys/skein.h>13#include "skein_impl.h"14#include <sys/isa_defs.h> /* for _ILP32 */1516#ifndef SKEIN_USE_ASM17#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */18#endif1920#ifndef SKEIN_LOOP21/*22* The low-level checksum routines use a lot of stack space. On systems where23* small stacks frame are enforced (like 32-bit kernel builds), do not unroll24* checksum calculations to save stack space.25*26* Even with no loops unrolled, we still can exceed the 1k stack frame limit27* in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can28* safely ignore it though, since that the checksum functions will be called29* from a worker thread that won't be using much stack. That's why we have30* the #pragma here to ignore the warning.31*/32#if defined(_ILP32) || defined(__powerpc) /* Assume small stack */33#if defined(__GNUC__) && !defined(__clang__)34#pragma GCC diagnostic ignored "-Wframe-larger-than="35#endif36/*37* We're running on 32-bit, don't unroll loops to save stack frame space38*39* Due to the ways the calculations on SKEIN_LOOP are done in40* Skein_*_Process_Block(), a value of 111 disables unrolling loops41* in any of those functions.42*/43#define SKEIN_LOOP 11144#else45/* We're compiling with large stacks */46#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */47#endif48#endif4950/* some useful definitions for code here */51#define BLK_BITS (WCNT*64)52#define KW_TWK_BASE (0)53#define KW_KEY_BASE (3)54#define ks (kw + KW_KEY_BASE)55#define ts (kw + KW_TWK_BASE)5657/* no debugging in Illumos version */58#define DebugSaveTweak(ctx)5960/* Skein_256 */61#if !(SKEIN_USE_ASM & 256)62void63Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,64size_t blkCnt, size_t byteCntAdd)65{66enum {67WCNT = SKEIN_256_STATE_WORDS68};69#undef RCNT70#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)7172#ifdef SKEIN_LOOP /* configure how much to unroll the loop */73#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)74#else75#define SKEIN_UNROLL_256 (0)76#endif7778#if SKEIN_UNROLL_25679#if (RCNT % SKEIN_UNROLL_256)80#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */81#endif82size_t r;83/* key schedule words : chaining vars + tweak + "rotation" */84uint64_t kw[WCNT + 4 + RCNT * 2];85#else86uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */87#endif88/* local copy of context vars, for speed */89uint64_t X0, X1, X2, X3;90uint64_t w[WCNT]; /* local copy of input block */91#ifdef SKEIN_DEBUG92/* use for debugging (help compiler put Xn in registers) */93const uint64_t *Xptr[4];94Xptr[0] = &X0;95Xptr[1] = &X1;96Xptr[2] = &X2;97Xptr[3] = &X3;98#endif99Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */100ts[0] = ctx->h.T[0];101ts[1] = ctx->h.T[1];102do {103/*104* this implementation only supports 2**64 input bytes105* (no carry out here)106*/107ts[0] += byteCntAdd; /* update processed length */108109/* precompute the key schedule for this block */110ks[0] = ctx->X[0];111ks[1] = ctx->X[1];112ks[2] = ctx->X[2];113ks[3] = ctx->X[3];114ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;115116ts[2] = ts[0] ^ ts[1];117118/* get input block in little-endian format */119Skein_Get64_LSB_First(w, blkPtr, WCNT);120DebugSaveTweak(ctx);121Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);122123X0 = w[0] + ks[0]; /* do the first full key injection */124X1 = w[1] + ks[1] + ts[0];125X2 = w[2] + ks[2] + ts[1];126X3 = w[3] + ks[3];127128Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,129Xptr); /* show starting state values */130131blkPtr += SKEIN_256_BLOCK_BYTES;132133/* run the rounds */134135#define Round256(p0, p1, p2, p3, ROT, rNum) \136X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \137X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \138139#if SKEIN_UNROLL_256 == 0140#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \141Round256(p0, p1, p2, p3, ROT, rNum) \142Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);143144#define I256(R) \145X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \146X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \147X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \148X3 += ks[((R) + 4) % 5] + (R) + 1; \149Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);150#else /* looping version */151#define R256(p0, p1, p2, p3, ROT, rNum) \152Round256(p0, p1, p2, p3, ROT, rNum) \153Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);154155#define I256(R) \156X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \157X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \158X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \159X3 += ks[r + (R) + 3] + r + (R); \160ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \161ts[r + (R) + 2] = ts[r + (R) - 1]; \162Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);163164/* loop through it */165for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)166#endif167{168#define R256_8_rounds(R) \169R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \170R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \171R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \172R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \173I256(2 * (R)); \174R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \175R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \176R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \177R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \178I256(2 * (R) + 1);179180R256_8_rounds(0);181182#define R256_Unroll_R(NN) \183((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \184(SKEIN_UNROLL_256 > (NN)))185186#if R256_Unroll_R(1)187R256_8_rounds(1);188#endif189#if R256_Unroll_R(2)190R256_8_rounds(2);191#endif192#if R256_Unroll_R(3)193R256_8_rounds(3);194#endif195#if R256_Unroll_R(4)196R256_8_rounds(4);197#endif198#if R256_Unroll_R(5)199R256_8_rounds(5);200#endif201#if R256_Unroll_R(6)202R256_8_rounds(6);203#endif204#if R256_Unroll_R(7)205R256_8_rounds(7);206#endif207#if R256_Unroll_R(8)208R256_8_rounds(8);209#endif210#if R256_Unroll_R(9)211R256_8_rounds(9);212#endif213#if R256_Unroll_R(10)214R256_8_rounds(10);215#endif216#if R256_Unroll_R(11)217R256_8_rounds(11);218#endif219#if R256_Unroll_R(12)220R256_8_rounds(12);221#endif222#if R256_Unroll_R(13)223R256_8_rounds(13);224#endif225#if R256_Unroll_R(14)226R256_8_rounds(14);227#endif228#if (SKEIN_UNROLL_256 > 14)229#error "need more unrolling in Skein_256_Process_Block"230#endif231}232/*233* do the final "feedforward" xor, update context chaining vars234*/235ctx->X[0] = X0 ^ w[0];236ctx->X[1] = X1 ^ w[1];237ctx->X[2] = X2 ^ w[2];238ctx->X[3] = X3 ^ w[3];239240Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);241242ts[1] &= ~SKEIN_T1_FLAG_FIRST;243} while (--blkCnt);244ctx->h.T[0] = ts[0];245ctx->h.T[1] = ts[1];246}247248#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)249size_t250Skein_256_Process_Block_CodeSize(void)251{252return ((uint8_t *)Skein_256_Process_Block_CodeSize) -253((uint8_t *)Skein_256_Process_Block);254}255256uint_t257Skein_256_Unroll_Cnt(void)258{259return (SKEIN_UNROLL_256);260}261#endif262#endif263264/* Skein_512 */265#if !(SKEIN_USE_ASM & 512)266void267Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,268size_t blkCnt, size_t byteCntAdd)269{270enum {271WCNT = SKEIN_512_STATE_WORDS272};273#undef RCNT274#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8)275276#ifdef SKEIN_LOOP /* configure how much to unroll the loop */277#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)278#else279#define SKEIN_UNROLL_512 (0)280#endif281282#if SKEIN_UNROLL_512283#if (RCNT % SKEIN_UNROLL_512)284#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */285#endif286size_t r;287/* key schedule words : chaining vars + tweak + "rotation" */288uint64_t kw[WCNT + 4 + RCNT * 2];289#else290uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */291#endif292/* local copy of vars, for speed */293uint64_t X0, X1, X2, X3, X4, X5, X6, X7;294uint64_t w[WCNT]; /* local copy of input block */295#ifdef SKEIN_DEBUG296/* use for debugging (help compiler put Xn in registers) */297const uint64_t *Xptr[8];298Xptr[0] = &X0;299Xptr[1] = &X1;300Xptr[2] = &X2;301Xptr[3] = &X3;302Xptr[4] = &X4;303Xptr[5] = &X5;304Xptr[6] = &X6;305Xptr[7] = &X7;306#endif307308Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */309ts[0] = ctx->h.T[0];310ts[1] = ctx->h.T[1];311do {312/*313* this implementation only supports 2**64 input bytes314* (no carry out here)315*/316ts[0] += byteCntAdd; /* update processed length */317318/* precompute the key schedule for this block */319ks[0] = ctx->X[0];320ks[1] = ctx->X[1];321ks[2] = ctx->X[2];322ks[3] = ctx->X[3];323ks[4] = ctx->X[4];324ks[5] = ctx->X[5];325ks[6] = ctx->X[6];326ks[7] = ctx->X[7];327ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^328ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;329330ts[2] = ts[0] ^ ts[1];331332/* get input block in little-endian format */333Skein_Get64_LSB_First(w, blkPtr, WCNT);334DebugSaveTweak(ctx);335Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);336337X0 = w[0] + ks[0]; /* do the first full key injection */338X1 = w[1] + ks[1];339X2 = w[2] + ks[2];340X3 = w[3] + ks[3];341X4 = w[4] + ks[4];342X5 = w[5] + ks[5] + ts[0];343X6 = w[6] + ks[6] + ts[1];344X7 = w[7] + ks[7];345346blkPtr += SKEIN_512_BLOCK_BYTES;347348Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,349Xptr);350/* run the rounds */351#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \352X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\353X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\354X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\355X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;356357#if SKEIN_UNROLL_512 == 0358#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \359Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \360Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);361362#define I512(R) \363X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\364X1 += ks[((R) + 2) % 9]; \365X2 += ks[((R) + 3) % 9]; \366X3 += ks[((R) + 4) % 9]; \367X4 += ks[((R) + 5) % 9]; \368X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \369X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \370X7 += ks[((R) + 8) % 9] + (R) + 1; \371Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);372#else /* looping version */373#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \374Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \375Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);376377#define I512(R) \378X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \379X1 += ks[r + (R) + 1]; \380X2 += ks[r + (R) + 2]; \381X3 += ks[r + (R) + 3]; \382X4 += ks[r + (R) + 4]; \383X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \384X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \385X7 += ks[r + (R) + 7] + r + (R); \386ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\387ts[r + (R)+2] = ts[r + (R) - 1]; \388Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);389390/* loop through it */391for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)392#endif /* end of looped code definitions */393{394#define R512_8_rounds(R) /* do 8 full rounds */ \395R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \396R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \397R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \398R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \399I512(2 * (R)); \400R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \401R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \402R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \403R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \404I512(2*(R) + 1); /* and key injection */405406R512_8_rounds(0);407408#define R512_Unroll_R(NN) \409((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \410(SKEIN_UNROLL_512 > (NN)))411412#if R512_Unroll_R(1)413R512_8_rounds(1);414#endif415#if R512_Unroll_R(2)416R512_8_rounds(2);417#endif418#if R512_Unroll_R(3)419R512_8_rounds(3);420#endif421#if R512_Unroll_R(4)422R512_8_rounds(4);423#endif424#if R512_Unroll_R(5)425R512_8_rounds(5);426#endif427#if R512_Unroll_R(6)428R512_8_rounds(6);429#endif430#if R512_Unroll_R(7)431R512_8_rounds(7);432#endif433#if R512_Unroll_R(8)434R512_8_rounds(8);435#endif436#if R512_Unroll_R(9)437R512_8_rounds(9);438#endif439#if R512_Unroll_R(10)440R512_8_rounds(10);441#endif442#if R512_Unroll_R(11)443R512_8_rounds(11);444#endif445#if R512_Unroll_R(12)446R512_8_rounds(12);447#endif448#if R512_Unroll_R(13)449R512_8_rounds(13);450#endif451#if R512_Unroll_R(14)452R512_8_rounds(14);453#endif454#if (SKEIN_UNROLL_512 > 14)455#error "need more unrolling in Skein_512_Process_Block"456#endif457}458459/*460* do the final "feedforward" xor, update context chaining vars461*/462ctx->X[0] = X0 ^ w[0];463ctx->X[1] = X1 ^ w[1];464ctx->X[2] = X2 ^ w[2];465ctx->X[3] = X3 ^ w[3];466ctx->X[4] = X4 ^ w[4];467ctx->X[5] = X5 ^ w[5];468ctx->X[6] = X6 ^ w[6];469ctx->X[7] = X7 ^ w[7];470Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);471472ts[1] &= ~SKEIN_T1_FLAG_FIRST;473} while (--blkCnt);474ctx->h.T[0] = ts[0];475ctx->h.T[1] = ts[1];476}477478#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)479size_t480Skein_512_Process_Block_CodeSize(void)481{482return ((uint8_t *)Skein_512_Process_Block_CodeSize) -483((uint8_t *)Skein_512_Process_Block);484}485486uint_t487Skein_512_Unroll_Cnt(void)488{489return (SKEIN_UNROLL_512);490}491#endif492#endif493494/* Skein1024 */495#if !(SKEIN_USE_ASM & 1024)496void497Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,498size_t blkCnt, size_t byteCntAdd)499{500/* do it in C, always looping (unrolled is bigger AND slower!) */501enum {502WCNT = SKEIN1024_STATE_WORDS503};504#undef RCNT505#define RCNT (SKEIN1024_ROUNDS_TOTAL/8)506507#ifdef SKEIN_LOOP /* configure how much to unroll the loop */508#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)509#else510#define SKEIN_UNROLL_1024 (0)511#endif512513#if (SKEIN_UNROLL_1024 != 0)514#if (RCNT % SKEIN_UNROLL_1024)515#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */516#endif517size_t r;518/* key schedule words : chaining vars + tweak + "rotation" */519uint64_t kw[WCNT + 4 + RCNT * 2];520#else521uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */522#endif523524/* local copy of vars, for speed */525uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,526X12, X13, X14, X15;527uint64_t w[WCNT]; /* local copy of input block */528#ifdef SKEIN_DEBUG529/* use for debugging (help compiler put Xn in registers) */530const uint64_t *Xptr[16];531Xptr[0] = &X00;532Xptr[1] = &X01;533Xptr[2] = &X02;534Xptr[3] = &X03;535Xptr[4] = &X04;536Xptr[5] = &X05;537Xptr[6] = &X06;538Xptr[7] = &X07;539Xptr[8] = &X08;540Xptr[9] = &X09;541Xptr[10] = &X10;542Xptr[11] = &X11;543Xptr[12] = &X12;544Xptr[13] = &X13;545Xptr[14] = &X14;546Xptr[15] = &X15;547#endif548549Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */550ts[0] = ctx->h.T[0];551ts[1] = ctx->h.T[1];552do {553/*554* this implementation only supports 2**64 input bytes555* (no carry out here)556*/557ts[0] += byteCntAdd; /* update processed length */558559/* precompute the key schedule for this block */560ks[0] = ctx->X[0];561ks[1] = ctx->X[1];562ks[2] = ctx->X[2];563ks[3] = ctx->X[3];564ks[4] = ctx->X[4];565ks[5] = ctx->X[5];566ks[6] = ctx->X[6];567ks[7] = ctx->X[7];568ks[8] = ctx->X[8];569ks[9] = ctx->X[9];570ks[10] = ctx->X[10];571ks[11] = ctx->X[11];572ks[12] = ctx->X[12];573ks[13] = ctx->X[13];574ks[14] = ctx->X[14];575ks[15] = ctx->X[15];576ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^577ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^578ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^579ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;580581ts[2] = ts[0] ^ ts[1];582583/* get input block in little-endian format */584Skein_Get64_LSB_First(w, blkPtr, WCNT);585DebugSaveTweak(ctx);586Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);587588X00 = w[0] + ks[0]; /* do the first full key injection */589X01 = w[1] + ks[1];590X02 = w[2] + ks[2];591X03 = w[3] + ks[3];592X04 = w[4] + ks[4];593X05 = w[5] + ks[5];594X06 = w[6] + ks[6];595X07 = w[7] + ks[7];596X08 = w[8] + ks[8];597X09 = w[9] + ks[9];598X10 = w[10] + ks[10];599X11 = w[11] + ks[11];600X12 = w[12] + ks[12];601X13 = w[13] + ks[13] + ts[0];602X14 = w[14] + ks[14] + ts[1];603X15 = w[15] + ks[15];604605Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,606Xptr);607608#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \609pD, pE, pF, ROT, rNum) \610X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\611X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\612X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\613X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\614X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\615X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\616X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\617X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;618619#if SKEIN_UNROLL_1024 == 0620#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \621pE, pF, ROT, rn) \622Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \623pD, pE, pF, ROT, rn) \624Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);625626#define I1024(R) \627X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\628X01 += ks[((R) + 2) % 17]; \629X02 += ks[((R) + 3) % 17]; \630X03 += ks[((R) + 4) % 17]; \631X04 += ks[((R) + 5) % 17]; \632X05 += ks[((R) + 6) % 17]; \633X06 += ks[((R) + 7) % 17]; \634X07 += ks[((R) + 8) % 17]; \635X08 += ks[((R) + 9) % 17]; \636X09 += ks[((R) + 10) % 17]; \637X10 += ks[((R) + 11) % 17]; \638X11 += ks[((R) + 12) % 17]; \639X12 += ks[((R) + 13) % 17]; \640X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \641X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \642X15 += ks[((R) + 16) % 17] + (R) +1; \643Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);644#else /* looping version */645#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \646pE, pF, ROT, rn) \647Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \648pD, pE, pF, ROT, rn) \649Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);650651#define I1024(R) \652X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \653X01 += ks[r + (R) + 1]; \654X02 += ks[r + (R) + 2]; \655X03 += ks[r + (R) + 3]; \656X04 += ks[r + (R) + 4]; \657X05 += ks[r + (R) + 5]; \658X06 += ks[r + (R) + 6]; \659X07 += ks[r + (R) + 7]; \660X08 += ks[r + (R) + 8]; \661X09 += ks[r + (R) + 9]; \662X10 += ks[r + (R) + 10]; \663X11 += ks[r + (R) + 11]; \664X12 += ks[r + (R) + 12]; \665X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \666X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \667X15 += ks[r + (R) + 15] + r + (R); \668ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\669ts[r + (R) + 2] = ts[r + (R) - 1]; \670Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);671672/* loop through it */673for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)674#endif675{676#define R1024_8_rounds(R) /* do 8 full rounds */ \677R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \67814, 15, R1024_0, 8 * (R) + 1); \679R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \68008, 01, R1024_1, 8 * (R) + 2); \681R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \68210, 09, R1024_2, 8 * (R) + 3); \683R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \68412, 07, R1024_3, 8 * (R) + 4); \685I1024(2 * (R)); \686R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \68714, 15, R1024_4, 8 * (R) + 5); \688R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \68908, 01, R1024_5, 8 * (R) + 6); \690R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \69110, 09, R1024_6, 8 * (R) + 7); \692R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \69312, 07, R1024_7, 8 * (R) + 8); \694I1024(2 * (R) + 1);695696R1024_8_rounds(0);697698#define R1024_Unroll_R(NN) \699((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \700(SKEIN_UNROLL_1024 > (NN)))701702#if R1024_Unroll_R(1)703R1024_8_rounds(1);704#endif705#if R1024_Unroll_R(2)706R1024_8_rounds(2);707#endif708#if R1024_Unroll_R(3)709R1024_8_rounds(3);710#endif711#if R1024_Unroll_R(4)712R1024_8_rounds(4);713#endif714#if R1024_Unroll_R(5)715R1024_8_rounds(5);716#endif717#if R1024_Unroll_R(6)718R1024_8_rounds(6);719#endif720#if R1024_Unroll_R(7)721R1024_8_rounds(7);722#endif723#if R1024_Unroll_R(8)724R1024_8_rounds(8);725#endif726#if R1024_Unroll_R(9)727R1024_8_rounds(9);728#endif729#if R1024_Unroll_R(10)730R1024_8_rounds(10);731#endif732#if R1024_Unroll_R(11)733R1024_8_rounds(11);734#endif735#if R1024_Unroll_R(12)736R1024_8_rounds(12);737#endif738#if R1024_Unroll_R(13)739R1024_8_rounds(13);740#endif741#if R1024_Unroll_R(14)742R1024_8_rounds(14);743#endif744#if (SKEIN_UNROLL_1024 > 14)745#error "need more unrolling in Skein_1024_Process_Block"746#endif747}748/*749* do the final "feedforward" xor, update context chaining vars750*/751752ctx->X[0] = X00 ^ w[0];753ctx->X[1] = X01 ^ w[1];754ctx->X[2] = X02 ^ w[2];755ctx->X[3] = X03 ^ w[3];756ctx->X[4] = X04 ^ w[4];757ctx->X[5] = X05 ^ w[5];758ctx->X[6] = X06 ^ w[6];759ctx->X[7] = X07 ^ w[7];760ctx->X[8] = X08 ^ w[8];761ctx->X[9] = X09 ^ w[9];762ctx->X[10] = X10 ^ w[10];763ctx->X[11] = X11 ^ w[11];764ctx->X[12] = X12 ^ w[12];765ctx->X[13] = X13 ^ w[13];766ctx->X[14] = X14 ^ w[14];767ctx->X[15] = X15 ^ w[15];768769Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);770771ts[1] &= ~SKEIN_T1_FLAG_FIRST;772blkPtr += SKEIN1024_BLOCK_BYTES;773} while (--blkCnt);774ctx->h.T[0] = ts[0];775ctx->h.T[1] = ts[1];776}777778#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)779size_t780Skein1024_Process_Block_CodeSize(void)781{782return ((uint8_t *)Skein1024_Process_Block_CodeSize) -783((uint8_t *)Skein1024_Process_Block);784}785786uint_t787Skein1024_Unroll_Cnt(void)788{789return (SKEIN_UNROLL_1024);790}791#endif792#endif793794795