//1// aes.c code for AES implementation2//3// Copyright (c) Microsoft Corporation. Licensed under the MIT license.4//5// The actual encryption and decryption routines here are not nearly as fast as the6// assembler ones. They are used on platforms that don't have assembler implementations7// and for various testing purposes.8//9// This code derives from the orignal fast AES code that Niels Ferguson wrote10// for BitLocker in Windows Vista.11// The C code is derived from the AES that was already in the RSA32 library,12// the assembler code was created new at that time.13//141516#include "precomp.h"171819///////////////////////////////////////////////////////////////////////////////20// Key expansion uses two functions, a 4-byte S-box lookup and one21// to create a decryption round key from an encryption round key.22// These are the C implementations of these functions23//242526static BYTE g_SymCryptAesRoundConstant[11] =27{280, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,29};3031SYMCRYPT_NOINLINE32SYMCRYPT_ERROR33SYMCRYPT_CALL34SymCryptAesExpandKeyInternal(35_Out_ PSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,36_In_reads_(cbKey) PCBYTE pbKey,37SIZE_T cbKey,38BOOLEAN fCreateDecryptionKeys )39{40UINT32 nRounds;41BYTE * p;42BYTE * q;43UINT32 i;44UINT32 t;4546BOOL UseSimd = FALSE;47SYMCRYPT_ERROR status = SYMCRYPT_NO_ERROR;4849#if SYMCRYPT_CPU_X8650SYMCRYPT_EXTENDED_SAVE_DATA SaveData;5152if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_AESNI_CODE ) )53{54if( SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )55{56UseSimd = TRUE;57}58}59#elif SYMCRYPT_CPU_AMD6460if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_AESNI_CODE ) )61{62UseSimd = TRUE;63}64#elif SYMCRYPT_CPU_ARM6465if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_AES ) )66{67UseSimd = TRUE;68}69#endif7071SYMCRYPT_SET_MAGIC( pExpandedKey );7273//74// Separate code for each key size, this is significantly faster.75// We have a number of applications that do frequent key expansions.76//77switch( cbKey )78{79case 16:80nRounds = 10;81pExpandedKey->lastEncRoundKey = &pExpandedKey->RoundKey[nRounds];82pExpandedKey->lastDecRoundKey = &pExpandedKey->RoundKey[2*nRounds];8384memcpy( &pExpandedKey->RoundKey[0], pbKey, 16 );8586p = (BYTE *)&pExpandedKey->RoundKey[1];8788for( i=1; i<=nRounds; i++ )89{90SymCryptAes4Sbox( &p[-4], p, UseSimd );91t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 16) ^ g_SymCryptAesRoundConstant[i];92SYMCRYPT_STORE_LSBFIRST32( p, t ); // this is a macro that re-evaluates its arguments9394*(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 12);95*(UINT32 *)(p+8) = *(UINT32 *)(p+4) ^ *(UINT32 *)(p - 8);96*(UINT32 *)(p+12) = *(UINT32 *)(p+8) ^ *(UINT32 *)(p - 4);9798p += 16;99}100101break;102103case 24:104nRounds = 12;105pExpandedKey->lastEncRoundKey = &pExpandedKey->RoundKey[nRounds];106pExpandedKey->lastDecRoundKey = &pExpandedKey->RoundKey[2*nRounds];107108memcpy( &pExpandedKey->RoundKey[0], pbKey, 24 );109110p = (BYTE *)&pExpandedKey->RoundKey[0] + 24;111112//113// We have 12 rounds, 13 round keys, and 13*16 = 208 bytes of encryption key to generate.114// We have 24 already, so we need 184 more.115// Each iteration produces 24 bytes, so we need to loop 8 times.116//117for( i=1; i<=8; i++ )118{119SymCryptAes4Sbox( &p[-4], p, UseSimd );120t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 24) ^ g_SymCryptAesRoundConstant[i];121SYMCRYPT_STORE_LSBFIRST32( p, t );122123*(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 20);124*(UINT32 *)(p+8) = *(UINT32 *)(p+ 4) ^ *(UINT32 *)(p - 16);125*(UINT32 *)(p+12) = *(UINT32 *)(p+ 8) ^ *(UINT32 *)(p - 12);126*(UINT32 *)(p+16) = *(UINT32 *)(p+12) ^ *(UINT32 *)(p - 8);127*(UINT32 *)(p+20) = *(UINT32 *)(p+16) ^ *(UINT32 *)(p - 4);128129p += 24;130}131132break;133134case 32:135nRounds = 14;136pExpandedKey->lastEncRoundKey = &pExpandedKey->RoundKey[nRounds];137pExpandedKey->lastDecRoundKey = &pExpandedKey->RoundKey[2*nRounds];138139memcpy( &pExpandedKey->RoundKey[0], pbKey, 32 );140141p = (BYTE *)&pExpandedKey->RoundKey[0] + 32;142143//144// We have 14 rounds, 15 round keys, and 15*16 = 240 bytes of encryption key to generate.145// We have 32 already, so we need 208 more.146// Each iteration produces 32 bytes, so we need to loop 6.5 times.147//148for( i=1; i<=6; i++ )149{150SymCryptAes4Sbox( &p[-4], p, UseSimd );151t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 32) ^ g_SymCryptAesRoundConstant[i];152SYMCRYPT_STORE_LSBFIRST32( p, t );153154*(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 28);155*(UINT32 *)(p+8) = *(UINT32 *)(p + 4) ^ *(UINT32 *)(p - 24);156*(UINT32 *)(p+12) = *(UINT32 *)(p + 8) ^ *(UINT32 *)(p - 20);157158SymCryptAes4Sbox( &p[12], &p[16], UseSimd );159*(UINT32 *)(p+16) = *(UINT32 *)(p + 16) ^ *(UINT32 *)(p - 16);160161*(UINT32 *)(p+20) = *(UINT32 *)(p + 16) ^ *(UINT32 *)(p - 12);162*(UINT32 *)(p+24) = *(UINT32 *)(p + 20) ^ *(UINT32 *)(p - 8);163*(UINT32 *)(p+28) = *(UINT32 *)(p + 24) ^ *(UINT32 *)(p - 4);164165p += 32;166}167168// We looped 6 times, so here is the half-loop169170SymCryptAes4Sbox( &p[-4], p, UseSimd );171t = ROR32(SYMCRYPT_LOAD_LSBFIRST32(p), 8) ^ SYMCRYPT_LOAD_LSBFIRST32(p - 32) ^ g_SymCryptAesRoundConstant[i];172SYMCRYPT_STORE_LSBFIRST32( p, t );173174*(UINT32 *)(p+4) = *(UINT32 *) p ^ *(UINT32 *)(p - 28);175*(UINT32 *)(p+8) = *(UINT32 *)(p + 4) ^ *(UINT32 *)(p - 24);176*(UINT32 *)(p+12) = *(UINT32 *)(p + 8) ^ *(UINT32 *)(p - 20);177178break;179180default:181status = SYMCRYPT_WRONG_KEY_SIZE;182goto cleanup;183}184185186if( fCreateDecryptionKeys )187{188p = &pExpandedKey->RoundKey[0][0][0];189q = (PBYTE)(pExpandedKey->lastDecRoundKey);190191// The first encryption round key is the last decryption round key192memcpy( q, p, SYMCRYPT_AES_BLOCK_SIZE );193p += 16;194q -= 16;195196while( p < (PBYTE) pExpandedKey->lastEncRoundKey )197{198SymCryptAesCreateDecryptionRoundKey( p, q, UseSimd );199q -= 16;200p += 16;201}202}203204cleanup:205206#if SYMCRYPT_CPU_X86207if( UseSimd )208{209SymCryptRestoreXmm( &SaveData );210}211#endif212213return status;214}215216SYMCRYPT_ERROR217SYMCRYPT_CALL218SymCryptAesExpandKey(219_Out_ PSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,220_In_reads_(cbKey) PCBYTE pbKey,221SIZE_T cbKey )222223{224return SymCryptAesExpandKeyInternal( pExpandedKey, pbKey, cbKey, TRUE );225}226227SYMCRYPT_ERROR228SYMCRYPT_CALL229SymCryptAesExpandKeyEncryptOnly(230_Out_ PSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,231_In_reads_(cbKey) PCBYTE pbKey,232SIZE_T cbKey )233{234return SymCryptAesExpandKeyInternal( pExpandedKey, pbKey, cbKey, FALSE );235}236237VOID238SYMCRYPT_CALL239SymCryptAesKeyCopy( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pSrc,240_Out_ PSYMCRYPT_AES_EXPANDED_KEY pDst )241{242SYMCRYPT_CHECK_MAGIC( pSrc );243244*pDst = *pSrc;245pDst->lastEncRoundKey = &pDst->RoundKey[0] + (pSrc->lastEncRoundKey - &pSrc->RoundKey[0]);246pDst->lastDecRoundKey = &pDst->RoundKey[0] + (pSrc->lastDecRoundKey - &pSrc->RoundKey[0]);247248SYMCRYPT_SET_MAGIC( pDst );249}250251//252// Self test code253//254255256const BYTE SymCryptAesNistTestVector128Ciphertext[16] = {2570x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30,2580xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a,259};260261262263/****************************************************************264* OLD CODE265*266* Old code to generate the AES tables dynamically.267* Kept for future reference.268*269270271//272// Prototype; on some platforms this function is in assembler.273//274VOID275SYMCRYPT_CALL276SymCryptAesCreateRotatedTables( BYTE MatrixMult[4][256][4] );277278VOID279SYMCRYPT_CALL280SymCryptAesCreateRotatedTables( _Inout_ BYTE MatrixMult[4][256][4] )281{282int i,j,k;283284//285// We do this byte-by-byte, which is easiest.286// It would be faster to use UINT32 operations,287// but that is endian-specific, and therefore platform-specific.288// Endian-agnostic UINT32-based code would be a lot more complicated.289// All this is extremely easy to do in assembler, which we do on those290// platforms that have assembler implementations.291//292for( j=1; j<4; j++ ) {293for( i=0; i<256; i++ ) {294for( k=0; k<4; k++ ) {295MatrixMult[j][i][k] = MatrixMult[0][i][(k-j)&3];296}297}298}299}300301302303//304// SymCryptAesInitMatrixMultiplyTable305//306// Initialize a matrix multiplication table.307// Each matrix multiplication table consists of 4 tables of 256 entries of 4 bytes each.308// The four tables are rotated copies of each other.309// This function generates the first of those four tables from the init310// value.311//312// After this call:313// At index i the table contains the four bytes314// i * init[0], i * init[1], i * init[2], i * init[3]315// where multiplication is in GF(2^8).316//317// We do not do a GF(2^8) multiplication for each entry, but rather use the318// relationship (a xor b) * init[.] = a * init[.] xor b * init[.]319// And only compute i*init[.] for i = 1,2,4,8,...,128. This can be done320// using repeated multiplication by x in the finite field.321//322// It is safe to call this function on two separate threads for the same table.323// All invocations will write the same data to the table, and within a tread each entry is written324// before it is read. Doing parallel initializations of the same table can be very inefficient325// as multiple cores will be fighting over the cache lines, but the result will be correct.326// We use this property to initialize the tables lazily.327//328static329VOID330SYMCRYPT_CALL331SymCryptAesInitMatrixMultiplyTable( _Out_ SYMCRYPT_ALIGN BYTE MatrixMult[256][4],332_In_ SYMCRYPT_ALIGN BYTE init[4]333)334{335int i,j;336SYMCRYPT_ALIGN BYTE initCopy[4];337UINT32 initCopyAsUint32;338339//340// We copy the init value so that we can modify it without worrying about multi-threading341// issues.342//343*(UINT32 *)initCopy = *(UINT32 *)init;344345*(UINT32 *)MatrixMult[0] = 0;346for( i=1; i<256; i<<=1 )347{348initCopyAsUint32 = *(UINT32 *)initCopy;349for( j=0; j<i; j++ )350{351*(UINT32 *)MatrixMult[i+j] = *(UINT32 *)MatrixMult[j] ^ initCopyAsUint32;352}353for( j=0; j<4; j++ )354{355initCopy[j] = MULT_BY_X( initCopy[j] );356}357}358}359360361//362// SymCryptAesInitialize363//364// Initialize the static tables for the AES implementation.365// This function is called by the key expansion function if it finds the366// tables not initialized.367//368// This leads to an interesting case where multiple threads running on multiple369// CPUs run this initialization code at the same time.370// This code is carefully structured to allow that. When global data is written it is371// always with the final value, and we never read uninitialized global data.372// Thus, even if two CPUs run this code at the same time, they will both initialize each373// memory location to the same correct value and the end result will be correct.374// (Performance will suffer due to the fact that cache lines will be bounced back and force375// between the two CPUs, but that is not a significant concern as this code is used only once.)376//377// At the end of the initialization the flag is set to indicate that further378// key expansion invocations do not need to re-run the initialization.379// We use memory barriers to keep this multi-thread safe.380//381static382VOID383SYMCRYPT_CALL384SymCryptAesInitialize(void)385{386int i,j;387BYTE S;388BYTE Stimes2;389390//391// We force alignment of these arrays as we sometimes treat them as a UINT32392//393SYMCRYPT_ALIGN BYTE InvMatrixEntry[4] = {0xe, 0x9, 0xd, 0xb};394SYMCRYPT_ALIGN BYTE MatrixEntry[4] = {2, 1, 1, 3};395SYMCRYPT_ALIGN BYTE MatrixScratch[256][4];396397// Generate the forward MDS multiplication table in the scratch space398SymCryptAesInitMatrixMultiplyTable( MatrixScratch, MatrixEntry );399400// Initialize first table of SymCryptAesInvMatrixMult401SymCryptAesInitMatrixMultiplyTable( SymCryptAesInvMatrixMult[0], InvMatrixEntry );402403//404// Build the InvSbox table and the first table of SymCryptAesSboxMatrixMult and405// SymCryptAesInvSboxMatrixMult406//407for( i=0; i<256; i++ ) {408S = SymCryptAesSbox[i];409SymCryptAesInvSbox[S] = (BYTE) i;410*(UINT32 *)SymCryptAesSboxMatrixMult[0][i] = *(UINT32 *)MatrixScratch[S];411*(UINT32 *)SymCryptAesInvSboxMatrixMult[0][S] = *(UINT32 *)SymCryptAesInvMatrixMult[0][i];412}413414//415// Now we generate the byte rotations of the tables416//417SymCryptAesCreateRotatedTables( SymCryptAesSboxMatrixMult );418SymCryptAesCreateRotatedTables( SymCryptAesInvSboxMatrixMult );419SymCryptAesCreateRotatedTables( SymCryptAesInvMatrixMult );420421//422// This is a memory barrier. It ensures that all the memory writes we do before the barrier423// are globally visible to other CPUs before the memory writes we do after the fence.424// In this particular case, it ensures that every CPU sees the completed tables before425// it sees the flag as set.426//427MemoryBarrier();428429//430// Set the flag to signal that the tables are initialized.431//432SymCryptAesTablesInitialized = TRUE;433}434435436*/437438439