Path: blob/main/contrib/bearssl/src/symcipher/aes_pwr8.c
39482 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#define BR_POWER_ASM_MACROS 125#include "inner.h"2627/*28* This code contains the AES key schedule implementation using the29* POWER8 opcodes.30*/3132#if BR_POWER83334static void35key_schedule_128(unsigned char *sk, const unsigned char *key)36{37long cc;3839static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };40#if BR_POWER8_LE41static const uint32_t idx2be[] = {420x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C43};44#endif4546cc = 0;4748/*49* We use the VSX instructions for loading and storing the50* key/subkeys, since they support unaligned accesses. The rest51* of the computation is VMX only. VMX register 0 is VSX52* register 32.53*/54asm volatile (5556/*57* v0 = all-zero word58* v1 = constant -8 / +8, copied into four words59* v2 = current subkey60* v3 = Rcon (x4 words)61* v6 = constant 8, copied into four words62* v7 = constant 0x11B, copied into four words63* v8 = constant for byteswapping words64*/65vspltisw(0, 0)66#if BR_POWER8_LE67vspltisw(1, -8)68#else69vspltisw(1, 8)70#endif71lxvw4x(34, 0, %[key])72vspltisw(3, 1)73vspltisw(6, 8)74lxvw4x(39, 0, %[fmod])75#if BR_POWER8_LE76lxvw4x(40, 0, %[idx2be])77#endif7879/*80* First subkey is a copy of the key itself.81*/82#if BR_POWER8_LE83vperm(4, 2, 2, 8)84stxvw4x(36, 0, %[sk])85#else86stxvw4x(34, 0, %[sk])87#endif8889/*90* Loop must run 10 times.91*/92li(%[cc], 10)93mtctr(%[cc])94label(loop)95/* Increment subkey address */96addi(%[sk], %[sk], 16)9798/* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */99vrlw(4, 2, 1)100vsbox(4, 4)101#if BR_POWER8_LE102vxor(4, 4, 3)103#else104vsldoi(5, 3, 0, 3)105vxor(4, 4, 5)106#endif107vspltw(4, 4, 3)108109/* XOR words for next subkey */110vsldoi(5, 0, 2, 12)111vxor(2, 2, 5)112vsldoi(5, 0, 2, 12)113vxor(2, 2, 5)114vsldoi(5, 0, 2, 12)115vxor(2, 2, 5)116vxor(2, 2, 4)117118/* Store next subkey */119#if BR_POWER8_LE120vperm(4, 2, 2, 8)121stxvw4x(36, 0, %[sk])122#else123stxvw4x(34, 0, %[sk])124#endif125126/* Update Rcon */127vadduwm(3, 3, 3)128vsrw(4, 3, 6)129vsubuwm(4, 0, 4)130vand(4, 4, 7)131vxor(3, 3, 4)132133bdnz(loop)134135: [sk] "+b" (sk), [cc] "+b" (cc)136: [key] "b" (key), [fmod] "b" (fmod)137#if BR_POWER8_LE138, [idx2be] "b" (idx2be)139#endif140: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"141);142}143144static void145key_schedule_192(unsigned char *sk, const unsigned char *key)146{147long cc;148149#if BR_POWER8_LE150static const uint32_t idx2be[] = {1510x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C152};153#endif154155cc = 0;156157/*158* We use the VSX instructions for loading and storing the159* key/subkeys, since they support unaligned accesses. The rest160* of the computation is VMX only. VMX register 0 is VSX161* register 32.162*/163asm volatile (164165/*166* v0 = all-zero word167* v1 = constant -8 / +8, copied into four words168* v2, v3 = current subkey169* v5 = Rcon (x4 words) (already shifted on big-endian)170* v6 = constant 8, copied into four words171* v8 = constant for byteswapping words172*173* The left two words of v3 are ignored.174*/175vspltisw(0, 0)176#if BR_POWER8_LE177vspltisw(1, -8)178#else179vspltisw(1, 8)180#endif181li(%[cc], 8)182lxvw4x(34, 0, %[key])183lxvw4x(35, %[cc], %[key])184vsldoi(3, 3, 0, 8)185vspltisw(5, 1)186#if !BR_POWER8_LE187vsldoi(5, 5, 0, 3)188#endif189vspltisw(6, 8)190#if BR_POWER8_LE191lxvw4x(40, 0, %[idx2be])192#endif193194/*195* Loop must run 8 times. Each iteration produces 256196* bits of subkeys, with a 64-bit overlap.197*/198li(%[cc], 8)199mtctr(%[cc])200li(%[cc], 16)201label(loop)202203/*204* Last 6 words in v2:v3l. Compute next 6 words into205* v3r:v4.206*/207vrlw(10, 3, 1)208vsbox(10, 10)209vxor(10, 10, 5)210vspltw(10, 10, 1)211vsldoi(11, 0, 10, 8)212213vsldoi(12, 0, 2, 12)214vxor(12, 2, 12)215vsldoi(13, 0, 12, 12)216vxor(12, 12, 13)217vsldoi(13, 0, 12, 12)218vxor(12, 12, 13)219220vspltw(13, 12, 3)221vxor(13, 13, 3)222vsldoi(14, 0, 3, 12)223vxor(13, 13, 14)224225vsldoi(4, 12, 13, 8)226vsldoi(14, 0, 3, 8)227vsldoi(3, 14, 12, 8)228229vxor(3, 3, 11)230vxor(4, 4, 10)231232/*233* Update Rcon. Since for a 192-bit key, we use only 8234* such constants, we will not hit the field modulus,235* so a simple shift (addition) works well.236*/237vadduwm(5, 5, 5)238239/*240* Write out the two left 128-bit words241*/242#if BR_POWER8_LE243vperm(10, 2, 2, 8)244vperm(11, 3, 3, 8)245stxvw4x(42, 0, %[sk])246stxvw4x(43, %[cc], %[sk])247#else248stxvw4x(34, 0, %[sk])249stxvw4x(35, %[cc], %[sk])250#endif251addi(%[sk], %[sk], 24)252253/*254* Shift words for next iteration.255*/256vsldoi(2, 3, 4, 8)257vsldoi(3, 4, 0, 8)258259bdnz(loop)260261/*262* The loop wrote the first 50 subkey words, but we need263* to produce 52, so we must do one last write.264*/265#if BR_POWER8_LE266vperm(10, 2, 2, 8)267stxvw4x(42, 0, %[sk])268#else269stxvw4x(34, 0, %[sk])270#endif271272: [sk] "+b" (sk), [cc] "+b" (cc)273: [key] "b" (key)274#if BR_POWER8_LE275, [idx2be] "b" (idx2be)276#endif277: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",278"v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"279);280}281282static void283key_schedule_256(unsigned char *sk, const unsigned char *key)284{285long cc;286287#if BR_POWER8_LE288static const uint32_t idx2be[] = {2890x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C290};291#endif292293cc = 0;294295/*296* We use the VSX instructions for loading and storing the297* key/subkeys, since they support unaligned accesses. The rest298* of the computation is VMX only. VMX register 0 is VSX299* register 32.300*/301asm volatile (302303/*304* v0 = all-zero word305* v1 = constant -8 / +8, copied into four words306* v2, v3 = current subkey307* v6 = Rcon (x4 words) (already shifted on big-endian)308* v7 = constant 8, copied into four words309* v8 = constant for byteswapping words310*311* The left two words of v3 are ignored.312*/313vspltisw(0, 0)314#if BR_POWER8_LE315vspltisw(1, -8)316#else317vspltisw(1, 8)318#endif319li(%[cc], 16)320lxvw4x(34, 0, %[key])321lxvw4x(35, %[cc], %[key])322vspltisw(6, 1)323#if !BR_POWER8_LE324vsldoi(6, 6, 0, 3)325#endif326vspltisw(7, 8)327#if BR_POWER8_LE328lxvw4x(40, 0, %[idx2be])329#endif330331/*332* Loop must run 7 times. Each iteration produces two333* subkeys.334*/335li(%[cc], 7)336mtctr(%[cc])337li(%[cc], 16)338label(loop)339340/*341* Current words are in v2:v3. Compute next word in v4.342*/343vrlw(10, 3, 1)344vsbox(10, 10)345vxor(10, 10, 6)346vspltw(10, 10, 3)347348vsldoi(4, 0, 2, 12)349vxor(4, 2, 4)350vsldoi(5, 0, 4, 12)351vxor(4, 4, 5)352vsldoi(5, 0, 4, 12)353vxor(4, 4, 5)354vxor(4, 4, 10)355356/*357* Then other word in v5.358*/359vsbox(10, 4)360vspltw(10, 10, 3)361362vsldoi(5, 0, 3, 12)363vxor(5, 3, 5)364vsldoi(11, 0, 5, 12)365vxor(5, 5, 11)366vsldoi(11, 0, 5, 12)367vxor(5, 5, 11)368vxor(5, 5, 10)369370/*371* Update Rcon. Since for a 256-bit key, we use only 7372* such constants, we will not hit the field modulus,373* so a simple shift (addition) works well.374*/375vadduwm(6, 6, 6)376377/*378* Write out the two left 128-bit words379*/380#if BR_POWER8_LE381vperm(10, 2, 2, 8)382vperm(11, 3, 3, 8)383stxvw4x(42, 0, %[sk])384stxvw4x(43, %[cc], %[sk])385#else386stxvw4x(34, 0, %[sk])387stxvw4x(35, %[cc], %[sk])388#endif389addi(%[sk], %[sk], 32)390391/*392* Replace v2:v3 with v4:v5.393*/394vxor(2, 0, 4)395vxor(3, 0, 5)396397bdnz(loop)398399/*400* The loop wrote the first 14 subkeys, but we need 15,401* so we must do an extra write.402*/403#if BR_POWER8_LE404vperm(10, 2, 2, 8)405stxvw4x(42, 0, %[sk])406#else407stxvw4x(34, 0, %[sk])408#endif409410: [sk] "+b" (sk), [cc] "+b" (cc)411: [key] "b" (key)412#if BR_POWER8_LE413, [idx2be] "b" (idx2be)414#endif415: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",416"v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"417);418}419420/* see inner.h */421int422br_aes_pwr8_supported(void)423{424return 1;425}426427/* see inner.h */428unsigned429br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)430{431switch (len) {432case 16:433key_schedule_128(sk, key);434return 10;435case 24:436key_schedule_192(sk, key);437return 12;438default:439key_schedule_256(sk, key);440return 14;441}442}443444#endif445446447