Path: blob/main/contrib/bearssl/src/symcipher/des_ct.c
39482 views
/*1* Copyright (c) 2016 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/*27* During key schedule, we need to apply bit extraction PC-2 then permute28* things into our bitslice representation. PC-2 extracts 48 bits out29* of two 28-bit words (kl and kr), and we store these bits into two30* 32-bit words sk0 and sk1.31*32* -- bit 16+x of sk0 comes from bit QL0[x] of kl33* -- bit x of sk0 comes from bit QR0[x] of kr34* -- bit 16+x of sk1 comes from bit QL1[x] of kl35* -- bit x of sk1 comes from bit QR1[x] of kr36*/3738static const unsigned char QL0[] = {3917, 4, 27, 23, 13, 22, 7, 18,4016, 24, 2, 20, 1, 8, 15, 2641};4243static const unsigned char QR0[] = {4425, 19, 9, 1, 5, 11, 23, 8,4517, 0, 22, 3, 6, 20, 27, 2446};4748static const unsigned char QL1[] = {4928, 28, 14, 11, 28, 28, 25, 0,5028, 28, 5, 9, 28, 28, 12, 2151};5253static const unsigned char QR1[] = {5428, 28, 15, 4, 28, 28, 26, 16,5528, 28, 12, 7, 28, 28, 10, 1456};5758/*59* 32-bit rotation. The C compiler is supposed to recognize it as a60* rotation and use the local architecture rotation opcode (if available).61*/62static inline uint32_t63rotl(uint32_t x, int n)64{65return (x << n) | (x >> (32 - n));66}6768/*69* Compute key schedule for 8 key bytes (produces 32 subkey words).70*/71static void72keysched_unit(uint32_t *skey, const void *key)73{74int i;7576br_des_keysched_unit(skey, key);7778/*79* Apply PC-2 + bitslicing.80*/81for (i = 0; i < 16; i ++) {82uint32_t kl, kr, sk0, sk1;83int j;8485kl = skey[(i << 1) + 0];86kr = skey[(i << 1) + 1];87sk0 = 0;88sk1 = 0;89for (j = 0; j < 16; j ++) {90sk0 <<= 1;91sk1 <<= 1;92sk0 |= ((kl >> QL0[j]) & (uint32_t)1) << 16;93sk0 |= (kr >> QR0[j]) & (uint32_t)1;94sk1 |= ((kl >> QL1[j]) & (uint32_t)1) << 16;95sk1 |= (kr >> QR1[j]) & (uint32_t)1;96}9798skey[(i << 1) + 0] = sk0;99skey[(i << 1) + 1] = sk1;100}101102#if 0103/*104* Speed-optimized version for PC-2 + bitslicing.105* (Unused. Kept for reference only.)106*/107sk0 = kl & (uint32_t)0x00100000;108sk0 |= (kl & (uint32_t)0x08008000) << 2;109sk0 |= (kl & (uint32_t)0x00400000) << 4;110sk0 |= (kl & (uint32_t)0x00800000) << 5;111sk0 |= (kl & (uint32_t)0x00040000) << 6;112sk0 |= (kl & (uint32_t)0x00010000) << 7;113sk0 |= (kl & (uint32_t)0x00000100) << 10;114sk0 |= (kl & (uint32_t)0x00022000) << 14;115sk0 |= (kl & (uint32_t)0x00000082) << 18;116sk0 |= (kl & (uint32_t)0x00000004) << 19;117sk0 |= (kl & (uint32_t)0x04000000) >> 10;118sk0 |= (kl & (uint32_t)0x00000010) << 26;119sk0 |= (kl & (uint32_t)0x01000000) >> 2;120121sk0 |= kr & (uint32_t)0x00000100;122sk0 |= (kr & (uint32_t)0x00000008) << 1;123sk0 |= (kr & (uint32_t)0x00000200) << 4;124sk0 |= rotl(kr & (uint32_t)0x08000021, 6);125sk0 |= (kr & (uint32_t)0x01000000) >> 24;126sk0 |= (kr & (uint32_t)0x00000002) << 11;127sk0 |= (kr & (uint32_t)0x00100000) >> 18;128sk0 |= (kr & (uint32_t)0x00400000) >> 17;129sk0 |= (kr & (uint32_t)0x00800000) >> 14;130sk0 |= (kr & (uint32_t)0x02020000) >> 10;131sk0 |= (kr & (uint32_t)0x00080000) >> 5;132sk0 |= (kr & (uint32_t)0x00000040) >> 3;133sk0 |= (kr & (uint32_t)0x00000800) >> 1;134135sk1 = kl & (uint32_t)0x02000000;136sk1 |= (kl & (uint32_t)0x00001000) << 5;137sk1 |= (kl & (uint32_t)0x00000200) << 11;138sk1 |= (kl & (uint32_t)0x00004000) << 15;139sk1 |= (kl & (uint32_t)0x00000020) << 16;140sk1 |= (kl & (uint32_t)0x00000800) << 17;141sk1 |= (kl & (uint32_t)0x00000001) << 24;142sk1 |= (kl & (uint32_t)0x00200000) >> 5;143144sk1 |= (kr & (uint32_t)0x00000010) << 8;145sk1 |= (kr & (uint32_t)0x04000000) >> 17;146sk1 |= (kr & (uint32_t)0x00004000) >> 14;147sk1 |= (kr & (uint32_t)0x00000400) >> 9;148sk1 |= (kr & (uint32_t)0x00010000) >> 8;149sk1 |= (kr & (uint32_t)0x00001000) >> 7;150sk1 |= (kr & (uint32_t)0x00000080) >> 3;151sk1 |= (kr & (uint32_t)0x00008000) >> 2;152#endif153}154155/* see inner.h */156unsigned157br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len)158{159switch (key_len) {160case 8:161keysched_unit(skey, key);162return 1;163case 16:164keysched_unit(skey, key);165keysched_unit(skey + 32, (const unsigned char *)key + 8);166br_des_rev_skey(skey + 32);167memcpy(skey + 64, skey, 32 * sizeof *skey);168return 3;169default:170keysched_unit(skey, key);171keysched_unit(skey + 32, (const unsigned char *)key + 8);172br_des_rev_skey(skey + 32);173keysched_unit(skey + 64, (const unsigned char *)key + 16);174return 3;175}176}177178/*179* DES confusion function. This function performs expansion E (32 to180* 48 bits), XOR with subkey, S-boxes, and permutation P.181*/182static inline uint32_t183Fconf(uint32_t r0, const uint32_t *sk)184{185/*186* Each 6->4 S-box is virtually turned into four 6->1 boxes; we187* thus end up with 32 boxes that we call "T-boxes" here. We will188* evaluate them with bitslice code.189*190* Each T-box is a circuit of multiplexers (sort of) and thus191* takes 70 inputs: the 6 actual T-box inputs, and 64 constants192* that describe the T-box output for all combinations of the193* 6 inputs. With this model, all T-boxes are identical (with194* distinct inputs) and thus can be executed in parallel with195* bitslice code.196*197* T-boxes are numbered from 0 to 31, in least-to-most198* significant order. Thus, S-box S1 corresponds to T-boxes 31,199* 30, 29 and 28, in that order. T-box 'n' is computed with the200* bits at rank 'n' in the 32-bit words.201*202* Words x0 to x5 contain the T-box inputs 0 to 5.203*/204uint32_t x0, x1, x2, x3, x4, x5, z0;205uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;206uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;207uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29;208uint32_t y30;209210/*211* Spread input bits over the 6 input words x*.212*/213x1 = r0 & (uint32_t)0x11111111;214x2 = (r0 >> 1) & (uint32_t)0x11111111;215x3 = (r0 >> 2) & (uint32_t)0x11111111;216x4 = (r0 >> 3) & (uint32_t)0x11111111;217x1 = (x1 << 4) - x1;218x2 = (x2 << 4) - x2;219x3 = (x3 << 4) - x3;220x4 = (x4 << 4) - x4;221x0 = (x4 << 4) | (x4 >> 28);222x5 = (x1 >> 4) | (x1 << 28);223224/*225* XOR with the subkey for this round.226*/227x0 ^= sk[0];228x1 ^= sk[1];229x2 ^= sk[2];230x3 ^= sk[3];231x4 ^= sk[4];232x5 ^= sk[5];233234/*235* The T-boxes are done in parallel, since they all use a236* "tree of multiplexer". We use "fake multiplexers":237*238* y = a ^ (x & b)239*240* computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1).241*/242y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C);243y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821);244y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809);245y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28);246y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820);247y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801);248y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929);249y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1);250y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A);251y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B);252y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A);253y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572);254y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083);255y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000);256y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA);257/* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */258y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9);259y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE);260y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816);261y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B);262y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88);263y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25);264y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D);265y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0);266y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100);267y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011);268y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959);269y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A);270y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8);271y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B);272y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD);273/* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */274275y0 = y0 ^ (x1 & y1);276y1 = y2 ^ (x1 & y3);277y2 = y4 ^ (x1 & y5);278y3 = y6 ^ (x1 & y7);279y4 = y8 ^ (x1 & y9);280y5 = y10 ^ (x1 & y11);281y6 = y12 ^ (x1 & y13);282y7 = y14; /* was: y14 ^ (x1 & y15) */283y8 = y16 ^ (x1 & y17);284y9 = y18 ^ (x1 & y19);285y10 = y20 ^ (x1 & y21);286y11 = y22 ^ (x1 & y23);287y12 = y24 ^ (x1 & y25);288y13 = y26 ^ (x1 & y27);289y14 = y28 ^ (x1 & y29);290y15 = y30; /* was: y30 ^ (x1 & y31) */291292y0 = y0 ^ (x2 & y1);293y1 = y2 ^ (x2 & y3);294y2 = y4 ^ (x2 & y5);295y3 = y6 ^ (x2 & y7);296y4 = y8 ^ (x2 & y9);297y5 = y10 ^ (x2 & y11);298y6 = y12 ^ (x2 & y13);299y7 = y14 ^ (x2 & y15);300301y0 = y0 ^ (x3 & y1);302y1 = y2 ^ (x3 & y3);303y2 = y4 ^ (x3 & y5);304y3 = y6 ^ (x3 & y7);305306y0 = y0 ^ (x4 & y1);307y1 = y2 ^ (x4 & y3);308309y0 = y0 ^ (x5 & y1);310311/*312* The P permutation:313* -- Each bit move is converted into a mask + left rotation.314* -- Rotations that use the same movement are coalesced together.315* -- Left and right shifts are used as alternatives to a rotation316* where appropriate (this will help architectures that do not have317* a rotation opcode).318*/319z0 = (y0 & (uint32_t)0x00000004) << 3;320z0 |= (y0 & (uint32_t)0x00004000) << 4;321z0 |= rotl(y0 & 0x12020120, 5);322z0 |= (y0 & (uint32_t)0x00100000) << 6;323z0 |= (y0 & (uint32_t)0x00008000) << 9;324z0 |= (y0 & (uint32_t)0x04000000) >> 22;325z0 |= (y0 & (uint32_t)0x00000001) << 11;326z0 |= rotl(y0 & 0x20000200, 12);327z0 |= (y0 & (uint32_t)0x00200000) >> 19;328z0 |= (y0 & (uint32_t)0x00000040) << 14;329z0 |= (y0 & (uint32_t)0x00010000) << 15;330z0 |= (y0 & (uint32_t)0x00000002) << 16;331z0 |= rotl(y0 & 0x40801800, 17);332z0 |= (y0 & (uint32_t)0x00080000) >> 13;333z0 |= (y0 & (uint32_t)0x00000010) << 21;334z0 |= (y0 & (uint32_t)0x01000000) >> 10;335z0 |= rotl(y0 & 0x88000008, 24);336z0 |= (y0 & (uint32_t)0x00000480) >> 7;337z0 |= (y0 & (uint32_t)0x00442000) >> 6;338return z0;339}340341/*342* Process one block through 16 successive rounds, omitting the swap343* in the final round.344*/345static void346process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *sk_exp)347{348int i;349uint32_t l, r;350351l = *pl;352r = *pr;353for (i = 0; i < 16; i ++) {354uint32_t t;355356t = l ^ Fconf(r, sk_exp);357l = r;358r = t;359sk_exp += 6;360}361*pl = r;362*pr = l;363}364365/* see inner.h */366void367br_des_ct_process_block(unsigned num_rounds,368const uint32_t *sk_exp, void *block)369{370unsigned char *buf;371uint32_t l, r;372373buf = block;374l = br_dec32be(buf);375r = br_dec32be(buf + 4);376br_des_do_IP(&l, &r);377while (num_rounds -- > 0) {378process_block_unit(&l, &r, sk_exp);379sk_exp += 96;380}381br_des_do_invIP(&l, &r);382br_enc32be(buf, l);383br_enc32be(buf + 4, r);384}385386/* see inner.h */387void388br_des_ct_skey_expand(uint32_t *sk_exp,389unsigned num_rounds, const uint32_t *skey)390{391num_rounds <<= 4;392while (num_rounds -- > 0) {393uint32_t v, w0, w1, w2, w3;394395v = *skey ++;396w0 = v & 0x11111111;397w1 = (v >> 1) & 0x11111111;398w2 = (v >> 2) & 0x11111111;399w3 = (v >> 3) & 0x11111111;400*sk_exp ++ = (w0 << 4) - w0;401*sk_exp ++ = (w1 << 4) - w1;402*sk_exp ++ = (w2 << 4) - w2;403*sk_exp ++ = (w3 << 4) - w3;404v = *skey ++;405w0 = v & 0x11111111;406w1 = (v >> 1) & 0x11111111;407*sk_exp ++ = (w0 << 4) - w0;408*sk_exp ++ = (w1 << 4) - w1;409}410}411412413