Path: blob/main/contrib/bearssl/src/symcipher/poly1305_ctmul32.c
39482 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/*27* Perform the inner processing of blocks for Poly1305.28*/29static void30poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len)31{32/*33* Implementation notes: we split the 130-bit values into ten34* 13-bit words. This gives us some space for carries and allows35* using only 32x32->32 multiplications, which are way faster than36* 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also37* help in making constant-time code on the Cortex-M3.38*39* Since we compute modulo 2^130-5, the "upper words" become40* low words with a factor of 5; that is, x*2^130 = x*5 mod p.41* This has already been integrated in the r[] array, which42* is extended to the 0..18 range.43*44* In each loop iteration, a[] and r[] words are 13-bit each,45* except a[1] which may use 14 bits.46*/47const unsigned char *buf;4849buf = data;50while (len > 0) {51unsigned char tmp[16];52uint32_t b[10];53unsigned u, v;54uint32_t z, cc1, cc2;5556/*57* If there is a partial block, right-pad it with zeros.58*/59if (len < 16) {60memset(tmp, 0, sizeof tmp);61memcpy(tmp, buf, len);62buf = tmp;63len = 16;64}6566/*67* Decode next block and apply the "high bit"; that value68* is added to the accumulator.69*/70v = br_dec16le(buf);71a[0] += v & 0x01FFF;72v >>= 13;73v |= buf[2] << 3;74v |= buf[3] << 11;75a[1] += v & 0x01FFF;76v >>= 13;77v |= buf[4] << 6;78a[2] += v & 0x01FFF;79v >>= 13;80v |= buf[5] << 1;81v |= buf[6] << 9;82a[3] += v & 0x01FFF;83v >>= 13;84v |= buf[7] << 4;85v |= buf[8] << 12;86a[4] += v & 0x01FFF;87v >>= 13;88v |= buf[9] << 7;89a[5] += v & 0x01FFF;90v >>= 13;91v |= buf[10] << 2;92v |= buf[11] << 10;93a[6] += v & 0x01FFF;94v >>= 13;95v |= buf[12] << 5;96a[7] += v & 0x01FFF;97v = br_dec16le(buf + 13);98a[8] += v & 0x01FFF;99v >>= 13;100v |= buf[15] << 3;101a[9] += v | 0x00800;102103/*104* At that point, all a[] values fit on 14 bits, while105* all r[] values fit on 13 bits. Thus products fit on106* 27 bits, and we can accumulate up to 31 of them in107* a 32-bit word and still have some room for carries.108*/109110/*111* Now a[] contains words with values up to 14 bits each.112* We perform the multiplication with r[].113*114* The extended words of r[] may be larger than 13 bits115* (they are 5 times a 13-bit word) so the full summation116* may yield values up to 46 times a 27-bit word, which117* does not fit on a 32-bit word. To avoid that issue, we118* must split the loop below in two, with a carry119* propagation operation in the middle.120*/121cc1 = 0;122for (u = 0; u < 10; u ++) {123uint32_t s;124125s = cc1126+ MUL15(a[0], r[u + 9 - 0])127+ MUL15(a[1], r[u + 9 - 1])128+ MUL15(a[2], r[u + 9 - 2])129+ MUL15(a[3], r[u + 9 - 3])130+ MUL15(a[4], r[u + 9 - 4]);131b[u] = s & 0x1FFF;132cc1 = s >> 13;133}134cc2 = 0;135for (u = 0; u < 10; u ++) {136uint32_t s;137138s = b[u] + cc2139+ MUL15(a[5], r[u + 9 - 5])140+ MUL15(a[6], r[u + 9 - 6])141+ MUL15(a[7], r[u + 9 - 7])142+ MUL15(a[8], r[u + 9 - 8])143+ MUL15(a[9], r[u + 9 - 9]);144b[u] = s & 0x1FFF;145cc2 = s >> 13;146}147memcpy(a, b, sizeof b);148149/*150* The two carries "loop back" with a factor of 5. We151* propagate them into a[0] and a[1].152*/153z = cc1 + cc2;154z += (z << 2) + a[0];155a[0] = z & 0x1FFF;156a[1] += z >> 13;157158buf += 16;159len -= 16;160}161}162163/* see bearssl_block.h */164void165br_poly1305_ctmul32_run(const void *key, const void *iv,166void *data, size_t len, const void *aad, size_t aad_len,167void *tag, br_chacha20_run ichacha, int encrypt)168{169unsigned char pkey[32], foot[16];170uint32_t z, r[19], acc[10], cc, ctl;171int i;172173/*174* Compute the MAC key. The 'r' value is the first 16 bytes of175* pkey[].176*/177memset(pkey, 0, sizeof pkey);178ichacha(key, iv, 0, pkey, sizeof pkey);179180/*181* If encrypting, ChaCha20 must run first, followed by Poly1305.182* When decrypting, the operations are reversed.183*/184if (encrypt) {185ichacha(key, iv, 1, data, len);186}187188/*189* Run Poly1305. We must process the AAD, then ciphertext, then190* the footer (with the lengths). Note that the AAD and ciphertext191* are meant to be padded with zeros up to the next multiple of 16,192* and the length of the footer is 16 bytes as well.193*/194195/*196* Decode the 'r' value into 13-bit words, with the "clamping"197* operation applied.198*/199z = br_dec32le(pkey) & 0x03FFFFFF;200r[9] = z & 0x1FFF;201r[10] = z >> 13;202z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;203r[11] = z & 0x1FFF;204r[12] = z >> 13;205z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;206r[13] = z & 0x1FFF;207r[14] = z >> 13;208z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;209r[15] = z & 0x1FFF;210r[16] = z >> 13;211z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;212r[17] = z & 0x1FFF;213r[18] = z >> 13;214215/*216* Extend r[] with the 5x factor pre-applied.217*/218for (i = 0; i < 9; i ++) {219r[i] = MUL15(5, r[i + 10]);220}221222/*223* Accumulator is 0.224*/225memset(acc, 0, sizeof acc);226227/*228* Process the additional authenticated data, ciphertext, and229* footer in due order.230*/231br_enc64le(foot, (uint64_t)aad_len);232br_enc64le(foot + 8, (uint64_t)len);233poly1305_inner(acc, r, aad, aad_len);234poly1305_inner(acc, r, data, len);235poly1305_inner(acc, r, foot, sizeof foot);236237/*238* Finalise modular reduction. This is done with carry propagation239* and applying the '2^130 = -5 mod p' rule. Note that the output240* of poly1035_inner() is already mostly reduced, since only241* acc[1] may be (very slightly) above 2^13. A single loop back242* to acc[1] will be enough to make the value fit in 130 bits.243*/244cc = 0;245for (i = 1; i < 10; i ++) {246z = acc[i] + cc;247acc[i] = z & 0x1FFF;248cc = z >> 13;249}250z = acc[0] + cc + (cc << 2);251acc[0] = z & 0x1FFF;252acc[1] += z >> 13;253254/*255* We may still have a value in the 2^130-5..2^130-1 range, in256* which case we must reduce it again. The code below selects,257* in constant-time, between 'acc' and 'acc-p',258*/259ctl = GT(acc[0], 0x1FFA);260for (i = 1; i < 10; i ++) {261ctl &= EQ(acc[i], 0x1FFF);262}263acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);264for (i = 1; i < 10; i ++) {265acc[i] &= ~(-ctl);266}267268/*269* Convert back the accumulator to 32-bit words, and add the270* 's' value (second half of pkey[]). That addition is done271* modulo 2^128.272*/273z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);274br_enc16le((unsigned char *)tag, z & 0xFFFF);275z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);276br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);277z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);278br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);279z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);280br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);281z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);282br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);283z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);284br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);285z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);286br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);287z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);288br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);289290/*291* If decrypting, then ChaCha20 runs _after_ Poly1305.292*/293if (!encrypt) {294ichacha(key, iv, 1, data, len);295}296}297298299