Path: blob/main/contrib/bearssl/src/symcipher/poly1305_ctmul.c
39482 views
/*1* Copyright (c) 2016 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/*27* Perform the inner processing of blocks for Poly1305. The accumulator28* and the r key are provided as arrays of 26-bit words (these words29* are allowed to have an extra bit, i.e. use 27 bits).30*31* On output, all accumulator words fit on 26 bits, except acc[1], which32* may be slightly larger (but by a very small amount only).33*/34static void35poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len)36{37/*38* Implementation notes: we split the 130-bit values into five39* 26-bit words. This gives us some space for carries.40*41* This code is inspired from the public-domain code available42* on:43* https://github.com/floodyberry/poly1305-donna44*45* Since we compute modulo 2^130-5, the "upper words" become46* low words with a factor of 5; that is, x*2^130 = x*5 mod p.47*/48const unsigned char *buf;49uint32_t a0, a1, a2, a3, a4;50uint32_t r0, r1, r2, r3, r4;51uint32_t u1, u2, u3, u4;5253r0 = r[0];54r1 = r[1];55r2 = r[2];56r3 = r[3];57r4 = r[4];5859u1 = r1 * 5;60u2 = r2 * 5;61u3 = r3 * 5;62u4 = r4 * 5;6364a0 = acc[0];65a1 = acc[1];66a2 = acc[2];67a3 = acc[3];68a4 = acc[4];6970buf = data;71while (len > 0) {72uint64_t w0, w1, w2, w3, w4;73uint64_t c;74unsigned char tmp[16];7576/*77* If there is a partial block, right-pad it with zeros.78*/79if (len < 16) {80memset(tmp, 0, sizeof tmp);81memcpy(tmp, buf, len);82buf = tmp;83len = 16;84}8586/*87* Decode next block and apply the "high bit"; that value88* is added to the accumulator.89*/90a0 += br_dec32le(buf) & 0x03FFFFFF;91a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF;92a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF;93a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF;94a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000;9596/*97* Compute multiplication.98*/99#define M(x, y) ((uint64_t)(x) * (uint64_t)(y))100101w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1);102w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2);103w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3);104w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4);105w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0);106107#undef M108/*109* Perform some (partial) modular reduction. This step is110* enough to keep values in ranges such that there won't111* be carry overflows. Most of the reduction was done in112* the multiplication step (by using the 'u*' values, and113* using the fact that 2^130 = -5 mod p); here we perform114* some carry propagation.115*/116c = w0 >> 26;117a0 = (uint32_t)w0 & 0x3FFFFFF;118w1 += c;119c = w1 >> 26;120a1 = (uint32_t)w1 & 0x3FFFFFF;121w2 += c;122c = w2 >> 26;123a2 = (uint32_t)w2 & 0x3FFFFFF;124w3 += c;125c = w3 >> 26;126a3 = (uint32_t)w3 & 0x3FFFFFF;127w4 += c;128c = w4 >> 26;129a4 = (uint32_t)w4 & 0x3FFFFFF;130a0 += (uint32_t)c * 5;131a1 += a0 >> 26;132a0 &= 0x3FFFFFF;133134buf += 16;135len -= 16;136}137138acc[0] = a0;139acc[1] = a1;140acc[2] = a2;141acc[3] = a3;142acc[4] = a4;143}144145/* see bearssl_block.h */146void147br_poly1305_ctmul_run(const void *key, const void *iv,148void *data, size_t len, const void *aad, size_t aad_len,149void *tag, br_chacha20_run ichacha, int encrypt)150{151unsigned char pkey[32], foot[16];152uint32_t r[5], acc[5], cc, ctl, hi;153uint64_t w;154int i;155156/*157* Compute the MAC key. The 'r' value is the first 16 bytes of158* pkey[].159*/160memset(pkey, 0, sizeof pkey);161ichacha(key, iv, 0, pkey, sizeof pkey);162163/*164* If encrypting, ChaCha20 must run first, followed by Poly1305.165* When decrypting, the operations are reversed.166*/167if (encrypt) {168ichacha(key, iv, 1, data, len);169}170171/*172* Run Poly1305. We must process the AAD, then ciphertext, then173* the footer (with the lengths). Note that the AAD and ciphertext174* are meant to be padded with zeros up to the next multiple of 16,175* and the length of the footer is 16 bytes as well.176*/177178/*179* Decode the 'r' value into 26-bit words, with the "clamping"180* operation applied.181*/182r[0] = br_dec32le(pkey) & 0x03FFFFFF;183r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;184r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;185r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;186r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;187188/*189* Accumulator is 0.190*/191memset(acc, 0, sizeof acc);192193/*194* Process the additional authenticated data, ciphertext, and195* footer in due order.196*/197br_enc64le(foot, (uint64_t)aad_len);198br_enc64le(foot + 8, (uint64_t)len);199poly1305_inner(acc, r, aad, aad_len);200poly1305_inner(acc, r, data, len);201poly1305_inner(acc, r, foot, sizeof foot);202203/*204* Finalise modular reduction. This is done with carry propagation205* and applying the '2^130 = -5 mod p' rule. Note that the output206* of poly1035_inner() is already mostly reduced, since only207* acc[1] may be (very slightly) above 2^26. A single loop back208* to acc[1] will be enough to make the value fit in 130 bits.209*/210cc = 0;211for (i = 1; i <= 6; i ++) {212int j;213214j = (i >= 5) ? i - 5 : i;215acc[j] += cc;216cc = acc[j] >> 26;217acc[j] &= 0x03FFFFFF;218}219220/*221* We may still have a value in the 2^130-5..2^130-1 range, in222* which case we must reduce it again. The code below selects,223* in constant-time, between 'acc' and 'acc-p',224*/225ctl = GT(acc[0], 0x03FFFFFA);226for (i = 1; i < 5; i ++) {227ctl &= EQ(acc[i], 0x03FFFFFF);228}229cc = 5;230for (i = 0; i < 5; i ++) {231uint32_t t;232233t = (acc[i] + cc);234cc = t >> 26;235t &= 0x03FFFFFF;236acc[i] = MUX(ctl, t, acc[i]);237}238239/*240* Convert back the accumulator to 32-bit words, and add the241* 's' value (second half of pkey[]). That addition is done242* modulo 2^128.243*/244w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16);245br_enc32le((unsigned char *)tag, (uint32_t)w);246w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20);247br_enc32le((unsigned char *)tag + 4, (uint32_t)w);248w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24);249br_enc32le((unsigned char *)tag + 8, (uint32_t)w);250hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28);251br_enc32le((unsigned char *)tag + 12, hi);252253/*254* If decrypting, then ChaCha20 runs _after_ Poly1305.255*/256if (!encrypt) {257ichacha(key, iv, 1, data, len);258}259}260261262