Path: blob/main/contrib/bearssl/src/symcipher/poly1305_ctmulq.c
39482 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526#if BR_INT128 || BR_UMUL1282728#if BR_INT1282930#define MUL128(hi, lo, x, y) do { \31unsigned __int128 mul128tmp; \32mul128tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \33(hi) = (uint64_t)(mul128tmp >> 64); \34(lo) = (uint64_t)mul128tmp; \35} while (0)3637#elif BR_UMUL1283839#include <intrin.h>4041#define MUL128(hi, lo, x, y) do { \42(lo) = _umul128((x), (y), &(hi)); \43} while (0)4445#endif4647#define MASK42 ((uint64_t)0x000003FFFFFFFFFF)48#define MASK44 ((uint64_t)0x00000FFFFFFFFFFF)4950/*51* The "accumulator" word is nominally a 130-bit value. We split it into52* words of 44 bits, each held in a 64-bit variable.53*54* If the current accumulator is a = a0 + a1*W + a2*W^2 (where W = 2^44)55* and r = r0 + r1*W + r2*W^2, then:56*57* a*r = (a0*r0)58* + (a0*r1 + a1*r0) * W59* + (a0*r2 + a1*r1 + a2*r0) * W^260* + (a1*r2 + a2*r1) * W^361* + (a2*r2) * W^462*63* We want to reduce that value modulo p = 2^130-5, so W^3 = 20 mod p,64* and W^4 = 20*W mod p. Thus, if we define u1 = 20*r1 and u2 = 20*r2,65* then the equations above become:66*67* b0 = a0*r0 + a1*u2 + a2*u168* b1 = a0*r1 + a1*r0 + a2*u269* b2 = a0*r2 + a1*r1 + a2*r070*71* In order to make u1 fit in 44 bits, we can change these equations72* into:73*74* b0 = a0*r0 + a1*u2 + a2*t175* b1 = a0*r1 + a1*r0 + a2*t276* b2 = a0*r2 + a1*r1 + a2*r077*78* Where t1 is u1 truncated to 44 bits, and t2 is u2 added to the extra79* bits of u1. Note that since r is clamped down to a 124-bit value, the80* values u2 and t2 fit on 44 bits too.81*82* The bx values are larger than 44 bits, so we may split them into a83* lower half (cx, 44 bits) and an upper half (dx). The new values for84* the accumulator are then:85*86* e0 = c0 + 20*d287* e1 = c1 + d088* e2 = c2 + d189*90* The equations allow for some room, i.e. the ax values may be larger91* than 44 bits. Similarly, the ex values will usually be larger than92* the ax. Thus, some sort of carry propagation must be done regularly,93* though not necessarily at each iteration. In particular, we do not94* need to compute the additions (for the bx values) over 128-bit95* quantities; we can stick to 64-bit computations.96*97*98* Since the 128-bit result of a 64x64 multiplication is actually99* represented over two 64-bit registers, it is cheaper to arrange for100* any split that happens between the "high" and "low" halves to be on101* that 64-bit boundary. This is done by left shifting the rx, ux and tx102* by 20 bits (since they all fit on 44 bits each, this shift is103* always possible).104*/105106static void107poly1305_inner_big(uint64_t *acc, uint64_t *r, const void *data, size_t len)108{109110#define MX(hi, lo, m0, m1, m2) do { \111uint64_t mxhi, mxlo; \112MUL128(mxhi, mxlo, a0, m0); \113(hi) = mxhi; \114(lo) = mxlo >> 20; \115MUL128(mxhi, mxlo, a1, m1); \116(hi) += mxhi; \117(lo) += mxlo >> 20; \118MUL128(mxhi, mxlo, a2, m2); \119(hi) += mxhi; \120(lo) += mxlo >> 20; \121} while (0)122123const unsigned char *buf;124uint64_t a0, a1, a2;125uint64_t r0, r1, r2, t1, t2, u2;126127r0 = r[0];128r1 = r[1];129r2 = r[2];130t1 = r[3];131t2 = r[4];132u2 = r[5];133a0 = acc[0];134a1 = acc[1];135a2 = acc[2];136buf = data;137138while (len > 0) {139uint64_t v0, v1, v2;140uint64_t c0, c1, c2, d0, d1, d2;141142v0 = br_dec64le(buf + 0);143v1 = br_dec64le(buf + 8);144v2 = v1 >> 24;145v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;146v0 &= MASK44;147a0 += v0;148a1 += v1;149a2 += v2 + ((uint64_t)1 << 40);150MX(d0, c0, r0, u2, t1);151MX(d1, c1, r1, r0, t2);152MX(d2, c2, r2, r1, r0);153a0 = c0 + 20 * d2;154a1 = c1 + d0;155a2 = c2 + d1;156157v0 = br_dec64le(buf + 16);158v1 = br_dec64le(buf + 24);159v2 = v1 >> 24;160v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;161v0 &= MASK44;162a0 += v0;163a1 += v1;164a2 += v2 + ((uint64_t)1 << 40);165MX(d0, c0, r0, u2, t1);166MX(d1, c1, r1, r0, t2);167MX(d2, c2, r2, r1, r0);168a0 = c0 + 20 * d2;169a1 = c1 + d0;170a2 = c2 + d1;171172v0 = br_dec64le(buf + 32);173v1 = br_dec64le(buf + 40);174v2 = v1 >> 24;175v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;176v0 &= MASK44;177a0 += v0;178a1 += v1;179a2 += v2 + ((uint64_t)1 << 40);180MX(d0, c0, r0, u2, t1);181MX(d1, c1, r1, r0, t2);182MX(d2, c2, r2, r1, r0);183a0 = c0 + 20 * d2;184a1 = c1 + d0;185a2 = c2 + d1;186187v0 = br_dec64le(buf + 48);188v1 = br_dec64le(buf + 56);189v2 = v1 >> 24;190v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;191v0 &= MASK44;192a0 += v0;193a1 += v1;194a2 += v2 + ((uint64_t)1 << 40);195MX(d0, c0, r0, u2, t1);196MX(d1, c1, r1, r0, t2);197MX(d2, c2, r2, r1, r0);198a0 = c0 + 20 * d2;199a1 = c1 + d0;200a2 = c2 + d1;201202a1 += a0 >> 44;203a0 &= MASK44;204a2 += a1 >> 44;205a1 &= MASK44;206a0 += 20 * (a2 >> 44);207a2 &= MASK44;208209buf += 64;210len -= 64;211}212acc[0] = a0;213acc[1] = a1;214acc[2] = a2;215216#undef MX217}218219static void220poly1305_inner_small(uint64_t *acc, uint64_t *r, const void *data, size_t len)221{222const unsigned char *buf;223uint64_t a0, a1, a2;224uint64_t r0, r1, r2, t1, t2, u2;225226r0 = r[0];227r1 = r[1];228r2 = r[2];229t1 = r[3];230t2 = r[4];231u2 = r[5];232a0 = acc[0];233a1 = acc[1];234a2 = acc[2];235buf = data;236237while (len > 0) {238uint64_t v0, v1, v2;239uint64_t c0, c1, c2, d0, d1, d2;240unsigned char tmp[16];241242if (len < 16) {243memcpy(tmp, buf, len);244memset(tmp + len, 0, (sizeof tmp) - len);245buf = tmp;246len = 16;247}248v0 = br_dec64le(buf + 0);249v1 = br_dec64le(buf + 8);250251v2 = v1 >> 24;252v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;253v0 &= MASK44;254255a0 += v0;256a1 += v1;257a2 += v2 + ((uint64_t)1 << 40);258259#define MX(hi, lo, m0, m1, m2) do { \260uint64_t mxhi, mxlo; \261MUL128(mxhi, mxlo, a0, m0); \262(hi) = mxhi; \263(lo) = mxlo >> 20; \264MUL128(mxhi, mxlo, a1, m1); \265(hi) += mxhi; \266(lo) += mxlo >> 20; \267MUL128(mxhi, mxlo, a2, m2); \268(hi) += mxhi; \269(lo) += mxlo >> 20; \270} while (0)271272MX(d0, c0, r0, u2, t1);273MX(d1, c1, r1, r0, t2);274MX(d2, c2, r2, r1, r0);275276#undef MX277278a0 = c0 + 20 * d2;279a1 = c1 + d0;280a2 = c2 + d1;281282a1 += a0 >> 44;283a0 &= MASK44;284a2 += a1 >> 44;285a1 &= MASK44;286a0 += 20 * (a2 >> 44);287a2 &= MASK44;288289buf += 16;290len -= 16;291}292acc[0] = a0;293acc[1] = a1;294acc[2] = a2;295}296297static inline void298poly1305_inner(uint64_t *acc, uint64_t *r, const void *data, size_t len)299{300if (len >= 64) {301size_t len2;302303len2 = len & ~(size_t)63;304poly1305_inner_big(acc, r, data, len2);305data = (const unsigned char *)data + len2;306len -= len2;307}308if (len > 0) {309poly1305_inner_small(acc, r, data, len);310}311}312313/* see bearssl_block.h */314void315br_poly1305_ctmulq_run(const void *key, const void *iv,316void *data, size_t len, const void *aad, size_t aad_len,317void *tag, br_chacha20_run ichacha, int encrypt)318{319unsigned char pkey[32], foot[16];320uint64_t r[6], acc[3], r0, r1;321uint32_t v0, v1, v2, v3, v4;322uint64_t w0, w1, w2, w3;323uint32_t ctl;324325/*326* Compute the MAC key. The 'r' value is the first 16 bytes of327* pkey[].328*/329memset(pkey, 0, sizeof pkey);330ichacha(key, iv, 0, pkey, sizeof pkey);331332/*333* If encrypting, ChaCha20 must run first, followed by Poly1305.334* When decrypting, the operations are reversed.335*/336if (encrypt) {337ichacha(key, iv, 1, data, len);338}339340/*341* Run Poly1305. We must process the AAD, then ciphertext, then342* the footer (with the lengths). Note that the AAD and ciphertext343* are meant to be padded with zeros up to the next multiple of 16,344* and the length of the footer is 16 bytes as well.345*/346347/*348* Apply the "clamping" on r.349*/350pkey[ 3] &= 0x0F;351pkey[ 4] &= 0xFC;352pkey[ 7] &= 0x0F;353pkey[ 8] &= 0xFC;354pkey[11] &= 0x0F;355pkey[12] &= 0xFC;356pkey[15] &= 0x0F;357358/*359* Decode the 'r' value into 44-bit words, left-shifted by 20 bits.360* Also compute the u1 and u2 values.361*/362r0 = br_dec64le(pkey + 0);363r1 = br_dec64le(pkey + 8);364r[0] = r0 << 20;365r[1] = ((r0 >> 24) | (r1 << 40)) & ~(uint64_t)0xFFFFF;366r[2] = (r1 >> 4) & ~(uint64_t)0xFFFFF;367r1 = 20 * (r[1] >> 20);368r[3] = r1 << 20;369r[5] = 20 * r[2];370r[4] = (r[5] + (r1 >> 24)) & ~(uint64_t)0xFFFFF;371372/*373* Accumulator is 0.374*/375acc[0] = 0;376acc[1] = 0;377acc[2] = 0;378379/*380* Process the additional authenticated data, ciphertext, and381* footer in due order.382*/383br_enc64le(foot, (uint64_t)aad_len);384br_enc64le(foot + 8, (uint64_t)len);385poly1305_inner(acc, r, aad, aad_len);386poly1305_inner(acc, r, data, len);387poly1305_inner_small(acc, r, foot, sizeof foot);388389/*390* Finalise modular reduction. At that point, the value consists391* in three 44-bit values (the lowest one might be slightly above392* 2^44). Two loops shall be sufficient.393*/394acc[1] += (acc[0] >> 44);395acc[0] &= MASK44;396acc[2] += (acc[1] >> 44);397acc[1] &= MASK44;398acc[0] += 5 * (acc[2] >> 42);399acc[2] &= MASK42;400acc[1] += (acc[0] >> 44);401acc[0] &= MASK44;402acc[2] += (acc[1] >> 44);403acc[1] &= MASK44;404acc[0] += 5 * (acc[2] >> 42);405acc[2] &= MASK42;406407/*408* The value may still fall in the 2^130-5..2^130-1 range, in409* which case we must reduce it again. The code below selects,410* in constant-time, between 'acc' and 'acc-p'. We encode the411* value over four 32-bit integers to finish the operation.412*/413v0 = (uint32_t)acc[0];414v1 = (uint32_t)(acc[0] >> 32) | ((uint32_t)acc[1] << 12);415v2 = (uint32_t)(acc[1] >> 20) | ((uint32_t)acc[2] << 24);416v3 = (uint32_t)(acc[2] >> 8);417v4 = (uint32_t)(acc[2] >> 40);418419ctl = GT(v0, 0xFFFFFFFA);420ctl &= EQ(v1, 0xFFFFFFFF);421ctl &= EQ(v2, 0xFFFFFFFF);422ctl &= EQ(v3, 0xFFFFFFFF);423ctl &= EQ(v4, 0x00000003);424v0 = MUX(ctl, v0 + 5, v0);425v1 = MUX(ctl, 0, v1);426v2 = MUX(ctl, 0, v2);427v3 = MUX(ctl, 0, v3);428429/*430* Add the "s" value. This is done modulo 2^128. Don't forget431* carry propagation...432*/433w0 = (uint64_t)v0 + (uint64_t)br_dec32le(pkey + 16);434w1 = (uint64_t)v1 + (uint64_t)br_dec32le(pkey + 20) + (w0 >> 32);435w2 = (uint64_t)v2 + (uint64_t)br_dec32le(pkey + 24) + (w1 >> 32);436w3 = (uint64_t)v3 + (uint64_t)br_dec32le(pkey + 28) + (w2 >> 32);437v0 = (uint32_t)w0;438v1 = (uint32_t)w1;439v2 = (uint32_t)w2;440v3 = (uint32_t)w3;441442/*443* Encode the tag.444*/445br_enc32le((unsigned char *)tag + 0, v0);446br_enc32le((unsigned char *)tag + 4, v1);447br_enc32le((unsigned char *)tag + 8, v2);448br_enc32le((unsigned char *)tag + 12, v3);449450/*451* If decrypting, then ChaCha20 runs _after_ Poly1305.452*/453if (!encrypt) {454ichacha(key, iv, 1, data, len);455}456}457458/* see bearssl_block.h */459br_poly1305_run460br_poly1305_ctmulq_get(void)461{462return &br_poly1305_ctmulq_run;463}464465#else466467/* see bearssl_block.h */468br_poly1305_run469br_poly1305_ctmulq_get(void)470{471return 0;472}473474#endif475476477