Path: blob/main/contrib/bearssl/src/ec/ec_c25519_m31.c
39536 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/* obsolete27#include <stdio.h>28#include <stdlib.h>29static void30print_int(const char *name, const uint32_t *x)31{32size_t u;33unsigned char tmp[40];3435printf("%s = ", name);36for (u = 0; u < 9; u ++) {37if (x[u] > 0x3FFFFFFF) {38printf("INVALID:");39for (u = 0; u < 9; u ++) {40printf(" %08X", x[u]);41}42printf("\n");43return;44}45}46memset(tmp, 0, sizeof tmp);47for (u = 0; u < 9; u ++) {48uint64_t w;49int j, k;5051w = x[u];52j = 30 * (int)u;53k = j & 7;54if (k != 0) {55w <<= k;56j -= k;57}58k = j >> 3;59for (j = 0; j < 8; j ++) {60tmp[39 - k - j] |= (unsigned char)w;61w >>= 8;62}63}64for (u = 8; u < 40; u ++) {65printf("%02X", tmp[u]);66}67printf("\n");68}69*/7071/*72* If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_73* that right-shifting a signed negative integer copies the sign bit74* (arithmetic right-shift). This is "implementation-defined behaviour",75* i.e. it is not undefined, but it may differ between compilers. Each76* compiler is supposed to document its behaviour in that respect. GCC77* explicitly defines that an arithmetic right shift is used. We expect78* all other compilers to do the same, because underlying CPU offer an79* arithmetic right shift opcode that could not be used otherwise.80*/81#if BR_NO_ARITH_SHIFT82#define ARSH(x, n) (((uint32_t)(x) >> (n)) \83| ((-((uint32_t)(x) >> 31)) << (32 - (n))))84#else85#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))86#endif8788/*89* Convert an integer from unsigned little-endian encoding to a sequence of90* 30-bit words in little-endian order. The final "partial" word is91* returned.92*/93static uint32_t94le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)95{96uint32_t acc;97int acc_len;9899acc = 0;100acc_len = 0;101while (len -- > 0) {102uint32_t b;103104b = *src ++;105if (acc_len < 22) {106acc |= b << acc_len;107acc_len += 8;108} else {109*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;110acc = b >> (30 - acc_len);111acc_len -= 22;112}113}114return acc;115}116117/*118* Convert an integer (30-bit words, little-endian) to unsigned119* little-endian encoding. The total encoding length is provided; all120* the destination bytes will be filled.121*/122static void123le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)124{125uint32_t acc;126int acc_len;127128acc = 0;129acc_len = 0;130while (len -- > 0) {131if (acc_len < 8) {132uint32_t w;133134w = *src ++;135*dst ++ = (unsigned char)(acc | (w << acc_len));136acc = w >> (8 - acc_len);137acc_len += 22;138} else {139*dst ++ = (unsigned char)acc;140acc >>= 8;141acc_len -= 8;142}143}144}145146/*147* Multiply two integers. Source integers are represented as arrays of148* nine 30-bit words, for values up to 2^270-1. Result is encoded over149* 18 words of 30 bits each.150*/151static void152mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)153{154/*155* Maximum intermediate result is no more than156* 10376293531797946367, which fits in 64 bits. Reason:157*158* 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406159* 10376293531797946367 < 9663676407 * 2^30160*161* Thus, adding together 9 products of 30-bit integers, with162* a carry of at most 9663676406, yields an integer that fits163* on 64 bits and generates a carry of at most 9663676406.164*/165uint64_t t[17];166uint64_t cc;167int i;168169t[ 0] = MUL31(a[0], b[0]);170t[ 1] = MUL31(a[0], b[1])171+ MUL31(a[1], b[0]);172t[ 2] = MUL31(a[0], b[2])173+ MUL31(a[1], b[1])174+ MUL31(a[2], b[0]);175t[ 3] = MUL31(a[0], b[3])176+ MUL31(a[1], b[2])177+ MUL31(a[2], b[1])178+ MUL31(a[3], b[0]);179t[ 4] = MUL31(a[0], b[4])180+ MUL31(a[1], b[3])181+ MUL31(a[2], b[2])182+ MUL31(a[3], b[1])183+ MUL31(a[4], b[0]);184t[ 5] = MUL31(a[0], b[5])185+ MUL31(a[1], b[4])186+ MUL31(a[2], b[3])187+ MUL31(a[3], b[2])188+ MUL31(a[4], b[1])189+ MUL31(a[5], b[0]);190t[ 6] = MUL31(a[0], b[6])191+ MUL31(a[1], b[5])192+ MUL31(a[2], b[4])193+ MUL31(a[3], b[3])194+ MUL31(a[4], b[2])195+ MUL31(a[5], b[1])196+ MUL31(a[6], b[0]);197t[ 7] = MUL31(a[0], b[7])198+ MUL31(a[1], b[6])199+ MUL31(a[2], b[5])200+ MUL31(a[3], b[4])201+ MUL31(a[4], b[3])202+ MUL31(a[5], b[2])203+ MUL31(a[6], b[1])204+ MUL31(a[7], b[0]);205t[ 8] = MUL31(a[0], b[8])206+ MUL31(a[1], b[7])207+ MUL31(a[2], b[6])208+ MUL31(a[3], b[5])209+ MUL31(a[4], b[4])210+ MUL31(a[5], b[3])211+ MUL31(a[6], b[2])212+ MUL31(a[7], b[1])213+ MUL31(a[8], b[0]);214t[ 9] = MUL31(a[1], b[8])215+ MUL31(a[2], b[7])216+ MUL31(a[3], b[6])217+ MUL31(a[4], b[5])218+ MUL31(a[5], b[4])219+ MUL31(a[6], b[3])220+ MUL31(a[7], b[2])221+ MUL31(a[8], b[1]);222t[10] = MUL31(a[2], b[8])223+ MUL31(a[3], b[7])224+ MUL31(a[4], b[6])225+ MUL31(a[5], b[5])226+ MUL31(a[6], b[4])227+ MUL31(a[7], b[3])228+ MUL31(a[8], b[2]);229t[11] = MUL31(a[3], b[8])230+ MUL31(a[4], b[7])231+ MUL31(a[5], b[6])232+ MUL31(a[6], b[5])233+ MUL31(a[7], b[4])234+ MUL31(a[8], b[3]);235t[12] = MUL31(a[4], b[8])236+ MUL31(a[5], b[7])237+ MUL31(a[6], b[6])238+ MUL31(a[7], b[5])239+ MUL31(a[8], b[4]);240t[13] = MUL31(a[5], b[8])241+ MUL31(a[6], b[7])242+ MUL31(a[7], b[6])243+ MUL31(a[8], b[5]);244t[14] = MUL31(a[6], b[8])245+ MUL31(a[7], b[7])246+ MUL31(a[8], b[6]);247t[15] = MUL31(a[7], b[8])248+ MUL31(a[8], b[7]);249t[16] = MUL31(a[8], b[8]);250251/*252* Propagate carries.253*/254cc = 0;255for (i = 0; i < 17; i ++) {256uint64_t w;257258w = t[i] + cc;259d[i] = (uint32_t)w & 0x3FFFFFFF;260cc = w >> 30;261}262d[17] = (uint32_t)cc;263}264265/*266* Square a 270-bit integer, represented as an array of nine 30-bit words.267* Result uses 18 words of 30 bits each.268*/269static void270square9(uint32_t *d, const uint32_t *a)271{272uint64_t t[17];273uint64_t cc;274int i;275276t[ 0] = MUL31(a[0], a[0]);277t[ 1] = ((MUL31(a[0], a[1])) << 1);278t[ 2] = MUL31(a[1], a[1])279+ ((MUL31(a[0], a[2])) << 1);280t[ 3] = ((MUL31(a[0], a[3])281+ MUL31(a[1], a[2])) << 1);282t[ 4] = MUL31(a[2], a[2])283+ ((MUL31(a[0], a[4])284+ MUL31(a[1], a[3])) << 1);285t[ 5] = ((MUL31(a[0], a[5])286+ MUL31(a[1], a[4])287+ MUL31(a[2], a[3])) << 1);288t[ 6] = MUL31(a[3], a[3])289+ ((MUL31(a[0], a[6])290+ MUL31(a[1], a[5])291+ MUL31(a[2], a[4])) << 1);292t[ 7] = ((MUL31(a[0], a[7])293+ MUL31(a[1], a[6])294+ MUL31(a[2], a[5])295+ MUL31(a[3], a[4])) << 1);296t[ 8] = MUL31(a[4], a[4])297+ ((MUL31(a[0], a[8])298+ MUL31(a[1], a[7])299+ MUL31(a[2], a[6])300+ MUL31(a[3], a[5])) << 1);301t[ 9] = ((MUL31(a[1], a[8])302+ MUL31(a[2], a[7])303+ MUL31(a[3], a[6])304+ MUL31(a[4], a[5])) << 1);305t[10] = MUL31(a[5], a[5])306+ ((MUL31(a[2], a[8])307+ MUL31(a[3], a[7])308+ MUL31(a[4], a[6])) << 1);309t[11] = ((MUL31(a[3], a[8])310+ MUL31(a[4], a[7])311+ MUL31(a[5], a[6])) << 1);312t[12] = MUL31(a[6], a[6])313+ ((MUL31(a[4], a[8])314+ MUL31(a[5], a[7])) << 1);315t[13] = ((MUL31(a[5], a[8])316+ MUL31(a[6], a[7])) << 1);317t[14] = MUL31(a[7], a[7])318+ ((MUL31(a[6], a[8])) << 1);319t[15] = ((MUL31(a[7], a[8])) << 1);320t[16] = MUL31(a[8], a[8]);321322/*323* Propagate carries.324*/325cc = 0;326for (i = 0; i < 17; i ++) {327uint64_t w;328329w = t[i] + cc;330d[i] = (uint32_t)w & 0x3FFFFFFF;331cc = w >> 30;332}333d[17] = (uint32_t)cc;334}335336/*337* Perform a "final reduction" in field F255 (field for Curve25519)338* The source value must be less than twice the modulus. If the value339* is not lower than the modulus, then the modulus is subtracted and340* this function returns 1; otherwise, it leaves it untouched and it341* returns 0.342*/343static uint32_t344reduce_final_f255(uint32_t *d)345{346uint32_t t[9];347uint32_t cc;348int i;349350memcpy(t, d, sizeof t);351cc = 19;352for (i = 0; i < 9; i ++) {353uint32_t w;354355w = t[i] + cc;356cc = w >> 30;357t[i] = w & 0x3FFFFFFF;358}359cc = t[8] >> 15;360t[8] &= 0x7FFF;361CCOPY(cc, d, t, sizeof t);362return cc;363}364365/*366* Perform a multiplication of two integers modulo 2^255-19.367* Operands are arrays of 9 words, each containing 30 bits of data, in368* little-endian order. Input value may be up to 2^256-1; on output, value369* fits on 256 bits and is lower than twice the modulus.370*/371static void372f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)373{374uint32_t t[18], cc;375int i;376377/*378* Compute raw multiplication. All result words fit in 30 bits379* each; upper word (t[17]) must fit on 2 bits, since the product380* of two 256-bit integers must fit on 512 bits.381*/382mul9(t, a, b);383384/*385* Modular reduction: each high word is added where necessary.386* Since the modulus is 2^255-19 and word 9 corresponds to387* offset 9*30 = 270, word 9+k must be added to word k with388* a factor of 19*2^15 = 622592. The extra bits in word 8 are also389* added that way.390*391* Keeping the carry on 32 bits helps with 32-bit architectures,392* and does not noticeably impact performance on 64-bit systems.393*/394cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */395t[8] &= 0x7FFF;396for (i = 0; i < 9; i ++) {397uint64_t w;398399w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);400t[i] = (uint32_t)w & 0x3FFFFFFF;401cc = (uint32_t)(w >> 30); /* at most 622592 */402}403404/*405* Original product was up to (2^256-1)^2, i.e. a 512-bit integer.406* This was split into two parts (upper of 257 bits, lower of 255407* bits), and the upper was added to the lower with a factor 19,408* which means that the intermediate value is less than 77*2^255409* (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are410* less than 77, and the initial carry cc is at most 76*19 = 1444.411*/412cc = MUL15(t[8] >> 15, 19);413t[8] &= 0x7FFF;414for (i = 0; i < 9; i ++) {415uint32_t z;416417z = t[i] + cc;418d[i] = z & 0x3FFFFFFF;419cc = z >> 30;420}421422/*423* Final result is at most 2^255 + 1443. In particular, the last424* carry is necessarily 0, since t[8] was truncated to 15 bits.425*/426}427428/*429* Perform a squaring of an integer modulo 2^255-19.430* Operands are arrays of 9 words, each containing 30 bits of data, in431* little-endian order. Input value may be up to 2^256-1; on output, value432* fits on 256 bits and is lower than twice the modulus.433*/434static void435f255_square(uint32_t *d, const uint32_t *a)436{437uint32_t t[18], cc;438int i;439440/*441* Compute raw squaring. All result words fit in 30 bits442* each; upper word (t[17]) must fit on 2 bits, since the square443* of a 256-bit integers must fit on 512 bits.444*/445square9(t, a);446447/*448* Modular reduction: each high word is added where necessary.449* See f255_mul() for details on the reduction and carry limits.450*/451cc = MUL15(t[8] >> 15, 19);452t[8] &= 0x7FFF;453for (i = 0; i < 9; i ++) {454uint64_t w;455456w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);457t[i] = (uint32_t)w & 0x3FFFFFFF;458cc = (uint32_t)(w >> 30);459}460cc = MUL15(t[8] >> 15, 19);461t[8] &= 0x7FFF;462for (i = 0; i < 9; i ++) {463uint32_t z;464465z = t[i] + cc;466d[i] = z & 0x3FFFFFFF;467cc = z >> 30;468}469}470471/*472* Add two values in F255. Partial reduction is performed (down to less473* than twice the modulus).474*/475static void476f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)477{478/*479* Since operand words fit on 30 bits, we can use 32-bit480* variables throughout.481*/482int i;483uint32_t cc, w;484485cc = 0;486for (i = 0; i < 9; i ++) {487w = a[i] + b[i] + cc;488d[i] = w & 0x3FFFFFFF;489cc = w >> 30;490}491cc = MUL15(w >> 15, 19);492d[8] &= 0x7FFF;493for (i = 0; i < 9; i ++) {494w = d[i] + cc;495d[i] = w & 0x3FFFFFFF;496cc = w >> 30;497}498}499500/*501* Subtract one value from another in F255. Partial reduction is502* performed (down to less than twice the modulus).503*/504static void505f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)506{507/*508* We actually compute a - b + 2*p, so that the final value is509* necessarily positive.510*/511int i;512uint32_t cc, w;513514cc = (uint32_t)-38;515for (i = 0; i < 9; i ++) {516w = a[i] - b[i] + cc;517d[i] = w & 0x3FFFFFFF;518cc = ARSH(w, 30);519}520cc = MUL15((w + 0x10000) >> 15, 19);521d[8] &= 0x7FFF;522for (i = 0; i < 9; i ++) {523w = d[i] + cc;524d[i] = w & 0x3FFFFFFF;525cc = w >> 30;526}527}528529/*530* Multiply an integer by the 'A24' constant (121665). Partial reduction531* is performed (down to less than twice the modulus).532*/533static void534f255_mul_a24(uint32_t *d, const uint32_t *a)535{536int i;537uint64_t w;538uint32_t cc;539540/*541* a[] is over 256 bits, thus a[8] has length at most 16 bits.542* We single out the processing of the last word: intermediate543* value w is up to 121665*2^16, yielding a carry for the next544* loop of at most 19*(121665*2^16/2^15) = 4623289.545*/546cc = 0;547for (i = 0; i < 8; i ++) {548w = MUL31(a[i], 121665) + (uint64_t)cc;549d[i] = (uint32_t)w & 0x3FFFFFFF;550cc = (uint32_t)(w >> 30);551}552w = MUL31(a[8], 121665) + (uint64_t)cc;553d[8] = (uint32_t)w & 0x7FFF;554cc = MUL15((uint32_t)(w >> 15), 19);555556for (i = 0; i < 9; i ++) {557uint32_t z;558559z = d[i] + cc;560d[i] = z & 0x3FFFFFFF;561cc = z >> 30;562}563}564565static const unsigned char GEN[] = {5660x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,5670x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,5680x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,5690x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00570};571572static const unsigned char ORDER[] = {5730x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,5740xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,5750xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,5760xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF577};578579static const unsigned char *580api_generator(int curve, size_t *len)581{582(void)curve;583*len = 32;584return GEN;585}586587static const unsigned char *588api_order(int curve, size_t *len)589{590(void)curve;591*len = 32;592return ORDER;593}594595static size_t596api_xoff(int curve, size_t *len)597{598(void)curve;599*len = 32;600return 0;601}602603static void604cswap(uint32_t *a, uint32_t *b, uint32_t ctl)605{606int i;607608ctl = -ctl;609for (i = 0; i < 9; i ++) {610uint32_t aw, bw, tw;611612aw = a[i];613bw = b[i];614tw = ctl & (aw ^ bw);615a[i] = aw ^ tw;616b[i] = bw ^ tw;617}618}619620static uint32_t621api_mul(unsigned char *G, size_t Glen,622const unsigned char *kb, size_t kblen, int curve)623{624uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];625uint32_t a[9], aa[9], b[9], bb[9];626uint32_t c[9], d[9], e[9], da[9], cb[9];627unsigned char k[32];628uint32_t swap;629int i;630631(void)curve;632633/*634* Points are encoded over exactly 32 bytes. Multipliers must fit635* in 32 bytes as well.636* RFC 7748 mandates that the high bit of the last point byte must637* be ignored/cleared.638*/639if (Glen != 32 || kblen > 32) {640return 0;641}642G[31] &= 0x7F;643644/*645* Initialise variables x1, x2, z2, x3 and z3. We set all of them646* into Montgomery representation.647*/648x1[8] = le8_to_le30(x1, G, 32);649memcpy(x3, x1, sizeof x1);650memset(z2, 0, sizeof z2);651memset(x2, 0, sizeof x2);652x2[0] = 1;653memset(z3, 0, sizeof z3);654z3[0] = 1;655656memset(k, 0, (sizeof k) - kblen);657memcpy(k + (sizeof k) - kblen, kb, kblen);658k[31] &= 0xF8;659k[0] &= 0x7F;660k[0] |= 0x40;661662/* obsolete663print_int("x1", x1);664*/665666swap = 0;667for (i = 254; i >= 0; i --) {668uint32_t kt;669670kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;671swap ^= kt;672cswap(x2, x3, swap);673cswap(z2, z3, swap);674swap = kt;675676/* obsolete677print_int("x2", x2);678print_int("z2", z2);679print_int("x3", x3);680print_int("z3", z3);681*/682683f255_add(a, x2, z2);684f255_square(aa, a);685f255_sub(b, x2, z2);686f255_square(bb, b);687f255_sub(e, aa, bb);688f255_add(c, x3, z3);689f255_sub(d, x3, z3);690f255_mul(da, d, a);691f255_mul(cb, c, b);692693/* obsolete694print_int("a ", a);695print_int("aa", aa);696print_int("b ", b);697print_int("bb", bb);698print_int("e ", e);699print_int("c ", c);700print_int("d ", d);701print_int("da", da);702print_int("cb", cb);703*/704705f255_add(x3, da, cb);706f255_square(x3, x3);707f255_sub(z3, da, cb);708f255_square(z3, z3);709f255_mul(z3, z3, x1);710f255_mul(x2, aa, bb);711f255_mul_a24(z2, e);712f255_add(z2, z2, aa);713f255_mul(z2, e, z2);714715/* obsolete716print_int("x2", x2);717print_int("z2", z2);718print_int("x3", x3);719print_int("z3", z3);720*/721}722cswap(x2, x3, swap);723cswap(z2, z3, swap);724725/*726* Inverse z2 with a modular exponentiation. This is a simple727* square-and-multiply algorithm; we mutualise most non-squarings728* since the exponent contains almost only ones.729*/730memcpy(a, z2, sizeof z2);731for (i = 0; i < 15; i ++) {732f255_square(a, a);733f255_mul(a, a, z2);734}735memcpy(b, a, sizeof a);736for (i = 0; i < 14; i ++) {737int j;738739for (j = 0; j < 16; j ++) {740f255_square(b, b);741}742f255_mul(b, b, a);743}744for (i = 14; i >= 0; i --) {745f255_square(b, b);746if ((0xFFEB >> i) & 1) {747f255_mul(b, z2, b);748}749}750f255_mul(x2, x2, b);751reduce_final_f255(x2);752le30_to_le8(G, 32, x2);753return 1;754}755756static size_t757api_mulgen(unsigned char *R,758const unsigned char *x, size_t xlen, int curve)759{760const unsigned char *G;761size_t Glen;762763G = api_generator(curve, &Glen);764memcpy(R, G, Glen);765api_mul(R, Glen, x, xlen, curve);766return Glen;767}768769static uint32_t770api_muladd(unsigned char *A, const unsigned char *B, size_t len,771const unsigned char *x, size_t xlen,772const unsigned char *y, size_t ylen, int curve)773{774/*775* We don't implement this method, since it is used for ECDSA776* only, and there is no ECDSA over Curve25519 (which instead777* uses EdDSA).778*/779(void)A;780(void)B;781(void)len;782(void)x;783(void)xlen;784(void)y;785(void)ylen;786(void)curve;787return 0;788}789790/* see bearssl_ec.h */791const br_ec_impl br_ec_c25519_m31 = {792(uint32_t)0x20000000,793&api_generator,794&api_order,795&api_xoff,796&api_mul,797&api_mulgen,798&api_muladd799};800801802