Path: blob/main/contrib/bearssl/src/ec/ec_c25519_m64.c
39536 views
/*1* Copyright (c) 2018 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526#if BR_INT128 || BR_UMUL1282728#if BR_UMUL12829#include <intrin.h>30#endif3132static const unsigned char GEN[] = {330x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,340x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,350x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,360x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0037};3839static const unsigned char ORDER[] = {400x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,410xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,420xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,430xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF44};4546static const unsigned char *47api_generator(int curve, size_t *len)48{49(void)curve;50*len = 32;51return GEN;52}5354static const unsigned char *55api_order(int curve, size_t *len)56{57(void)curve;58*len = 32;59return ORDER;60}6162static size_t63api_xoff(int curve, size_t *len)64{65(void)curve;66*len = 32;67return 0;68}6970/*71* A field element is encoded as four 64-bit integers, in basis 2^63.72* Operations return partially reduced values, which may range up to73* 2^255+37.74*/7576#define MASK63 (((uint64_t)1 << 63) - (uint64_t)1)7778/*79* Swap two field elements, conditionally on a flag.80*/81static inline void82f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)83{84uint64_t m, w;8586m = -(uint64_t)ctl;87w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;88w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;89w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;90w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;91}9293/*94* Addition in the field.95*/96static inline void97f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)98{99#if BR_INT128100101uint64_t t0, t1, t2, t3, cc;102unsigned __int128 z;103104z = (unsigned __int128)a[0] + (unsigned __int128)b[0];105t0 = (uint64_t)z;106z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);107t1 = (uint64_t)z;108z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);109t2 = (uint64_t)z;110z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);111t3 = (uint64_t)z & MASK63;112cc = (uint64_t)(z >> 63);113114/*115* Since operands are at most 2^255+37, the sum is at most116* 2^256+74; thus, the carry cc is equal to 0, 1 or 2.117*118* We use: 2^255 = 19 mod p.119* Since we add 0, 19 or 38 to a value that fits on 255 bits,120* the result is at most 2^255+37.121*/122z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);123d[0] = (uint64_t)z;124z = (unsigned __int128)t1 + (z >> 64);125d[1] = (uint64_t)z;126z = (unsigned __int128)t2 + (z >> 64);127d[2] = (uint64_t)z;128d[3] = t3 + (uint64_t)(z >> 64);129130#elif BR_UMUL128131132uint64_t t0, t1, t2, t3, cc;133unsigned char k;134135k = _addcarry_u64(0, a[0], b[0], &t0);136k = _addcarry_u64(k, a[1], b[1], &t1);137k = _addcarry_u64(k, a[2], b[2], &t2);138k = _addcarry_u64(k, a[3], b[3], &t3);139cc = (k << 1) + (t3 >> 63);140t3 &= MASK63;141142/*143* Since operands are at most 2^255+37, the sum is at most144* 2^256+74; thus, the carry cc is equal to 0, 1 or 2.145*146* We use: 2^255 = 19 mod p.147* Since we add 0, 19 or 38 to a value that fits on 255 bits,148* the result is at most 2^255+37.149*/150k = _addcarry_u64(0, t0, 19 * cc, &d[0]);151k = _addcarry_u64(k, t1, 0, &d[1]);152k = _addcarry_u64(k, t2, 0, &d[2]);153(void)_addcarry_u64(k, t3, 0, &d[3]);154155#endif156}157158/*159* Subtraction.160*/161static inline void162f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)163{164#if BR_INT128165166/*167* We compute t = 2^256 - 38 + a - b, which is necessarily168* positive but lower than 2^256 + 2^255, since a <= 2^255 + 37169* and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending170* on the two upper bits of t (bits 255 and 256).171*/172173uint64_t t0, t1, t2, t3, t4, cc;174unsigned __int128 z;175176z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;177t0 = (uint64_t)z;178cc = -(uint64_t)(z >> 64);179z = (unsigned __int128)a[1] - (unsigned __int128)b[1]180- (unsigned __int128)cc;181t1 = (uint64_t)z;182cc = -(uint64_t)(z >> 64);183z = (unsigned __int128)a[2] - (unsigned __int128)b[2]184- (unsigned __int128)cc;185t2 = (uint64_t)z;186cc = -(uint64_t)(z >> 64);187z = (unsigned __int128)a[3] - (unsigned __int128)b[3]188- (unsigned __int128)cc;189t3 = (uint64_t)z;190t4 = 1 + (uint64_t)(z >> 64);191192/*193* We have a 257-bit result. The two top bits can be 00, 01 or 10,194* but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).195* Therefore, we can truncate to 255 bits, and add 0, 19 or 38.196* This guarantees that the result is at most 2^255+37.197*/198cc = (38 & -t4) + (19 & -(t3 >> 63));199t3 &= MASK63;200z = (unsigned __int128)t0 + (unsigned __int128)cc;201d[0] = (uint64_t)z;202z = (unsigned __int128)t1 + (z >> 64);203d[1] = (uint64_t)z;204z = (unsigned __int128)t2 + (z >> 64);205d[2] = (uint64_t)z;206d[3] = t3 + (uint64_t)(z >> 64);207208#elif BR_UMUL128209210/*211* We compute t = 2^256 - 38 + a - b, which is necessarily212* positive but lower than 2^256 + 2^255, since a <= 2^255 + 37213* and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending214* on the two upper bits of t (bits 255 and 256).215*/216217uint64_t t0, t1, t2, t3, t4;218unsigned char k;219220k = _subborrow_u64(0, a[0], b[0], &t0);221k = _subborrow_u64(k, a[1], b[1], &t1);222k = _subborrow_u64(k, a[2], b[2], &t2);223k = _subborrow_u64(k, a[3], b[3], &t3);224(void)_subborrow_u64(k, 1, 0, &t4);225226k = _subborrow_u64(0, t0, 38, &t0);227k = _subborrow_u64(k, t1, 0, &t1);228k = _subborrow_u64(k, t2, 0, &t2);229k = _subborrow_u64(k, t3, 0, &t3);230(void)_subborrow_u64(k, t4, 0, &t4);231232/*233* We have a 257-bit result. The two top bits can be 00, 01 or 10,234* but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).235* Therefore, we can truncate to 255 bits, and add 0, 19 or 38.236* This guarantees that the result is at most 2^255+37.237*/238t4 = (38 & -t4) + (19 & -(t3 >> 63));239t3 &= MASK63;240k = _addcarry_u64(0, t0, t4, &d[0]);241k = _addcarry_u64(k, t1, 0, &d[1]);242k = _addcarry_u64(k, t2, 0, &d[2]);243(void)_addcarry_u64(k, t3, 0, &d[3]);244245#endif246}247248/*249* Multiplication.250*/251static inline void252f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)253{254#if BR_INT128255256unsigned __int128 z;257uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;258259/*260* Compute the product a*b over plain integers.261*/262z = (unsigned __int128)a[0] * (unsigned __int128)b[0];263t0 = (uint64_t)z;264z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);265t1 = (uint64_t)z;266z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);267t2 = (uint64_t)z;268z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);269t3 = (uint64_t)z;270t4 = (uint64_t)(z >> 64);271272z = (unsigned __int128)a[1] * (unsigned __int128)b[0]273+ (unsigned __int128)t1;274t1 = (uint64_t)z;275z = (unsigned __int128)a[1] * (unsigned __int128)b[1]276+ (unsigned __int128)t2 + (z >> 64);277t2 = (uint64_t)z;278z = (unsigned __int128)a[1] * (unsigned __int128)b[2]279+ (unsigned __int128)t3 + (z >> 64);280t3 = (uint64_t)z;281z = (unsigned __int128)a[1] * (unsigned __int128)b[3]282+ (unsigned __int128)t4 + (z >> 64);283t4 = (uint64_t)z;284t5 = (uint64_t)(z >> 64);285286z = (unsigned __int128)a[2] * (unsigned __int128)b[0]287+ (unsigned __int128)t2;288t2 = (uint64_t)z;289z = (unsigned __int128)a[2] * (unsigned __int128)b[1]290+ (unsigned __int128)t3 + (z >> 64);291t3 = (uint64_t)z;292z = (unsigned __int128)a[2] * (unsigned __int128)b[2]293+ (unsigned __int128)t4 + (z >> 64);294t4 = (uint64_t)z;295z = (unsigned __int128)a[2] * (unsigned __int128)b[3]296+ (unsigned __int128)t5 + (z >> 64);297t5 = (uint64_t)z;298t6 = (uint64_t)(z >> 64);299300z = (unsigned __int128)a[3] * (unsigned __int128)b[0]301+ (unsigned __int128)t3;302t3 = (uint64_t)z;303z = (unsigned __int128)a[3] * (unsigned __int128)b[1]304+ (unsigned __int128)t4 + (z >> 64);305t4 = (uint64_t)z;306z = (unsigned __int128)a[3] * (unsigned __int128)b[2]307+ (unsigned __int128)t5 + (z >> 64);308t5 = (uint64_t)z;309z = (unsigned __int128)a[3] * (unsigned __int128)b[3]310+ (unsigned __int128)t6 + (z >> 64);311t6 = (uint64_t)z;312t7 = (uint64_t)(z >> 64);313314/*315* Modulo p, we have:316*317* 2^255 = 19318* 2^510 = 19*19 = 361319*320* We split the intermediate t into three parts, in basis321* 2^255. The low one will be in t0..t3; the middle one in t4..t7.322* The upper one can only be a single bit (th), since the323* multiplication operands are at most 2^255+37 each.324*/325th = t7 >> 62;326t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;327t6 = (t6 << 1) | (t5 >> 63);328t5 = (t5 << 1) | (t4 >> 63);329t4 = (t4 << 1) | (t3 >> 63);330t3 &= MASK63;331332/*333* Multiply the middle part (t4..t7) by 19. We truncate it to334* 255 bits; the extra bits will go along with th.335*/336z = (unsigned __int128)t4 * 19;337t4 = (uint64_t)z;338z = (unsigned __int128)t5 * 19 + (z >> 64);339t5 = (uint64_t)z;340z = (unsigned __int128)t6 * 19 + (z >> 64);341t6 = (uint64_t)z;342z = (unsigned __int128)t7 * 19 + (z >> 64);343t7 = (uint64_t)z & MASK63;344345th = (361 & -th) + (19 * (uint64_t)(z >> 63));346347/*348* Add elements together.349* At this point:350* t0..t3 fits on 255 bits.351* t4..t7 fits on 255 bits.352* th <= 361 + 342 = 703.353*/354z = (unsigned __int128)t0 + (unsigned __int128)t4355+ (unsigned __int128)th;356t0 = (uint64_t)z;357z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);358t1 = (uint64_t)z;359z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);360t2 = (uint64_t)z;361z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);362t3 = (uint64_t)z & MASK63;363th = (uint64_t)(z >> 63);364365/*366* Since the sum is at most 2^256 + 703, the two upper bits, in th,367* can only have value 0, 1 or 2. We just add th*19, which368* guarantees a result of at most 2^255+37.369*/370z = (unsigned __int128)t0 + (19 * th);371d[0] = (uint64_t)z;372z = (unsigned __int128)t1 + (z >> 64);373d[1] = (uint64_t)z;374z = (unsigned __int128)t2 + (z >> 64);375d[2] = (uint64_t)z;376d[3] = t3 + (uint64_t)(z >> 64);377378#elif BR_UMUL128379380uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;381uint64_t h0, h1, h2, h3;382unsigned char k;383384/*385* Compute the product a*b over plain integers.386*/387t0 = _umul128(a[0], b[0], &h0);388t1 = _umul128(a[0], b[1], &h1);389k = _addcarry_u64(0, t1, h0, &t1);390t2 = _umul128(a[0], b[2], &h2);391k = _addcarry_u64(k, t2, h1, &t2);392t3 = _umul128(a[0], b[3], &h3);393k = _addcarry_u64(k, t3, h2, &t3);394(void)_addcarry_u64(k, h3, 0, &t4);395396k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);397k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);398k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);399k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);400t5 = k;401k = _addcarry_u64(0, t2, h0, &t2);402k = _addcarry_u64(k, t3, h1, &t3);403k = _addcarry_u64(k, t4, h2, &t4);404(void)_addcarry_u64(k, t5, h3, &t5);405406k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);407k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);408k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);409k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);410t6 = k;411k = _addcarry_u64(0, t3, h0, &t3);412k = _addcarry_u64(k, t4, h1, &t4);413k = _addcarry_u64(k, t5, h2, &t5);414(void)_addcarry_u64(k, t6, h3, &t6);415416k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);417k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);418k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);419k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);420t7 = k;421k = _addcarry_u64(0, t4, h0, &t4);422k = _addcarry_u64(k, t5, h1, &t5);423k = _addcarry_u64(k, t6, h2, &t6);424(void)_addcarry_u64(k, t7, h3, &t7);425426/*427* Modulo p, we have:428*429* 2^255 = 19430* 2^510 = 19*19 = 361431*432* We split the intermediate t into three parts, in basis433* 2^255. The low one will be in t0..t3; the middle one in t4..t7.434* The upper one can only be a single bit (th), since the435* multiplication operands are at most 2^255+37 each.436*/437th = t7 >> 62;438t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;439t6 = (t6 << 1) | (t5 >> 63);440t5 = (t5 << 1) | (t4 >> 63);441t4 = (t4 << 1) | (t3 >> 63);442t3 &= MASK63;443444/*445* Multiply the middle part (t4..t7) by 19. We truncate it to446* 255 bits; the extra bits will go along with th.447*/448t4 = _umul128(t4, 19, &h0);449t5 = _umul128(t5, 19, &h1);450t6 = _umul128(t6, 19, &h2);451t7 = _umul128(t7, 19, &h3);452k = _addcarry_u64(0, t5, h0, &t5);453k = _addcarry_u64(k, t6, h1, &t6);454k = _addcarry_u64(k, t7, h2, &t7);455(void)_addcarry_u64(k, h3, 0, &h3);456th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));457t7 &= MASK63;458459/*460* Add elements together.461* At this point:462* t0..t3 fits on 255 bits.463* t4..t7 fits on 255 bits.464* th <= 361 + 342 = 703.465*/466k = _addcarry_u64(0, t0, t4, &t0);467k = _addcarry_u64(k, t1, t5, &t1);468k = _addcarry_u64(k, t2, t6, &t2);469k = _addcarry_u64(k, t3, t7, &t3);470t4 = k;471k = _addcarry_u64(0, t0, th, &t0);472k = _addcarry_u64(k, t1, 0, &t1);473k = _addcarry_u64(k, t2, 0, &t2);474k = _addcarry_u64(k, t3, 0, &t3);475(void)_addcarry_u64(k, t4, 0, &t4);476477th = (t4 << 1) + (t3 >> 63);478t3 &= MASK63;479480/*481* Since the sum is at most 2^256 + 703, the two upper bits, in th,482* can only have value 0, 1 or 2. We just add th*19, which483* guarantees a result of at most 2^255+37.484*/485k = _addcarry_u64(0, t0, 19 * th, &d[0]);486k = _addcarry_u64(k, t1, 0, &d[1]);487k = _addcarry_u64(k, t2, 0, &d[2]);488(void)_addcarry_u64(k, t3, 0, &d[3]);489490#endif491}492493/*494* Multiplication by A24 = 121665.495*/496static inline void497f255_mul_a24(uint64_t *d, const uint64_t *a)498{499#if BR_INT128500501uint64_t t0, t1, t2, t3;502unsigned __int128 z;503504z = (unsigned __int128)a[0] * 121665;505t0 = (uint64_t)z;506z = (unsigned __int128)a[1] * 121665 + (z >> 64);507t1 = (uint64_t)z;508z = (unsigned __int128)a[2] * 121665 + (z >> 64);509t2 = (uint64_t)z;510z = (unsigned __int128)a[3] * 121665 + (z >> 64);511t3 = (uint64_t)z & MASK63;512513z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));514t0 = (uint64_t)z;515z = (unsigned __int128)t1 + (z >> 64);516t1 = (uint64_t)z;517z = (unsigned __int128)t2 + (z >> 64);518t2 = (uint64_t)z;519t3 = t3 + (uint64_t)(z >> 64);520521z = (unsigned __int128)t0 + (19 & -(t3 >> 63));522d[0] = (uint64_t)z;523z = (unsigned __int128)t1 + (z >> 64);524d[1] = (uint64_t)z;525z = (unsigned __int128)t2 + (z >> 64);526d[2] = (uint64_t)z;527d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);528529#elif BR_UMUL128530531uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;532unsigned char k;533534t0 = _umul128(a[0], 121665, &h0);535t1 = _umul128(a[1], 121665, &h1);536k = _addcarry_u64(0, t1, h0, &t1);537t2 = _umul128(a[2], 121665, &h2);538k = _addcarry_u64(k, t2, h1, &t2);539t3 = _umul128(a[3], 121665, &h3);540k = _addcarry_u64(k, t3, h2, &t3);541(void)_addcarry_u64(k, h3, 0, &t4);542543t4 = (t4 << 1) + (t3 >> 63);544t3 &= MASK63;545k = _addcarry_u64(0, t0, 19 * t4, &t0);546k = _addcarry_u64(k, t1, 0, &t1);547k = _addcarry_u64(k, t2, 0, &t2);548(void)_addcarry_u64(k, t3, 0, &t3);549550t4 = 19 & -(t3 >> 63);551t3 &= MASK63;552k = _addcarry_u64(0, t0, t4, &d[0]);553k = _addcarry_u64(k, t1, 0, &d[1]);554k = _addcarry_u64(k, t2, 0, &d[2]);555(void)_addcarry_u64(k, t3, 0, &d[3]);556557#endif558}559560/*561* Finalize reduction.562*/563static inline void564f255_final_reduce(uint64_t *a)565{566#if BR_INT128567568uint64_t t0, t1, t2, t3, m;569unsigned __int128 z;570571/*572* We add 19. If the result (in t) is below 2^255, then a[]573* is already less than 2^255-19, thus already reduced.574* Otherwise, we subtract 2^255 from t[], in which case we575* have t = a - (2^255-19), and that's our result.576*/577z = (unsigned __int128)a[0] + 19;578t0 = (uint64_t)z;579z = (unsigned __int128)a[1] + (z >> 64);580t1 = (uint64_t)z;581z = (unsigned __int128)a[2] + (z >> 64);582t2 = (uint64_t)z;583t3 = a[3] + (uint64_t)(z >> 64);584585m = -(t3 >> 63);586t3 &= MASK63;587a[0] ^= m & (a[0] ^ t0);588a[1] ^= m & (a[1] ^ t1);589a[2] ^= m & (a[2] ^ t2);590a[3] ^= m & (a[3] ^ t3);591592#elif BR_UMUL128593594uint64_t t0, t1, t2, t3, m;595unsigned char k;596597/*598* We add 19. If the result (in t) is below 2^255, then a[]599* is already less than 2^255-19, thus already reduced.600* Otherwise, we subtract 2^255 from t[], in which case we601* have t = a - (2^255-19), and that's our result.602*/603k = _addcarry_u64(0, a[0], 19, &t0);604k = _addcarry_u64(k, a[1], 0, &t1);605k = _addcarry_u64(k, a[2], 0, &t2);606(void)_addcarry_u64(k, a[3], 0, &t3);607608m = -(t3 >> 63);609t3 &= MASK63;610a[0] ^= m & (a[0] ^ t0);611a[1] ^= m & (a[1] ^ t1);612a[2] ^= m & (a[2] ^ t2);613a[3] ^= m & (a[3] ^ t3);614615#endif616}617618static uint32_t619api_mul(unsigned char *G, size_t Glen,620const unsigned char *kb, size_t kblen, int curve)621{622unsigned char k[32];623uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];624uint32_t swap;625int i;626627(void)curve;628629/*630* Points are encoded over exactly 32 bytes. Multipliers must fit631* in 32 bytes as well.632*/633if (Glen != 32 || kblen > 32) {634return 0;635}636637/*638* RFC 7748 mandates that the high bit of the last point byte must639* be ignored/cleared.640*/641x1[0] = br_dec64le(&G[ 0]);642x1[1] = br_dec64le(&G[ 8]);643x1[2] = br_dec64le(&G[16]);644x1[3] = br_dec64le(&G[24]) & MASK63;645646/*647* We can use memset() to clear values, because exact-width types648* like uint64_t are guaranteed to have no padding bits or649* trap representations.650*/651memset(x2, 0, sizeof x2);652x2[0] = 1;653memset(z2, 0, sizeof z2);654memcpy(x3, x1, sizeof x1);655memcpy(z3, x2, sizeof x2);656657/*658* The multiplier is provided in big-endian notation, and659* possibly shorter than 32 bytes.660*/661memset(k, 0, (sizeof k) - kblen);662memcpy(k + (sizeof k) - kblen, kb, kblen);663k[31] &= 0xF8;664k[0] &= 0x7F;665k[0] |= 0x40;666667swap = 0;668669for (i = 254; i >= 0; i --) {670uint64_t a[4], aa[4], b[4], bb[4], e[4];671uint64_t c[4], d[4], da[4], cb[4];672uint32_t kt;673674kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;675swap ^= kt;676f255_cswap(x2, x3, swap);677f255_cswap(z2, z3, swap);678swap = kt;679680/* A = x_2 + z_2 */681f255_add(a, x2, z2);682683/* AA = A^2 */684f255_mul(aa, a, a);685686/* B = x_2 - z_2 */687f255_sub(b, x2, z2);688689/* BB = B^2 */690f255_mul(bb, b, b);691692/* E = AA - BB */693f255_sub(e, aa, bb);694695/* C = x_3 + z_3 */696f255_add(c, x3, z3);697698/* D = x_3 - z_3 */699f255_sub(d, x3, z3);700701/* DA = D * A */702f255_mul(da, d, a);703704/* CB = C * B */705f255_mul(cb, c, b);706707/* x_3 = (DA + CB)^2 */708f255_add(x3, da, cb);709f255_mul(x3, x3, x3);710711/* z_3 = x_1 * (DA - CB)^2 */712f255_sub(z3, da, cb);713f255_mul(z3, z3, z3);714f255_mul(z3, x1, z3);715716/* x_2 = AA * BB */717f255_mul(x2, aa, bb);718719/* z_2 = E * (AA + a24 * E) */720f255_mul_a24(z2, e);721f255_add(z2, aa, z2);722f255_mul(z2, e, z2);723}724725f255_cswap(x2, x3, swap);726f255_cswap(z2, z3, swap);727728/*729* Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize730* most non-squarings. We use x1 and x3, now useless, as temporaries.731*/732memcpy(x1, z2, sizeof z2);733for (i = 0; i < 15; i ++) {734f255_mul(x1, x1, x1);735f255_mul(x1, x1, z2);736}737memcpy(x3, x1, sizeof x1);738for (i = 0; i < 14; i ++) {739int j;740741for (j = 0; j < 16; j ++) {742f255_mul(x3, x3, x3);743}744f255_mul(x3, x3, x1);745}746for (i = 14; i >= 0; i --) {747f255_mul(x3, x3, x3);748if ((0xFFEB >> i) & 1) {749f255_mul(x3, z2, x3);750}751}752753/*754* Compute x2/z2. We have 1/z2 in x3.755*/756f255_mul(x2, x2, x3);757f255_final_reduce(x2);758759/*760* Encode the final x2 value in little-endian.761*/762br_enc64le(G, x2[0]);763br_enc64le(G + 8, x2[1]);764br_enc64le(G + 16, x2[2]);765br_enc64le(G + 24, x2[3]);766return 1;767}768769static size_t770api_mulgen(unsigned char *R,771const unsigned char *x, size_t xlen, int curve)772{773const unsigned char *G;774size_t Glen;775776G = api_generator(curve, &Glen);777memcpy(R, G, Glen);778api_mul(R, Glen, x, xlen, curve);779return Glen;780}781782static uint32_t783api_muladd(unsigned char *A, const unsigned char *B, size_t len,784const unsigned char *x, size_t xlen,785const unsigned char *y, size_t ylen, int curve)786{787/*788* We don't implement this method, since it is used for ECDSA789* only, and there is no ECDSA over Curve25519 (which instead790* uses EdDSA).791*/792(void)A;793(void)B;794(void)len;795(void)x;796(void)xlen;797(void)y;798(void)ylen;799(void)curve;800return 0;801}802803/* see bearssl_ec.h */804const br_ec_impl br_ec_c25519_m64 = {805(uint32_t)0x20000000,806&api_generator,807&api_order,808&api_xoff,809&api_mul,810&api_mulgen,811&api_muladd812};813814/* see bearssl_ec.h */815const br_ec_impl *816br_ec_c25519_m64_get(void)817{818return &br_ec_c25519_m64;819}820821#else822823/* see bearssl_ec.h */824const br_ec_impl *825br_ec_c25519_m64_get(void)826{827return 0;828}829830#endif831832833