Path: blob/main/contrib/bearssl/src/ec/ec_p256_m31.c
39507 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/*27* If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_28* that right-shifting a signed negative integer copies the sign bit29* (arithmetic right-shift). This is "implementation-defined behaviour",30* i.e. it is not undefined, but it may differ between compilers. Each31* compiler is supposed to document its behaviour in that respect. GCC32* explicitly defines that an arithmetic right shift is used. We expect33* all other compilers to do the same, because underlying CPU offer an34* arithmetic right shift opcode that could not be used otherwise.35*/36#if BR_NO_ARITH_SHIFT37#define ARSH(x, n) (((uint32_t)(x) >> (n)) \38| ((-((uint32_t)(x) >> 31)) << (32 - (n))))39#define ARSHW(x, n) (((uint64_t)(x) >> (n)) \40| ((-((uint64_t)(x) >> 63)) << (64 - (n))))41#else42#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))43#define ARSHW(x, n) ((*(int64_t *)&(x)) >> (n))44#endif4546/*47* Convert an integer from unsigned big-endian encoding to a sequence of48* 30-bit words in little-endian order. The final "partial" word is49* returned.50*/51static uint32_t52be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)53{54uint32_t acc;55int acc_len;5657acc = 0;58acc_len = 0;59while (len -- > 0) {60uint32_t b;6162b = src[len];63if (acc_len < 22) {64acc |= b << acc_len;65acc_len += 8;66} else {67*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;68acc = b >> (30 - acc_len);69acc_len -= 22;70}71}72return acc;73}7475/*76* Convert an integer (30-bit words, little-endian) to unsigned77* big-endian encoding. The total encoding length is provided; all78* the destination bytes will be filled.79*/80static void81le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)82{83uint32_t acc;84int acc_len;8586acc = 0;87acc_len = 0;88while (len -- > 0) {89if (acc_len < 8) {90uint32_t w;9192w = *src ++;93dst[len] = (unsigned char)(acc | (w << acc_len));94acc = w >> (8 - acc_len);95acc_len += 22;96} else {97dst[len] = (unsigned char)acc;98acc >>= 8;99acc_len -= 8;100}101}102}103104/*105* Multiply two integers. Source integers are represented as arrays of106* nine 30-bit words, for values up to 2^270-1. Result is encoded over107* 18 words of 30 bits each.108*/109static void110mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)111{112/*113* Maximum intermediate result is no more than114* 10376293531797946367, which fits in 64 bits. Reason:115*116* 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406117* 10376293531797946367 < 9663676407 * 2^30118*119* Thus, adding together 9 products of 30-bit integers, with120* a carry of at most 9663676406, yields an integer that fits121* on 64 bits and generates a carry of at most 9663676406.122*/123uint64_t t[17];124uint64_t cc;125int i;126127t[ 0] = MUL31(a[0], b[0]);128t[ 1] = MUL31(a[0], b[1])129+ MUL31(a[1], b[0]);130t[ 2] = MUL31(a[0], b[2])131+ MUL31(a[1], b[1])132+ MUL31(a[2], b[0]);133t[ 3] = MUL31(a[0], b[3])134+ MUL31(a[1], b[2])135+ MUL31(a[2], b[1])136+ MUL31(a[3], b[0]);137t[ 4] = MUL31(a[0], b[4])138+ MUL31(a[1], b[3])139+ MUL31(a[2], b[2])140+ MUL31(a[3], b[1])141+ MUL31(a[4], b[0]);142t[ 5] = MUL31(a[0], b[5])143+ MUL31(a[1], b[4])144+ MUL31(a[2], b[3])145+ MUL31(a[3], b[2])146+ MUL31(a[4], b[1])147+ MUL31(a[5], b[0]);148t[ 6] = MUL31(a[0], b[6])149+ MUL31(a[1], b[5])150+ MUL31(a[2], b[4])151+ MUL31(a[3], b[3])152+ MUL31(a[4], b[2])153+ MUL31(a[5], b[1])154+ MUL31(a[6], b[0]);155t[ 7] = MUL31(a[0], b[7])156+ MUL31(a[1], b[6])157+ MUL31(a[2], b[5])158+ MUL31(a[3], b[4])159+ MUL31(a[4], b[3])160+ MUL31(a[5], b[2])161+ MUL31(a[6], b[1])162+ MUL31(a[7], b[0]);163t[ 8] = MUL31(a[0], b[8])164+ MUL31(a[1], b[7])165+ MUL31(a[2], b[6])166+ MUL31(a[3], b[5])167+ MUL31(a[4], b[4])168+ MUL31(a[5], b[3])169+ MUL31(a[6], b[2])170+ MUL31(a[7], b[1])171+ MUL31(a[8], b[0]);172t[ 9] = MUL31(a[1], b[8])173+ MUL31(a[2], b[7])174+ MUL31(a[3], b[6])175+ MUL31(a[4], b[5])176+ MUL31(a[5], b[4])177+ MUL31(a[6], b[3])178+ MUL31(a[7], b[2])179+ MUL31(a[8], b[1]);180t[10] = MUL31(a[2], b[8])181+ MUL31(a[3], b[7])182+ MUL31(a[4], b[6])183+ MUL31(a[5], b[5])184+ MUL31(a[6], b[4])185+ MUL31(a[7], b[3])186+ MUL31(a[8], b[2]);187t[11] = MUL31(a[3], b[8])188+ MUL31(a[4], b[7])189+ MUL31(a[5], b[6])190+ MUL31(a[6], b[5])191+ MUL31(a[7], b[4])192+ MUL31(a[8], b[3]);193t[12] = MUL31(a[4], b[8])194+ MUL31(a[5], b[7])195+ MUL31(a[6], b[6])196+ MUL31(a[7], b[5])197+ MUL31(a[8], b[4]);198t[13] = MUL31(a[5], b[8])199+ MUL31(a[6], b[7])200+ MUL31(a[7], b[6])201+ MUL31(a[8], b[5]);202t[14] = MUL31(a[6], b[8])203+ MUL31(a[7], b[7])204+ MUL31(a[8], b[6]);205t[15] = MUL31(a[7], b[8])206+ MUL31(a[8], b[7]);207t[16] = MUL31(a[8], b[8]);208209/*210* Propagate carries.211*/212cc = 0;213for (i = 0; i < 17; i ++) {214uint64_t w;215216w = t[i] + cc;217d[i] = (uint32_t)w & 0x3FFFFFFF;218cc = w >> 30;219}220d[17] = (uint32_t)cc;221}222223/*224* Square a 270-bit integer, represented as an array of nine 30-bit words.225* Result uses 18 words of 30 bits each.226*/227static void228square9(uint32_t *d, const uint32_t *a)229{230uint64_t t[17];231uint64_t cc;232int i;233234t[ 0] = MUL31(a[0], a[0]);235t[ 1] = ((MUL31(a[0], a[1])) << 1);236t[ 2] = MUL31(a[1], a[1])237+ ((MUL31(a[0], a[2])) << 1);238t[ 3] = ((MUL31(a[0], a[3])239+ MUL31(a[1], a[2])) << 1);240t[ 4] = MUL31(a[2], a[2])241+ ((MUL31(a[0], a[4])242+ MUL31(a[1], a[3])) << 1);243t[ 5] = ((MUL31(a[0], a[5])244+ MUL31(a[1], a[4])245+ MUL31(a[2], a[3])) << 1);246t[ 6] = MUL31(a[3], a[3])247+ ((MUL31(a[0], a[6])248+ MUL31(a[1], a[5])249+ MUL31(a[2], a[4])) << 1);250t[ 7] = ((MUL31(a[0], a[7])251+ MUL31(a[1], a[6])252+ MUL31(a[2], a[5])253+ MUL31(a[3], a[4])) << 1);254t[ 8] = MUL31(a[4], a[4])255+ ((MUL31(a[0], a[8])256+ MUL31(a[1], a[7])257+ MUL31(a[2], a[6])258+ MUL31(a[3], a[5])) << 1);259t[ 9] = ((MUL31(a[1], a[8])260+ MUL31(a[2], a[7])261+ MUL31(a[3], a[6])262+ MUL31(a[4], a[5])) << 1);263t[10] = MUL31(a[5], a[5])264+ ((MUL31(a[2], a[8])265+ MUL31(a[3], a[7])266+ MUL31(a[4], a[6])) << 1);267t[11] = ((MUL31(a[3], a[8])268+ MUL31(a[4], a[7])269+ MUL31(a[5], a[6])) << 1);270t[12] = MUL31(a[6], a[6])271+ ((MUL31(a[4], a[8])272+ MUL31(a[5], a[7])) << 1);273t[13] = ((MUL31(a[5], a[8])274+ MUL31(a[6], a[7])) << 1);275t[14] = MUL31(a[7], a[7])276+ ((MUL31(a[6], a[8])) << 1);277t[15] = ((MUL31(a[7], a[8])) << 1);278t[16] = MUL31(a[8], a[8]);279280/*281* Propagate carries.282*/283cc = 0;284for (i = 0; i < 17; i ++) {285uint64_t w;286287w = t[i] + cc;288d[i] = (uint32_t)w & 0x3FFFFFFF;289cc = w >> 30;290}291d[17] = (uint32_t)cc;292}293294/*295* Base field modulus for P-256.296*/297static const uint32_t F256[] = {2982990x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,3000x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF301};302303/*304* The 'b' curve equation coefficient for P-256.305*/306static const uint32_t P256_B[] = {3073080x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,3090x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6310};311312/*313* Addition in the field. Source operands shall fit on 257 bits; output314* will be lower than twice the modulus.315*/316static void317add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)318{319uint32_t w, cc;320int i;321322cc = 0;323for (i = 0; i < 9; i ++) {324w = a[i] + b[i] + cc;325d[i] = w & 0x3FFFFFFF;326cc = w >> 30;327}328w >>= 16;329d[8] &= 0xFFFF;330d[3] -= w << 6;331d[6] -= w << 12;332d[7] += w << 14;333cc = w;334for (i = 0; i < 9; i ++) {335w = d[i] + cc;336d[i] = w & 0x3FFFFFFF;337cc = ARSH(w, 30);338}339}340341/*342* Subtraction in the field. Source operands shall be smaller than twice343* the modulus; the result will fulfil the same property.344*/345static void346sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)347{348uint32_t w, cc;349int i;350351/*352* We really compute a - b + 2*p to make sure that the result is353* positive.354*/355w = a[0] - b[0] - 0x00002;356d[0] = w & 0x3FFFFFFF;357w = a[1] - b[1] + ARSH(w, 30);358d[1] = w & 0x3FFFFFFF;359w = a[2] - b[2] + ARSH(w, 30);360d[2] = w & 0x3FFFFFFF;361w = a[3] - b[3] + ARSH(w, 30) + 0x00080;362d[3] = w & 0x3FFFFFFF;363w = a[4] - b[4] + ARSH(w, 30);364d[4] = w & 0x3FFFFFFF;365w = a[5] - b[5] + ARSH(w, 30);366d[5] = w & 0x3FFFFFFF;367w = a[6] - b[6] + ARSH(w, 30) + 0x02000;368d[6] = w & 0x3FFFFFFF;369w = a[7] - b[7] + ARSH(w, 30) - 0x08000;370d[7] = w & 0x3FFFFFFF;371w = a[8] - b[8] + ARSH(w, 30) + 0x20000;372d[8] = w & 0xFFFF;373w >>= 16;374d[8] &= 0xFFFF;375d[3] -= w << 6;376d[6] -= w << 12;377d[7] += w << 14;378cc = w;379for (i = 0; i < 9; i ++) {380w = d[i] + cc;381d[i] = w & 0x3FFFFFFF;382cc = ARSH(w, 30);383}384}385386/*387* Compute a multiplication in F256. Source operands shall be less than388* twice the modulus.389*/390static void391mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)392{393uint32_t t[18];394uint64_t s[18];395uint64_t cc, x;396uint32_t z, c;397int i;398399mul9(t, a, b);400401/*402* Modular reduction: each high word in added/subtracted where403* necessary.404*405* The modulus is:406* p = 2^256 - 2^224 + 2^192 + 2^96 - 1407* Therefore:408* 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p409*410* For a word x at bit offset n (n >= 256), we have:411* x*2^n = x*2^(n-32) - x*2^(n-64)412* - x*2^(n - 160) + x*2^(n-256) mod p413*414* Thus, we can nullify the high word if we reinject it at some415* proper emplacements.416*417* We use 64-bit intermediate words to allow for carries to418* accumulate easily, before performing the final propagation.419*/420for (i = 0; i < 18; i ++) {421s[i] = t[i];422}423424for (i = 17; i >= 9; i --) {425uint64_t y;426427y = s[i];428s[i - 1] += ARSHW(y, 2);429s[i - 2] += (y << 28) & 0x3FFFFFFF;430s[i - 2] -= ARSHW(y, 4);431s[i - 3] -= (y << 26) & 0x3FFFFFFF;432s[i - 5] -= ARSHW(y, 10);433s[i - 6] -= (y << 20) & 0x3FFFFFFF;434s[i - 8] += ARSHW(y, 16);435s[i - 9] += (y << 14) & 0x3FFFFFFF;436}437438/*439* Carry propagation must be signed. Moreover, we may have overdone440* it a bit, and obtain a negative result.441*442* The loop above ran 9 times; each time, each word was augmented443* by at most one extra word (in absolute value). Thus, the top444* word must in fine fit in 39 bits, so the carry below will fit445* on 9 bits.446*/447cc = 0;448for (i = 0; i < 9; i ++) {449x = s[i] + cc;450d[i] = (uint32_t)x & 0x3FFFFFFF;451cc = ARSHW(x, 30);452}453454/*455* All nine words fit on 30 bits, but there may be an extra456* carry for a few bits (at most 9), and that carry may be457* negative. Moreover, we want the result to fit on 257 bits.458* The two lines below ensure that the word in d[] has length459* 256 bits, and the (signed) carry (beyond 2^256) is in cc. The460* significant length of cc is less than 24 bits, so we will be461* able to switch to 32-bit operations.462*/463cc = ARSHW(x, 16);464d[8] &= 0xFFFF;465466/*467* One extra round of reduction, for cc*2^256, which means468* adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)469* value. If cc is negative, then it may happen (rarely, but470* not neglectibly so) that the result would be negative. In471* order to avoid that, if cc is negative, then we add the472* modulus once. Note that if cc is negative, then propagating473* that carry must yield a value lower than the modulus, so474* adding the modulus once will keep the final result under475* twice the modulus.476*/477z = (uint32_t)cc;478d[3] -= z << 6;479d[6] -= (z << 12) & 0x3FFFFFFF;480d[7] -= ARSH(z, 18);481d[7] += (z << 14) & 0x3FFFFFFF;482d[8] += ARSH(z, 16);483c = z >> 31;484d[0] -= c;485d[3] += c << 6;486d[6] += c << 12;487d[7] -= c << 14;488d[8] += c << 16;489for (i = 0; i < 9; i ++) {490uint32_t w;491492w = d[i] + z;493d[i] = w & 0x3FFFFFFF;494z = ARSH(w, 30);495}496}497498/*499* Compute a square in F256. Source operand shall be less than500* twice the modulus.501*/502static void503square_f256(uint32_t *d, const uint32_t *a)504{505uint32_t t[18];506uint64_t s[18];507uint64_t cc, x;508uint32_t z, c;509int i;510511square9(t, a);512513/*514* Modular reduction: each high word in added/subtracted where515* necessary.516*517* The modulus is:518* p = 2^256 - 2^224 + 2^192 + 2^96 - 1519* Therefore:520* 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p521*522* For a word x at bit offset n (n >= 256), we have:523* x*2^n = x*2^(n-32) - x*2^(n-64)524* - x*2^(n - 160) + x*2^(n-256) mod p525*526* Thus, we can nullify the high word if we reinject it at some527* proper emplacements.528*529* We use 64-bit intermediate words to allow for carries to530* accumulate easily, before performing the final propagation.531*/532for (i = 0; i < 18; i ++) {533s[i] = t[i];534}535536for (i = 17; i >= 9; i --) {537uint64_t y;538539y = s[i];540s[i - 1] += ARSHW(y, 2);541s[i - 2] += (y << 28) & 0x3FFFFFFF;542s[i - 2] -= ARSHW(y, 4);543s[i - 3] -= (y << 26) & 0x3FFFFFFF;544s[i - 5] -= ARSHW(y, 10);545s[i - 6] -= (y << 20) & 0x3FFFFFFF;546s[i - 8] += ARSHW(y, 16);547s[i - 9] += (y << 14) & 0x3FFFFFFF;548}549550/*551* Carry propagation must be signed. Moreover, we may have overdone552* it a bit, and obtain a negative result.553*554* The loop above ran 9 times; each time, each word was augmented555* by at most one extra word (in absolute value). Thus, the top556* word must in fine fit in 39 bits, so the carry below will fit557* on 9 bits.558*/559cc = 0;560for (i = 0; i < 9; i ++) {561x = s[i] + cc;562d[i] = (uint32_t)x & 0x3FFFFFFF;563cc = ARSHW(x, 30);564}565566/*567* All nine words fit on 30 bits, but there may be an extra568* carry for a few bits (at most 9), and that carry may be569* negative. Moreover, we want the result to fit on 257 bits.570* The two lines below ensure that the word in d[] has length571* 256 bits, and the (signed) carry (beyond 2^256) is in cc. The572* significant length of cc is less than 24 bits, so we will be573* able to switch to 32-bit operations.574*/575cc = ARSHW(x, 16);576d[8] &= 0xFFFF;577578/*579* One extra round of reduction, for cc*2^256, which means580* adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)581* value. If cc is negative, then it may happen (rarely, but582* not neglectibly so) that the result would be negative. In583* order to avoid that, if cc is negative, then we add the584* modulus once. Note that if cc is negative, then propagating585* that carry must yield a value lower than the modulus, so586* adding the modulus once will keep the final result under587* twice the modulus.588*/589z = (uint32_t)cc;590d[3] -= z << 6;591d[6] -= (z << 12) & 0x3FFFFFFF;592d[7] -= ARSH(z, 18);593d[7] += (z << 14) & 0x3FFFFFFF;594d[8] += ARSH(z, 16);595c = z >> 31;596d[0] -= c;597d[3] += c << 6;598d[6] += c << 12;599d[7] -= c << 14;600d[8] += c << 16;601for (i = 0; i < 9; i ++) {602uint32_t w;603604w = d[i] + z;605d[i] = w & 0x3FFFFFFF;606z = ARSH(w, 30);607}608}609610/*611* Perform a "final reduction" in field F256 (field for curve P-256).612* The source value must be less than twice the modulus. If the value613* is not lower than the modulus, then the modulus is subtracted and614* this function returns 1; otherwise, it leaves it untouched and it615* returns 0.616*/617static uint32_t618reduce_final_f256(uint32_t *d)619{620uint32_t t[9];621uint32_t cc;622int i;623624cc = 0;625for (i = 0; i < 9; i ++) {626uint32_t w;627628w = d[i] - F256[i] - cc;629cc = w >> 31;630t[i] = w & 0x3FFFFFFF;631}632cc ^= 1;633CCOPY(cc, d, t, sizeof t);634return cc;635}636637/*638* Jacobian coordinates for a point in P-256: affine coordinates (X,Y)639* are such that:640* X = x / z^2641* Y = y / z^3642* For the point at infinity, z = 0.643* Each point thus admits many possible representations.644*645* Coordinates are represented in arrays of 32-bit integers, each holding646* 30 bits of data. Values may also be slightly greater than the modulus,647* but they will always be lower than twice the modulus.648*/649typedef struct {650uint32_t x[9];651uint32_t y[9];652uint32_t z[9];653} p256_jacobian;654655/*656* Convert a point to affine coordinates:657* - If the point is the point at infinity, then all three coordinates658* are set to 0.659* - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'660* coordinates are the 'X' and 'Y' affine coordinates.661* The coordinates are guaranteed to be lower than the modulus.662*/663static void664p256_to_affine(p256_jacobian *P)665{666uint32_t t1[9], t2[9];667int i;668669/*670* Invert z with a modular exponentiation: the modulus is671* p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is672* p-2. Exponent bit pattern (from high to low) is:673* - 32 bits of value 1674* - 31 bits of value 0675* - 1 bit of value 1676* - 96 bits of value 0677* - 94 bits of value 1678* - 1 bit of value 0679* - 1 bit of value 1680* Thus, we precompute z^(2^31-1) to speed things up.681*682* If z = 0 (point at infinity) then the modular exponentiation683* will yield 0, which leads to the expected result (all three684* coordinates set to 0).685*/686687/*688* A simple square-and-multiply for z^(2^31-1). We could save about689* two dozen multiplications here with an addition chain, but690* this would require a bit more code, and extra stack buffers.691*/692memcpy(t1, P->z, sizeof P->z);693for (i = 0; i < 30; i ++) {694square_f256(t1, t1);695mul_f256(t1, t1, P->z);696}697698/*699* Square-and-multiply. Apart from the squarings, we have a few700* multiplications to set bits to 1; we multiply by the original z701* for setting 1 bit, and by t1 for setting 31 bits.702*/703memcpy(t2, P->z, sizeof P->z);704for (i = 1; i < 256; i ++) {705square_f256(t2, t2);706switch (i) {707case 31:708case 190:709case 221:710case 252:711mul_f256(t2, t2, t1);712break;713case 63:714case 253:715case 255:716mul_f256(t2, t2, P->z);717break;718}719}720721/*722* Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.723*/724mul_f256(t1, t2, t2);725mul_f256(P->x, t1, P->x);726mul_f256(t1, t1, t2);727mul_f256(P->y, t1, P->y);728reduce_final_f256(P->x);729reduce_final_f256(P->y);730731/*732* Multiply z by 1/z. If z = 0, then this will yield 0, otherwise733* this will set z to 1.734*/735mul_f256(P->z, P->z, t2);736reduce_final_f256(P->z);737}738739/*740* Double a point in P-256. This function works for all valid points,741* including the point at infinity.742*/743static void744p256_double(p256_jacobian *Q)745{746/*747* Doubling formulas are:748*749* s = 4*x*y^2750* m = 3*(x + z^2)*(x - z^2)751* x' = m^2 - 2*s752* y' = m*(s - x') - 8*y^4753* z' = 2*y*z754*755* These formulas work for all points, including points of order 2756* and points at infinity:757* - If y = 0 then z' = 0. But there is no such point in P-256758* anyway.759* - If z = 0 then z' = 0.760*/761uint32_t t1[9], t2[9], t3[9], t4[9];762763/*764* Compute z^2 in t1.765*/766square_f256(t1, Q->z);767768/*769* Compute x-z^2 in t2 and x+z^2 in t1.770*/771add_f256(t2, Q->x, t1);772sub_f256(t1, Q->x, t1);773774/*775* Compute 3*(x+z^2)*(x-z^2) in t1.776*/777mul_f256(t3, t1, t2);778add_f256(t1, t3, t3);779add_f256(t1, t3, t1);780781/*782* Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).783*/784square_f256(t3, Q->y);785add_f256(t3, t3, t3);786mul_f256(t2, Q->x, t3);787add_f256(t2, t2, t2);788789/*790* Compute x' = m^2 - 2*s.791*/792square_f256(Q->x, t1);793sub_f256(Q->x, Q->x, t2);794sub_f256(Q->x, Q->x, t2);795796/*797* Compute z' = 2*y*z.798*/799mul_f256(t4, Q->y, Q->z);800add_f256(Q->z, t4, t4);801802/*803* Compute y' = m*(s - x') - 8*y^4. Note that we already have804* 2*y^2 in t3.805*/806sub_f256(t2, t2, Q->x);807mul_f256(Q->y, t1, t2);808square_f256(t4, t3);809add_f256(t4, t4, t4);810sub_f256(Q->y, Q->y, t4);811}812813/*814* Add point P2 to point P1.815*816* This function computes the wrong result in the following cases:817*818* - If P1 == 0 but P2 != 0819* - If P1 != 0 but P2 == 0820* - If P1 == P2821*822* In all three cases, P1 is set to the point at infinity.823*824* Returned value is 0 if one of the following occurs:825*826* - P1 and P2 have the same Y coordinate827* - P1 == 0 and P2 == 0828* - The Y coordinate of one of the points is 0 and the other point is829* the point at infinity.830*831* The third case cannot actually happen with valid points, since a point832* with Y == 0 is a point of order 2, and there is no point of order 2 on833* curve P-256.834*835* Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller836* can apply the following:837*838* - If the result is not the point at infinity, then it is correct.839* - Otherwise, if the returned value is 1, then this is a case of840* P1+P2 == 0, so the result is indeed the point at infinity.841* - Otherwise, P1 == P2, so a "double" operation should have been842* performed.843*/844static uint32_t845p256_add(p256_jacobian *P1, const p256_jacobian *P2)846{847/*848* Addtions formulas are:849*850* u1 = x1 * z2^2851* u2 = x2 * z1^2852* s1 = y1 * z2^3853* s2 = y2 * z1^3854* h = u2 - u1855* r = s2 - s1856* x3 = r^2 - h^3 - 2 * u1 * h^2857* y3 = r * (u1 * h^2 - x3) - s1 * h^3858* z3 = h * z1 * z2859*/860uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];861uint32_t ret;862int i;863864/*865* Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).866*/867square_f256(t3, P2->z);868mul_f256(t1, P1->x, t3);869mul_f256(t4, P2->z, t3);870mul_f256(t3, P1->y, t4);871872/*873* Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).874*/875square_f256(t4, P1->z);876mul_f256(t2, P2->x, t4);877mul_f256(t5, P1->z, t4);878mul_f256(t4, P2->y, t5);879880/*881* Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).882* We need to test whether r is zero, so we will do some extra883* reduce.884*/885sub_f256(t2, t2, t1);886sub_f256(t4, t4, t3);887reduce_final_f256(t4);888ret = 0;889for (i = 0; i < 9; i ++) {890ret |= t4[i];891}892ret = (ret | -ret) >> 31;893894/*895* Compute u1*h^2 (in t6) and h^3 (in t5);896*/897square_f256(t7, t2);898mul_f256(t6, t1, t7);899mul_f256(t5, t7, t2);900901/*902* Compute x3 = r^2 - h^3 - 2*u1*h^2.903*/904square_f256(P1->x, t4);905sub_f256(P1->x, P1->x, t5);906sub_f256(P1->x, P1->x, t6);907sub_f256(P1->x, P1->x, t6);908909/*910* Compute y3 = r*(u1*h^2 - x3) - s1*h^3.911*/912sub_f256(t6, t6, P1->x);913mul_f256(P1->y, t4, t6);914mul_f256(t1, t5, t3);915sub_f256(P1->y, P1->y, t1);916917/*918* Compute z3 = h*z1*z2.919*/920mul_f256(t1, P1->z, P2->z);921mul_f256(P1->z, t1, t2);922923return ret;924}925926/*927* Add point P2 to point P1. This is a specialised function for the928* case when P2 is a non-zero point in affine coordinate.929*930* This function computes the wrong result in the following cases:931*932* - If P1 == 0933* - If P1 == P2934*935* In both cases, P1 is set to the point at infinity.936*937* Returned value is 0 if one of the following occurs:938*939* - P1 and P2 have the same Y coordinate940* - The Y coordinate of P2 is 0 and P1 is the point at infinity.941*942* The second case cannot actually happen with valid points, since a point943* with Y == 0 is a point of order 2, and there is no point of order 2 on944* curve P-256.945*946* Therefore, assuming that P1 != 0 on input, then the caller947* can apply the following:948*949* - If the result is not the point at infinity, then it is correct.950* - Otherwise, if the returned value is 1, then this is a case of951* P1+P2 == 0, so the result is indeed the point at infinity.952* - Otherwise, P1 == P2, so a "double" operation should have been953* performed.954*/955static uint32_t956p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)957{958/*959* Addtions formulas are:960*961* u1 = x1962* u2 = x2 * z1^2963* s1 = y1964* s2 = y2 * z1^3965* h = u2 - u1966* r = s2 - s1967* x3 = r^2 - h^3 - 2 * u1 * h^2968* y3 = r * (u1 * h^2 - x3) - s1 * h^3969* z3 = h * z1970*/971uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];972uint32_t ret;973int i;974975/*976* Compute u1 = x1 (in t1) and s1 = y1 (in t3).977*/978memcpy(t1, P1->x, sizeof t1);979memcpy(t3, P1->y, sizeof t3);980981/*982* Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).983*/984square_f256(t4, P1->z);985mul_f256(t2, P2->x, t4);986mul_f256(t5, P1->z, t4);987mul_f256(t4, P2->y, t5);988989/*990* Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).991* We need to test whether r is zero, so we will do some extra992* reduce.993*/994sub_f256(t2, t2, t1);995sub_f256(t4, t4, t3);996reduce_final_f256(t4);997ret = 0;998for (i = 0; i < 9; i ++) {999ret |= t4[i];1000}1001ret = (ret | -ret) >> 31;10021003/*1004* Compute u1*h^2 (in t6) and h^3 (in t5);1005*/1006square_f256(t7, t2);1007mul_f256(t6, t1, t7);1008mul_f256(t5, t7, t2);10091010/*1011* Compute x3 = r^2 - h^3 - 2*u1*h^2.1012*/1013square_f256(P1->x, t4);1014sub_f256(P1->x, P1->x, t5);1015sub_f256(P1->x, P1->x, t6);1016sub_f256(P1->x, P1->x, t6);10171018/*1019* Compute y3 = r*(u1*h^2 - x3) - s1*h^3.1020*/1021sub_f256(t6, t6, P1->x);1022mul_f256(P1->y, t4, t6);1023mul_f256(t1, t5, t3);1024sub_f256(P1->y, P1->y, t1);10251026/*1027* Compute z3 = h*z1*z2.1028*/1029mul_f256(P1->z, P1->z, t2);10301031return ret;1032}10331034/*1035* Decode a P-256 point. This function does not support the point at1036* infinity. Returned value is 0 if the point is invalid, 1 otherwise.1037*/1038static uint32_t1039p256_decode(p256_jacobian *P, const void *src, size_t len)1040{1041const unsigned char *buf;1042uint32_t tx[9], ty[9], t1[9], t2[9];1043uint32_t bad;1044int i;10451046if (len != 65) {1047return 0;1048}1049buf = src;10501051/*1052* First byte must be 0x04 (uncompressed format). We could support1053* "hybrid format" (first byte is 0x06 or 0x07, and encodes the1054* least significant bit of the Y coordinate), but it is explicitly1055* forbidden by RFC 5480 (section 2.2).1056*/1057bad = NEQ(buf[0], 0x04);10581059/*1060* Decode the coordinates, and check that they are both lower1061* than the modulus.1062*/1063tx[8] = be8_to_le30(tx, buf + 1, 32);1064ty[8] = be8_to_le30(ty, buf + 33, 32);1065bad |= reduce_final_f256(tx);1066bad |= reduce_final_f256(ty);10671068/*1069* Check curve equation.1070*/1071square_f256(t1, tx);1072mul_f256(t1, tx, t1);1073square_f256(t2, ty);1074sub_f256(t1, t1, tx);1075sub_f256(t1, t1, tx);1076sub_f256(t1, t1, tx);1077add_f256(t1, t1, P256_B);1078sub_f256(t1, t1, t2);1079reduce_final_f256(t1);1080for (i = 0; i < 9; i ++) {1081bad |= t1[i];1082}10831084/*1085* Copy coordinates to the point structure.1086*/1087memcpy(P->x, tx, sizeof tx);1088memcpy(P->y, ty, sizeof ty);1089memset(P->z, 0, sizeof P->z);1090P->z[0] = 1;1091return EQ(bad, 0);1092}10931094/*1095* Encode a point into a buffer. This function assumes that the point is1096* valid, in affine coordinates, and not the point at infinity.1097*/1098static void1099p256_encode(void *dst, const p256_jacobian *P)1100{1101unsigned char *buf;11021103buf = dst;1104buf[0] = 0x04;1105le30_to_be8(buf + 1, 32, P->x);1106le30_to_be8(buf + 33, 32, P->y);1107}11081109/*1110* Multiply a curve point by an integer. The integer is assumed to be1111* lower than the curve order, and the base point must not be the point1112* at infinity.1113*/1114static void1115p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)1116{1117/*1118* qz is a flag that is initially 1, and remains equal to 11119* as long as the point is the point at infinity.1120*1121* We use a 2-bit window to handle multiplier bits by pairs.1122* The precomputed window really is the points P2 and P3.1123*/1124uint32_t qz;1125p256_jacobian P2, P3, Q, T, U;11261127/*1128* Compute window values.1129*/1130P2 = *P;1131p256_double(&P2);1132P3 = *P;1133p256_add(&P3, &P2);11341135/*1136* We start with Q = 0. We process multiplier bits 2 by 2.1137*/1138memset(&Q, 0, sizeof Q);1139qz = 1;1140while (xlen -- > 0) {1141int k;11421143for (k = 6; k >= 0; k -= 2) {1144uint32_t bits;1145uint32_t bnz;11461147p256_double(&Q);1148p256_double(&Q);1149T = *P;1150U = Q;1151bits = (*x >> k) & (uint32_t)3;1152bnz = NEQ(bits, 0);1153CCOPY(EQ(bits, 2), &T, &P2, sizeof T);1154CCOPY(EQ(bits, 3), &T, &P3, sizeof T);1155p256_add(&U, &T);1156CCOPY(bnz & qz, &Q, &T, sizeof Q);1157CCOPY(bnz & ~qz, &Q, &U, sizeof Q);1158qz &= ~bnz;1159}1160x ++;1161}1162*P = Q;1163}11641165/*1166* Precomputed window: k*G points, where G is the curve generator, and k1167* is an integer from 1 to 15 (inclusive). The X and Y coordinates of1168* the point are encoded as 9 words of 30 bits each (little-endian1169* order).1170*/1171static const uint32_t Gwin[15][18] = {11721173{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,11740x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,11750x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,11760x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,11770x10B8BF86, 0x00004FE3 },11781179{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,11800x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,11810x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,11820x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,11830x154436E3, 0x00000777 },11841185{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,11860x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,11870x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,11880x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,11890x19031266, 0x00008734 },11901191{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,11920x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,11930x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,11940x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,11950x15D69318, 0x0000E0F1 },11961197{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,11980x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,11990x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,12000x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,12010x1F6A2412, 0x0000E0C1 },12021203{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,12040x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,12050x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,12060x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,12070x041D0C8D, 0x0000E85C },12081209{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,12100x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,12110x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,12120x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,12130x076F780C, 0x000073EB },12141215{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,12160x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,12170x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,12180x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,12190x332F647A, 0x0000AD5A },12201221{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,12220x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,12230x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,12240x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,12250x11325CB2, 0x00002A27 },12261227{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,12280x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,12290x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,12300x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,12310x18A88A6A, 0x00008786 },12321233{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,12340x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,12350x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,12360x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,12370x0826B331, 0x00009099 },12381239{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,12400x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,12410x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,12420x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,12430x2D1AA70E, 0x00000770 },12441245{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,12460x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,12470x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,12480x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,12490x163353AF, 0x000063BB },12501251{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,12520x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,12530x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,12540x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,12550x3C6ECA7D, 0x0000F599 },12561257{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,12580x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,12590x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,12600x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,12610x0FB8D64B, 0x0000B5B9 }1262};12631264/*1265* Lookup one of the Gwin[] values, by index. This is constant-time.1266*/1267static void1268lookup_Gwin(p256_jacobian *T, uint32_t idx)1269{1270uint32_t xy[18];1271uint32_t k;1272size_t u;12731274memset(xy, 0, sizeof xy);1275for (k = 0; k < 15; k ++) {1276uint32_t m;12771278m = -EQ(idx, k + 1);1279for (u = 0; u < 18; u ++) {1280xy[u] |= m & Gwin[k][u];1281}1282}1283memcpy(T->x, &xy[0], sizeof T->x);1284memcpy(T->y, &xy[9], sizeof T->y);1285memset(T->z, 0, sizeof T->z);1286T->z[0] = 1;1287}12881289/*1290* Multiply the generator by an integer. The integer is assumed non-zero1291* and lower than the curve order.1292*/1293static void1294p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)1295{1296/*1297* qz is a flag that is initially 1, and remains equal to 11298* as long as the point is the point at infinity.1299*1300* We use a 4-bit window to handle multiplier bits by groups1301* of 4. The precomputed window is constant static data, with1302* points in affine coordinates; we use a constant-time lookup.1303*/1304p256_jacobian Q;1305uint32_t qz;13061307memset(&Q, 0, sizeof Q);1308qz = 1;1309while (xlen -- > 0) {1310int k;1311unsigned bx;13121313bx = *x ++;1314for (k = 0; k < 2; k ++) {1315uint32_t bits;1316uint32_t bnz;1317p256_jacobian T, U;13181319p256_double(&Q);1320p256_double(&Q);1321p256_double(&Q);1322p256_double(&Q);1323bits = (bx >> 4) & 0x0F;1324bnz = NEQ(bits, 0);1325lookup_Gwin(&T, bits);1326U = Q;1327p256_add_mixed(&U, &T);1328CCOPY(bnz & qz, &Q, &T, sizeof Q);1329CCOPY(bnz & ~qz, &Q, &U, sizeof Q);1330qz &= ~bnz;1331bx <<= 4;1332}1333}1334*P = Q;1335}13361337static const unsigned char P256_G[] = {13380x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,13390xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,13400x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,13410x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,13420x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,13430xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,13440x68, 0x37, 0xBF, 0x51, 0xF51345};13461347static const unsigned char P256_N[] = {13480xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,13490xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,13500xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,13510x25, 0x511352};13531354static const unsigned char *1355api_generator(int curve, size_t *len)1356{1357(void)curve;1358*len = sizeof P256_G;1359return P256_G;1360}13611362static const unsigned char *1363api_order(int curve, size_t *len)1364{1365(void)curve;1366*len = sizeof P256_N;1367return P256_N;1368}13691370static size_t1371api_xoff(int curve, size_t *len)1372{1373(void)curve;1374*len = 32;1375return 1;1376}13771378static uint32_t1379api_mul(unsigned char *G, size_t Glen,1380const unsigned char *x, size_t xlen, int curve)1381{1382uint32_t r;1383p256_jacobian P;13841385(void)curve;1386if (Glen != 65) {1387return 0;1388}1389r = p256_decode(&P, G, Glen);1390p256_mul(&P, x, xlen);1391p256_to_affine(&P);1392p256_encode(G, &P);1393return r;1394}13951396static size_t1397api_mulgen(unsigned char *R,1398const unsigned char *x, size_t xlen, int curve)1399{1400p256_jacobian P;14011402(void)curve;1403p256_mulgen(&P, x, xlen);1404p256_to_affine(&P);1405p256_encode(R, &P);1406return 65;1407}14081409static uint32_t1410api_muladd(unsigned char *A, const unsigned char *B, size_t len,1411const unsigned char *x, size_t xlen,1412const unsigned char *y, size_t ylen, int curve)1413{1414p256_jacobian P, Q;1415uint32_t r, t, z;1416int i;14171418(void)curve;1419if (len != 65) {1420return 0;1421}1422r = p256_decode(&P, A, len);1423p256_mul(&P, x, xlen);1424if (B == NULL) {1425p256_mulgen(&Q, y, ylen);1426} else {1427r &= p256_decode(&Q, B, len);1428p256_mul(&Q, y, ylen);1429}14301431/*1432* The final addition may fail in case both points are equal.1433*/1434t = p256_add(&P, &Q);1435reduce_final_f256(P.z);1436z = 0;1437for (i = 0; i < 9; i ++) {1438z |= P.z[i];1439}1440z = EQ(z, 0);1441p256_double(&Q);14421443/*1444* If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we1445* have the following:1446*1447* z = 0, t = 0 return P (normal addition)1448* z = 0, t = 1 return P (normal addition)1449* z = 1, t = 0 return Q (a 'double' case)1450* z = 1, t = 1 report an error (P+Q = 0)1451*/1452CCOPY(z & ~t, &P, &Q, sizeof Q);1453p256_to_affine(&P);1454p256_encode(A, &P);1455r &= ~(z & t);1456return r;1457}14581459/* see bearssl_ec.h */1460const br_ec_impl br_ec_p256_m31 = {1461(uint32_t)0x00800000,1462&api_generator,1463&api_order,1464&api_xoff,1465&api_mul,1466&api_mulgen,1467&api_muladd1468};146914701471