Path: blob/main/crypto/openssl/crypto/bn/bn_asm.c
106751 views
/*1* Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.2*3* Licensed under the Apache License 2.0 (the "License"). You may not use4* this file except in compliance with the License. You can obtain a copy5* in the file LICENSE in the source distribution or at6* https://www.openssl.org/source/license.html7*/89#include <assert.h>10#include <openssl/crypto.h>11#include "internal/cryptlib.h"12#include "bn_local.h"1314#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)1516BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,17BN_ULONG w)18{19BN_ULONG c1 = 0;2021assert(num >= 0);22if (num <= 0)23return c1;2425#ifndef OPENSSL_SMALL_FOOTPRINT26while (num & ~3) {27mul_add(rp[0], ap[0], w, c1);28mul_add(rp[1], ap[1], w, c1);29mul_add(rp[2], ap[2], w, c1);30mul_add(rp[3], ap[3], w, c1);31ap += 4;32rp += 4;33num -= 4;34}35#endif36while (num) {37mul_add(rp[0], ap[0], w, c1);38ap++;39rp++;40num--;41}4243return c1;44}4546BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)47{48BN_ULONG c1 = 0;4950assert(num >= 0);51if (num <= 0)52return c1;5354#ifndef OPENSSL_SMALL_FOOTPRINT55while (num & ~3) {56mul(rp[0], ap[0], w, c1);57mul(rp[1], ap[1], w, c1);58mul(rp[2], ap[2], w, c1);59mul(rp[3], ap[3], w, c1);60ap += 4;61rp += 4;62num -= 4;63}64#endif65while (num) {66mul(rp[0], ap[0], w, c1);67ap++;68rp++;69num--;70}71return c1;72}7374void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)75{76assert(n >= 0);77if (n <= 0)78return;7980#ifndef OPENSSL_SMALL_FOOTPRINT81while (n & ~3) {82sqr(r[0], r[1], a[0]);83sqr(r[2], r[3], a[1]);84sqr(r[4], r[5], a[2]);85sqr(r[6], r[7], a[3]);86a += 4;87r += 8;88n -= 4;89}90#endif91while (n) {92sqr(r[0], r[1], a[0]);93a++;94r += 2;95n--;96}97}9899#else /* !(defined(BN_LLONG) || \100* defined(BN_UMULT_HIGH)) */101102BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,103BN_ULONG w)104{105BN_ULONG c = 0;106BN_ULONG bl, bh;107108assert(num >= 0);109if (num <= 0)110return (BN_ULONG)0;111112bl = LBITS(w);113bh = HBITS(w);114115#ifndef OPENSSL_SMALL_FOOTPRINT116while (num & ~3) {117mul_add(rp[0], ap[0], bl, bh, c);118mul_add(rp[1], ap[1], bl, bh, c);119mul_add(rp[2], ap[2], bl, bh, c);120mul_add(rp[3], ap[3], bl, bh, c);121ap += 4;122rp += 4;123num -= 4;124}125#endif126while (num) {127mul_add(rp[0], ap[0], bl, bh, c);128ap++;129rp++;130num--;131}132return c;133}134135BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)136{137BN_ULONG carry = 0;138BN_ULONG bl, bh;139140assert(num >= 0);141if (num <= 0)142return (BN_ULONG)0;143144bl = LBITS(w);145bh = HBITS(w);146147#ifndef OPENSSL_SMALL_FOOTPRINT148while (num & ~3) {149mul(rp[0], ap[0], bl, bh, carry);150mul(rp[1], ap[1], bl, bh, carry);151mul(rp[2], ap[2], bl, bh, carry);152mul(rp[3], ap[3], bl, bh, carry);153ap += 4;154rp += 4;155num -= 4;156}157#endif158while (num) {159mul(rp[0], ap[0], bl, bh, carry);160ap++;161rp++;162num--;163}164return carry;165}166167void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)168{169assert(n >= 0);170if (n <= 0)171return;172173#ifndef OPENSSL_SMALL_FOOTPRINT174while (n & ~3) {175sqr64(r[0], r[1], a[0]);176sqr64(r[2], r[3], a[1]);177sqr64(r[4], r[5], a[2]);178sqr64(r[6], r[7], a[3]);179a += 4;180r += 8;181n -= 4;182}183#endif184while (n) {185sqr64(r[0], r[1], a[0]);186a++;187r += 2;188n--;189}190}191192#endif /* !(defined(BN_LLONG) || \193* defined(BN_UMULT_HIGH)) */194195#if defined(BN_LLONG) && defined(BN_DIV2W)196197BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)198{199return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d));200}201202#else203204/* Divide h,l by d and return the result. */205/* I need to test this some more :-( */206BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)207{208BN_ULONG dh, dl, q, ret = 0, th, tl, t;209int i, count = 2;210211if (d == 0)212return BN_MASK2;213214i = BN_num_bits_word(d);215assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));216217i = BN_BITS2 - i;218if (h >= d)219h -= d;220221if (i) {222d <<= i;223h = (h << i) | (l >> (BN_BITS2 - i));224l <<= i;225}226dh = (d & BN_MASK2h) >> BN_BITS4;227dl = (d & BN_MASK2l);228for (;;) {229if ((h >> BN_BITS4) == dh)230q = BN_MASK2l;231else232q = h / dh;233234th = q * dh;235tl = dl * q;236for (;;) {237t = h - th;238if ((t & BN_MASK2h) || ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))239break;240q--;241th -= dh;242tl -= dl;243}244t = (tl >> BN_BITS4);245tl = (tl << BN_BITS4) & BN_MASK2h;246th += t;247248if (l < tl)249th++;250l -= tl;251if (h < th) {252h += d;253q--;254}255h -= th;256257if (--count == 0)258break;259260ret = q << BN_BITS4;261h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;262l = (l & BN_MASK2l) << BN_BITS4;263}264ret |= q;265return ret;266}267#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */268269#ifdef BN_LLONG270BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,271int n)272{273BN_ULLONG ll = 0;274275assert(n >= 0);276if (n <= 0)277return (BN_ULONG)0;278279#ifndef OPENSSL_SMALL_FOOTPRINT280while (n & ~3) {281ll += (BN_ULLONG)a[0] + b[0];282r[0] = (BN_ULONG)ll & BN_MASK2;283ll >>= BN_BITS2;284ll += (BN_ULLONG)a[1] + b[1];285r[1] = (BN_ULONG)ll & BN_MASK2;286ll >>= BN_BITS2;287ll += (BN_ULLONG)a[2] + b[2];288r[2] = (BN_ULONG)ll & BN_MASK2;289ll >>= BN_BITS2;290ll += (BN_ULLONG)a[3] + b[3];291r[3] = (BN_ULONG)ll & BN_MASK2;292ll >>= BN_BITS2;293a += 4;294b += 4;295r += 4;296n -= 4;297}298#endif299while (n) {300ll += (BN_ULLONG)a[0] + b[0];301r[0] = (BN_ULONG)ll & BN_MASK2;302ll >>= BN_BITS2;303a++;304b++;305r++;306n--;307}308return (BN_ULONG)ll;309}310#else /* !BN_LLONG */311BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,312int n)313{314BN_ULONG c, l, t;315316assert(n >= 0);317if (n <= 0)318return (BN_ULONG)0;319320c = 0;321#ifndef OPENSSL_SMALL_FOOTPRINT322while (n & ~3) {323t = a[0];324t = (t + c) & BN_MASK2;325c = (t < c);326l = (t + b[0]) & BN_MASK2;327c += (l < t);328r[0] = l;329t = a[1];330t = (t + c) & BN_MASK2;331c = (t < c);332l = (t + b[1]) & BN_MASK2;333c += (l < t);334r[1] = l;335t = a[2];336t = (t + c) & BN_MASK2;337c = (t < c);338l = (t + b[2]) & BN_MASK2;339c += (l < t);340r[2] = l;341t = a[3];342t = (t + c) & BN_MASK2;343c = (t < c);344l = (t + b[3]) & BN_MASK2;345c += (l < t);346r[3] = l;347a += 4;348b += 4;349r += 4;350n -= 4;351}352#endif353while (n) {354t = a[0];355t = (t + c) & BN_MASK2;356c = (t < c);357l = (t + b[0]) & BN_MASK2;358c += (l < t);359r[0] = l;360a++;361b++;362r++;363n--;364}365return (BN_ULONG)c;366}367#endif /* !BN_LLONG */368369BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,370int n)371{372BN_ULONG t1, t2;373int c = 0;374375assert(n >= 0);376if (n <= 0)377return (BN_ULONG)0;378379#ifndef OPENSSL_SMALL_FOOTPRINT380while (n & ~3) {381t1 = a[0];382t2 = (t1 - c) & BN_MASK2;383c = (t2 > t1);384t1 = b[0];385t1 = (t2 - t1) & BN_MASK2;386r[0] = t1;387c += (t1 > t2);388t1 = a[1];389t2 = (t1 - c) & BN_MASK2;390c = (t2 > t1);391t1 = b[1];392t1 = (t2 - t1) & BN_MASK2;393r[1] = t1;394c += (t1 > t2);395t1 = a[2];396t2 = (t1 - c) & BN_MASK2;397c = (t2 > t1);398t1 = b[2];399t1 = (t2 - t1) & BN_MASK2;400r[2] = t1;401c += (t1 > t2);402t1 = a[3];403t2 = (t1 - c) & BN_MASK2;404c = (t2 > t1);405t1 = b[3];406t1 = (t2 - t1) & BN_MASK2;407r[3] = t1;408c += (t1 > t2);409a += 4;410b += 4;411r += 4;412n -= 4;413}414#endif415while (n) {416t1 = a[0];417t2 = (t1 - c) & BN_MASK2;418c = (t2 > t1);419t1 = b[0];420t1 = (t2 - t1) & BN_MASK2;421r[0] = t1;422c += (t1 > t2);423a++;424b++;425r++;426n--;427}428return c;429}430431#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)432433/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */434/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */435/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */436/*437* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number438* c=(c2,c1,c0)439*/440441#ifdef BN_LLONG442/*443* Keep in mind that additions to multiplication result can not444* overflow, because its high half cannot be all-ones.445*/446#define mul_add_c(a, b, c0, c1, c2) \447do { \448BN_ULONG hi; \449BN_ULLONG t = (BN_ULLONG)(a) * (b); \450t += c0; /* no carry */ \451c0 = (BN_ULONG)Lw(t); \452hi = (BN_ULONG)Hw(t); \453c1 = (c1 + hi) & BN_MASK2; \454c2 += (c1 < hi); \455} while (0)456457#define mul_add_c2(a, b, c0, c1, c2) \458do { \459BN_ULONG hi; \460BN_ULLONG t = (BN_ULLONG)(a) * (b); \461BN_ULLONG tt = t + c0; /* no carry */ \462c0 = (BN_ULONG)Lw(tt); \463hi = (BN_ULONG)Hw(tt); \464c1 = (c1 + hi) & BN_MASK2; \465c2 += (c1 < hi); \466t += c0; /* no carry */ \467c0 = (BN_ULONG)Lw(t); \468hi = (BN_ULONG)Hw(t); \469c1 = (c1 + hi) & BN_MASK2; \470c2 += (c1 < hi); \471} while (0)472473#define sqr_add_c(a, i, c0, c1, c2) \474do { \475BN_ULONG hi; \476BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \477t += c0; /* no carry */ \478c0 = (BN_ULONG)Lw(t); \479hi = (BN_ULONG)Hw(t); \480c1 = (c1 + hi) & BN_MASK2; \481c2 += (c1 < hi); \482} while (0)483484#define sqr_add_c2(a, i, j, c0, c1, c2) \485mul_add_c2((a)[i], (a)[j], c0, c1, c2)486487#elif defined(BN_UMULT_LOHI)488/*489* Keep in mind that additions to hi can not overflow, because490* the high word of a multiplication result cannot be all-ones.491*/492#define mul_add_c(a, b, c0, c1, c2) \493do { \494BN_ULONG ta = (a), tb = (b); \495BN_ULONG lo, hi; \496BN_UMULT_LOHI(lo, hi, ta, tb); \497c0 += lo; \498hi += (c0 < lo); \499c1 += hi; \500c2 += (c1 < hi); \501} while (0)502503#define mul_add_c2(a, b, c0, c1, c2) \504do { \505BN_ULONG ta = (a), tb = (b); \506BN_ULONG lo, hi, tt; \507BN_UMULT_LOHI(lo, hi, ta, tb); \508c0 += lo; \509tt = hi + (c0 < lo); \510c1 += tt; \511c2 += (c1 < tt); \512c0 += lo; \513hi += (c0 < lo); \514c1 += hi; \515c2 += (c1 < hi); \516} while (0)517518#define sqr_add_c(a, i, c0, c1, c2) \519do { \520BN_ULONG ta = (a)[i]; \521BN_ULONG lo, hi; \522BN_UMULT_LOHI(lo, hi, ta, ta); \523c0 += lo; \524hi += (c0 < lo); \525c1 += hi; \526c2 += (c1 < hi); \527} while (0)528529#define sqr_add_c2(a, i, j, c0, c1, c2) \530mul_add_c2((a)[i], (a)[j], c0, c1, c2)531532#elif defined(BN_UMULT_HIGH)533/*534* Keep in mind that additions to hi can not overflow, because535* the high word of a multiplication result cannot be all-ones.536*/537#define mul_add_c(a, b, c0, c1, c2) \538do { \539BN_ULONG ta = (a), tb = (b); \540BN_ULONG lo = ta * tb; \541BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \542c0 += lo; \543hi += (c0 < lo); \544c1 += hi; \545c2 += (c1 < hi); \546} while (0)547548#define mul_add_c2(a, b, c0, c1, c2) \549do { \550BN_ULONG ta = (a), tb = (b), tt; \551BN_ULONG lo = ta * tb; \552BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \553c0 += lo; \554tt = hi + (c0 < lo); \555c1 += tt; \556c2 += (c1 < tt); \557c0 += lo; \558hi += (c0 < lo); \559c1 += hi; \560c2 += (c1 < hi); \561} while (0)562563#define sqr_add_c(a, i, c0, c1, c2) \564do { \565BN_ULONG ta = (a)[i]; \566BN_ULONG lo = ta * ta; \567BN_ULONG hi = BN_UMULT_HIGH(ta, ta); \568c0 += lo; \569hi += (c0 < lo); \570c1 += hi; \571c2 += (c1 < hi); \572} while (0)573574#define sqr_add_c2(a, i, j, c0, c1, c2) \575mul_add_c2((a)[i], (a)[j], c0, c1, c2)576577#else /* !BN_LLONG */578/*579* Keep in mind that additions to hi can not overflow, because580* the high word of a multiplication result cannot be all-ones.581*/582#define mul_add_c(a, b, c0, c1, c2) \583do { \584BN_ULONG lo = LBITS(a), hi = HBITS(a); \585BN_ULONG bl = LBITS(b), bh = HBITS(b); \586mul64(lo, hi, bl, bh); \587c0 = (c0 + lo) & BN_MASK2; \588hi += (c0 < lo); \589c1 = (c1 + hi) & BN_MASK2; \590c2 += (c1 < hi); \591} while (0)592593#define mul_add_c2(a, b, c0, c1, c2) \594do { \595BN_ULONG tt; \596BN_ULONG lo = LBITS(a), hi = HBITS(a); \597BN_ULONG bl = LBITS(b), bh = HBITS(b); \598mul64(lo, hi, bl, bh); \599tt = hi; \600c0 = (c0 + lo) & BN_MASK2; \601tt += (c0 < lo); \602c1 = (c1 + tt) & BN_MASK2; \603c2 += (c1 < tt); \604c0 = (c0 + lo) & BN_MASK2; \605hi += (c0 < lo); \606c1 = (c1 + hi) & BN_MASK2; \607c2 += (c1 < hi); \608} while (0)609610#define sqr_add_c(a, i, c0, c1, c2) \611do { \612BN_ULONG lo, hi; \613sqr64(lo, hi, (a)[i]); \614c0 = (c0 + lo) & BN_MASK2; \615hi += (c0 < lo); \616c1 = (c1 + hi) & BN_MASK2; \617c2 += (c1 < hi); \618} while (0)619620#define sqr_add_c2(a, i, j, c0, c1, c2) \621mul_add_c2((a)[i], (a)[j], c0, c1, c2)622#endif /* !BN_LLONG */623624void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)625{626BN_ULONG c1, c2, c3;627628c1 = 0;629c2 = 0;630c3 = 0;631mul_add_c(a[0], b[0], c1, c2, c3);632r[0] = c1;633c1 = 0;634mul_add_c(a[0], b[1], c2, c3, c1);635mul_add_c(a[1], b[0], c2, c3, c1);636r[1] = c2;637c2 = 0;638mul_add_c(a[2], b[0], c3, c1, c2);639mul_add_c(a[1], b[1], c3, c1, c2);640mul_add_c(a[0], b[2], c3, c1, c2);641r[2] = c3;642c3 = 0;643mul_add_c(a[0], b[3], c1, c2, c3);644mul_add_c(a[1], b[2], c1, c2, c3);645mul_add_c(a[2], b[1], c1, c2, c3);646mul_add_c(a[3], b[0], c1, c2, c3);647r[3] = c1;648c1 = 0;649mul_add_c(a[4], b[0], c2, c3, c1);650mul_add_c(a[3], b[1], c2, c3, c1);651mul_add_c(a[2], b[2], c2, c3, c1);652mul_add_c(a[1], b[3], c2, c3, c1);653mul_add_c(a[0], b[4], c2, c3, c1);654r[4] = c2;655c2 = 0;656mul_add_c(a[0], b[5], c3, c1, c2);657mul_add_c(a[1], b[4], c3, c1, c2);658mul_add_c(a[2], b[3], c3, c1, c2);659mul_add_c(a[3], b[2], c3, c1, c2);660mul_add_c(a[4], b[1], c3, c1, c2);661mul_add_c(a[5], b[0], c3, c1, c2);662r[5] = c3;663c3 = 0;664mul_add_c(a[6], b[0], c1, c2, c3);665mul_add_c(a[5], b[1], c1, c2, c3);666mul_add_c(a[4], b[2], c1, c2, c3);667mul_add_c(a[3], b[3], c1, c2, c3);668mul_add_c(a[2], b[4], c1, c2, c3);669mul_add_c(a[1], b[5], c1, c2, c3);670mul_add_c(a[0], b[6], c1, c2, c3);671r[6] = c1;672c1 = 0;673mul_add_c(a[0], b[7], c2, c3, c1);674mul_add_c(a[1], b[6], c2, c3, c1);675mul_add_c(a[2], b[5], c2, c3, c1);676mul_add_c(a[3], b[4], c2, c3, c1);677mul_add_c(a[4], b[3], c2, c3, c1);678mul_add_c(a[5], b[2], c2, c3, c1);679mul_add_c(a[6], b[1], c2, c3, c1);680mul_add_c(a[7], b[0], c2, c3, c1);681r[7] = c2;682c2 = 0;683mul_add_c(a[7], b[1], c3, c1, c2);684mul_add_c(a[6], b[2], c3, c1, c2);685mul_add_c(a[5], b[3], c3, c1, c2);686mul_add_c(a[4], b[4], c3, c1, c2);687mul_add_c(a[3], b[5], c3, c1, c2);688mul_add_c(a[2], b[6], c3, c1, c2);689mul_add_c(a[1], b[7], c3, c1, c2);690r[8] = c3;691c3 = 0;692mul_add_c(a[2], b[7], c1, c2, c3);693mul_add_c(a[3], b[6], c1, c2, c3);694mul_add_c(a[4], b[5], c1, c2, c3);695mul_add_c(a[5], b[4], c1, c2, c3);696mul_add_c(a[6], b[3], c1, c2, c3);697mul_add_c(a[7], b[2], c1, c2, c3);698r[9] = c1;699c1 = 0;700mul_add_c(a[7], b[3], c2, c3, c1);701mul_add_c(a[6], b[4], c2, c3, c1);702mul_add_c(a[5], b[5], c2, c3, c1);703mul_add_c(a[4], b[6], c2, c3, c1);704mul_add_c(a[3], b[7], c2, c3, c1);705r[10] = c2;706c2 = 0;707mul_add_c(a[4], b[7], c3, c1, c2);708mul_add_c(a[5], b[6], c3, c1, c2);709mul_add_c(a[6], b[5], c3, c1, c2);710mul_add_c(a[7], b[4], c3, c1, c2);711r[11] = c3;712c3 = 0;713mul_add_c(a[7], b[5], c1, c2, c3);714mul_add_c(a[6], b[6], c1, c2, c3);715mul_add_c(a[5], b[7], c1, c2, c3);716r[12] = c1;717c1 = 0;718mul_add_c(a[6], b[7], c2, c3, c1);719mul_add_c(a[7], b[6], c2, c3, c1);720r[13] = c2;721c2 = 0;722mul_add_c(a[7], b[7], c3, c1, c2);723r[14] = c3;724r[15] = c1;725}726727void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)728{729BN_ULONG c1, c2, c3;730731c1 = 0;732c2 = 0;733c3 = 0;734mul_add_c(a[0], b[0], c1, c2, c3);735r[0] = c1;736c1 = 0;737mul_add_c(a[0], b[1], c2, c3, c1);738mul_add_c(a[1], b[0], c2, c3, c1);739r[1] = c2;740c2 = 0;741mul_add_c(a[2], b[0], c3, c1, c2);742mul_add_c(a[1], b[1], c3, c1, c2);743mul_add_c(a[0], b[2], c3, c1, c2);744r[2] = c3;745c3 = 0;746mul_add_c(a[0], b[3], c1, c2, c3);747mul_add_c(a[1], b[2], c1, c2, c3);748mul_add_c(a[2], b[1], c1, c2, c3);749mul_add_c(a[3], b[0], c1, c2, c3);750r[3] = c1;751c1 = 0;752mul_add_c(a[3], b[1], c2, c3, c1);753mul_add_c(a[2], b[2], c2, c3, c1);754mul_add_c(a[1], b[3], c2, c3, c1);755r[4] = c2;756c2 = 0;757mul_add_c(a[2], b[3], c3, c1, c2);758mul_add_c(a[3], b[2], c3, c1, c2);759r[5] = c3;760c3 = 0;761mul_add_c(a[3], b[3], c1, c2, c3);762r[6] = c1;763r[7] = c2;764}765766void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)767{768BN_ULONG c1, c2, c3;769770c1 = 0;771c2 = 0;772c3 = 0;773sqr_add_c(a, 0, c1, c2, c3);774r[0] = c1;775c1 = 0;776sqr_add_c2(a, 1, 0, c2, c3, c1);777r[1] = c2;778c2 = 0;779sqr_add_c(a, 1, c3, c1, c2);780sqr_add_c2(a, 2, 0, c3, c1, c2);781r[2] = c3;782c3 = 0;783sqr_add_c2(a, 3, 0, c1, c2, c3);784sqr_add_c2(a, 2, 1, c1, c2, c3);785r[3] = c1;786c1 = 0;787sqr_add_c(a, 2, c2, c3, c1);788sqr_add_c2(a, 3, 1, c2, c3, c1);789sqr_add_c2(a, 4, 0, c2, c3, c1);790r[4] = c2;791c2 = 0;792sqr_add_c2(a, 5, 0, c3, c1, c2);793sqr_add_c2(a, 4, 1, c3, c1, c2);794sqr_add_c2(a, 3, 2, c3, c1, c2);795r[5] = c3;796c3 = 0;797sqr_add_c(a, 3, c1, c2, c3);798sqr_add_c2(a, 4, 2, c1, c2, c3);799sqr_add_c2(a, 5, 1, c1, c2, c3);800sqr_add_c2(a, 6, 0, c1, c2, c3);801r[6] = c1;802c1 = 0;803sqr_add_c2(a, 7, 0, c2, c3, c1);804sqr_add_c2(a, 6, 1, c2, c3, c1);805sqr_add_c2(a, 5, 2, c2, c3, c1);806sqr_add_c2(a, 4, 3, c2, c3, c1);807r[7] = c2;808c2 = 0;809sqr_add_c(a, 4, c3, c1, c2);810sqr_add_c2(a, 5, 3, c3, c1, c2);811sqr_add_c2(a, 6, 2, c3, c1, c2);812sqr_add_c2(a, 7, 1, c3, c1, c2);813r[8] = c3;814c3 = 0;815sqr_add_c2(a, 7, 2, c1, c2, c3);816sqr_add_c2(a, 6, 3, c1, c2, c3);817sqr_add_c2(a, 5, 4, c1, c2, c3);818r[9] = c1;819c1 = 0;820sqr_add_c(a, 5, c2, c3, c1);821sqr_add_c2(a, 6, 4, c2, c3, c1);822sqr_add_c2(a, 7, 3, c2, c3, c1);823r[10] = c2;824c2 = 0;825sqr_add_c2(a, 7, 4, c3, c1, c2);826sqr_add_c2(a, 6, 5, c3, c1, c2);827r[11] = c3;828c3 = 0;829sqr_add_c(a, 6, c1, c2, c3);830sqr_add_c2(a, 7, 5, c1, c2, c3);831r[12] = c1;832c1 = 0;833sqr_add_c2(a, 7, 6, c2, c3, c1);834r[13] = c2;835c2 = 0;836sqr_add_c(a, 7, c3, c1, c2);837r[14] = c3;838r[15] = c1;839}840841void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)842{843BN_ULONG c1, c2, c3;844845c1 = 0;846c2 = 0;847c3 = 0;848sqr_add_c(a, 0, c1, c2, c3);849r[0] = c1;850c1 = 0;851sqr_add_c2(a, 1, 0, c2, c3, c1);852r[1] = c2;853c2 = 0;854sqr_add_c(a, 1, c3, c1, c2);855sqr_add_c2(a, 2, 0, c3, c1, c2);856r[2] = c3;857c3 = 0;858sqr_add_c2(a, 3, 0, c1, c2, c3);859sqr_add_c2(a, 2, 1, c1, c2, c3);860r[3] = c1;861c1 = 0;862sqr_add_c(a, 2, c2, c3, c1);863sqr_add_c2(a, 3, 1, c2, c3, c1);864r[4] = c2;865c2 = 0;866sqr_add_c2(a, 3, 2, c3, c1, c2);867r[5] = c3;868c3 = 0;869sqr_add_c(a, 3, c1, c2, c3);870r[6] = c1;871r[7] = c2;872}873874#ifdef OPENSSL_NO_ASM875#ifdef OPENSSL_BN_ASM_MONT876#include <alloca.h>877/*878* This is essentially reference implementation, which may or may not879* result in performance improvement. E.g. on IA-32 this routine was880* observed to give 40% faster rsa1024 private key operations and 10%881* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only882* by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a883* reference implementation, one to be used as starting point for884* platform-specific assembler. Mentioned numbers apply to compiler885* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and886* can vary not only from platform to platform, but even for compiler887* versions. Assembler vs. assembler improvement coefficients can888* [and are known to] differ and are to be documented elsewhere.889*/890int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,891const BN_ULONG *np, const BN_ULONG *n0p, int num)892{893BN_ULONG c0, c1, ml, *tp, n0;894#ifdef mul64895BN_ULONG mh;896#endif897volatile BN_ULONG *vp;898int i = 0, j;899900#if 0 /* template for platform-specific \901* implementation */902if (ap == bp)903return bn_sqr_mont(rp, ap, np, n0p, num);904#endif905vp = tp = alloca((num + 2) * sizeof(BN_ULONG));906907n0 = *n0p;908909c0 = 0;910ml = bp[0];911#ifdef mul64912mh = HBITS(ml);913ml = LBITS(ml);914for (j = 0; j < num; ++j)915mul(tp[j], ap[j], ml, mh, c0);916#else917for (j = 0; j < num; ++j)918mul(tp[j], ap[j], ml, c0);919#endif920921tp[num] = c0;922tp[num + 1] = 0;923goto enter;924925for (i = 0; i < num; i++) {926c0 = 0;927ml = bp[i];928#ifdef mul64929mh = HBITS(ml);930ml = LBITS(ml);931for (j = 0; j < num; ++j)932mul_add(tp[j], ap[j], ml, mh, c0);933#else934for (j = 0; j < num; ++j)935mul_add(tp[j], ap[j], ml, c0);936#endif937c1 = (tp[num] + c0) & BN_MASK2;938tp[num] = c1;939tp[num + 1] = (c1 < c0 ? 1 : 0);940enter:941c1 = tp[0];942ml = (c1 * n0) & BN_MASK2;943c0 = 0;944#ifdef mul64945mh = HBITS(ml);946ml = LBITS(ml);947mul_add(c1, np[0], ml, mh, c0);948#else949mul_add(c1, ml, np[0], c0);950#endif951for (j = 1; j < num; j++) {952c1 = tp[j];953#ifdef mul64954mul_add(c1, np[j], ml, mh, c0);955#else956mul_add(c1, ml, np[j], c0);957#endif958tp[j - 1] = c1 & BN_MASK2;959}960c1 = (tp[num] + c0) & BN_MASK2;961tp[num - 1] = c1;962tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);963}964965if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {966c0 = bn_sub_words(rp, tp, np, num);967if (tp[num] != 0 || c0 == 0) {968for (i = 0; i < num + 2; i++)969vp[i] = 0;970return 1;971}972}973for (i = 0; i < num; i++)974rp[i] = tp[i], vp[i] = 0;975vp[num] = 0;976vp[num + 1] = 0;977return 1;978}979#else980/*981* Return value of 0 indicates that multiplication/convolution was not982* performed to signal the caller to fall down to alternative/original983* code-path.984*/985int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,986const BN_ULONG *np, const BN_ULONG *n0, int num)987{988return 0;989}990#endif /* OPENSSL_BN_ASM_MONT */991#endif992993#else /* !BN_MUL_COMBA */994995/* hmm... is it faster just to do a multiply? */996void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)997{998BN_ULONG t[8];999bn_sqr_normal(r, a, 4, t);1000}10011002void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)1003{1004BN_ULONG t[16];1005bn_sqr_normal(r, a, 8, t);1006}10071008void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)1009{1010r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);1011r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);1012r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);1013r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);1014}10151016void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)1017{1018r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);1019r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);1020r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);1021r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);1022r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);1023r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);1024r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);1025r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);1026}10271028#ifdef OPENSSL_NO_ASM1029#ifdef OPENSSL_BN_ASM_MONT1030#include <alloca.h>1031int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,1032const BN_ULONG *np, const BN_ULONG *n0p, int num)1033{1034BN_ULONG c0, c1, *tp, n0 = *n0p;1035volatile BN_ULONG *vp;1036int i = 0, j;10371038vp = tp = alloca((num + 2) * sizeof(BN_ULONG));10391040for (i = 0; i <= num; i++)1041tp[i] = 0;10421043for (i = 0; i < num; i++) {1044c0 = bn_mul_add_words(tp, ap, num, bp[i]);1045c1 = (tp[num] + c0) & BN_MASK2;1046tp[num] = c1;1047tp[num + 1] = (c1 < c0 ? 1 : 0);10481049c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);1050c1 = (tp[num] + c0) & BN_MASK2;1051tp[num] = c1;1052tp[num + 1] += (c1 < c0 ? 1 : 0);1053for (j = 0; j <= num; j++)1054tp[j] = tp[j + 1];1055}10561057if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {1058c0 = bn_sub_words(rp, tp, np, num);1059if (tp[num] != 0 || c0 == 0) {1060for (i = 0; i < num + 2; i++)1061vp[i] = 0;1062return 1;1063}1064}1065for (i = 0; i < num; i++)1066rp[i] = tp[i], vp[i] = 0;1067vp[num] = 0;1068vp[num + 1] = 0;1069return 1;1070}1071#else1072int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,1073const BN_ULONG *np, const BN_ULONG *n0, int num)1074{1075return 0;1076}1077#endif /* OPENSSL_BN_ASM_MONT */1078#endif10791080#endif /* !BN_MUL_COMBA */108110821083