Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/crypto/openssl/crypto/bn/bn_asm.c
34875 views
1
/*
2
* Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
3
*
4
* Licensed under the Apache License 2.0 (the "License"). You may not use
5
* this file except in compliance with the License. You can obtain a copy
6
* in the file LICENSE in the source distribution or at
7
* https://www.openssl.org/source/license.html
8
*/
9
10
#include <assert.h>
11
#include <openssl/crypto.h>
12
#include "internal/cryptlib.h"
13
#include "bn_local.h"
14
15
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
16
17
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
18
BN_ULONG w)
19
{
20
BN_ULONG c1 = 0;
21
22
assert(num >= 0);
23
if (num <= 0)
24
return c1;
25
26
# ifndef OPENSSL_SMALL_FOOTPRINT
27
while (num & ~3) {
28
mul_add(rp[0], ap[0], w, c1);
29
mul_add(rp[1], ap[1], w, c1);
30
mul_add(rp[2], ap[2], w, c1);
31
mul_add(rp[3], ap[3], w, c1);
32
ap += 4;
33
rp += 4;
34
num -= 4;
35
}
36
# endif
37
while (num) {
38
mul_add(rp[0], ap[0], w, c1);
39
ap++;
40
rp++;
41
num--;
42
}
43
44
return c1;
45
}
46
47
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
48
{
49
BN_ULONG c1 = 0;
50
51
assert(num >= 0);
52
if (num <= 0)
53
return c1;
54
55
# ifndef OPENSSL_SMALL_FOOTPRINT
56
while (num & ~3) {
57
mul(rp[0], ap[0], w, c1);
58
mul(rp[1], ap[1], w, c1);
59
mul(rp[2], ap[2], w, c1);
60
mul(rp[3], ap[3], w, c1);
61
ap += 4;
62
rp += 4;
63
num -= 4;
64
}
65
# endif
66
while (num) {
67
mul(rp[0], ap[0], w, c1);
68
ap++;
69
rp++;
70
num--;
71
}
72
return c1;
73
}
74
75
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
76
{
77
assert(n >= 0);
78
if (n <= 0)
79
return;
80
81
# ifndef OPENSSL_SMALL_FOOTPRINT
82
while (n & ~3) {
83
sqr(r[0], r[1], a[0]);
84
sqr(r[2], r[3], a[1]);
85
sqr(r[4], r[5], a[2]);
86
sqr(r[6], r[7], a[3]);
87
a += 4;
88
r += 8;
89
n -= 4;
90
}
91
# endif
92
while (n) {
93
sqr(r[0], r[1], a[0]);
94
a++;
95
r += 2;
96
n--;
97
}
98
}
99
100
#else /* !(defined(BN_LLONG) ||
101
* defined(BN_UMULT_HIGH)) */
102
103
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
104
BN_ULONG w)
105
{
106
BN_ULONG c = 0;
107
BN_ULONG bl, bh;
108
109
assert(num >= 0);
110
if (num <= 0)
111
return (BN_ULONG)0;
112
113
bl = LBITS(w);
114
bh = HBITS(w);
115
116
# ifndef OPENSSL_SMALL_FOOTPRINT
117
while (num & ~3) {
118
mul_add(rp[0], ap[0], bl, bh, c);
119
mul_add(rp[1], ap[1], bl, bh, c);
120
mul_add(rp[2], ap[2], bl, bh, c);
121
mul_add(rp[3], ap[3], bl, bh, c);
122
ap += 4;
123
rp += 4;
124
num -= 4;
125
}
126
# endif
127
while (num) {
128
mul_add(rp[0], ap[0], bl, bh, c);
129
ap++;
130
rp++;
131
num--;
132
}
133
return c;
134
}
135
136
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
137
{
138
BN_ULONG carry = 0;
139
BN_ULONG bl, bh;
140
141
assert(num >= 0);
142
if (num <= 0)
143
return (BN_ULONG)0;
144
145
bl = LBITS(w);
146
bh = HBITS(w);
147
148
# ifndef OPENSSL_SMALL_FOOTPRINT
149
while (num & ~3) {
150
mul(rp[0], ap[0], bl, bh, carry);
151
mul(rp[1], ap[1], bl, bh, carry);
152
mul(rp[2], ap[2], bl, bh, carry);
153
mul(rp[3], ap[3], bl, bh, carry);
154
ap += 4;
155
rp += 4;
156
num -= 4;
157
}
158
# endif
159
while (num) {
160
mul(rp[0], ap[0], bl, bh, carry);
161
ap++;
162
rp++;
163
num--;
164
}
165
return carry;
166
}
167
168
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
169
{
170
assert(n >= 0);
171
if (n <= 0)
172
return;
173
174
# ifndef OPENSSL_SMALL_FOOTPRINT
175
while (n & ~3) {
176
sqr64(r[0], r[1], a[0]);
177
sqr64(r[2], r[3], a[1]);
178
sqr64(r[4], r[5], a[2]);
179
sqr64(r[6], r[7], a[3]);
180
a += 4;
181
r += 8;
182
n -= 4;
183
}
184
# endif
185
while (n) {
186
sqr64(r[0], r[1], a[0]);
187
a++;
188
r += 2;
189
n--;
190
}
191
}
192
193
#endif /* !(defined(BN_LLONG) ||
194
* defined(BN_UMULT_HIGH)) */
195
196
#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199
{
200
return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
201
}
202
203
#else
204
205
/* Divide h,l by d and return the result. */
206
/* I need to test this some more :-( */
207
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208
{
209
BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210
int i, count = 2;
211
212
if (d == 0)
213
return BN_MASK2;
214
215
i = BN_num_bits_word(d);
216
assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
217
218
i = BN_BITS2 - i;
219
if (h >= d)
220
h -= d;
221
222
if (i) {
223
d <<= i;
224
h = (h << i) | (l >> (BN_BITS2 - i));
225
l <<= i;
226
}
227
dh = (d & BN_MASK2h) >> BN_BITS4;
228
dl = (d & BN_MASK2l);
229
for (;;) {
230
if ((h >> BN_BITS4) == dh)
231
q = BN_MASK2l;
232
else
233
q = h / dh;
234
235
th = q * dh;
236
tl = dl * q;
237
for (;;) {
238
t = h - th;
239
if ((t & BN_MASK2h) ||
240
((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
241
break;
242
q--;
243
th -= dh;
244
tl -= dl;
245
}
246
t = (tl >> BN_BITS4);
247
tl = (tl << BN_BITS4) & BN_MASK2h;
248
th += t;
249
250
if (l < tl)
251
th++;
252
l -= tl;
253
if (h < th) {
254
h += d;
255
q--;
256
}
257
h -= th;
258
259
if (--count == 0)
260
break;
261
262
ret = q << BN_BITS4;
263
h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
264
l = (l & BN_MASK2l) << BN_BITS4;
265
}
266
ret |= q;
267
return ret;
268
}
269
#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271
#ifdef BN_LLONG
272
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
273
int n)
274
{
275
BN_ULLONG ll = 0;
276
277
assert(n >= 0);
278
if (n <= 0)
279
return (BN_ULONG)0;
280
281
# ifndef OPENSSL_SMALL_FOOTPRINT
282
while (n & ~3) {
283
ll += (BN_ULLONG) a[0] + b[0];
284
r[0] = (BN_ULONG)ll & BN_MASK2;
285
ll >>= BN_BITS2;
286
ll += (BN_ULLONG) a[1] + b[1];
287
r[1] = (BN_ULONG)ll & BN_MASK2;
288
ll >>= BN_BITS2;
289
ll += (BN_ULLONG) a[2] + b[2];
290
r[2] = (BN_ULONG)ll & BN_MASK2;
291
ll >>= BN_BITS2;
292
ll += (BN_ULLONG) a[3] + b[3];
293
r[3] = (BN_ULONG)ll & BN_MASK2;
294
ll >>= BN_BITS2;
295
a += 4;
296
b += 4;
297
r += 4;
298
n -= 4;
299
}
300
# endif
301
while (n) {
302
ll += (BN_ULLONG) a[0] + b[0];
303
r[0] = (BN_ULONG)ll & BN_MASK2;
304
ll >>= BN_BITS2;
305
a++;
306
b++;
307
r++;
308
n--;
309
}
310
return (BN_ULONG)ll;
311
}
312
#else /* !BN_LLONG */
313
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
314
int n)
315
{
316
BN_ULONG c, l, t;
317
318
assert(n >= 0);
319
if (n <= 0)
320
return (BN_ULONG)0;
321
322
c = 0;
323
# ifndef OPENSSL_SMALL_FOOTPRINT
324
while (n & ~3) {
325
t = a[0];
326
t = (t + c) & BN_MASK2;
327
c = (t < c);
328
l = (t + b[0]) & BN_MASK2;
329
c += (l < t);
330
r[0] = l;
331
t = a[1];
332
t = (t + c) & BN_MASK2;
333
c = (t < c);
334
l = (t + b[1]) & BN_MASK2;
335
c += (l < t);
336
r[1] = l;
337
t = a[2];
338
t = (t + c) & BN_MASK2;
339
c = (t < c);
340
l = (t + b[2]) & BN_MASK2;
341
c += (l < t);
342
r[2] = l;
343
t = a[3];
344
t = (t + c) & BN_MASK2;
345
c = (t < c);
346
l = (t + b[3]) & BN_MASK2;
347
c += (l < t);
348
r[3] = l;
349
a += 4;
350
b += 4;
351
r += 4;
352
n -= 4;
353
}
354
# endif
355
while (n) {
356
t = a[0];
357
t = (t + c) & BN_MASK2;
358
c = (t < c);
359
l = (t + b[0]) & BN_MASK2;
360
c += (l < t);
361
r[0] = l;
362
a++;
363
b++;
364
r++;
365
n--;
366
}
367
return (BN_ULONG)c;
368
}
369
#endif /* !BN_LLONG */
370
371
BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
372
int n)
373
{
374
BN_ULONG t1, t2;
375
int c = 0;
376
377
assert(n >= 0);
378
if (n <= 0)
379
return (BN_ULONG)0;
380
381
#ifndef OPENSSL_SMALL_FOOTPRINT
382
while (n & ~3) {
383
t1 = a[0];
384
t2 = (t1 - c) & BN_MASK2;
385
c = (t2 > t1);
386
t1 = b[0];
387
t1 = (t2 - t1) & BN_MASK2;
388
r[0] = t1;
389
c += (t1 > t2);
390
t1 = a[1];
391
t2 = (t1 - c) & BN_MASK2;
392
c = (t2 > t1);
393
t1 = b[1];
394
t1 = (t2 - t1) & BN_MASK2;
395
r[1] = t1;
396
c += (t1 > t2);
397
t1 = a[2];
398
t2 = (t1 - c) & BN_MASK2;
399
c = (t2 > t1);
400
t1 = b[2];
401
t1 = (t2 - t1) & BN_MASK2;
402
r[2] = t1;
403
c += (t1 > t2);
404
t1 = a[3];
405
t2 = (t1 - c) & BN_MASK2;
406
c = (t2 > t1);
407
t1 = b[3];
408
t1 = (t2 - t1) & BN_MASK2;
409
r[3] = t1;
410
c += (t1 > t2);
411
a += 4;
412
b += 4;
413
r += 4;
414
n -= 4;
415
}
416
#endif
417
while (n) {
418
t1 = a[0];
419
t2 = (t1 - c) & BN_MASK2;
420
c = (t2 > t1);
421
t1 = b[0];
422
t1 = (t2 - t1) & BN_MASK2;
423
r[0] = t1;
424
c += (t1 > t2);
425
a++;
426
b++;
427
r++;
428
n--;
429
}
430
return c;
431
}
432
433
#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
434
435
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
436
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
437
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
438
/*
439
* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
440
* c=(c2,c1,c0)
441
*/
442
443
# ifdef BN_LLONG
444
/*
445
* Keep in mind that additions to multiplication result can not
446
* overflow, because its high half cannot be all-ones.
447
*/
448
# define mul_add_c(a,b,c0,c1,c2) do { \
449
BN_ULONG hi; \
450
BN_ULLONG t = (BN_ULLONG)(a)*(b); \
451
t += c0; /* no carry */ \
452
c0 = (BN_ULONG)Lw(t); \
453
hi = (BN_ULONG)Hw(t); \
454
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
455
} while(0)
456
457
# define mul_add_c2(a,b,c0,c1,c2) do { \
458
BN_ULONG hi; \
459
BN_ULLONG t = (BN_ULLONG)(a)*(b); \
460
BN_ULLONG tt = t+c0; /* no carry */ \
461
c0 = (BN_ULONG)Lw(tt); \
462
hi = (BN_ULONG)Hw(tt); \
463
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
464
t += c0; /* no carry */ \
465
c0 = (BN_ULONG)Lw(t); \
466
hi = (BN_ULONG)Hw(t); \
467
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
468
} while(0)
469
470
# define sqr_add_c(a,i,c0,c1,c2) do { \
471
BN_ULONG hi; \
472
BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
473
t += c0; /* no carry */ \
474
c0 = (BN_ULONG)Lw(t); \
475
hi = (BN_ULONG)Hw(t); \
476
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
477
} while(0)
478
479
# define sqr_add_c2(a,i,j,c0,c1,c2) \
480
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
481
482
# elif defined(BN_UMULT_LOHI)
483
/*
484
* Keep in mind that additions to hi can not overflow, because
485
* the high word of a multiplication result cannot be all-ones.
486
*/
487
# define mul_add_c(a,b,c0,c1,c2) do { \
488
BN_ULONG ta = (a), tb = (b); \
489
BN_ULONG lo, hi; \
490
BN_UMULT_LOHI(lo,hi,ta,tb); \
491
c0 += lo; hi += (c0<lo); \
492
c1 += hi; c2 += (c1<hi); \
493
} while(0)
494
495
# define mul_add_c2(a,b,c0,c1,c2) do { \
496
BN_ULONG ta = (a), tb = (b); \
497
BN_ULONG lo, hi, tt; \
498
BN_UMULT_LOHI(lo,hi,ta,tb); \
499
c0 += lo; tt = hi + (c0<lo); \
500
c1 += tt; c2 += (c1<tt); \
501
c0 += lo; hi += (c0<lo); \
502
c1 += hi; c2 += (c1<hi); \
503
} while(0)
504
505
# define sqr_add_c(a,i,c0,c1,c2) do { \
506
BN_ULONG ta = (a)[i]; \
507
BN_ULONG lo, hi; \
508
BN_UMULT_LOHI(lo,hi,ta,ta); \
509
c0 += lo; hi += (c0<lo); \
510
c1 += hi; c2 += (c1<hi); \
511
} while(0)
512
513
# define sqr_add_c2(a,i,j,c0,c1,c2) \
514
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
515
516
# elif defined(BN_UMULT_HIGH)
517
/*
518
* Keep in mind that additions to hi can not overflow, because
519
* the high word of a multiplication result cannot be all-ones.
520
*/
521
# define mul_add_c(a,b,c0,c1,c2) do { \
522
BN_ULONG ta = (a), tb = (b); \
523
BN_ULONG lo = ta * tb; \
524
BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
525
c0 += lo; hi += (c0<lo); \
526
c1 += hi; c2 += (c1<hi); \
527
} while(0)
528
529
# define mul_add_c2(a,b,c0,c1,c2) do { \
530
BN_ULONG ta = (a), tb = (b), tt; \
531
BN_ULONG lo = ta * tb; \
532
BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
533
c0 += lo; tt = hi + (c0<lo); \
534
c1 += tt; c2 += (c1<tt); \
535
c0 += lo; hi += (c0<lo); \
536
c1 += hi; c2 += (c1<hi); \
537
} while(0)
538
539
# define sqr_add_c(a,i,c0,c1,c2) do { \
540
BN_ULONG ta = (a)[i]; \
541
BN_ULONG lo = ta * ta; \
542
BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
543
c0 += lo; hi += (c0<lo); \
544
c1 += hi; c2 += (c1<hi); \
545
} while(0)
546
547
# define sqr_add_c2(a,i,j,c0,c1,c2) \
548
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
549
550
# else /* !BN_LLONG */
551
/*
552
* Keep in mind that additions to hi can not overflow, because
553
* the high word of a multiplication result cannot be all-ones.
554
*/
555
# define mul_add_c(a,b,c0,c1,c2) do { \
556
BN_ULONG lo = LBITS(a), hi = HBITS(a); \
557
BN_ULONG bl = LBITS(b), bh = HBITS(b); \
558
mul64(lo,hi,bl,bh); \
559
c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
560
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
561
} while(0)
562
563
# define mul_add_c2(a,b,c0,c1,c2) do { \
564
BN_ULONG tt; \
565
BN_ULONG lo = LBITS(a), hi = HBITS(a); \
566
BN_ULONG bl = LBITS(b), bh = HBITS(b); \
567
mul64(lo,hi,bl,bh); \
568
tt = hi; \
569
c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \
570
c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \
571
c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
572
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
573
} while(0)
574
575
# define sqr_add_c(a,i,c0,c1,c2) do { \
576
BN_ULONG lo, hi; \
577
sqr64(lo,hi,(a)[i]); \
578
c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
579
c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
580
} while(0)
581
582
# define sqr_add_c2(a,i,j,c0,c1,c2) \
583
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
584
# endif /* !BN_LLONG */
585
586
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
587
{
588
BN_ULONG c1, c2, c3;
589
590
c1 = 0;
591
c2 = 0;
592
c3 = 0;
593
mul_add_c(a[0], b[0], c1, c2, c3);
594
r[0] = c1;
595
c1 = 0;
596
mul_add_c(a[0], b[1], c2, c3, c1);
597
mul_add_c(a[1], b[0], c2, c3, c1);
598
r[1] = c2;
599
c2 = 0;
600
mul_add_c(a[2], b[0], c3, c1, c2);
601
mul_add_c(a[1], b[1], c3, c1, c2);
602
mul_add_c(a[0], b[2], c3, c1, c2);
603
r[2] = c3;
604
c3 = 0;
605
mul_add_c(a[0], b[3], c1, c2, c3);
606
mul_add_c(a[1], b[2], c1, c2, c3);
607
mul_add_c(a[2], b[1], c1, c2, c3);
608
mul_add_c(a[3], b[0], c1, c2, c3);
609
r[3] = c1;
610
c1 = 0;
611
mul_add_c(a[4], b[0], c2, c3, c1);
612
mul_add_c(a[3], b[1], c2, c3, c1);
613
mul_add_c(a[2], b[2], c2, c3, c1);
614
mul_add_c(a[1], b[3], c2, c3, c1);
615
mul_add_c(a[0], b[4], c2, c3, c1);
616
r[4] = c2;
617
c2 = 0;
618
mul_add_c(a[0], b[5], c3, c1, c2);
619
mul_add_c(a[1], b[4], c3, c1, c2);
620
mul_add_c(a[2], b[3], c3, c1, c2);
621
mul_add_c(a[3], b[2], c3, c1, c2);
622
mul_add_c(a[4], b[1], c3, c1, c2);
623
mul_add_c(a[5], b[0], c3, c1, c2);
624
r[5] = c3;
625
c3 = 0;
626
mul_add_c(a[6], b[0], c1, c2, c3);
627
mul_add_c(a[5], b[1], c1, c2, c3);
628
mul_add_c(a[4], b[2], c1, c2, c3);
629
mul_add_c(a[3], b[3], c1, c2, c3);
630
mul_add_c(a[2], b[4], c1, c2, c3);
631
mul_add_c(a[1], b[5], c1, c2, c3);
632
mul_add_c(a[0], b[6], c1, c2, c3);
633
r[6] = c1;
634
c1 = 0;
635
mul_add_c(a[0], b[7], c2, c3, c1);
636
mul_add_c(a[1], b[6], c2, c3, c1);
637
mul_add_c(a[2], b[5], c2, c3, c1);
638
mul_add_c(a[3], b[4], c2, c3, c1);
639
mul_add_c(a[4], b[3], c2, c3, c1);
640
mul_add_c(a[5], b[2], c2, c3, c1);
641
mul_add_c(a[6], b[1], c2, c3, c1);
642
mul_add_c(a[7], b[0], c2, c3, c1);
643
r[7] = c2;
644
c2 = 0;
645
mul_add_c(a[7], b[1], c3, c1, c2);
646
mul_add_c(a[6], b[2], c3, c1, c2);
647
mul_add_c(a[5], b[3], c3, c1, c2);
648
mul_add_c(a[4], b[4], c3, c1, c2);
649
mul_add_c(a[3], b[5], c3, c1, c2);
650
mul_add_c(a[2], b[6], c3, c1, c2);
651
mul_add_c(a[1], b[7], c3, c1, c2);
652
r[8] = c3;
653
c3 = 0;
654
mul_add_c(a[2], b[7], c1, c2, c3);
655
mul_add_c(a[3], b[6], c1, c2, c3);
656
mul_add_c(a[4], b[5], c1, c2, c3);
657
mul_add_c(a[5], b[4], c1, c2, c3);
658
mul_add_c(a[6], b[3], c1, c2, c3);
659
mul_add_c(a[7], b[2], c1, c2, c3);
660
r[9] = c1;
661
c1 = 0;
662
mul_add_c(a[7], b[3], c2, c3, c1);
663
mul_add_c(a[6], b[4], c2, c3, c1);
664
mul_add_c(a[5], b[5], c2, c3, c1);
665
mul_add_c(a[4], b[6], c2, c3, c1);
666
mul_add_c(a[3], b[7], c2, c3, c1);
667
r[10] = c2;
668
c2 = 0;
669
mul_add_c(a[4], b[7], c3, c1, c2);
670
mul_add_c(a[5], b[6], c3, c1, c2);
671
mul_add_c(a[6], b[5], c3, c1, c2);
672
mul_add_c(a[7], b[4], c3, c1, c2);
673
r[11] = c3;
674
c3 = 0;
675
mul_add_c(a[7], b[5], c1, c2, c3);
676
mul_add_c(a[6], b[6], c1, c2, c3);
677
mul_add_c(a[5], b[7], c1, c2, c3);
678
r[12] = c1;
679
c1 = 0;
680
mul_add_c(a[6], b[7], c2, c3, c1);
681
mul_add_c(a[7], b[6], c2, c3, c1);
682
r[13] = c2;
683
c2 = 0;
684
mul_add_c(a[7], b[7], c3, c1, c2);
685
r[14] = c3;
686
r[15] = c1;
687
}
688
689
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
690
{
691
BN_ULONG c1, c2, c3;
692
693
c1 = 0;
694
c2 = 0;
695
c3 = 0;
696
mul_add_c(a[0], b[0], c1, c2, c3);
697
r[0] = c1;
698
c1 = 0;
699
mul_add_c(a[0], b[1], c2, c3, c1);
700
mul_add_c(a[1], b[0], c2, c3, c1);
701
r[1] = c2;
702
c2 = 0;
703
mul_add_c(a[2], b[0], c3, c1, c2);
704
mul_add_c(a[1], b[1], c3, c1, c2);
705
mul_add_c(a[0], b[2], c3, c1, c2);
706
r[2] = c3;
707
c3 = 0;
708
mul_add_c(a[0], b[3], c1, c2, c3);
709
mul_add_c(a[1], b[2], c1, c2, c3);
710
mul_add_c(a[2], b[1], c1, c2, c3);
711
mul_add_c(a[3], b[0], c1, c2, c3);
712
r[3] = c1;
713
c1 = 0;
714
mul_add_c(a[3], b[1], c2, c3, c1);
715
mul_add_c(a[2], b[2], c2, c3, c1);
716
mul_add_c(a[1], b[3], c2, c3, c1);
717
r[4] = c2;
718
c2 = 0;
719
mul_add_c(a[2], b[3], c3, c1, c2);
720
mul_add_c(a[3], b[2], c3, c1, c2);
721
r[5] = c3;
722
c3 = 0;
723
mul_add_c(a[3], b[3], c1, c2, c3);
724
r[6] = c1;
725
r[7] = c2;
726
}
727
728
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
729
{
730
BN_ULONG c1, c2, c3;
731
732
c1 = 0;
733
c2 = 0;
734
c3 = 0;
735
sqr_add_c(a, 0, c1, c2, c3);
736
r[0] = c1;
737
c1 = 0;
738
sqr_add_c2(a, 1, 0, c2, c3, c1);
739
r[1] = c2;
740
c2 = 0;
741
sqr_add_c(a, 1, c3, c1, c2);
742
sqr_add_c2(a, 2, 0, c3, c1, c2);
743
r[2] = c3;
744
c3 = 0;
745
sqr_add_c2(a, 3, 0, c1, c2, c3);
746
sqr_add_c2(a, 2, 1, c1, c2, c3);
747
r[3] = c1;
748
c1 = 0;
749
sqr_add_c(a, 2, c2, c3, c1);
750
sqr_add_c2(a, 3, 1, c2, c3, c1);
751
sqr_add_c2(a, 4, 0, c2, c3, c1);
752
r[4] = c2;
753
c2 = 0;
754
sqr_add_c2(a, 5, 0, c3, c1, c2);
755
sqr_add_c2(a, 4, 1, c3, c1, c2);
756
sqr_add_c2(a, 3, 2, c3, c1, c2);
757
r[5] = c3;
758
c3 = 0;
759
sqr_add_c(a, 3, c1, c2, c3);
760
sqr_add_c2(a, 4, 2, c1, c2, c3);
761
sqr_add_c2(a, 5, 1, c1, c2, c3);
762
sqr_add_c2(a, 6, 0, c1, c2, c3);
763
r[6] = c1;
764
c1 = 0;
765
sqr_add_c2(a, 7, 0, c2, c3, c1);
766
sqr_add_c2(a, 6, 1, c2, c3, c1);
767
sqr_add_c2(a, 5, 2, c2, c3, c1);
768
sqr_add_c2(a, 4, 3, c2, c3, c1);
769
r[7] = c2;
770
c2 = 0;
771
sqr_add_c(a, 4, c3, c1, c2);
772
sqr_add_c2(a, 5, 3, c3, c1, c2);
773
sqr_add_c2(a, 6, 2, c3, c1, c2);
774
sqr_add_c2(a, 7, 1, c3, c1, c2);
775
r[8] = c3;
776
c3 = 0;
777
sqr_add_c2(a, 7, 2, c1, c2, c3);
778
sqr_add_c2(a, 6, 3, c1, c2, c3);
779
sqr_add_c2(a, 5, 4, c1, c2, c3);
780
r[9] = c1;
781
c1 = 0;
782
sqr_add_c(a, 5, c2, c3, c1);
783
sqr_add_c2(a, 6, 4, c2, c3, c1);
784
sqr_add_c2(a, 7, 3, c2, c3, c1);
785
r[10] = c2;
786
c2 = 0;
787
sqr_add_c2(a, 7, 4, c3, c1, c2);
788
sqr_add_c2(a, 6, 5, c3, c1, c2);
789
r[11] = c3;
790
c3 = 0;
791
sqr_add_c(a, 6, c1, c2, c3);
792
sqr_add_c2(a, 7, 5, c1, c2, c3);
793
r[12] = c1;
794
c1 = 0;
795
sqr_add_c2(a, 7, 6, c2, c3, c1);
796
r[13] = c2;
797
c2 = 0;
798
sqr_add_c(a, 7, c3, c1, c2);
799
r[14] = c3;
800
r[15] = c1;
801
}
802
803
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
804
{
805
BN_ULONG c1, c2, c3;
806
807
c1 = 0;
808
c2 = 0;
809
c3 = 0;
810
sqr_add_c(a, 0, c1, c2, c3);
811
r[0] = c1;
812
c1 = 0;
813
sqr_add_c2(a, 1, 0, c2, c3, c1);
814
r[1] = c2;
815
c2 = 0;
816
sqr_add_c(a, 1, c3, c1, c2);
817
sqr_add_c2(a, 2, 0, c3, c1, c2);
818
r[2] = c3;
819
c3 = 0;
820
sqr_add_c2(a, 3, 0, c1, c2, c3);
821
sqr_add_c2(a, 2, 1, c1, c2, c3);
822
r[3] = c1;
823
c1 = 0;
824
sqr_add_c(a, 2, c2, c3, c1);
825
sqr_add_c2(a, 3, 1, c2, c3, c1);
826
r[4] = c2;
827
c2 = 0;
828
sqr_add_c2(a, 3, 2, c3, c1, c2);
829
r[5] = c3;
830
c3 = 0;
831
sqr_add_c(a, 3, c1, c2, c3);
832
r[6] = c1;
833
r[7] = c2;
834
}
835
836
# ifdef OPENSSL_NO_ASM
837
# ifdef OPENSSL_BN_ASM_MONT
838
# include <alloca.h>
839
/*
840
* This is essentially reference implementation, which may or may not
841
* result in performance improvement. E.g. on IA-32 this routine was
842
* observed to give 40% faster rsa1024 private key operations and 10%
843
* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
844
* by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
845
* reference implementation, one to be used as starting point for
846
* platform-specific assembler. Mentioned numbers apply to compiler
847
* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
848
* can vary not only from platform to platform, but even for compiler
849
* versions. Assembler vs. assembler improvement coefficients can
850
* [and are known to] differ and are to be documented elsewhere.
851
*/
852
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
853
const BN_ULONG *np, const BN_ULONG *n0p, int num)
854
{
855
BN_ULONG c0, c1, ml, *tp, n0;
856
# ifdef mul64
857
BN_ULONG mh;
858
# endif
859
volatile BN_ULONG *vp;
860
int i = 0, j;
861
862
# if 0 /* template for platform-specific
863
* implementation */
864
if (ap == bp)
865
return bn_sqr_mont(rp, ap, np, n0p, num);
866
# endif
867
vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
868
869
n0 = *n0p;
870
871
c0 = 0;
872
ml = bp[0];
873
# ifdef mul64
874
mh = HBITS(ml);
875
ml = LBITS(ml);
876
for (j = 0; j < num; ++j)
877
mul(tp[j], ap[j], ml, mh, c0);
878
# else
879
for (j = 0; j < num; ++j)
880
mul(tp[j], ap[j], ml, c0);
881
# endif
882
883
tp[num] = c0;
884
tp[num + 1] = 0;
885
goto enter;
886
887
for (i = 0; i < num; i++) {
888
c0 = 0;
889
ml = bp[i];
890
# ifdef mul64
891
mh = HBITS(ml);
892
ml = LBITS(ml);
893
for (j = 0; j < num; ++j)
894
mul_add(tp[j], ap[j], ml, mh, c0);
895
# else
896
for (j = 0; j < num; ++j)
897
mul_add(tp[j], ap[j], ml, c0);
898
# endif
899
c1 = (tp[num] + c0) & BN_MASK2;
900
tp[num] = c1;
901
tp[num + 1] = (c1 < c0 ? 1 : 0);
902
enter:
903
c1 = tp[0];
904
ml = (c1 * n0) & BN_MASK2;
905
c0 = 0;
906
# ifdef mul64
907
mh = HBITS(ml);
908
ml = LBITS(ml);
909
mul_add(c1, np[0], ml, mh, c0);
910
# else
911
mul_add(c1, ml, np[0], c0);
912
# endif
913
for (j = 1; j < num; j++) {
914
c1 = tp[j];
915
# ifdef mul64
916
mul_add(c1, np[j], ml, mh, c0);
917
# else
918
mul_add(c1, ml, np[j], c0);
919
# endif
920
tp[j - 1] = c1 & BN_MASK2;
921
}
922
c1 = (tp[num] + c0) & BN_MASK2;
923
tp[num - 1] = c1;
924
tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
925
}
926
927
if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
928
c0 = bn_sub_words(rp, tp, np, num);
929
if (tp[num] != 0 || c0 == 0) {
930
for (i = 0; i < num + 2; i++)
931
vp[i] = 0;
932
return 1;
933
}
934
}
935
for (i = 0; i < num; i++)
936
rp[i] = tp[i], vp[i] = 0;
937
vp[num] = 0;
938
vp[num + 1] = 0;
939
return 1;
940
}
941
# else
942
/*
943
* Return value of 0 indicates that multiplication/convolution was not
944
* performed to signal the caller to fall down to alternative/original
945
* code-path.
946
*/
947
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
948
const BN_ULONG *np, const BN_ULONG *n0, int num)
949
{
950
return 0;
951
}
952
# endif /* OPENSSL_BN_ASM_MONT */
953
# endif
954
955
#else /* !BN_MUL_COMBA */
956
957
/* hmm... is it faster just to do a multiply? */
958
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
959
{
960
BN_ULONG t[8];
961
bn_sqr_normal(r, a, 4, t);
962
}
963
964
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
965
{
966
BN_ULONG t[16];
967
bn_sqr_normal(r, a, 8, t);
968
}
969
970
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
971
{
972
r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
973
r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
974
r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
975
r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
976
}
977
978
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
979
{
980
r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
981
r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
982
r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
983
r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
984
r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
985
r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
986
r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
987
r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
988
}
989
990
# ifdef OPENSSL_NO_ASM
991
# ifdef OPENSSL_BN_ASM_MONT
992
# include <alloca.h>
993
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
994
const BN_ULONG *np, const BN_ULONG *n0p, int num)
995
{
996
BN_ULONG c0, c1, *tp, n0 = *n0p;
997
volatile BN_ULONG *vp;
998
int i = 0, j;
999
1000
vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1001
1002
for (i = 0; i <= num; i++)
1003
tp[i] = 0;
1004
1005
for (i = 0; i < num; i++) {
1006
c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1007
c1 = (tp[num] + c0) & BN_MASK2;
1008
tp[num] = c1;
1009
tp[num + 1] = (c1 < c0 ? 1 : 0);
1010
1011
c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1012
c1 = (tp[num] + c0) & BN_MASK2;
1013
tp[num] = c1;
1014
tp[num + 1] += (c1 < c0 ? 1 : 0);
1015
for (j = 0; j <= num; j++)
1016
tp[j] = tp[j + 1];
1017
}
1018
1019
if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1020
c0 = bn_sub_words(rp, tp, np, num);
1021
if (tp[num] != 0 || c0 == 0) {
1022
for (i = 0; i < num + 2; i++)
1023
vp[i] = 0;
1024
return 1;
1025
}
1026
}
1027
for (i = 0; i < num; i++)
1028
rp[i] = tp[i], vp[i] = 0;
1029
vp[num] = 0;
1030
vp[num + 1] = 0;
1031
return 1;
1032
}
1033
# else
1034
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1035
const BN_ULONG *np, const BN_ULONG *n0, int num)
1036
{
1037
return 0;
1038
}
1039
# endif /* OPENSSL_BN_ASM_MONT */
1040
# endif
1041
1042
#endif /* !BN_MUL_COMBA */
1043
1044