Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
samr7
GitHub Repository: samr7/vanitygen
Path: blob/master/calc_addrs.cl
239 views
1
/*
2
* Vanitygen, vanity bitcoin address generator
3
* Copyright (C) 2011 <samr7@cs.washington.edu>
4
*
5
* Vanitygen is free software: you can redistribute it and/or modify
6
* it under the terms of the GNU Affero General Public License as published by
7
* the Free Software Foundation, either version 3 of the License, or
8
* any later version.
9
*
10
* Vanitygen is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU Affero General Public License for more details.
14
*
15
* You should have received a copy of the GNU Affero General Public License
16
* along with Vanitygen. If not, see <http://www.gnu.org/licenses/>.
17
*/
18
19
/*
20
* This file contains an OpenCL kernel for performing certain parts of
21
* the bitcoin address calculation process.
22
*
23
* Kernel: ec_add_grid
24
*
25
* Inputs:
26
* - Row: Array of (sequential) EC points
27
* - Column: Array of column increment EC points (= rowsize * Pgenerator)
28
*
29
* Steps:
30
* - Compute P = Row[x] + Column[y]
31
* P is computed as numerator/denominator components Pxj, Pyj, Pz
32
* Final values are: Px = Pxj / (Pz^2), Py = Pyj / (Pz^3)
33
*
34
* The modular inverse of Pz is required to compute Px and Py, and
35
* can be computed more efficiently in large batches. This is done in
36
* the next kernel heap_invert.
37
*
38
* - Store Pxj, Pyj to intermediate point buffer
39
* - Store Pz to z_heap
40
*
41
* Outputs:
42
* - Intermediate point buffer
43
* - Denominator buffer (z_heap)
44
*
45
* -------------------------------
46
* Kernel: heap_invert
47
*
48
* Inputs:
49
* - Denominator buffer (z_heap)
50
* - N = Batch size (power of 2)
51
*
52
* Steps:
53
* - Compute the product tree for N values in the denominator buffer
54
* - Compute the modular inverse of the root of the product tree
55
* - Multiply down the tree to compute the modular inverse of each leaf
56
*
57
* Outputs:
58
* - Modular inverse denominator buffer (z_heap)
59
*
60
* -------------------------------
61
* Kernel: hash_ec_point_get
62
*
63
* Inputs:
64
* - Intermediate point buffer
65
* - Modular inverse denominator buffer (z_heap)
66
*
67
* Steps:
68
* - Compute Px = Pxj * (1/Pz)^2
69
* - Compute Py = Pyj * (1/Pz)^3
70
* - Compute H = RIPEMD160(SHA256(0x04 | Px | Py))
71
*
72
* Output:
73
* - Array of 20-byte address hash values
74
*
75
* -------------------------------
76
* Kernel: hash_ec_point_search_prefix
77
*
78
* Like hash_ec_point_get, but instead of storing the complete hash
79
* value to an output buffer, it searches a sorted list of ranges,
80
* and if a match is found, writes a flag to an output buffer.
81
*/
82
83
84
/* Byte-swapping and endianness */
85
#define bswap32(v) \
86
(((v) >> 24) | (((v) >> 8) & 0xff00) | \
87
(((v) << 8) & 0xff0000) | ((v) << 24))
88
89
#if __ENDIAN_LITTLE__ != 1
90
#define load_le32(v) bswap32(v)
91
#define load_be32(v) (v)
92
#else
93
#define load_le32(v) (v)
94
#define load_be32(v) bswap32(v)
95
#endif
96
97
/*
98
* Loop unrolling macros
99
*
100
* In most cases, preprocessor unrolling works best.
101
* The exception is NVIDIA's compiler, which seems to take unreasonably
102
* long to compile a loop with a larger iteration count, or a loop with
103
* a body of >50 PTX instructions, with preprocessor unrolling.
104
* However, it does not seem to take as long with pragma unroll, and
105
* produces good output.
106
*/
107
108
/* Explicit loop unrolling */
109
#define unroll_5(a) do { a(0) a(1) a(2) a(3) a(4) } while (0)
110
#define unroll_8(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
111
#define unroll_1_7(a) do { a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
112
#define unroll_7(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) } while (0)
113
#define unroll_7_0(a) do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) a(0) } while (0)
114
#define unroll_7_1(a) do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) } while (0)
115
#define unroll_16(a) do { \
116
a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) \
117
a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15) \
118
} while (0)
119
#define unroll_64(a) do { \
120
a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) \
121
a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15) \
122
a(16) a(17) a(18) a(19) a(20) a(21) a(22) a(23) \
123
a(24) a(25) a(26) a(27) a(28) a(29) a(30) a(31) \
124
a(32) a(33) a(34) a(35) a(36) a(37) a(38) a(39) \
125
a(40) a(41) a(42) a(43) a(44) a(45) a(46) a(47) \
126
a(48) a(49) a(50) a(51) a(52) a(53) a(54) a(55) \
127
a(56) a(57) a(58) a(59) a(60) a(61) a(62) a(63) \
128
} while (0)
129
130
/* Conditional loop unrolling */
131
#if defined(DEEP_PREPROC_UNROLL)
132
#define iter_5(a) unroll_5(a)
133
#define iter_8(a) unroll_8(a)
134
#define iter_16(a) unroll_16(a)
135
#define iter_64(a) unroll_64(a)
136
#else
137
#define iter_5(a) do {int _i; for (_i = 0; _i < 5; _i++) { a(_i) }} while (0)
138
#define iter_8(a) do {int _i; for (_i = 0; _i < 8; _i++) { a(_i) }} while (0)
139
#define iter_16(a) do {int _i; for (_i = 0; _i < 16; _i++) { a(_i) }} while (0)
140
#define iter_64(a) do {int _i; for (_i = 0; _i < 64; _i++) { a(_i) }} while (0)
141
#endif
142
143
/*
144
* BIGNUM mini-library
145
* This module deals with fixed-size 256-bit bignums.
146
* Where modular arithmetic is performed, the SECP256k1 prime
147
* modulus (below) is assumed.
148
*
149
* Methods include:
150
* - bn_is_zero/bn_is_one/bn_is_odd/bn_is_even/bn_is_bit_set
151
* - bn_rshift[1]/bn_lshift[1]
152
* - bn_neg
153
* - bn_uadd/bn_uadd_p
154
* - bn_usub/bn_usub_p
155
*/
156
157
typedef uint bn_word;
158
#define BN_NBITS 256
159
#define BN_WSHIFT 5
160
#define BN_WBITS (1 << BN_WSHIFT)
161
#define BN_NWORDS ((BN_NBITS/8) / sizeof(bn_word))
162
#define BN_WORDMAX 0xffffffff
163
164
#define MODULUS_BYTES \
165
0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, \
166
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
167
168
typedef struct {
169
bn_word d[BN_NWORDS];
170
} bignum;
171
172
__constant bn_word modulus[] = { MODULUS_BYTES };
173
__constant bignum bn_zero;
174
175
__constant bn_word mont_rr[BN_NWORDS] = { 0xe90a1, 0x7a2, 0x1, 0, };
176
__constant bn_word mont_n0[2] = { 0xd2253531, 0xd838091d };
177
178
179
#define bn_is_odd(bn) (bn.d[0] & 1)
180
#define bn_is_even(bn) (!bn_is_odd(bn))
181
#define bn_is_zero(bn) (!bn.d[0] && !bn.d[1] && !bn.d[2] && \
182
!bn.d[3] && !bn.d[4] && !bn.d[5] && \
183
!bn.d[6] && !bn.d[7])
184
#define bn_is_one(bn) ((bn.d[0] == 1) && !bn.d[1] && !bn.d[2] && \
185
!bn.d[3] && !bn.d[4] && !bn.d[5] && \
186
!bn.d[6] && !bn.d[7])
187
#define bn_is_bit_set(bn, n) \
188
((((bn_word*)&bn)[n >> BN_WSHIFT]) & (1 << (n & (BN_WBITS-1))))
189
190
#define bn_unroll(e) unroll_8(e)
191
#define bn_unroll_sf(e) unroll_1_7(e)
192
#define bn_unroll_sl(e) unroll_7(e)
193
#define bn_unroll_reverse(e) unroll_7_0(e)
194
#define bn_unroll_reverse_sl(e) unroll_7_1(e)
195
196
#define bn_unroll_arg(e, arg) \
197
e(arg, 0) e(arg, 1) e(arg, 2) e(arg, 3) \
198
e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
199
#define bn_unroll_arg_sf(e, arg) \
200
e(arg, 1) e(arg, 2) e(arg, 3) \
201
e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
202
203
#define bn_iter(e) iter_8(e)
204
205
206
/*
207
* Bitwise shift
208
*/
209
210
void
211
bn_lshift1(bignum *bn)
212
{
213
#define bn_lshift1_inner1(i) \
214
bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
215
bn_unroll_reverse_sl(bn_lshift1_inner1);
216
bn->d[0] <<= 1;
217
}
218
219
void
220
bn_rshift(bignum *bn, int shift)
221
{
222
int wd, iws, iwr;
223
bn_word ihw, ilw;
224
iws = (shift & (BN_WBITS-1));
225
iwr = BN_WBITS - iws;
226
wd = (shift >> BN_WSHIFT);
227
ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;
228
229
#define bn_rshift_inner1(i) \
230
wd++; \
231
ilw = ihw; \
232
ihw = (wd < BN_WBITS) ? bn->d[wd] : 0; \
233
bn->d[i] = (ilw >> iws) | (ihw << iwr);
234
bn_unroll_sl(bn_rshift_inner1);
235
bn->d[BN_NWORDS-1] = (ihw >> iws);
236
}
237
238
void
239
bn_rshift1(bignum *bn)
240
{
241
#define bn_rshift1_inner1(i) \
242
bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
243
bn_unroll_sl(bn_rshift1_inner1);
244
bn->d[BN_NWORDS-1] >>= 1;
245
}
246
247
void
248
bn_rshift1_2(bignum *bna, bignum *bnb)
249
{
250
#define bn_rshift1_2_inner1(i) \
251
bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1); \
252
bnb->d[i] = (bnb->d[i+1] << 31) | (bnb->d[i] >> 1);
253
bn_unroll_sl(bn_rshift1_2_inner1);
254
bna->d[BN_NWORDS-1] >>= 1;
255
bnb->d[BN_NWORDS-1] >>= 1;
256
}
257
258
259
/*
260
* Unsigned comparison
261
*/
262
263
int
264
bn_ucmp_ge(bignum *a, bignum *b)
265
{
266
int l = 0, g = 0;
267
268
#define bn_ucmp_ge_inner1(i) \
269
if (a->d[i] < b->d[i]) l |= (1 << i); \
270
if (a->d[i] > b->d[i]) g |= (1 << i);
271
bn_unroll_reverse(bn_ucmp_ge_inner1);
272
return (l > g) ? 0 : 1;
273
}
274
275
int
276
bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
277
{
278
int l = 0, g = 0;
279
280
#define bn_ucmp_ge_c_inner1(i) \
281
if (a->d[i] < b[i]) l |= (1 << i); \
282
if (a->d[i] > b[i]) g |= (1 << i);
283
bn_unroll_reverse(bn_ucmp_ge_c_inner1);
284
return (l > g) ? 0 : 1;
285
}
286
287
/*
288
* Negate
289
*/
290
291
void
292
bn_neg(bignum *n)
293
{
294
int c = 1;
295
296
#define bn_neg_inner1(i) \
297
c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
298
bn_unroll(bn_neg_inner1);
299
}
300
301
/*
302
* Add/subtract
303
*/
304
305
#define bn_add_word(r, a, b, t, c) do { \
306
t = a + b; \
307
c = (t < a) ? 1 : 0; \
308
r = t; \
309
} while (0)
310
311
#define bn_addc_word(r, a, b, t, c) do { \
312
t = a + b + c; \
313
c = (t < a) ? 1 : ((c & (t == a)) ? 1 : 0); \
314
r = t; \
315
} while (0)
316
317
bn_word
318
bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)
319
{
320
bn_word t, c = 0;
321
322
#define bn_uadd_words_seq_inner1(i) \
323
bn_addc_word(r[i], a[i], b[i], t, c);
324
bn_add_word(r[0], a[0], b[0], t, c);
325
bn_unroll_sf(bn_uadd_words_seq_inner1);
326
return c;
327
}
328
329
bn_word
330
bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
331
{
332
bn_word t, c = 0;
333
334
bn_add_word(r[0], a[0], b[0], t, c);
335
bn_unroll_sf(bn_uadd_words_seq_inner1);
336
return c;
337
}
338
339
#define bn_sub_word(r, a, b, t, c) do { \
340
t = a - b; \
341
c = (a < b) ? 1 : 0; \
342
r = t; \
343
} while (0)
344
345
#define bn_subb_word(r, a, b, t, c) do { \
346
t = a - (b + c); \
347
c = (!(a) && c) ? 1 : 0; \
348
c |= (a < b) ? 1 : 0; \
349
r = t; \
350
} while (0)
351
352
bn_word
353
bn_usub_words_seq(bn_word *r, bn_word *a, bn_word *b)
354
{
355
bn_word t, c = 0;
356
357
#define bn_usub_words_seq_inner1(i) \
358
bn_subb_word(r[i], a[i], b[i], t, c);
359
360
bn_sub_word(r[0], a[0], b[0], t, c);
361
bn_unroll_sf(bn_usub_words_seq_inner1);
362
return c;
363
}
364
365
bn_word
366
bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
367
{
368
bn_word t, c = 0;
369
370
bn_sub_word(r[0], a[0], b[0], t, c);
371
bn_unroll_sf(bn_usub_words_seq_inner1);
372
return c;
373
}
374
375
/*
376
* Add/subtract better suited for AMD's VLIW architecture
377
*/
378
bn_word
379
bn_uadd_words_vliw(bn_word *r, bn_word *a, bn_word *b)
380
{
381
bignum x;
382
bn_word c = 0, cp = 0;
383
384
#define bn_uadd_words_vliw_inner1(i) \
385
x.d[i] = a[i] + b[i];
386
387
#define bn_uadd_words_vliw_inner2(i) \
388
c |= (a[i] > x.d[i]) ? (1 << i) : 0; \
389
cp |= (!~x.d[i]) ? (1 << i) : 0;
390
391
#define bn_uadd_words_vliw_inner3(i) \
392
r[i] = x.d[i] + ((c >> i) & 1);
393
394
bn_unroll(bn_uadd_words_vliw_inner1);
395
bn_unroll(bn_uadd_words_vliw_inner2);
396
c = ((cp + (c << 1)) ^ cp);
397
r[0] = x.d[0];
398
bn_unroll_sf(bn_uadd_words_vliw_inner3);
399
return c >> BN_NWORDS;
400
}
401
402
bn_word
403
bn_uadd_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
404
{
405
bignum x;
406
bn_word c = 0, cp = 0;
407
408
bn_unroll(bn_uadd_words_vliw_inner1);
409
bn_unroll(bn_uadd_words_vliw_inner2);
410
c = ((cp + (c << 1)) ^ cp);
411
r[0] = x.d[0];
412
bn_unroll_sf(bn_uadd_words_vliw_inner3);
413
return c >> BN_NWORDS;
414
}
415
416
bn_word
417
bn_usub_words_vliw(bn_word *r, bn_word *a, bn_word *b)
418
{
419
bignum x;
420
bn_word c = 0, cp = 0;
421
422
#define bn_usub_words_vliw_inner1(i) \
423
x.d[i] = a[i] - b[i];
424
425
#define bn_usub_words_vliw_inner2(i) \
426
c |= (a[i] < b[i]) ? (1 << i) : 0; \
427
cp |= (!x.d[i]) ? (1 << i) : 0;
428
429
#define bn_usub_words_vliw_inner3(i) \
430
r[i] = x.d[i] - ((c >> i) & 1);
431
432
bn_unroll(bn_usub_words_vliw_inner1);
433
bn_unroll(bn_usub_words_vliw_inner2);
434
c = ((cp + (c << 1)) ^ cp);
435
r[0] = x.d[0];
436
bn_unroll_sf(bn_usub_words_vliw_inner3);
437
return c >> BN_NWORDS;
438
}
439
440
bn_word
441
bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
442
{
443
bignum x;
444
bn_word c = 0, cp = 0;
445
446
bn_unroll(bn_usub_words_vliw_inner1);
447
bn_unroll(bn_usub_words_vliw_inner2);
448
c = ((cp + (c << 1)) ^ cp);
449
r[0] = x.d[0];
450
bn_unroll_sf(bn_usub_words_vliw_inner3);
451
return c >> BN_NWORDS;
452
}
453
454
455
#if defined(DEEP_VLIW)
456
#define bn_uadd_words bn_uadd_words_vliw
457
#define bn_uadd_words_c bn_uadd_words_c_vliw
458
#define bn_usub_words bn_usub_words_vliw
459
#define bn_usub_words_c bn_usub_words_c_vliw
460
#else
461
#define bn_uadd_words bn_uadd_words_seq
462
#define bn_uadd_words_c bn_uadd_words_c_seq
463
#define bn_usub_words bn_usub_words_seq
464
#define bn_usub_words_c bn_usub_words_c_seq
465
#endif
466
467
#define bn_uadd(r, a, b) bn_uadd_words((r)->d, (a)->d, (b)->d)
468
#define bn_uadd_c(r, a, b) bn_uadd_words_c((r)->d, (a)->d, b)
469
#define bn_usub(r, a, b) bn_usub_words((r)->d, (a)->d, (b)->d)
470
#define bn_usub_c(r, a, b) bn_usub_words_c((r)->d, (a)->d, b)
471
472
/*
473
* Modular add/sub
474
*/
475
476
void
477
bn_mod_add(bignum *r, bignum *a, bignum *b)
478
{
479
if (bn_uadd(r, a, b) ||
480
(bn_ucmp_ge_c(r, modulus)))
481
bn_usub_c(r, r, modulus);
482
}
483
484
void
485
bn_mod_sub(bignum *r, bignum *a, bignum *b)
486
{
487
if (bn_usub(r, a, b))
488
bn_uadd_c(r, r, modulus);
489
}
490
491
void
492
bn_mod_lshift1(bignum *bn)
493
{
494
bn_word c = (bn->d[BN_NWORDS-1] & 0x80000000);
495
bn_lshift1(bn);
496
if (c || (bn_ucmp_ge_c(bn, modulus)))
497
bn_usub_c(bn, bn, modulus);
498
}
499
500
/*
501
* Montgomery multiplication
502
*
503
* This includes normal multiplication of two "Montgomeryized"
504
* bignums, and bn_from_mont for de-Montgomeryizing a bignum.
505
*/
506
507
#define bn_mul_word(r, a, w, c, p, s) do { \
508
r = (a * w) + c; \
509
p = mul_hi(a, w); \
510
c = (r < c) ? p + 1 : p; \
511
} while (0)
512
513
#define bn_mul_add_word(r, a, w, c, p, s) do { \
514
s = r + c; \
515
p = mul_hi(a, w); \
516
r = (a * w) + s; \
517
c = (s < c) ? p + 1 : p; \
518
if (r < s) c++; \
519
} while (0)
520
void
521
bn_mul_mont(bignum *r, bignum *a, bignum *b)
522
{
523
bignum t;
524
bn_word tea, teb, c, p, s, m;
525
526
#if !defined(VERY_EXPENSIVE_BRANCHES)
527
int q;
528
#endif
529
530
c = 0;
531
#define bn_mul_mont_inner1(j) \
532
bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
533
bn_unroll(bn_mul_mont_inner1);
534
tea = c;
535
teb = 0;
536
537
c = 0;
538
m = t.d[0] * mont_n0[0];
539
bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
540
#define bn_mul_mont_inner2(j) \
541
bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); \
542
t.d[j-1] = t.d[j];
543
bn_unroll_sf(bn_mul_mont_inner2);
544
t.d[BN_NWORDS-1] = tea + c;
545
tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
546
547
#define bn_mul_mont_inner3_1(i, j) \
548
bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
549
#define bn_mul_mont_inner3_2(i, j) \
550
bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); \
551
t.d[j-1] = t.d[j];
552
#define bn_mul_mont_inner3(i) \
553
c = 0; \
554
bn_unroll_arg(bn_mul_mont_inner3_1, i); \
555
tea += c; \
556
teb = ((tea < c) ? 1 : 0); \
557
c = 0; \
558
m = t.d[0] * mont_n0[0]; \
559
bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); \
560
bn_unroll_arg_sf(bn_mul_mont_inner3_2, i); \
561
t.d[BN_NWORDS-1] = tea + c; \
562
tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
563
564
/*
565
* The outer loop here is quite long, and we won't unroll it
566
* unless VERY_EXPENSIVE_BRANCHES is set.
567
*/
568
#if defined(VERY_EXPENSIVE_BRANCHES)
569
bn_unroll_sf(bn_mul_mont_inner3);
570
c = tea | !bn_usub_c(r, &t, modulus);
571
if (!c)
572
*r = t;
573
574
#else
575
for (q = 1; q < BN_NWORDS; q++) {
576
bn_mul_mont_inner3(q);
577
}
578
c = tea || (t.d[BN_NWORDS-1] >= modulus[BN_NWORDS-1]);
579
if (c) {
580
c = tea | !bn_usub_c(r, &t, modulus);
581
if (c)
582
return;
583
}
584
*r = t;
585
#endif
586
}
587
588
void
589
bn_from_mont(bignum *rb, bignum *b)
590
{
591
#define WORKSIZE ((2*BN_NWORDS) + 1)
592
bn_word r[WORKSIZE];
593
bn_word m, c, p, s;
594
#if defined(PRAGMA_UNROLL)
595
int i;
596
#endif
597
598
/* Copy the input to the working area */
599
/* Zero the upper words */
600
#define bn_from_mont_inner1(i) \
601
r[i] = b->d[i];
602
#define bn_from_mont_inner2(i) \
603
r[BN_NWORDS+i] = 0;
604
605
bn_unroll(bn_from_mont_inner1);
606
bn_unroll(bn_from_mont_inner2);
607
r[WORKSIZE-1] = 0;
608
609
/* Multiply (long) by modulus */
610
#define bn_from_mont_inner3_1(i, j) \
611
bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
612
613
#if !defined(VERY_EXPENSIVE_BRANCHES)
614
#define bn_from_mont_inner3_2(i) \
615
if (r[BN_NWORDS + i] < c) \
616
r[BN_NWORDS + i + 1] += 1;
617
#else
618
#define bn_from_mont_inner3_2(i) \
619
r[BN_NWORDS + i + 1] += (r[BN_NWORDS + i] < c) ? 1 : 0;
620
#endif
621
622
#define bn_from_mont_inner3(i) \
623
m = r[i] * mont_n0[0]; \
624
c = 0; \
625
bn_unroll_arg(bn_from_mont_inner3_1, i); \
626
r[BN_NWORDS + i] += c; \
627
bn_from_mont_inner3_2(i)
628
629
/*
630
* The outer loop here is not very long, so we will unroll
631
* it by default. However, it's just complicated enough to
632
* cause NVIDIA's compiler to take unreasonably long to compile
633
* it, unless we use pragma unroll.
634
*/
635
#if !defined(PRAGMA_UNROLL)
636
bn_iter(bn_from_mont_inner3);
637
#else
638
#pragma unroll 8
639
for (i = 0; i < BN_NWORDS; i++) { bn_from_mont_inner3(i) }
640
#endif
641
642
/*
643
* Make sure the result is less than the modulus.
644
* Subtracting is not much more expensive than compare, so
645
* subtract always and assign based on the carry out value.
646
*/
647
c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus);
648
if (c) {
649
#define bn_from_mont_inner4(i) \
650
rb->d[i] = r[BN_NWORDS + i];
651
bn_unroll(bn_from_mont_inner4);
652
}
653
}
654
655
/*
656
* Modular inversion
657
*/
658
659
void
660
bn_mod_inverse(bignum *r, bignum *n)
661
{
662
bignum a, b, x, y;
663
int shift;
664
bn_word xc, yc;
665
for (shift = 0; shift < BN_NWORDS; shift++) {
666
a.d[shift] = modulus[shift];
667
x.d[shift] = 0;
668
y.d[shift] = 0;
669
}
670
b = *n;
671
x.d[0] = 1;
672
xc = 0;
673
yc = 0;
674
while (!bn_is_zero(b)) {
675
shift = 0;
676
while (!bn_is_odd(b)) {
677
if (bn_is_odd(x))
678
xc += bn_uadd_c(&x, &x, modulus);
679
bn_rshift1_2(&x, &b);
680
x.d[7] |= (xc << 31);
681
xc >>= 1;
682
}
683
684
while (!bn_is_odd(a)) {
685
if (bn_is_odd(y))
686
yc += bn_uadd_c(&y, &y, modulus);
687
bn_rshift1_2(&y, &a);
688
y.d[7] |= (yc << 31);
689
yc >>= 1;
690
}
691
692
if (bn_ucmp_ge(&b, &a)) {
693
xc += yc + bn_uadd(&x, &x, &y);
694
bn_usub(&b, &b, &a);
695
} else {
696
yc += xc + bn_uadd(&y, &y, &x);
697
bn_usub(&a, &a, &b);
698
}
699
}
700
701
if (!bn_is_one(a)) {
702
/* no modular inverse */
703
*r = bn_zero;
704
} else {
705
/* Compute y % m as cheaply as possible */
706
while (yc < 0x80000000)
707
yc -= bn_usub_c(&y, &y, modulus);
708
bn_neg(&y);
709
*r = y;
710
}
711
}
712
713
/*
714
* HASH FUNCTIONS
715
*
716
* BYTE ORDER NOTE: None of the hash functions below deal with byte
717
* order. The caller is expected to be aware of this when it stuffs
718
* data into in the native integer.
719
*
720
* NOTE #2: Endianness of the OpenCL device makes no difference here.
721
*/
722
723
#define hash256_unroll(a) unroll_8(a)
724
#define hash160_unroll(a) unroll_5(a)
725
#define hash256_iter(a) iter_8(a)
726
#define hash160_iter(a) iter_5(a)
727
728
729
/*
730
* SHA-2 256
731
*
732
* CAUTION: Input buffer will be overwritten/mangled.
733
* Data expected in big-endian format.
734
* This implementation is designed for space efficiency more than
735
* raw speed.
736
*/
737
738
__constant uint sha2_init[8] = {
739
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
740
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
741
};
742
743
__constant uint sha2_k[64] = {
744
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
745
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
746
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
747
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
748
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
749
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
750
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
751
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
752
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
753
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
754
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
755
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
756
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
757
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
758
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
759
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
760
};
761
762
void
763
sha2_256_init(uint *out)
764
{
765
#define sha2_256_init_inner_1(i) \
766
out[i] = sha2_init[i];
767
768
hash256_unroll(sha2_256_init_inner_1);
769
}
770
771
/* The state variable remapping is really contorted */
772
#define sha2_stvar(vals, i, v) vals[(64+v-i) % 8]
773
#define sha2_s0(a) (rotate(a, 30U) ^ rotate(a, 19U) ^ rotate(a, 10U))
774
#define sha2_s1(a) (rotate(a, 26U) ^ rotate(a, 21U) ^ rotate(a, 7U))
775
#if defined(AMD_BFI_INT)
776
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
777
#define sha2_ch(a, b, c) amd_bytealign(a, b, c)
778
#define sha2_ma(a, b, c) amd_bytealign((a^c), b, a)
779
#else
780
#define sha2_ch(a, b, c) (c ^ (a & (b ^ c)))
781
#define sha2_ma(a, b, c) ((a & c) | (b & (a | c)))
782
#endif
783
784
void
785
sha2_256_block(uint *out, uint *in)
786
{
787
uint state[8], t1, t2;
788
#if defined(PRAGMA_UNROLL)
789
int i;
790
#endif
791
792
#define sha2_256_block_inner_1(i) \
793
state[i] = out[i];
794
hash256_unroll(sha2_256_block_inner_1);
795
796
#define sha2_256_block_inner_2(i) \
797
if (i >= 16) { \
798
t1 = in[(i + 1) % 16]; \
799
t2 = in[(i + 14) % 16]; \
800
in[i % 16] += (in[(i + 9) % 16] + \
801
(rotate(t1, 25U) ^ rotate(t1, 14U) ^ (t1 >> 3)) + \
802
(rotate(t2, 15U) ^ rotate(t2, 13U) ^ (t2 >> 10))); \
803
} \
804
t1 = (sha2_stvar(state, i, 7) + \
805
sha2_s1(sha2_stvar(state, i, 4)) + \
806
sha2_ch(sha2_stvar(state, i, 4), \
807
sha2_stvar(state, i, 5), \
808
sha2_stvar(state, i, 6)) + \
809
sha2_k[i] + \
810
in[i % 16]); \
811
t2 = (sha2_s0(sha2_stvar(state, i, 0)) + \
812
sha2_ma(sha2_stvar(state, i, 0), \
813
sha2_stvar(state, i, 1), \
814
sha2_stvar(state, i, 2))); \
815
sha2_stvar(state, i, 3) += t1; \
816
sha2_stvar(state, i, 7) = t1 + t2; \
817
818
#if !defined(PRAGMA_UNROLL)
819
iter_64(sha2_256_block_inner_2);
820
#else
821
#pragma unroll 64
822
for (i = 0; i < 64; i++) { sha2_256_block_inner_2(i) }
823
#endif
824
825
#define sha2_256_block_inner_3(i) \
826
out[i] += state[i];
827
828
hash256_unroll(sha2_256_block_inner_3);
829
}
830
831
832
/*
833
* RIPEMD160
834
*
835
* Data expected in little-endian format.
836
*/
837
838
__constant uint ripemd160_iv[] = {
839
0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
840
__constant uint ripemd160_k[] = {
841
0x00000000, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
842
__constant uint ripemd160_kp[] = {
843
0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0x00000000 };
844
__constant uchar ripemd160_ws[] = {
845
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
846
7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8,
847
3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12,
848
1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2,
849
4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13,
850
};
851
__constant uchar ripemd160_wsp[] = {
852
5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12,
853
6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2,
854
15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13,
855
8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14,
856
12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11
857
};
858
__constant uchar ripemd160_rl[] = {
859
11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8,
860
7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12,
861
11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5,
862
11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12,
863
9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6,
864
};
865
__constant uchar ripemd160_rlp[] = {
866
8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6,
867
9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11,
868
9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5,
869
15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8,
870
8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
871
};
872
873
#define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5]
874
#define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)]
875
#if defined(AMD_BFI_INT)
876
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
877
#define ripemd160_f1(x, y, z) amd_bytealign(x, y, z)
878
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
879
#define ripemd160_f3(x, y, z) amd_bytealign(z, x, y)
880
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
881
#else
882
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
883
#define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
884
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
885
#define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
886
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
887
#endif
888
#define ripemd160_round(i, in, vals, f, fp, t) do { \
889
ripemd160_val(vals, i, 0) = \
890
rotate(ripemd160_val(vals, i, 0) + \
891
f(ripemd160_val(vals, i, 1), \
892
ripemd160_val(vals, i, 2), \
893
ripemd160_val(vals, i, 3)) + \
894
in[ripemd160_ws[i]] + \
895
ripemd160_k[i / 16], \
896
(uint)ripemd160_rl[i]) + \
897
ripemd160_val(vals, i, 4); \
898
ripemd160_val(vals, i, 2) = \
899
rotate(ripemd160_val(vals, i, 2), 10U); \
900
ripemd160_valp(vals, i, 0) = \
901
rotate(ripemd160_valp(vals, i, 0) + \
902
fp(ripemd160_valp(vals, i, 1), \
903
ripemd160_valp(vals, i, 2), \
904
ripemd160_valp(vals, i, 3)) + \
905
in[ripemd160_wsp[i]] + \
906
ripemd160_kp[i / 16], \
907
(uint)ripemd160_rlp[i]) + \
908
ripemd160_valp(vals, i, 4); \
909
ripemd160_valp(vals, i, 2) = \
910
rotate(ripemd160_valp(vals, i, 2), 10U); \
911
} while (0)
912
913
void
914
ripemd160_init(uint *out)
915
{
916
#define ripemd160_init_inner_1(i) \
917
out[i] = ripemd160_iv[i];
918
919
hash160_unroll(ripemd160_init_inner_1);
920
}
921
922
void
923
ripemd160_block(uint *out, uint *in)
924
{
925
uint vals[10], t;
926
#if defined(PRAGMA_UNROLL)
927
int i;
928
#endif
929
930
#define ripemd160_block_inner_1(i) \
931
vals[i] = vals[i + 5] = out[i];
932
933
hash160_unroll(ripemd160_block_inner_1);
934
935
#define ripemd160_block_inner_p0(i) \
936
ripemd160_round(i, in, vals, \
937
ripemd160_f0, ripemd160_f4, t);
938
#define ripemd160_block_inner_p1(i) \
939
ripemd160_round((16 + i), in, vals, \
940
ripemd160_f1, ripemd160_f3, t);
941
#define ripemd160_block_inner_p2(i) \
942
ripemd160_round((32 + i), in, vals, \
943
ripemd160_f2, ripemd160_f2, t);
944
#define ripemd160_block_inner_p3(i) \
945
ripemd160_round((48 + i), in, vals, \
946
ripemd160_f3, ripemd160_f1, t);
947
#define ripemd160_block_inner_p4(i) \
948
ripemd160_round((64 + i), in, vals, \
949
ripemd160_f4, ripemd160_f0, t);
950
951
#if !defined(PRAGMA_UNROLL)
952
iter_16(ripemd160_block_inner_p0);
953
iter_16(ripemd160_block_inner_p1);
954
iter_16(ripemd160_block_inner_p2);
955
iter_16(ripemd160_block_inner_p3);
956
iter_16(ripemd160_block_inner_p4);
957
#else
958
#pragma unroll 16
959
for (i = 0; i < 16; i++) { ripemd160_block_inner_p0(i); }
960
#pragma unroll 16
961
for (i = 0; i < 16; i++) { ripemd160_block_inner_p1(i); }
962
#pragma unroll 16
963
for (i = 0; i < 16; i++) { ripemd160_block_inner_p2(i); }
964
#pragma unroll 16
965
for (i = 0; i < 16; i++) { ripemd160_block_inner_p3(i); }
966
#pragma unroll 16
967
for (i = 0; i < 16; i++) { ripemd160_block_inner_p4(i); }
968
#endif
969
970
t = out[1] + vals[2] + vals[8];
971
out[1] = out[2] + vals[3] + vals[9];
972
out[2] = out[3] + vals[4] + vals[5];
973
out[3] = out[4] + vals[0] + vals[6];
974
out[4] = out[0] + vals[1] + vals[7];
975
out[0] = t;
976
}
977
978
979
#ifdef TEST_KERNELS
980
/*
981
* Test kernels
982
*/
983
984
/* Montgomery multiplication test kernel */
985
__kernel void
986
test_mul_mont(__global bignum *products_out, __global bignum *nums_in)
987
{
988
bignum a, b, c;
989
int o;
990
o = get_global_id(0);
991
nums_in += (2*o);
992
993
a = nums_in[0];
994
b = nums_in[1];
995
bn_mul_mont(&c, &a, &b);
996
products_out[o] = c;
997
}
998
999
/* modular inversion test kernel */
1000
__kernel void
1001
test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in,
1002
int count)
1003
{
1004
bignum x, xp;
1005
int i, o;
1006
o = get_global_id(0) * count;
1007
for (i = 0; i < count; i++) {
1008
x = nums_in[o];
1009
bn_mod_inverse(&xp, &x);
1010
inv_out[o++] = xp;
1011
}
1012
}
1013
#endif /* TEST_KERNELS */
1014
1015
1016
#define ACCESS_BUNDLE 1024
1017
#define ACCESS_STRIDE (ACCESS_BUNDLE/BN_NWORDS)
1018
1019
__kernel void
1020
ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap,
1021
__global bn_word *row_in, __global bignum *col_in)
1022
{
1023
bignum rx, ry;
1024
bignum x1, y1, a, b, c, d, e, z;
1025
bn_word cy;
1026
int i, cell, start;
1027
1028
/* Load the row increment point */
1029
i = 2 * get_global_id(1);
1030
rx = col_in[i];
1031
ry = col_in[i+1];
1032
1033
cell = get_global_id(0);
1034
start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1035
(cell % (ACCESS_STRIDE/2)));
1036
1037
#define ec_add_grid_inner_1(i) \
1038
x1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
1039
1040
bn_unroll(ec_add_grid_inner_1);
1041
start += (ACCESS_STRIDE/2);
1042
1043
#define ec_add_grid_inner_2(i) \
1044
y1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
1045
1046
bn_unroll(ec_add_grid_inner_2);
1047
1048
bn_mod_sub(&z, &x1, &rx);
1049
1050
cell += (get_global_id(1) * get_global_size(0));
1051
start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1052
(cell % ACCESS_STRIDE));
1053
1054
#define ec_add_grid_inner_3(i) \
1055
z_heap[start + (i*ACCESS_STRIDE)] = z.d[i];
1056
1057
bn_unroll(ec_add_grid_inner_3);
1058
1059
bn_mod_sub(&b, &y1, &ry);
1060
bn_mod_add(&c, &x1, &rx);
1061
bn_mod_add(&d, &y1, &ry);
1062
bn_mul_mont(&y1, &b, &b);
1063
bn_mul_mont(&x1, &z, &z);
1064
bn_mul_mont(&e, &c, &x1);
1065
bn_mod_sub(&y1, &y1, &e);
1066
1067
/*
1068
* This disgusting code caters to the global memory unit on
1069
* various GPUs, by giving it a nice contiguous patch to write
1070
* per warp/wavefront.
1071
*/
1072
start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1073
(cell % (ACCESS_STRIDE/2)));
1074
1075
#define ec_add_grid_inner_4(i) \
1076
points_out[start + (i*ACCESS_STRIDE)] = y1.d[i];
1077
1078
bn_unroll(ec_add_grid_inner_4);
1079
1080
bn_mod_lshift1(&y1);
1081
bn_mod_sub(&y1, &e, &y1);
1082
bn_mul_mont(&y1, &y1, &b);
1083
bn_mul_mont(&a, &x1, &z);
1084
bn_mul_mont(&c, &d, &a);
1085
bn_mod_sub(&y1, &y1, &c);
1086
cy = 0;
1087
if (bn_is_odd(y1))
1088
cy = bn_uadd_c(&y1, &y1, modulus);
1089
bn_rshift1(&y1);
1090
y1.d[BN_NWORDS-1] |= (cy ? 0x80000000 : 0);
1091
1092
start += (ACCESS_STRIDE/2);
1093
1094
bn_unroll(ec_add_grid_inner_4);
1095
}
1096
1097
__kernel void
1098
heap_invert(__global bn_word *z_heap, int batch)
1099
{
1100
bignum a, b, c, z;
1101
int i, off, lcell, hcell, start;
1102
1103
#define heap_invert_inner_load_a(j) \
1104
a.d[j] = z_heap[start + j*ACCESS_STRIDE];
1105
#define heap_invert_inner_load_b(j) \
1106
b.d[j] = z_heap[start + j*ACCESS_STRIDE];
1107
#define heap_invert_inner_load_z(j) \
1108
z.d[j] = z_heap[start + j*ACCESS_STRIDE];
1109
#define heap_invert_inner_store_z(j) \
1110
z_heap[start + j*ACCESS_STRIDE] = z.d[j];
1111
#define heap_invert_inner_store_c(j) \
1112
z_heap[start + j*ACCESS_STRIDE] = c.d[j];
1113
1114
off = get_global_size(0);
1115
lcell = get_global_id(0);
1116
hcell = (off * batch) + lcell;
1117
for (i = 0; i < (batch-1); i++) {
1118
1119
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1120
(lcell % ACCESS_STRIDE));
1121
1122
bn_unroll(heap_invert_inner_load_a);
1123
1124
lcell += off;
1125
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1126
(lcell % ACCESS_STRIDE));
1127
1128
bn_unroll(heap_invert_inner_load_b);
1129
1130
bn_mul_mont(&z, &a, &b);
1131
1132
start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1133
(hcell % ACCESS_STRIDE));
1134
1135
bn_unroll(heap_invert_inner_store_z);
1136
1137
lcell += off;
1138
hcell += off;
1139
}
1140
1141
/* Invert the root, fix up 1/ZR -> R/Z */
1142
bn_mod_inverse(&z, &z);
1143
1144
#define heap_invert_inner_1(i) \
1145
a.d[i] = mont_rr[i];
1146
1147
bn_unroll(heap_invert_inner_1);
1148
1149
bn_mul_mont(&z, &z, &a);
1150
bn_mul_mont(&z, &z, &a);
1151
1152
/* Unroll the first iteration to avoid a load/store on the root */
1153
lcell -= (off << 1);
1154
hcell -= (off << 1);
1155
1156
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1157
(lcell % ACCESS_STRIDE));
1158
bn_unroll(heap_invert_inner_load_a);
1159
1160
lcell += off;
1161
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1162
(lcell % ACCESS_STRIDE));
1163
bn_unroll(heap_invert_inner_load_b);
1164
1165
bn_mul_mont(&c, &a, &z);
1166
1167
bn_unroll(heap_invert_inner_store_c);
1168
1169
bn_mul_mont(&c, &b, &z);
1170
1171
lcell -= off;
1172
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1173
(lcell % ACCESS_STRIDE));
1174
bn_unroll(heap_invert_inner_store_c);
1175
1176
lcell -= (off << 1);
1177
1178
for (i = 0; i < (batch-2); i++) {
1179
start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1180
(hcell % ACCESS_STRIDE));
1181
bn_unroll(heap_invert_inner_load_z);
1182
1183
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1184
(lcell % ACCESS_STRIDE));
1185
bn_unroll(heap_invert_inner_load_a);
1186
1187
lcell += off;
1188
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1189
(lcell % ACCESS_STRIDE));
1190
bn_unroll(heap_invert_inner_load_b);
1191
1192
bn_mul_mont(&c, &a, &z);
1193
1194
bn_unroll(heap_invert_inner_store_c);
1195
1196
bn_mul_mont(&c, &b, &z);
1197
1198
lcell -= off;
1199
start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1200
(lcell % ACCESS_STRIDE));
1201
bn_unroll(heap_invert_inner_store_c);
1202
1203
lcell -= (off << 1);
1204
hcell -= off;
1205
}
1206
}
1207
1208
void
1209
hash_ec_point(uint *hash_out, __global bn_word *xy, __global bn_word *zip)
1210
{
1211
uint hash1[16], hash2[16];
1212
bignum c, zi, zzi;
1213
bn_word wh, wl;
1214
1215
/*
1216
* Multiply the coordinates by the inverted Z values.
1217
* Stash the coordinates in the hash buffer.
1218
* SHA-2 requires big endian, and our intended hash input
1219
* is big-endian, so swapping is unnecessary, but
1220
* inserting the format byte in front causes a headache.
1221
*/
1222
#define hash_ec_point_inner_1(i) \
1223
zi.d[i] = zip[i*ACCESS_STRIDE];
1224
1225
bn_unroll(hash_ec_point_inner_1);
1226
1227
bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */
1228
1229
#define hash_ec_point_inner_2(i) \
1230
c.d[i] = xy[i*ACCESS_STRIDE];
1231
1232
bn_unroll(hash_ec_point_inner_2);
1233
1234
bn_mul_mont(&c, &c, &zzi); /* X / Z^2 */
1235
bn_from_mont(&c, &c);
1236
1237
wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
1238
1239
#define hash_ec_point_inner_3(i) \
1240
wl = wh; \
1241
wh = c.d[(BN_NWORDS - 1) - i]; \
1242
hash1[i] = (wl << 24) | (wh >> 8);
1243
1244
bn_unroll(hash_ec_point_inner_3);
1245
1246
bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */
1247
1248
#define hash_ec_point_inner_4(i) \
1249
c.d[i] = xy[(ACCESS_STRIDE/2) + i*ACCESS_STRIDE];
1250
1251
bn_unroll(hash_ec_point_inner_4);
1252
1253
bn_mul_mont(&c, &c, &zzi); /* Y / Z^3 */
1254
bn_from_mont(&c, &c);
1255
1256
#define hash_ec_point_inner_5(i) \
1257
wl = wh; \
1258
wh = c.d[(BN_NWORDS - 1) - i]; \
1259
hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8);
1260
1261
bn_unroll(hash_ec_point_inner_5);
1262
1263
/*
1264
* Hash the first 64 bytes of the buffer
1265
*/
1266
sha2_256_init(hash2);
1267
sha2_256_block(hash2, hash1);
1268
1269
/*
1270
* Hash the last byte of the buffer + SHA-2 padding
1271
*/
1272
hash1[0] = wh << 24 | 0x800000;
1273
hash1[1] = 0;
1274
hash1[2] = 0;
1275
hash1[3] = 0;
1276
hash1[4] = 0;
1277
hash1[5] = 0;
1278
hash1[6] = 0;
1279
hash1[7] = 0;
1280
hash1[8] = 0;
1281
hash1[9] = 0;
1282
hash1[10] = 0;
1283
hash1[11] = 0;
1284
hash1[12] = 0;
1285
hash1[13] = 0;
1286
hash1[14] = 0;
1287
hash1[15] = 65 * 8;
1288
sha2_256_block(hash2, hash1);
1289
1290
/*
1291
* Hash the SHA-2 result with RIPEMD160
1292
* Unfortunately, SHA-2 outputs big-endian, but
1293
* RIPEMD160 expects little-endian. Need to swap!
1294
*/
1295
1296
#define hash_ec_point_inner_6(i) \
1297
hash2[i] = bswap32(hash2[i]);
1298
1299
hash256_unroll(hash_ec_point_inner_6);
1300
1301
hash2[8] = bswap32(0x80000000);
1302
hash2[9] = 0;
1303
hash2[10] = 0;
1304
hash2[11] = 0;
1305
hash2[12] = 0;
1306
hash2[13] = 0;
1307
hash2[14] = 32 * 8;
1308
hash2[15] = 0;
1309
ripemd160_init(hash_out);
1310
ripemd160_block(hash_out, hash2);
1311
}
1312
1313
1314
__kernel void
1315
hash_ec_point_get(__global uint *hashes_out,
1316
__global bn_word *points_in, __global bn_word *z_heap)
1317
{
1318
uint hash[5];
1319
int i, p, cell, start;
1320
1321
cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
1322
start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1323
(cell % ACCESS_STRIDE));
1324
z_heap += start;
1325
1326
start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1327
(cell % (ACCESS_STRIDE/2)));
1328
points_in += start;
1329
1330
/* Complete the coordinates and hash */
1331
hash_ec_point(hash, points_in, z_heap);
1332
1333
p = get_global_size(0);
1334
i = p * get_global_id(1);
1335
hashes_out += 5 * (i + get_global_id(0));
1336
1337
/* Output the hash in proper byte-order */
1338
#define hash_ec_point_get_inner_1(i) \
1339
hashes_out[i] = load_le32(hash[i]);
1340
1341
hash160_unroll(hash_ec_point_get_inner_1);
1342
}
1343
1344
/*
1345
* Normally this would be one function that compared two hash160s.
1346
* This one compares a hash160 with an upper and lower bound in one
1347
* function to work around a problem with AMD's OpenCL compiler.
1348
*/
1349
int
1350
hash160_ucmp_g(uint *a, __global uint *bound)
1351
{
1352
uint gv;
1353
1354
#define hash160_ucmp_g_inner_1(i) \
1355
gv = load_be32(bound[i]); \
1356
if (a[i] < gv) return -1; \
1357
if (a[i] > gv) break;
1358
1359
hash160_iter(hash160_ucmp_g_inner_1);
1360
1361
#define hash160_ucmp_g_inner_2(i) \
1362
gv = load_be32(bound[5+i]); \
1363
if (a[i] < gv) return 0; \
1364
if (a[i] > gv) return 1;
1365
1366
hash160_iter(hash160_ucmp_g_inner_2);
1367
return 0;
1368
}
1369
1370
__kernel void
1371
hash_ec_point_search_prefix(__global uint *found,
1372
__global bn_word *points_in,
1373
__global bn_word *z_heap,
1374
__global uint *target_table, int ntargets)
1375
{
1376
uint hash[5];
1377
int i, high, low, p, cell, start;
1378
1379
cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
1380
start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1381
(cell % ACCESS_STRIDE));
1382
z_heap += start;
1383
1384
start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1385
(cell % (ACCESS_STRIDE/2)));
1386
points_in += start;
1387
1388
/* Complete the coordinates and hash */
1389
hash_ec_point(hash, points_in, z_heap);
1390
1391
/*
1392
* Unconditionally byteswap the hash result, because:
1393
* - The byte-level convention of RIPEMD160 is little-endian
1394
* - We are comparing it in big-endian order
1395
*/
1396
#define hash_ec_point_search_prefix_inner_1(i) \
1397
hash[i] = bswap32(hash[i]);
1398
1399
hash160_unroll(hash_ec_point_search_prefix_inner_1);
1400
1401
/* Binary-search the target table for the hash we just computed */
1402
for (high = ntargets - 1, low = 0, i = high >> 1;
1403
high >= low;
1404
i = low + ((high - low) >> 1)) {
1405
p = hash160_ucmp_g(hash, &target_table[10*i]);
1406
low = (p > 0) ? (i + 1) : low;
1407
high = (p < 0) ? (i - 1) : high;
1408
if (p == 0) {
1409
/* For debugging purposes, write the hash value */
1410
found[0] = ((get_global_id(1) * get_global_size(0)) +
1411
get_global_id(0));
1412
found[1] = i;
1413
1414
#define hash_ec_point_search_prefix_inner_2(i) \
1415
found[i+2] = load_be32(hash[i]);
1416
1417
hash160_unroll(hash_ec_point_search_prefix_inner_2);
1418
high = -1;
1419
}
1420
}
1421
}
1422
1423