CoCalc -- calc

GitHub Repository: samr7/vanitygen
Path: blob/master/calc_addrs.cl
²³⁹ views
1
/*
2
 * Vanitygen, vanity bitcoin address generator
3
 * Copyright (C) 2011 <samr7@cs.washington.edu>
4
 *
5
 * Vanitygen is free software: you can redistribute it and/or modify
6
 * it under the terms of the GNU Affero General Public License as published by
7
 * the Free Software Foundation, either version 3 of the License, or
8
 * any later version. 
9
 *
10
 * Vanitygen is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU Affero General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Affero General Public License
16
 * along with Vanitygen.  If not, see <http://www.gnu.org/licenses/>.
17
 */
18

19
/*
20
 * This file contains an OpenCL kernel for performing certain parts of
21
 * the bitcoin address calculation process.
22
 *
23
 * Kernel: ec_add_grid
24
 *
25
 * Inputs:
26
 * - Row: Array of (sequential) EC points
27
 * - Column: Array of column increment EC points (= rowsize * Pgenerator)
28
 *
29
 * Steps:
30
 * - Compute P = Row[x] + Column[y]
31
 *   P is computed as numerator/denominator components Pxj, Pyj, Pz
32
 *   Final values are: Px = Pxj / (Pz^2), Py = Pyj / (Pz^3)
33
 *
34
 *   The modular inverse of Pz is required to compute Px and Py, and
35
 *   can be computed more efficiently in large batches.  This is done in
36
 *   the next kernel heap_invert.
37
 *
38
 * - Store Pxj, Pyj to intermediate point buffer
39
 * - Store Pz to z_heap
40
 *
41
 * Outputs:
42
 * - Intermediate point buffer
43
 * - Denominator buffer (z_heap)
44
 *
45
 * -------------------------------
46
 * Kernel: heap_invert
47
 *
48
 * Inputs:
49
 * - Denominator buffer (z_heap)
50
 * - N = Batch size (power of 2)
51
 *
52
 * Steps:
53
 * - Compute the product tree for N values in the denominator buffer
54
 * - Compute the modular inverse of the root of the product tree
55
 * - Multiply down the tree to compute the modular inverse of each leaf
56
 *
57
 * Outputs:
58
 * - Modular inverse denominator buffer (z_heap)
59
 *
60
 * -------------------------------
61
 * Kernel: hash_ec_point_get
62
 *
63
 * Inputs:
64
 * - Intermediate point buffer
65
 * - Modular inverse denominator buffer (z_heap)
66
 *
67
 * Steps:
68
 * - Compute Px = Pxj * (1/Pz)^2
69
 * - Compute Py = Pyj * (1/Pz)^3
70
 * - Compute H = RIPEMD160(SHA256(0x04 | Px | Py))
71
 *
72
 * Output:
73
 * - Array of 20-byte address hash values
74
 *
75
 * -------------------------------
76
 * Kernel: hash_ec_point_search_prefix
77
 *
78
 * Like hash_ec_point_get, but instead of storing the complete hash
79
 * value to an output buffer, it searches a sorted list of ranges,
80
 * and if a match is found, writes a flag to an output buffer.
81
 */
82

83

84
/* Byte-swapping and endianness */
85
#define bswap32(v)					\
86
	(((v) >> 24) | (((v) >> 8) & 0xff00) |		\
87
	 (((v) << 8) & 0xff0000) | ((v) << 24))
88

89
#if __ENDIAN_LITTLE__ != 1
90
#define load_le32(v) bswap32(v)
91
#define load_be32(v) (v)
92
#else
93
#define load_le32(v) (v)
94
#define load_be32(v) bswap32(v)
95
#endif
96

97
/*
98
 * Loop unrolling macros
99
 *
100
 * In most cases, preprocessor unrolling works best.
101
 * The exception is NVIDIA's compiler, which seems to take unreasonably
102
 * long to compile a loop with a larger iteration count, or a loop with
103
 * a body of >50 PTX instructions, with preprocessor unrolling.
104
 * However, it does not seem to take as long with pragma unroll, and
105
 * produces good output.
106
 */
107

108
/* Explicit loop unrolling */
109
#define unroll_5(a) do { a(0) a(1) a(2) a(3) a(4) } while (0)
110
#define unroll_8(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
111
#define unroll_1_7(a) do { a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
112
#define unroll_7(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) } while (0)
113
#define unroll_7_0(a) do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) a(0) } while (0)
114
#define unroll_7_1(a) do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) } while (0)
115
#define unroll_16(a) do {				\
116
	a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7)		\
117
	a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15)	\
118
	} while (0)
119
#define unroll_64(a) do {				\
120
	a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7)		\
121
	a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15)	\
122
	a(16) a(17) a(18) a(19) a(20) a(21) a(22) a(23) \
123
	a(24) a(25) a(26) a(27) a(28) a(29) a(30) a(31)	\
124
	a(32) a(33) a(34) a(35) a(36) a(37) a(38) a(39) \
125
	a(40) a(41) a(42) a(43) a(44) a(45) a(46) a(47) \
126
	a(48) a(49) a(50) a(51) a(52) a(53) a(54) a(55) \
127
	a(56) a(57) a(58) a(59) a(60) a(61) a(62) a(63) \
128
	} while (0)
129

130
/* Conditional loop unrolling */
131
#if defined(DEEP_PREPROC_UNROLL)
132
#define iter_5(a) unroll_5(a)
133
#define iter_8(a) unroll_8(a)
134
#define iter_16(a) unroll_16(a)
135
#define iter_64(a) unroll_64(a)
136
#else
137
#define iter_5(a) do {int _i; for (_i = 0; _i < 5; _i++) { a(_i) }} while (0)
138
#define iter_8(a) do {int _i; for (_i = 0; _i < 8; _i++) { a(_i) }} while (0)
139
#define iter_16(a) do {int _i; for (_i = 0; _i < 16; _i++) { a(_i) }} while (0)
140
#define iter_64(a) do {int _i; for (_i = 0; _i < 64; _i++) { a(_i) }} while (0)
141
#endif
142

143
/*
144
 * BIGNUM mini-library
145
 * This module deals with fixed-size 256-bit bignums.
146
 * Where modular arithmetic is performed, the SECP256k1 prime
147
 * modulus (below) is assumed.
148
 *
149
 * Methods include:
150
 * - bn_is_zero/bn_is_one/bn_is_odd/bn_is_even/bn_is_bit_set
151
 * - bn_rshift[1]/bn_lshift[1]
152
 * - bn_neg
153
 * - bn_uadd/bn_uadd_p
154
 * - bn_usub/bn_usub_p
155
 */
156

157
typedef uint bn_word;
158
#define BN_NBITS 256
159
#define BN_WSHIFT 5
160
#define BN_WBITS (1 << BN_WSHIFT)
161
#define BN_NWORDS ((BN_NBITS/8) / sizeof(bn_word))
162
#define BN_WORDMAX 0xffffffff
163

164
#define MODULUS_BYTES \
165
	0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, \
166
	0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
167

168
typedef struct {
169
	bn_word d[BN_NWORDS];
170
} bignum;
171

172
__constant bn_word modulus[] = { MODULUS_BYTES };
173
__constant bignum bn_zero;
174

175
__constant bn_word mont_rr[BN_NWORDS] = { 0xe90a1, 0x7a2, 0x1, 0, };
176
__constant bn_word mont_n0[2] = { 0xd2253531, 0xd838091d };
177

178

179
#define bn_is_odd(bn)		(bn.d[0] & 1)
180
#define bn_is_even(bn) 		(!bn_is_odd(bn))
181
#define bn_is_zero(bn) 		(!bn.d[0] && !bn.d[1] && !bn.d[2] && \
182
				 !bn.d[3] && !bn.d[4] && !bn.d[5] && \
183
				 !bn.d[6] && !bn.d[7])
184
#define bn_is_one(bn) 		((bn.d[0] == 1) && !bn.d[1] && !bn.d[2] && \
185
				 !bn.d[3] && !bn.d[4] && !bn.d[5] && \
186
				 !bn.d[6] && !bn.d[7])
187
#define bn_is_bit_set(bn, n) \
188
	((((bn_word*)&bn)[n >> BN_WSHIFT]) & (1 << (n & (BN_WBITS-1))))
189

190
#define bn_unroll(e) unroll_8(e)
191
#define bn_unroll_sf(e)	unroll_1_7(e)
192
#define bn_unroll_sl(e)	unroll_7(e)
193
#define bn_unroll_reverse(e) unroll_7_0(e)
194
#define bn_unroll_reverse_sl(e) unroll_7_1(e)
195

196
#define bn_unroll_arg(e, arg)				\
197
	e(arg, 0) e(arg, 1) e(arg, 2) e(arg, 3)	\
198
	e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
199
#define bn_unroll_arg_sf(e, arg)			\
200
	e(arg, 1) e(arg, 2) e(arg, 3)		\
201
	e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
202

203
#define bn_iter(e) iter_8(e)
204

205

206
/*
207
 * Bitwise shift
208
 */
209

210
void
211
bn_lshift1(bignum *bn)
212
{
213
#define bn_lshift1_inner1(i)						\
214
		bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
215
	bn_unroll_reverse_sl(bn_lshift1_inner1);
216
	bn->d[0] <<= 1;
217
}
218

219
void
220
bn_rshift(bignum *bn, int shift)
221
{
222
	int wd, iws, iwr;
223
	bn_word ihw, ilw;
224
	iws = (shift & (BN_WBITS-1));
225
	iwr = BN_WBITS - iws;
226
	wd = (shift >> BN_WSHIFT);
227
	ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;
228

229
#define bn_rshift_inner1(i)				\
230
		wd++;					\
231
		ilw = ihw;				\
232
		ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;	\
233
		bn->d[i] = (ilw >> iws) | (ihw << iwr);
234
	bn_unroll_sl(bn_rshift_inner1);
235
	bn->d[BN_NWORDS-1] = (ihw >> iws);
236
}
237

238
void
239
bn_rshift1(bignum *bn)
240
{
241
#define bn_rshift1_inner1(i)						\
242
		bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
243
	bn_unroll_sl(bn_rshift1_inner1);
244
	bn->d[BN_NWORDS-1] >>= 1;
245
}
246

247
void
248
bn_rshift1_2(bignum *bna, bignum *bnb)
249
{
250
#define bn_rshift1_2_inner1(i)						\
251
		bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1);	\
252
		bnb->d[i] = (bnb->d[i+1] << 31) | (bnb->d[i] >> 1);
253
	bn_unroll_sl(bn_rshift1_2_inner1);
254
	bna->d[BN_NWORDS-1] >>= 1;
255
	bnb->d[BN_NWORDS-1] >>= 1;
256
}
257

258

259
/*
260
 * Unsigned comparison
261
 */
262

263
int
264
bn_ucmp_ge(bignum *a, bignum *b)
265
{
266
	int l = 0, g = 0;
267

268
#define bn_ucmp_ge_inner1(i)				\
269
		if (a->d[i] < b->d[i]) l |= (1 << i);	\
270
		if (a->d[i] > b->d[i]) g |= (1 << i);
271
	bn_unroll_reverse(bn_ucmp_ge_inner1);
272
	return (l > g) ? 0 : 1;
273
}
274

275
int
276
bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
277
{
278
	int l = 0, g = 0;
279

280
#define bn_ucmp_ge_c_inner1(i)				\
281
		if (a->d[i] < b[i]) l |= (1 << i);	\
282
		if (a->d[i] > b[i]) g |= (1 << i);
283
	bn_unroll_reverse(bn_ucmp_ge_c_inner1);
284
	return (l > g) ? 0 : 1;
285
}
286

287
/*
288
 * Negate
289
 */
290

291
void
292
bn_neg(bignum *n)
293
{
294
	int c = 1;
295

296
#define bn_neg_inner1(i)				\
297
		c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
298
	bn_unroll(bn_neg_inner1);
299
}
300

301
/*
302
 * Add/subtract
303
 */
304

305
#define bn_add_word(r, a, b, t, c) do {		\
306
		t = a + b;			\
307
		c = (t < a) ? 1 : 0;		\
308
		r = t;				\
309
	} while (0)
310

311
#define bn_addc_word(r, a, b, t, c) do {			\
312
		t = a + b + c;					\
313
		c = (t < a) ? 1 : ((c & (t == a)) ? 1 : 0);	\
314
		r = t;						\
315
	} while (0)
316

317
bn_word
318
bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)
319
{
320
	bn_word t, c = 0;
321

322
#define bn_uadd_words_seq_inner1(i)			\
323
		bn_addc_word(r[i], a[i], b[i], t, c);
324
	bn_add_word(r[0], a[0], b[0], t, c);
325
	bn_unroll_sf(bn_uadd_words_seq_inner1);
326
	return c;
327
}
328

329
bn_word
330
bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
331
{
332
	bn_word t, c = 0;
333

334
	bn_add_word(r[0], a[0], b[0], t, c);
335
	bn_unroll_sf(bn_uadd_words_seq_inner1);
336
	return c;
337
}
338

339
#define bn_sub_word(r, a, b, t, c) do {		\
340
		t = a - b;			\
341
		c = (a < b) ? 1 : 0;		\
342
		r = t;				\
343
	} while (0)
344

345
#define bn_subb_word(r, a, b, t, c) do {	\
346
		t = a - (b + c);		\
347
		c = (!(a) && c) ? 1 : 0;	\
348
		c |= (a < b) ? 1 : 0;		\
349
		r = t;				\
350
	} while (0)
351

352
bn_word
353
bn_usub_words_seq(bn_word *r, bn_word *a, bn_word *b)
354
{
355
	bn_word t, c = 0;
356

357
#define bn_usub_words_seq_inner1(i)			\
358
		bn_subb_word(r[i], a[i], b[i], t, c);
359

360
	bn_sub_word(r[0], a[0], b[0], t, c);
361
	bn_unroll_sf(bn_usub_words_seq_inner1);
362
	return c;
363
}
364

365
bn_word
366
bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
367
{
368
	bn_word t, c = 0;
369

370
	bn_sub_word(r[0], a[0], b[0], t, c);
371
	bn_unroll_sf(bn_usub_words_seq_inner1);
372
	return c;
373
}
374

375
/*
376
 * Add/subtract better suited for AMD's VLIW architecture
377
 */
378
bn_word
379
bn_uadd_words_vliw(bn_word *r, bn_word *a, bn_word *b)
380
{
381
	bignum x;
382
	bn_word c = 0, cp = 0;
383

384
#define bn_uadd_words_vliw_inner1(i)		\
385
		x.d[i] = a[i] + b[i];
386

387
#define bn_uadd_words_vliw_inner2(i)			\
388
		c |= (a[i] > x.d[i]) ? (1 << i) : 0;	\
389
		cp |= (!~x.d[i]) ? (1 << i) : 0;
390

391
#define bn_uadd_words_vliw_inner3(i)		\
392
		r[i] = x.d[i] + ((c >> i) & 1);
393

394
	bn_unroll(bn_uadd_words_vliw_inner1);
395
	bn_unroll(bn_uadd_words_vliw_inner2);
396
	c = ((cp + (c << 1)) ^ cp);
397
	r[0] = x.d[0];
398
	bn_unroll_sf(bn_uadd_words_vliw_inner3);
399
	return c >> BN_NWORDS;
400
}
401

402
bn_word
403
bn_uadd_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
404
{
405
	bignum x;
406
	bn_word c = 0, cp = 0;
407

408
	bn_unroll(bn_uadd_words_vliw_inner1);
409
	bn_unroll(bn_uadd_words_vliw_inner2);
410
	c = ((cp + (c << 1)) ^ cp);
411
	r[0] = x.d[0];
412
	bn_unroll_sf(bn_uadd_words_vliw_inner3);
413
	return c >> BN_NWORDS;
414
}
415

416
bn_word
417
bn_usub_words_vliw(bn_word *r, bn_word *a, bn_word *b)
418
{
419
	bignum x;
420
	bn_word c = 0, cp = 0;
421

422
#define bn_usub_words_vliw_inner1(i)		\
423
		x.d[i] = a[i] - b[i];
424

425
#define bn_usub_words_vliw_inner2(i)			\
426
		c |= (a[i] < b[i]) ? (1 << i) : 0;	\
427
		cp |= (!x.d[i]) ? (1 << i) : 0;
428

429
#define bn_usub_words_vliw_inner3(i)		\
430
		r[i] = x.d[i] - ((c >> i) & 1);
431

432
	bn_unroll(bn_usub_words_vliw_inner1);
433
	bn_unroll(bn_usub_words_vliw_inner2);
434
	c = ((cp + (c << 1)) ^ cp);
435
	r[0] = x.d[0];
436
	bn_unroll_sf(bn_usub_words_vliw_inner3);
437
	return c >> BN_NWORDS;
438
}
439

440
bn_word
441
bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
442
{
443
	bignum x;
444
	bn_word c = 0, cp = 0;
445

446
	bn_unroll(bn_usub_words_vliw_inner1);
447
	bn_unroll(bn_usub_words_vliw_inner2);
448
	c = ((cp + (c << 1)) ^ cp);
449
	r[0] = x.d[0];
450
	bn_unroll_sf(bn_usub_words_vliw_inner3);
451
	return c >> BN_NWORDS;
452
}
453

454

455
#if defined(DEEP_VLIW)
456
#define bn_uadd_words bn_uadd_words_vliw
457
#define bn_uadd_words_c bn_uadd_words_c_vliw
458
#define bn_usub_words bn_usub_words_vliw
459
#define bn_usub_words_c bn_usub_words_c_vliw
460
#else
461
#define bn_uadd_words bn_uadd_words_seq
462
#define bn_uadd_words_c bn_uadd_words_c_seq
463
#define bn_usub_words bn_usub_words_seq
464
#define bn_usub_words_c bn_usub_words_c_seq
465
#endif
466

467
#define bn_uadd(r, a, b) bn_uadd_words((r)->d, (a)->d, (b)->d)
468
#define bn_uadd_c(r, a, b) bn_uadd_words_c((r)->d, (a)->d, b)
469
#define bn_usub(r, a, b) bn_usub_words((r)->d, (a)->d, (b)->d)
470
#define bn_usub_c(r, a, b) bn_usub_words_c((r)->d, (a)->d, b)
471

472
/*
473
 * Modular add/sub
474
 */
475

476
void
477
bn_mod_add(bignum *r, bignum *a, bignum *b)
478
{
479
	if (bn_uadd(r, a, b) ||
480
	    (bn_ucmp_ge_c(r, modulus)))
481
		bn_usub_c(r, r, modulus);
482
}
483

484
void
485
bn_mod_sub(bignum *r, bignum *a, bignum *b)
486
{
487
	if (bn_usub(r, a, b))
488
		bn_uadd_c(r, r, modulus);
489
}
490

491
void
492
bn_mod_lshift1(bignum *bn)
493
{
494
	bn_word c = (bn->d[BN_NWORDS-1] & 0x80000000);
495
	bn_lshift1(bn);
496
	if (c || (bn_ucmp_ge_c(bn, modulus)))
497
		bn_usub_c(bn, bn, modulus);
498
}
499

500
/*
501
 * Montgomery multiplication
502
 *
503
 * This includes normal multiplication of two "Montgomeryized"
504
 * bignums, and bn_from_mont for de-Montgomeryizing a bignum.
505
 */
506

507
#define bn_mul_word(r, a, w, c, p, s) do { \
508
		r = (a * w) + c;	   \
509
		p = mul_hi(a, w);	   \
510
		c = (r < c) ? p + 1 : p;   \
511
	} while (0)
512

513
#define bn_mul_add_word(r, a, w, c, p, s) do {	\
514
		s = r + c;			\
515
		p = mul_hi(a, w);		\
516
		r = (a * w) + s;		\
517
		c = (s < c) ? p + 1 : p;	\
518
		if (r < s) c++;			\
519
	} while (0)
520
void
521
bn_mul_mont(bignum *r, bignum *a, bignum *b)
522
{
523
	bignum t;
524
	bn_word tea, teb, c, p, s, m;
525

526
#if !defined(VERY_EXPENSIVE_BRANCHES)
527
	int q;
528
#endif
529

530
	c = 0;
531
#define bn_mul_mont_inner1(j)					\
532
		bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
533
	bn_unroll(bn_mul_mont_inner1);
534
	tea = c;
535
	teb = 0;
536

537
	c = 0;
538
	m = t.d[0] * mont_n0[0];
539
	bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
540
#define bn_mul_mont_inner2(j)						\
541
		bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);	\
542
		t.d[j-1] = t.d[j];
543
	bn_unroll_sf(bn_mul_mont_inner2);
544
	t.d[BN_NWORDS-1] = tea + c;
545
	tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
546

547
#define bn_mul_mont_inner3_1(i, j)					\
548
		bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
549
#define bn_mul_mont_inner3_2(i, j)					\
550
		bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);	\
551
		t.d[j-1] = t.d[j];
552
#define bn_mul_mont_inner3(i)				 \
553
	c = 0;						 \
554
	bn_unroll_arg(bn_mul_mont_inner3_1, i);		 \
555
	tea += c;					 \
556
	teb = ((tea < c) ? 1 : 0);			 \
557
	c = 0;						 \
558
	m = t.d[0] * mont_n0[0];			 \
559
	bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); \
560
	bn_unroll_arg_sf(bn_mul_mont_inner3_2, i);	 \
561
	t.d[BN_NWORDS-1] = tea + c;			 \
562
	tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
563

564
	/*
565
	 * The outer loop here is quite long, and we won't unroll it
566
	 * unless VERY_EXPENSIVE_BRANCHES is set.
567
	 */
568
#if defined(VERY_EXPENSIVE_BRANCHES)
569
	bn_unroll_sf(bn_mul_mont_inner3);
570
	c = tea | !bn_usub_c(r, &t, modulus);
571
	if (!c)
572
		*r = t;
573

574
#else
575
	for (q = 1; q < BN_NWORDS; q++) {
576
		bn_mul_mont_inner3(q);
577
	}
578
	c = tea || (t.d[BN_NWORDS-1] >= modulus[BN_NWORDS-1]);
579
	if (c) {
580
		c = tea | !bn_usub_c(r, &t, modulus);
581
		if (c)
582
			return;
583
	}
584
	*r = t;
585
#endif
586
}
587

588
void
589
bn_from_mont(bignum *rb, bignum *b)
590
{
591
#define WORKSIZE ((2*BN_NWORDS) + 1)
592
	bn_word r[WORKSIZE];
593
	bn_word m, c, p, s;
594
#if defined(PRAGMA_UNROLL)
595
	int i;
596
#endif
597

598
	/* Copy the input to the working area */
599
	/* Zero the upper words */
600
#define bn_from_mont_inner1(i)			\
601
	r[i] = b->d[i];
602
#define bn_from_mont_inner2(i)			\
603
	r[BN_NWORDS+i] = 0;
604

605
	bn_unroll(bn_from_mont_inner1);
606
	bn_unroll(bn_from_mont_inner2);
607
	r[WORKSIZE-1] = 0;
608

609
	/* Multiply (long) by modulus */
610
#define bn_from_mont_inner3_1(i, j) \
611
	bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
612

613
#if !defined(VERY_EXPENSIVE_BRANCHES)
614
#define bn_from_mont_inner3_2(i)		\
615
	if (r[BN_NWORDS + i] < c)		\
616
		r[BN_NWORDS + i + 1] += 1;
617
#else
618
#define bn_from_mont_inner3_2(i)				\
619
	r[BN_NWORDS + i + 1] += (r[BN_NWORDS + i] < c) ? 1 : 0;
620
#endif
621

622
#define bn_from_mont_inner3(i)			 \
623
	m = r[i] * mont_n0[0];			 \
624
	c = 0;					 \
625
	bn_unroll_arg(bn_from_mont_inner3_1, i); \
626
	r[BN_NWORDS + i] += c;			 \
627
	bn_from_mont_inner3_2(i)
628

629
	/*
630
	 * The outer loop here is not very long, so we will unroll
631
	 * it by default.  However, it's just complicated enough to
632
	 * cause NVIDIA's compiler to take unreasonably long to compile
633
	 * it, unless we use pragma unroll.
634
	 */
635
#if !defined(PRAGMA_UNROLL)
636
	bn_iter(bn_from_mont_inner3);
637
#else
638
#pragma unroll 8
639
	for (i = 0; i < BN_NWORDS; i++) { bn_from_mont_inner3(i) }
640
#endif
641

642
	/*
643
	 * Make sure the result is less than the modulus.
644
	 * Subtracting is not much more expensive than compare, so
645
	 * subtract always and assign based on the carry out value.
646
	 */
647
	c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus);
648
	if (c) {
649
#define bn_from_mont_inner4(i)				\
650
			rb->d[i] = r[BN_NWORDS + i];
651
		bn_unroll(bn_from_mont_inner4);
652
	}
653
}
654

655
/*
656
 * Modular inversion
657
 */
658

659
void
660
bn_mod_inverse(bignum *r, bignum *n)
661
{
662
	bignum a, b, x, y;
663
	int shift;
664
	bn_word xc, yc;
665
	for (shift = 0; shift < BN_NWORDS; shift++) {
666
		a.d[shift] = modulus[shift];
667
		x.d[shift] = 0;
668
		y.d[shift] = 0;
669
	}
670
	b = *n;
671
	x.d[0] = 1;
672
	xc = 0;
673
	yc = 0;
674
	while (!bn_is_zero(b)) {
675
		shift = 0;
676
		while (!bn_is_odd(b)) {
677
			if (bn_is_odd(x))
678
				xc += bn_uadd_c(&x, &x, modulus);
679
			bn_rshift1_2(&x, &b);
680
			x.d[7] |= (xc << 31);
681
			xc >>= 1;
682
		}
683

684
		while (!bn_is_odd(a)) {
685
			if (bn_is_odd(y))
686
				yc += bn_uadd_c(&y, &y, modulus);
687
			bn_rshift1_2(&y, &a);
688
			y.d[7] |= (yc << 31);
689
			yc >>= 1;
690
		}
691

692
		if (bn_ucmp_ge(&b, &a)) {
693
			xc += yc + bn_uadd(&x, &x, &y);
694
			bn_usub(&b, &b, &a);
695
		} else {
696
			yc += xc + bn_uadd(&y, &y, &x);
697
			bn_usub(&a, &a, &b);
698
		}
699
	}
700

701
	if (!bn_is_one(a)) {
702
		/* no modular inverse */
703
		*r = bn_zero;
704
	} else {
705
		/* Compute y % m as cheaply as possible */
706
		while (yc < 0x80000000)
707
			yc -= bn_usub_c(&y, &y, modulus);
708
		bn_neg(&y);
709
		*r = y;
710
	}
711
}
712

713
/*
714
 * HASH FUNCTIONS
715
 *
716
 * BYTE ORDER NOTE: None of the hash functions below deal with byte
717
 * order.  The caller is expected to be aware of this when it stuffs
718
 * data into in the native integer.
719
 *
720
 * NOTE #2: Endianness of the OpenCL device makes no difference here.
721
 */
722

723
#define hash256_unroll(a) unroll_8(a)
724
#define hash160_unroll(a) unroll_5(a)
725
#define hash256_iter(a) iter_8(a)
726
#define hash160_iter(a) iter_5(a)
727

728

729
/*
730
 * SHA-2 256
731
 *
732
 * CAUTION: Input buffer will be overwritten/mangled.
733
 * Data expected in big-endian format.
734
 * This implementation is designed for space efficiency more than
735
 * raw speed.
736
 */
737

738
__constant uint sha2_init[8] = {
739
	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
740
	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
741
};
742

743
__constant uint sha2_k[64] = {
744
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
745
	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
746
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
747
	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
748
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
749
	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
750
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
751
	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
752
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
753
	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
754
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
755
	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
756
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
757
	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
758
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
759
	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
760
};
761

762
void
763
sha2_256_init(uint *out)
764
{
765
#define sha2_256_init_inner_1(i) \
766
	out[i] = sha2_init[i];
767

768
	hash256_unroll(sha2_256_init_inner_1);
769
}
770

771
/* The state variable remapping is really contorted */
772
#define sha2_stvar(vals, i, v) vals[(64+v-i) % 8]
773
#define sha2_s0(a) (rotate(a, 30U) ^ rotate(a, 19U) ^ rotate(a, 10U))
774
#define sha2_s1(a) (rotate(a, 26U) ^ rotate(a, 21U) ^ rotate(a, 7U))
775
#if defined(AMD_BFI_INT)
776
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
777
#define sha2_ch(a, b, c) amd_bytealign(a, b, c)
778
#define sha2_ma(a, b, c) amd_bytealign((a^c), b, a)
779
#else
780
#define sha2_ch(a, b, c) (c ^ (a & (b ^ c)))
781
#define sha2_ma(a, b, c) ((a & c) | (b & (a | c)))
782
#endif
783

784
void
785
sha2_256_block(uint *out, uint *in)
786
{
787
	uint state[8], t1, t2;
788
#if defined(PRAGMA_UNROLL)
789
	int i;
790
#endif
791

792
#define sha2_256_block_inner_1(i) \
793
	state[i] = out[i];
794
	hash256_unroll(sha2_256_block_inner_1);
795

796
#define sha2_256_block_inner_2(i) \
797
	if (i >= 16) {							\
798
		t1 = in[(i + 1) % 16];					\
799
		t2 = in[(i + 14) % 16];					\
800
		in[i % 16] += (in[(i + 9) % 16] +			\
801
		       (rotate(t1, 25U) ^ rotate(t1, 14U) ^ (t1 >> 3)) + \
802
		       (rotate(t2, 15U) ^ rotate(t2, 13U) ^ (t2 >> 10))); \
803
	}								\
804
	t1 = (sha2_stvar(state, i, 7) +					\
805
	      sha2_s1(sha2_stvar(state, i, 4)) +			\
806
	      sha2_ch(sha2_stvar(state, i, 4),				\
807
		      sha2_stvar(state, i, 5),				\
808
		      sha2_stvar(state, i, 6)) +			\
809
	      sha2_k[i] +						\
810
	      in[i % 16]);						\
811
	t2 = (sha2_s0(sha2_stvar(state, i, 0)) +			\
812
	      sha2_ma(sha2_stvar(state, i, 0),				\
813
		      sha2_stvar(state, i, 1),				\
814
		      sha2_stvar(state, i, 2)));			\
815
	sha2_stvar(state, i, 3) += t1;					\
816
	sha2_stvar(state, i, 7) = t1 + t2;				\
817

818
#if !defined(PRAGMA_UNROLL)
819
	iter_64(sha2_256_block_inner_2);
820
#else
821
#pragma unroll 64
822
	for (i = 0; i < 64; i++) { sha2_256_block_inner_2(i) }
823
#endif
824

825
#define sha2_256_block_inner_3(i) \
826
	out[i] += state[i];
827

828
	hash256_unroll(sha2_256_block_inner_3);
829
}
830

831

832
/*
833
 * RIPEMD160
834
 *
835
 * Data expected in little-endian format.
836
 */
837

838
__constant uint ripemd160_iv[] = {
839
	0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
840
__constant uint ripemd160_k[] = {
841
	0x00000000, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
842
__constant uint ripemd160_kp[] = {
843
	0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0x00000000 };
844
__constant uchar ripemd160_ws[] = {
845
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
846
	7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8,
847
	3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12,
848
	1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2,
849
	4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13,
850
};
851
__constant uchar ripemd160_wsp[] = {
852
	5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12,
853
	6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2,
854
	15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13,
855
	8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14,
856
	12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11
857
};
858
__constant uchar ripemd160_rl[] = {
859
	11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8,
860
	7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12,
861
	11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5,
862
	11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12,
863
	9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6,
864
};
865
__constant uchar ripemd160_rlp[] = {
866
	8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6,
867
	9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11,
868
	9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5,
869
	15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8,
870
	8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
871
};
872

873
#define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5]
874
#define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)]
875
#if defined(AMD_BFI_INT)
876
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
877
#define ripemd160_f1(x, y, z) amd_bytealign(x, y, z)
878
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
879
#define ripemd160_f3(x, y, z) amd_bytealign(z, x, y)
880
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
881
#else
882
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
883
#define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
884
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
885
#define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
886
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
887
#endif
888
#define ripemd160_round(i, in, vals, f, fp, t) do {			\
889
		ripemd160_val(vals, i, 0) =				\
890
			rotate(ripemd160_val(vals, i, 0) +		\
891
			       f(ripemd160_val(vals, i, 1),		\
892
				 ripemd160_val(vals, i, 2),		\
893
				 ripemd160_val(vals, i, 3)) +		\
894
			       in[ripemd160_ws[i]] +			\
895
			       ripemd160_k[i / 16],			\
896
			       (uint)ripemd160_rl[i]) +			\
897
			ripemd160_val(vals, i, 4);			\
898
		ripemd160_val(vals, i, 2) =				\
899
			rotate(ripemd160_val(vals, i, 2), 10U);		\
900
		ripemd160_valp(vals, i, 0) =				\
901
			rotate(ripemd160_valp(vals, i, 0) +		\
902
			       fp(ripemd160_valp(vals, i, 1),		\
903
				  ripemd160_valp(vals, i, 2),		\
904
				  ripemd160_valp(vals, i, 3)) +		\
905
			       in[ripemd160_wsp[i]] +			\
906
			       ripemd160_kp[i / 16],			\
907
			       (uint)ripemd160_rlp[i]) +		\
908
			ripemd160_valp(vals, i, 4);			\
909
		ripemd160_valp(vals, i, 2) =				\
910
			rotate(ripemd160_valp(vals, i, 2), 10U);	\
911
	} while (0)
912

913
void
914
ripemd160_init(uint *out)
915
{
916
#define ripemd160_init_inner_1(i) \
917
	out[i] = ripemd160_iv[i];
918

919
	hash160_unroll(ripemd160_init_inner_1);
920
}
921

922
void
923
ripemd160_block(uint *out, uint *in)
924
{
925
	uint vals[10], t;
926
#if defined(PRAGMA_UNROLL)
927
	int i;
928
#endif
929

930
#define ripemd160_block_inner_1(i) \
931
	vals[i] = vals[i + 5] = out[i];
932

933
	hash160_unroll(ripemd160_block_inner_1);
934

935
#define ripemd160_block_inner_p0(i)		\
936
	ripemd160_round(i, in, vals, \
937
			ripemd160_f0, ripemd160_f4, t);
938
#define ripemd160_block_inner_p1(i)		\
939
	ripemd160_round((16 + i), in, vals,		\
940
			ripemd160_f1, ripemd160_f3, t);
941
#define ripemd160_block_inner_p2(i)		\
942
	ripemd160_round((32 + i), in, vals,		\
943
			ripemd160_f2, ripemd160_f2, t);
944
#define ripemd160_block_inner_p3(i)		\
945
	ripemd160_round((48 + i), in, vals,		\
946
			ripemd160_f3, ripemd160_f1, t);
947
#define ripemd160_block_inner_p4(i)		\
948
	ripemd160_round((64 + i), in, vals,		\
949
			ripemd160_f4, ripemd160_f0, t);
950

951
#if !defined(PRAGMA_UNROLL)
952
	iter_16(ripemd160_block_inner_p0);
953
	iter_16(ripemd160_block_inner_p1);
954
	iter_16(ripemd160_block_inner_p2);
955
	iter_16(ripemd160_block_inner_p3);
956
	iter_16(ripemd160_block_inner_p4);
957
#else
958
#pragma unroll 16
959
	for (i = 0; i < 16; i++) { ripemd160_block_inner_p0(i); }
960
#pragma unroll 16
961
	for (i = 0; i < 16; i++) { ripemd160_block_inner_p1(i); }
962
#pragma unroll 16
963
	for (i = 0; i < 16; i++) { ripemd160_block_inner_p2(i); }
964
#pragma unroll 16
965
	for (i = 0; i < 16; i++) { ripemd160_block_inner_p3(i); }
966
#pragma unroll 16
967
	for (i = 0; i < 16; i++) { ripemd160_block_inner_p4(i); }
968
#endif
969

970
	t = out[1] + vals[2] + vals[8];
971
	out[1] = out[2] + vals[3] + vals[9];
972
	out[2] = out[3] + vals[4] + vals[5];
973
	out[3] = out[4] + vals[0] + vals[6];
974
	out[4] = out[0] + vals[1] + vals[7];
975
	out[0] = t;
976
}
977

978

979
#ifdef TEST_KERNELS
980
/*
981
 * Test kernels
982
 */
983

984
/* Montgomery multiplication test kernel */
985
__kernel void
986
test_mul_mont(__global bignum *products_out, __global bignum *nums_in)
987
{
988
	bignum a, b, c;
989
	int o;
990
	o = get_global_id(0);
991
	nums_in += (2*o);
992

993
	a = nums_in[0];
994
	b = nums_in[1];
995
	bn_mul_mont(&c, &a, &b);
996
	products_out[o] = c;
997
}
998

999
/* modular inversion test kernel */
1000
__kernel void
1001
test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in,
1002
		 int count)
1003
{
1004
	bignum x, xp;
1005
	int i, o;
1006
	o = get_global_id(0) * count;
1007
	for (i = 0; i < count; i++) {
1008
		x = nums_in[o];
1009
		bn_mod_inverse(&xp, &x);
1010
		inv_out[o++] = xp;
1011
	}
1012
}
1013
#endif  /* TEST_KERNELS */
1014

1015

1016
#define ACCESS_BUNDLE 1024
1017
#define ACCESS_STRIDE (ACCESS_BUNDLE/BN_NWORDS)
1018

1019
__kernel void
1020
ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap, 
1021
	    __global bn_word *row_in, __global bignum *col_in)
1022
{
1023
	bignum rx, ry;
1024
	bignum x1, y1, a, b, c, d, e, z;
1025
	bn_word cy;
1026
	int i, cell, start;
1027

1028
	/* Load the row increment point */
1029
	i = 2 * get_global_id(1);
1030
	rx = col_in[i];
1031
	ry = col_in[i+1];
1032

1033
	cell = get_global_id(0);
1034
	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1035
		 (cell % (ACCESS_STRIDE/2)));
1036

1037
#define ec_add_grid_inner_1(i) \
1038
	x1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
1039

1040
	bn_unroll(ec_add_grid_inner_1);
1041
	start += (ACCESS_STRIDE/2);
1042

1043
#define ec_add_grid_inner_2(i) \
1044
	y1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
1045

1046
	bn_unroll(ec_add_grid_inner_2);
1047

1048
	bn_mod_sub(&z, &x1, &rx);
1049

1050
	cell += (get_global_id(1) * get_global_size(0));
1051
	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1052
		 (cell % ACCESS_STRIDE));
1053

1054
#define ec_add_grid_inner_3(i) \
1055
	z_heap[start + (i*ACCESS_STRIDE)] = z.d[i];
1056

1057
	bn_unroll(ec_add_grid_inner_3);
1058

1059
	bn_mod_sub(&b, &y1, &ry);
1060
	bn_mod_add(&c, &x1, &rx);
1061
	bn_mod_add(&d, &y1, &ry);
1062
	bn_mul_mont(&y1, &b, &b);
1063
	bn_mul_mont(&x1, &z, &z);
1064
	bn_mul_mont(&e, &c, &x1);
1065
	bn_mod_sub(&y1, &y1, &e);
1066

1067
	/*
1068
	 * This disgusting code caters to the global memory unit on
1069
	 * various GPUs, by giving it a nice contiguous patch to write
1070
	 * per warp/wavefront.
1071
	 */
1072
	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1073
		 (cell % (ACCESS_STRIDE/2)));
1074

1075
#define ec_add_grid_inner_4(i) \
1076
	points_out[start + (i*ACCESS_STRIDE)] = y1.d[i];
1077

1078
	bn_unroll(ec_add_grid_inner_4);
1079

1080
	bn_mod_lshift1(&y1);
1081
	bn_mod_sub(&y1, &e, &y1);
1082
	bn_mul_mont(&y1, &y1, &b);
1083
	bn_mul_mont(&a, &x1, &z);
1084
	bn_mul_mont(&c, &d, &a);
1085
	bn_mod_sub(&y1, &y1, &c);
1086
	cy = 0;
1087
	if (bn_is_odd(y1))
1088
		cy = bn_uadd_c(&y1, &y1, modulus);
1089
	bn_rshift1(&y1);
1090
	y1.d[BN_NWORDS-1] |= (cy ? 0x80000000 : 0);
1091

1092
	start += (ACCESS_STRIDE/2);
1093

1094
	bn_unroll(ec_add_grid_inner_4);
1095
}
1096

1097
__kernel void
1098
heap_invert(__global bn_word *z_heap, int batch)
1099
{
1100
	bignum a, b, c, z;
1101
	int i, off, lcell, hcell, start;
1102

1103
#define heap_invert_inner_load_a(j)				\
1104
		a.d[j] = z_heap[start + j*ACCESS_STRIDE];
1105
#define heap_invert_inner_load_b(j)				\
1106
		b.d[j] = z_heap[start + j*ACCESS_STRIDE];
1107
#define heap_invert_inner_load_z(j)				\
1108
		z.d[j] = z_heap[start + j*ACCESS_STRIDE];
1109
#define heap_invert_inner_store_z(j)				\
1110
		z_heap[start + j*ACCESS_STRIDE] = z.d[j];
1111
#define heap_invert_inner_store_c(j)				\
1112
		z_heap[start + j*ACCESS_STRIDE] = c.d[j];
1113

1114
	off = get_global_size(0);
1115
	lcell = get_global_id(0);
1116
	hcell = (off * batch) + lcell;
1117
	for (i = 0; i < (batch-1); i++) {
1118

1119
		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1120
			 (lcell % ACCESS_STRIDE));
1121

1122
		bn_unroll(heap_invert_inner_load_a);
1123

1124
		lcell += off;
1125
		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1126
			 (lcell % ACCESS_STRIDE));
1127

1128
		bn_unroll(heap_invert_inner_load_b);
1129

1130
		bn_mul_mont(&z, &a, &b);
1131

1132
		start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1133
			 (hcell % ACCESS_STRIDE));
1134

1135
		bn_unroll(heap_invert_inner_store_z);
1136

1137
		lcell += off;
1138
		hcell += off;
1139
	}
1140

1141
	/* Invert the root, fix up 1/ZR -> R/Z */
1142
	bn_mod_inverse(&z, &z);
1143

1144
#define heap_invert_inner_1(i)			\
1145
	a.d[i] = mont_rr[i];
1146

1147
	bn_unroll(heap_invert_inner_1);
1148

1149
	bn_mul_mont(&z, &z, &a);
1150
	bn_mul_mont(&z, &z, &a);
1151

1152
	/* Unroll the first iteration to avoid a load/store on the root */
1153
	lcell -= (off << 1);
1154
	hcell -= (off << 1);
1155

1156
	start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1157
		 (lcell % ACCESS_STRIDE));
1158
	bn_unroll(heap_invert_inner_load_a);
1159

1160
	lcell += off;
1161
	start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1162
		 (lcell % ACCESS_STRIDE));
1163
	bn_unroll(heap_invert_inner_load_b);
1164

1165
	bn_mul_mont(&c, &a, &z);
1166

1167
	bn_unroll(heap_invert_inner_store_c);
1168

1169
	bn_mul_mont(&c, &b, &z);
1170

1171
	lcell -= off;
1172
	start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1173
		 (lcell % ACCESS_STRIDE));
1174
	bn_unroll(heap_invert_inner_store_c);
1175

1176
	lcell -= (off << 1);
1177

1178
	for (i = 0; i < (batch-2); i++) {
1179
		start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1180
			 (hcell % ACCESS_STRIDE));
1181
		bn_unroll(heap_invert_inner_load_z);
1182

1183
		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1184
			 (lcell % ACCESS_STRIDE));
1185
		bn_unroll(heap_invert_inner_load_a);
1186

1187
		lcell += off;
1188
		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1189
			 (lcell % ACCESS_STRIDE));
1190
		bn_unroll(heap_invert_inner_load_b);
1191

1192
		bn_mul_mont(&c, &a, &z);
1193

1194
		bn_unroll(heap_invert_inner_store_c);
1195

1196
		bn_mul_mont(&c, &b, &z);
1197

1198
		lcell -= off;
1199
		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1200
			 (lcell % ACCESS_STRIDE));
1201
		bn_unroll(heap_invert_inner_store_c);
1202

1203
		lcell -= (off << 1);
1204
		hcell -= off;
1205
	}
1206
}
1207

1208
void
1209
hash_ec_point(uint *hash_out, __global bn_word *xy, __global bn_word *zip)
1210
{
1211
	uint hash1[16], hash2[16];
1212
	bignum c, zi, zzi;
1213
	bn_word wh, wl;
1214

1215
	/*
1216
	 * Multiply the coordinates by the inverted Z values.
1217
	 * Stash the coordinates in the hash buffer.
1218
	 * SHA-2 requires big endian, and our intended hash input
1219
	 * is big-endian, so swapping is unnecessary, but
1220
	 * inserting the format byte in front causes a headache.
1221
	 */
1222
#define hash_ec_point_inner_1(i)		\
1223
	zi.d[i] = zip[i*ACCESS_STRIDE];
1224

1225
	bn_unroll(hash_ec_point_inner_1);
1226

1227
	bn_mul_mont(&zzi, &zi, &zi);  /* 1 / Z^2 */
1228

1229
#define hash_ec_point_inner_2(i)		\
1230
	c.d[i] = xy[i*ACCESS_STRIDE];
1231

1232
	bn_unroll(hash_ec_point_inner_2);
1233

1234
	bn_mul_mont(&c, &c, &zzi);  /* X / Z^2 */
1235
	bn_from_mont(&c, &c);
1236

1237
	wh = 0x00000004;  /* POINT_CONVERSION_UNCOMPRESSED */
1238

1239
#define hash_ec_point_inner_3(i)		\
1240
	wl = wh;				\
1241
	wh = c.d[(BN_NWORDS - 1) - i];		\
1242
	hash1[i] = (wl << 24) | (wh >> 8);
1243

1244
	bn_unroll(hash_ec_point_inner_3);
1245

1246
	bn_mul_mont(&zzi, &zzi, &zi);  /* 1 / Z^3 */
1247

1248
#define hash_ec_point_inner_4(i)				\
1249
	c.d[i] = xy[(ACCESS_STRIDE/2) + i*ACCESS_STRIDE];
1250

1251
	bn_unroll(hash_ec_point_inner_4);
1252

1253
	bn_mul_mont(&c, &c, &zzi);  /* Y / Z^3 */
1254
	bn_from_mont(&c, &c);
1255

1256
#define hash_ec_point_inner_5(i)			\
1257
	wl = wh;					\
1258
	wh = c.d[(BN_NWORDS - 1) - i];			\
1259
	hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8);
1260

1261
	bn_unroll(hash_ec_point_inner_5);
1262

1263
	/*
1264
	 * Hash the first 64 bytes of the buffer
1265
	 */
1266
	sha2_256_init(hash2);
1267
	sha2_256_block(hash2, hash1);
1268

1269
	/*
1270
	 * Hash the last byte of the buffer + SHA-2 padding
1271
	 */
1272
	hash1[0] = wh << 24 | 0x800000;
1273
	hash1[1] = 0;
1274
	hash1[2] = 0;
1275
	hash1[3] = 0;
1276
	hash1[4] = 0;
1277
	hash1[5] = 0;
1278
	hash1[6] = 0;
1279
	hash1[7] = 0;
1280
	hash1[8] = 0;
1281
	hash1[9] = 0;
1282
	hash1[10] = 0;
1283
	hash1[11] = 0;
1284
	hash1[12] = 0;
1285
	hash1[13] = 0;
1286
	hash1[14] = 0;
1287
	hash1[15] = 65 * 8;
1288
	sha2_256_block(hash2, hash1);
1289

1290
	/*
1291
	 * Hash the SHA-2 result with RIPEMD160
1292
	 * Unfortunately, SHA-2 outputs big-endian, but
1293
	 * RIPEMD160 expects little-endian.  Need to swap!
1294
	 */
1295

1296
#define hash_ec_point_inner_6(i)		\
1297
	hash2[i] = bswap32(hash2[i]);
1298

1299
	hash256_unroll(hash_ec_point_inner_6);
1300

1301
	hash2[8] = bswap32(0x80000000);
1302
	hash2[9] = 0;
1303
	hash2[10] = 0;
1304
	hash2[11] = 0;
1305
	hash2[12] = 0;
1306
	hash2[13] = 0;
1307
	hash2[14] = 32 * 8;
1308
	hash2[15] = 0;
1309
	ripemd160_init(hash_out);
1310
	ripemd160_block(hash_out, hash2);
1311
}
1312

1313

1314
__kernel void
1315
hash_ec_point_get(__global uint *hashes_out,
1316
		  __global bn_word *points_in, __global bn_word *z_heap)
1317
{
1318
	uint hash[5];
1319
	int i, p, cell, start;
1320

1321
	cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
1322
	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1323
		 (cell % ACCESS_STRIDE));
1324
	z_heap += start;
1325

1326
	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1327
		 (cell % (ACCESS_STRIDE/2)));
1328
	points_in += start;
1329

1330
	/* Complete the coordinates and hash */
1331
	hash_ec_point(hash, points_in, z_heap);
1332

1333
	p = get_global_size(0);
1334
	i = p * get_global_id(1);
1335
	hashes_out += 5 * (i + get_global_id(0));
1336

1337
	/* Output the hash in proper byte-order */
1338
#define hash_ec_point_get_inner_1(i)		\
1339
	hashes_out[i] = load_le32(hash[i]);
1340

1341
	hash160_unroll(hash_ec_point_get_inner_1);
1342
}
1343

1344
/*
1345
 * Normally this would be one function that compared two hash160s.
1346
 * This one compares a hash160 with an upper and lower bound in one
1347
 * function to work around a problem with AMD's OpenCL compiler.
1348
 */
1349
int
1350
hash160_ucmp_g(uint *a, __global uint *bound)
1351
{
1352
	uint gv;
1353

1354
#define hash160_ucmp_g_inner_1(i) 		\
1355
		gv = load_be32(bound[i]);	\
1356
		if (a[i] < gv) return -1;	\
1357
		if (a[i] > gv) break;
1358

1359
	hash160_iter(hash160_ucmp_g_inner_1);
1360

1361
#define hash160_ucmp_g_inner_2(i)   		\
1362
		gv = load_be32(bound[5+i]);	\
1363
		if (a[i] < gv) return 0;	\
1364
		if (a[i] > gv) return 1;
1365

1366
	hash160_iter(hash160_ucmp_g_inner_2);
1367
	return 0;
1368
}
1369

1370
__kernel void
1371
hash_ec_point_search_prefix(__global uint *found,
1372
			    __global bn_word *points_in,
1373
			    __global bn_word *z_heap,
1374
			    __global uint *target_table, int ntargets)
1375
{
1376
	uint hash[5];
1377
	int i, high, low, p, cell, start;
1378

1379
	cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
1380
	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1381
		 (cell % ACCESS_STRIDE));
1382
	z_heap += start;
1383

1384
	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1385
		 (cell % (ACCESS_STRIDE/2)));
1386
	points_in += start;
1387

1388
	/* Complete the coordinates and hash */
1389
	hash_ec_point(hash, points_in, z_heap);
1390

1391
	/*
1392
	 * Unconditionally byteswap the hash result, because:
1393
	 * - The byte-level convention of RIPEMD160 is little-endian
1394
	 * - We are comparing it in big-endian order
1395
	 */
1396
#define hash_ec_point_search_prefix_inner_1(i)	\
1397
	hash[i] = bswap32(hash[i]);
1398

1399
	hash160_unroll(hash_ec_point_search_prefix_inner_1);
1400

1401
	/* Binary-search the target table for the hash we just computed */
1402
	for (high = ntargets - 1, low = 0, i = high >> 1;
1403
	     high >= low;
1404
	     i = low + ((high - low) >> 1)) {
1405
		p = hash160_ucmp_g(hash, &target_table[10*i]);
1406
		low = (p > 0) ? (i + 1) : low;
1407
		high = (p < 0) ? (i - 1) : high;
1408
		if (p == 0) {
1409
			/* For debugging purposes, write the hash value */
1410
			found[0] = ((get_global_id(1) * get_global_size(0)) +
1411
				    get_global_id(0));
1412
			found[1] = i;
1413

1414
#define hash_ec_point_search_prefix_inner_2(i)	\
1415
			found[i+2] = load_be32(hash[i]);
1416

1417
			hash160_unroll(hash_ec_point_search_prefix_inner_2);
1418
			high = -1;
1419
		}
1420
	}
1421
}
1422

1423
Product

Resources

Company