Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
samr7
GitHub Repository: samr7/vanitygen
Path: blob/master/oclengine.c
239 views
1
/*
2
* Vanitygen, vanity bitcoin address generator
3
* Copyright (C) 2011 <[email protected]>
4
*
5
* Vanitygen is free software: you can redistribute it and/or modify
6
* it under the terms of the GNU Affero General Public License as published by
7
* the Free Software Foundation, either version 3 of the License, or
8
* any later version.
9
*
10
* Vanitygen is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU Affero General Public License for more details.
14
*
15
* You should have received a copy of the GNU Affero General Public License
16
* along with Vanitygen. If not, see <http://www.gnu.org/licenses/>.
17
*/
18
19
#include <stdio.h>
20
#include <string.h>
21
#include <math.h>
22
#include <assert.h>
23
24
#include <pthread.h>
25
26
#include <openssl/ec.h>
27
#include <openssl/bn.h>
28
#include <openssl/rand.h>
29
#include <openssl/evp.h>
30
31
#ifdef __APPLE__
32
#include <OpenCL/cl.h>
33
#ifndef CL_CALLBACK
34
#define CL_CALLBACK
35
#endif
36
#else
37
#include <CL/cl.h>
38
#endif
39
40
#include "oclengine.h"
41
#include "pattern.h"
42
#include "util.h"
43
44
45
#define MAX_SLOT 2
46
#define MAX_ARG 6
47
#define MAX_KERNEL 3
48
49
#define is_pow2(v) (!((v) & ((v)-1)))
50
#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1))
51
52
static void vg_ocl_free_args(vg_ocl_context_t *vocp);
53
static void *vg_opencl_loop(vg_exec_context_t *arg);
54
55
56
/* OpenCL address searching mode */
57
struct _vg_ocl_context_s;
58
typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *);
59
typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot);
60
61
struct _vg_ocl_context_s {
62
vg_exec_context_t base;
63
cl_device_id voc_ocldid;
64
cl_context voc_oclctx;
65
cl_command_queue voc_oclcmdq;
66
cl_program voc_oclprog;
67
vg_ocl_init_t voc_init_func;
68
vg_ocl_init_t voc_rekey_func;
69
vg_ocl_check_t voc_check_func;
70
int voc_quirks;
71
int voc_nslots;
72
cl_kernel voc_oclkernel[MAX_SLOT][MAX_KERNEL];
73
cl_event voc_oclkrnwait[MAX_SLOT];
74
cl_mem voc_args[MAX_SLOT][MAX_ARG];
75
size_t voc_arg_size[MAX_SLOT][MAX_ARG];
76
77
int voc_pattern_rewrite;
78
int voc_pattern_alloc;
79
80
vg_ocl_check_t voc_verify_func[MAX_KERNEL];
81
82
pthread_t voc_ocl_thread;
83
pthread_mutex_t voc_lock;
84
pthread_cond_t voc_wait;
85
int voc_ocl_slot;
86
int voc_ocl_rows;
87
int voc_ocl_cols;
88
int voc_ocl_invsize;
89
int voc_halt;
90
int voc_dump_done;
91
};
92
93
94
/* Thread synchronization stubs */
95
void
96
vg_exec_downgrade_lock(vg_exec_context_t *vxcp)
97
{
98
}
99
100
int
101
vg_exec_upgrade_lock(vg_exec_context_t *vxcp)
102
{
103
return 0;
104
}
105
106
107
/*
108
* OpenCL debugging and support
109
*/
110
111
static const char *
112
vg_ocl_strerror(cl_int ret)
113
{
114
#define OCL_STATUS(st) case st: return #st;
115
switch (ret) {
116
OCL_STATUS(CL_SUCCESS);
117
OCL_STATUS(CL_DEVICE_NOT_FOUND);
118
OCL_STATUS(CL_DEVICE_NOT_AVAILABLE);
119
OCL_STATUS(CL_COMPILER_NOT_AVAILABLE);
120
OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE);
121
OCL_STATUS(CL_OUT_OF_RESOURCES);
122
OCL_STATUS(CL_OUT_OF_HOST_MEMORY);
123
OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE);
124
OCL_STATUS(CL_MEM_COPY_OVERLAP);
125
OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH);
126
OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED);
127
OCL_STATUS(CL_BUILD_PROGRAM_FAILURE);
128
OCL_STATUS(CL_MAP_FAILURE);
129
#if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET)
130
OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET);
131
#endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */
132
#if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
133
OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
134
#endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */
135
OCL_STATUS(CL_INVALID_VALUE);
136
OCL_STATUS(CL_INVALID_DEVICE_TYPE);
137
OCL_STATUS(CL_INVALID_PLATFORM);
138
OCL_STATUS(CL_INVALID_DEVICE);
139
OCL_STATUS(CL_INVALID_CONTEXT);
140
OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES);
141
OCL_STATUS(CL_INVALID_COMMAND_QUEUE);
142
OCL_STATUS(CL_INVALID_HOST_PTR);
143
OCL_STATUS(CL_INVALID_MEM_OBJECT);
144
OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
145
OCL_STATUS(CL_INVALID_IMAGE_SIZE);
146
OCL_STATUS(CL_INVALID_SAMPLER);
147
OCL_STATUS(CL_INVALID_BINARY);
148
OCL_STATUS(CL_INVALID_BUILD_OPTIONS);
149
OCL_STATUS(CL_INVALID_PROGRAM);
150
OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE);
151
OCL_STATUS(CL_INVALID_KERNEL_NAME);
152
OCL_STATUS(CL_INVALID_KERNEL_DEFINITION);
153
OCL_STATUS(CL_INVALID_KERNEL);
154
OCL_STATUS(CL_INVALID_ARG_INDEX);
155
OCL_STATUS(CL_INVALID_ARG_VALUE);
156
OCL_STATUS(CL_INVALID_ARG_SIZE);
157
OCL_STATUS(CL_INVALID_KERNEL_ARGS);
158
OCL_STATUS(CL_INVALID_WORK_DIMENSION);
159
OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE);
160
OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE);
161
OCL_STATUS(CL_INVALID_GLOBAL_OFFSET);
162
OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST);
163
OCL_STATUS(CL_INVALID_EVENT);
164
OCL_STATUS(CL_INVALID_OPERATION);
165
OCL_STATUS(CL_INVALID_GL_OBJECT);
166
OCL_STATUS(CL_INVALID_BUFFER_SIZE);
167
OCL_STATUS(CL_INVALID_MIP_LEVEL);
168
OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE);
169
#if defined(CL_INVALID_PROPERTY)
170
OCL_STATUS(CL_INVALID_PROPERTY);
171
#endif /* defined(CL_INVALID_PROPERTY) */
172
#undef OCL_STATUS
173
default: {
174
static char tmp[64];
175
snprintf(tmp, sizeof(tmp), "Unknown code %d", ret);
176
return tmp;
177
}
178
}
179
}
180
181
/* Get device strings, using a static buffer -- caveat emptor */
182
static const char *
183
vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param)
184
{
185
static char platform_str[1024];
186
cl_int ret;
187
size_t size_ret;
188
ret = clGetPlatformInfo(pid, param,
189
sizeof(platform_str), platform_str,
190
&size_ret);
191
if (ret != CL_SUCCESS) {
192
snprintf(platform_str, sizeof(platform_str),
193
"clGetPlatformInfo(%d): %s",
194
param, vg_ocl_strerror(ret));
195
}
196
return platform_str;
197
}
198
199
static cl_platform_id
200
vg_ocl_device_getplatform(cl_device_id did)
201
{
202
cl_int ret;
203
cl_platform_id val;
204
size_t size_ret;
205
ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM,
206
sizeof(val), &val, &size_ret);
207
if (ret != CL_SUCCESS) {
208
fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s",
209
vg_ocl_strerror(ret));
210
}
211
return val;
212
}
213
214
static cl_device_type
215
vg_ocl_device_gettype(cl_device_id did)
216
{
217
cl_int ret;
218
cl_device_type val;
219
size_t size_ret;
220
ret = clGetDeviceInfo(did, CL_DEVICE_TYPE,
221
sizeof(val), &val, &size_ret);
222
if (ret != CL_SUCCESS) {
223
fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s",
224
vg_ocl_strerror(ret));
225
}
226
return val;
227
}
228
229
static const char *
230
vg_ocl_device_getstr(cl_device_id did, cl_device_info param)
231
{
232
static char device_str[1024];
233
cl_int ret;
234
size_t size_ret;
235
ret = clGetDeviceInfo(did, param,
236
sizeof(device_str), device_str,
237
&size_ret);
238
if (ret != CL_SUCCESS) {
239
snprintf(device_str, sizeof(device_str),
240
"clGetDeviceInfo(%d): %s",
241
param, vg_ocl_strerror(ret));
242
}
243
return device_str;
244
}
245
246
static size_t
247
vg_ocl_device_getsizet(cl_device_id did, cl_device_info param)
248
{
249
cl_int ret;
250
size_t val;
251
size_t size_ret;
252
ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
253
if (ret != CL_SUCCESS) {
254
fprintf(stderr,
255
"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
256
}
257
return val;
258
}
259
260
static cl_ulong
261
vg_ocl_device_getulong(cl_device_id did, cl_device_info param)
262
{
263
cl_int ret;
264
cl_ulong val;
265
size_t size_ret;
266
ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
267
if (ret != CL_SUCCESS) {
268
fprintf(stderr,
269
"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
270
}
271
return val;
272
}
273
274
static cl_uint
275
vg_ocl_device_getuint(cl_device_id did, cl_device_info param)
276
{
277
cl_int ret;
278
cl_uint val;
279
size_t size_ret;
280
ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
281
if (ret != CL_SUCCESS) {
282
fprintf(stderr,
283
"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
284
}
285
return val;
286
}
287
288
void
289
vg_ocl_dump_info(vg_ocl_context_t *vocp)
290
{
291
cl_device_id did;
292
if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1))
293
return;
294
if (vocp->voc_dump_done)
295
return;
296
did = vocp->voc_ocldid;
297
fprintf(stderr, "Device: %s\n",
298
vg_ocl_device_getstr(did, CL_DEVICE_NAME));
299
fprintf(stderr, "Vendor: %s (%04x)\n",
300
vg_ocl_device_getstr(did, CL_DEVICE_VENDOR),
301
vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID));
302
fprintf(stderr, "Driver: %s\n",
303
vg_ocl_device_getstr(did, CL_DRIVER_VERSION));
304
fprintf(stderr, "Profile: %s\n",
305
vg_ocl_device_getstr(did, CL_DEVICE_PROFILE));
306
fprintf(stderr, "Version: %s\n",
307
vg_ocl_device_getstr(did, CL_DEVICE_VERSION));
308
fprintf(stderr, "Max compute units: %"PRSIZET"d\n",
309
vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS));
310
fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n",
311
vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE));
312
fprintf(stderr, "Global memory: %ld\n",
313
vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE));
314
fprintf(stderr, "Max allocation: %ld\n",
315
vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE));
316
vocp->voc_dump_done = 1;
317
}
318
319
void
320
vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc)
321
{
322
const char *err = vg_ocl_strerror(code);
323
if (desc) {
324
fprintf(stderr, "%s: %s\n", desc, err);
325
} else {
326
fprintf(stderr, "%s\n", err);
327
}
328
329
if (vocp && vocp->voc_ocldid)
330
vg_ocl_dump_info(vocp);
331
}
332
333
static void
334
vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
335
{
336
size_t logbufsize, logsize;
337
char *log;
338
int off = 0;
339
cl_int ret;
340
341
ret = clGetProgramBuildInfo(prog,
342
vocp->voc_ocldid,
343
CL_PROGRAM_BUILD_LOG,
344
0, NULL,
345
&logbufsize);
346
if (ret != CL_SUCCESS) {
347
vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
348
return;
349
}
350
351
log = (char *) malloc(logbufsize);
352
if (!log) {
353
fprintf(stderr, "Could not allocate build log buffer\n");
354
return;
355
}
356
357
ret = clGetProgramBuildInfo(prog,
358
vocp->voc_ocldid,
359
CL_PROGRAM_BUILD_LOG,
360
logbufsize,
361
log,
362
&logsize);
363
if (ret != CL_SUCCESS) {
364
vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
365
366
} else {
367
/* Remove leading newlines and trailing newlines/whitespace */
368
log[logbufsize-1] = '\0';
369
for (off = logsize - 1; off >= 0; off--) {
370
if ((log[off] != '\r') &&
371
(log[off] != '\n') &&
372
(log[off] != ' ') &&
373
(log[off] != '\t') &&
374
(log[off] != '\0'))
375
break;
376
log[off] = '\0';
377
}
378
for (off = 0; off < logbufsize; off++) {
379
if ((log[off] != '\r') &&
380
(log[off] != '\n'))
381
break;
382
}
383
384
fprintf(stderr, "Build log:\n%s\n", &log[off]);
385
}
386
free(log);
387
}
388
389
/*
390
* OpenCL per-exec functions
391
*/
392
393
enum {
394
VG_OCL_DEEP_PREPROC_UNROLL = (1 << 0),
395
VG_OCL_PRAGMA_UNROLL = (1 << 1),
396
VG_OCL_EXPENSIVE_BRANCHES = (1 << 2),
397
VG_OCL_DEEP_VLIW = (1 << 3),
398
VG_OCL_AMD_BFI_INT = (1 << 4),
399
VG_OCL_NV_VERBOSE = (1 << 5),
400
VG_OCL_BROKEN = (1 << 6),
401
VG_OCL_NO_BINARIES = (1 << 7),
402
403
VG_OCL_OPTIMIZATIONS = (VG_OCL_DEEP_PREPROC_UNROLL |
404
VG_OCL_PRAGMA_UNROLL |
405
VG_OCL_EXPENSIVE_BRANCHES |
406
VG_OCL_DEEP_VLIW |
407
VG_OCL_AMD_BFI_INT),
408
409
};
410
411
static int
412
vg_ocl_get_quirks(vg_ocl_context_t *vocp)
413
{
414
uint32_t vend;
415
const char *dvn;
416
unsigned int quirks = 0;
417
418
quirks |= VG_OCL_DEEP_PREPROC_UNROLL;
419
420
vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
421
switch (vend) {
422
case 0x10de: /* NVIDIA */
423
/*
424
* NVIDIA's compiler seems to take a really really long
425
* time when using preprocessor unrolling, but works
426
* well with pragma unroll.
427
*/
428
quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL;
429
quirks |= VG_OCL_PRAGMA_UNROLL;
430
quirks |= VG_OCL_NV_VERBOSE;
431
break;
432
case 0x1002: /* AMD/ATI */
433
/*
434
* AMD's compiler works best with preprocesor unrolling.
435
* Pragma unroll is unreliable with AMD's compiler and
436
* seems to crash based on whether the gods were smiling
437
* when Catalyst was last installed/upgraded.
438
*/
439
if (vg_ocl_device_gettype(vocp->voc_ocldid) &
440
CL_DEVICE_TYPE_GPU) {
441
quirks |= VG_OCL_EXPENSIVE_BRANCHES;
442
quirks |= VG_OCL_DEEP_VLIW;
443
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
444
CL_DEVICE_EXTENSIONS);
445
if (dvn && strstr(dvn, "cl_amd_media_ops"))
446
quirks |= VG_OCL_AMD_BFI_INT;
447
448
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
449
CL_DEVICE_NAME);
450
if (!strcmp(dvn, "ATI RV710")) {
451
quirks &= ~VG_OCL_OPTIMIZATIONS;
452
quirks |= VG_OCL_NO_BINARIES;
453
}
454
}
455
break;
456
default:
457
break;
458
}
459
return quirks;
460
}
461
462
static int
463
vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func)
464
{
465
int i;
466
cl_kernel krn;
467
cl_int ret;
468
469
for (i = 0; i < MAX_SLOT; i++) {
470
krn = clCreateKernel(vocp->voc_oclprog, func, &ret);
471
if (!krn) {
472
fprintf(stderr, "clCreateKernel(%d): ", i);
473
vg_ocl_error(vocp, ret, NULL);
474
while (--i >= 0) {
475
clReleaseKernel(vocp->voc_oclkernel[i][knum]);
476
vocp->voc_oclkernel[i][knum] = NULL;
477
}
478
return 0;
479
}
480
vocp->voc_oclkernel[i][knum] = krn;
481
vocp->voc_oclkrnwait[i] = NULL;
482
}
483
return 1;
484
}
485
486
static void
487
vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts,
488
const char *program, size_t size,
489
unsigned char *hash_out)
490
{
491
EVP_MD_CTX *mdctx;
492
cl_platform_id pid;
493
const char *str;
494
495
mdctx = EVP_MD_CTX_create();
496
EVP_DigestInit_ex(mdctx, EVP_md5(), NULL);
497
pid = vg_ocl_device_getplatform(vocp->voc_ocldid);
498
str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME);
499
EVP_DigestUpdate(mdctx, str, strlen(str) + 1);
500
str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION);
501
EVP_DigestUpdate(mdctx, str, strlen(str) + 1);
502
str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
503
EVP_DigestUpdate(mdctx, str, strlen(str) + 1);
504
if (opts)
505
EVP_DigestUpdate(mdctx, opts, strlen(opts) + 1);
506
if (size)
507
EVP_DigestUpdate(mdctx, program, size);
508
EVP_DigestFinal_ex(mdctx, hash_out, NULL);
509
EVP_MD_CTX_destroy(mdctx);
510
}
511
512
typedef struct {
513
unsigned char e_ident[16];
514
uint16_t e_type;
515
uint16_t e_machine;
516
uint32_t e_version;
517
uint32_t e_entry;
518
uint32_t e_phoff;
519
uint32_t e_shoff;
520
uint32_t e_flags;
521
uint16_t e_ehsize;
522
uint16_t e_phentsize;
523
uint16_t e_phnum;
524
uint16_t e_shentsize;
525
uint16_t e_shnum;
526
uint16_t e_shstrndx;
527
} vg_elf32_header_t;
528
529
typedef struct {
530
uint32_t sh_name;
531
uint32_t sh_type;
532
uint32_t sh_flags;
533
uint32_t sh_addr;
534
uint32_t sh_offset;
535
uint32_t sh_size;
536
uint32_t sh_link;
537
uint32_t sh_info;
538
uint32_t sh_addralign;
539
uint32_t sh_entsize;
540
} vg_elf32_shdr_t;
541
542
static int
543
vg_ocl_amd_patch_inner(unsigned char *binary, size_t size)
544
{
545
vg_elf32_header_t *ehp;
546
vg_elf32_shdr_t *shp, *nshp;
547
uint32_t *instr;
548
size_t off;
549
int i, n, txt2idx, patched;
550
551
ehp = (vg_elf32_header_t *) binary;
552
if ((size < sizeof(*ehp)) ||
553
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
554
!ehp->e_shoff)
555
return 0;
556
557
off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize);
558
nshp = (vg_elf32_shdr_t *) (binary + off);
559
if ((off + sizeof(*nshp)) > size)
560
return 0;
561
562
shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff);
563
n = 0;
564
txt2idx = 0;
565
for (i = 0; i < ehp->e_shnum; i++) {
566
off = nshp->sh_offset + shp[i].sh_name;
567
if (((off + 6) >= size) ||
568
memcmp(binary + off, ".text", 6))
569
continue;
570
n++;
571
if (n == 2)
572
txt2idx = i;
573
}
574
if (n != 2)
575
return 0;
576
577
off = shp[txt2idx].sh_offset;
578
instr = (uint32_t *) (binary + off);
579
n = shp[txt2idx].sh_size / 4;
580
patched = 0;
581
for (i = 0; i < n; i += 2) {
582
if (((instr[i] & 0x02001000) == 0) &&
583
((instr[i+1] & 0x9003f000) == 0x0001a000)) {
584
instr[i+1] ^= (0x0001a000 ^ 0x0000c000);
585
patched++;
586
}
587
}
588
589
return patched;
590
}
591
592
static int
593
vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size)
594
{
595
vg_context_t *vcp = vocp->base.vxc_vc;
596
vg_elf32_header_t *ehp;
597
unsigned char *ptr;
598
size_t offset = 1;
599
int ninner = 0, nrun, npatched = 0;
600
601
ehp = (vg_elf32_header_t *) binary;
602
if ((size < sizeof(*ehp)) ||
603
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) ||
604
!ehp->e_shoff)
605
return 0;
606
607
offset = 1;
608
while (offset < (size - 8)) {
609
ptr = (unsigned char *) memchr(binary + offset,
610
0x7f,
611
size - offset);
612
if (!ptr)
613
return npatched;
614
offset = ptr - binary;
615
ehp = (vg_elf32_header_t *) ptr;
616
if (((size - offset) < sizeof(*ehp)) ||
617
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
618
!ehp->e_shoff) {
619
offset += 1;
620
continue;
621
}
622
623
ninner++;
624
nrun = vg_ocl_amd_patch_inner(ptr, size - offset);
625
npatched += nrun;
626
if (vcp->vc_verbose > 1)
627
fprintf(stderr, "AMD BFI_INT: patched %d instructions "
628
"in kernel %d\n",
629
nrun, ninner);
630
npatched++;
631
offset += 1;
632
}
633
return npatched;
634
}
635
636
637
static int
638
vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
639
const char *filename, const char *opts)
640
{
641
FILE *kfp;
642
char *buf, *tbuf;
643
int len, fromsource = 0, patched = 0;
644
size_t sz, szr;
645
cl_program prog;
646
cl_int ret, sts;
647
unsigned char prog_hash[16];
648
char bin_name[64];
649
650
if (vcp->vc_verbose > 1)
651
fprintf(stderr,
652
"OpenCL compiler flags: %s\n", opts ? opts : "");
653
654
sz = 128 * 1024;
655
buf = (char *) malloc(sz);
656
if (!buf) {
657
fprintf(stderr, "Could not allocate program buffer\n");
658
return 0;
659
}
660
661
kfp = fopen(filename, "r");
662
if (!kfp) {
663
fprintf(stderr, "Error loading kernel file '%s': %s\n",
664
filename, strerror(errno));
665
free(buf);
666
return 0;
667
}
668
669
len = fread(buf, 1, sz, kfp);
670
fclose(kfp);
671
672
if (!len) {
673
fprintf(stderr, "Short read on CL kernel\n");
674
free(buf);
675
return 0;
676
}
677
678
vg_ocl_hash_program(vocp, opts, buf, len, prog_hash);
679
snprintf(bin_name, sizeof(bin_name),
680
"%02x%02x%02x%02x%02x%02x%02x%02x"
681
"%02x%02x%02x%02x%02x%02x%02x%02x.oclbin",
682
prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3],
683
prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7],
684
prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11],
685
prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]);
686
687
if (vocp->voc_quirks & VG_OCL_NO_BINARIES) {
688
kfp = NULL;
689
if (vcp->vc_verbose > 1)
690
fprintf(stderr, "Binary OpenCL programs disabled\n");
691
} else {
692
kfp = fopen(bin_name, "rb");
693
}
694
695
if (!kfp) {
696
/* No binary available, create with source */
697
fromsource = 1;
698
sz = len;
699
prog = clCreateProgramWithSource(vocp->voc_oclctx,
700
1, (const char **) &buf, &sz,
701
&ret);
702
} else {
703
if (vcp->vc_verbose > 1)
704
fprintf(stderr, "Loading kernel binary %s\n", bin_name);
705
szr = 0;
706
while (!feof(kfp)) {
707
len = fread(buf + szr, 1, sz - szr, kfp);
708
if (!len) {
709
fprintf(stderr,
710
"Short read on CL kernel binary\n");
711
fclose(kfp);
712
free(buf);
713
return 0;
714
}
715
szr += len;
716
if (szr == sz) {
717
tbuf = (char *) realloc(buf, sz*2);
718
if (!tbuf) {
719
fprintf(stderr,
720
"Could not expand CL kernel "
721
"binary buffer\n");
722
fclose(kfp);
723
free(buf);
724
return 0;
725
}
726
buf = tbuf;
727
sz *= 2;
728
}
729
}
730
fclose(kfp);
731
rebuild:
732
prog = clCreateProgramWithBinary(vocp->voc_oclctx,
733
1, &vocp->voc_ocldid,
734
&szr,
735
(const unsigned char **) &buf,
736
&sts,
737
&ret);
738
}
739
free(buf);
740
if (!prog) {
741
vg_ocl_error(vocp, ret, "clCreateProgramWithSource");
742
return 0;
743
}
744
745
if (vcp->vc_verbose > 0) {
746
if (fromsource && !patched) {
747
fprintf(stderr,
748
"Compiling kernel, can take minutes...");
749
fflush(stderr);
750
}
751
}
752
ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL);
753
if (ret != CL_SUCCESS) {
754
if ((vcp->vc_verbose > 0) && fromsource && !patched)
755
fprintf(stderr, "failure.\n");
756
vg_ocl_error(NULL, ret, "clBuildProgram");
757
} else if ((vcp->vc_verbose > 0) && fromsource && !patched) {
758
fprintf(stderr, "done!\n");
759
}
760
if ((ret != CL_SUCCESS) ||
761
((vcp->vc_verbose > 1) && fromsource && !patched)) {
762
vg_ocl_buildlog(vocp, prog);
763
}
764
if (ret != CL_SUCCESS) {
765
vg_ocl_dump_info(vocp);
766
clReleaseProgram(prog);
767
return 0;
768
}
769
770
if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) {
771
ret = clGetProgramInfo(prog,
772
CL_PROGRAM_BINARY_SIZES,
773
sizeof(szr), &szr,
774
&sz);
775
if (ret != CL_SUCCESS) {
776
vg_ocl_error(vocp, ret,
777
"WARNING: clGetProgramInfo(BINARY_SIZES)");
778
goto out;
779
}
780
if (sz == 0) {
781
fprintf(stderr,
782
"WARNING: zero-length CL kernel binary\n");
783
goto out;
784
}
785
786
buf = (char *) malloc(szr);
787
if (!buf) {
788
fprintf(stderr,
789
"WARNING: Could not allocate %"PRSIZET"d bytes "
790
"for CL binary\n",
791
szr);
792
goto out;
793
}
794
795
ret = clGetProgramInfo(prog,
796
CL_PROGRAM_BINARIES,
797
sizeof(buf), &buf,
798
&sz);
799
if (ret != CL_SUCCESS) {
800
vg_ocl_error(vocp, ret,
801
"WARNING: clGetProgramInfo(BINARIES)");
802
free(buf);
803
goto out;
804
}
805
806
if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) {
807
patched = vg_ocl_amd_patch(vocp,
808
(unsigned char *) buf, szr);
809
if (patched > 0) {
810
if (vcp->vc_verbose > 1)
811
fprintf(stderr,
812
"AMD BFI_INT patch complete\n");
813
clReleaseProgram(prog);
814
goto rebuild;
815
}
816
fprintf(stderr,
817
"WARNING: AMD BFI_INT patching failed\n");
818
if (patched < 0) {
819
/* Program was incompletely modified */
820
free(buf);
821
goto out;
822
}
823
}
824
825
kfp = fopen(bin_name, "wb");
826
if (!kfp) {
827
fprintf(stderr, "WARNING: "
828
"could not save CL kernel binary: %s\n",
829
strerror(errno));
830
} else {
831
sz = fwrite(buf, 1, szr, kfp);
832
fclose(kfp);
833
if (sz != szr) {
834
fprintf(stderr,
835
"WARNING: short write on CL kernel "
836
"binary file: expected "
837
"%"PRSIZET"d, got %"PRSIZET"d\n",
838
szr, sz);
839
unlink(bin_name);
840
}
841
}
842
free(buf);
843
}
844
845
out:
846
vocp->voc_oclprog = prog;
847
if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") ||
848
!vg_ocl_create_kernel(vocp, 1, "heap_invert")) {
849
clReleaseProgram(vocp->voc_oclprog);
850
vocp->voc_oclprog = NULL;
851
return 0;
852
}
853
854
return 1;
855
}
856
857
static void CL_CALLBACK
858
vg_ocl_context_callback(const char *errinfo,
859
const void *private_info,
860
size_t cb,
861
void *user_data)
862
{
863
fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo);
864
}
865
866
static int
867
vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
868
int safe_mode)
869
{
870
cl_int ret;
871
char optbuf[128];
872
int end = 0;
873
874
memset(vocp, 0, sizeof(*vocp));
875
vg_exec_context_init(vcp, &vocp->base);
876
vocp->base.vxc_threadfunc = vg_opencl_loop;
877
878
pthread_mutex_init(&vocp->voc_lock, NULL);
879
pthread_cond_init(&vocp->voc_wait, NULL);
880
vocp->voc_ocl_slot = -1;
881
882
vocp->voc_ocldid = did;
883
884
if (vcp->vc_verbose > 1)
885
vg_ocl_dump_info(vocp);
886
887
vocp->voc_quirks = vg_ocl_get_quirks(vocp);
888
889
if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) {
890
char yesbuf[16];
891
printf("Type 'yes' to continue: ");
892
fflush(stdout);
893
if (!fgets(yesbuf, sizeof(yesbuf), stdin) ||
894
strncmp(yesbuf, "yes", 3))
895
exit(1);
896
}
897
898
vocp->voc_oclctx = clCreateContext(NULL,
899
1, &did,
900
vg_ocl_context_callback,
901
NULL,
902
&ret);
903
if (!vocp->voc_oclctx) {
904
vg_ocl_error(vocp, ret, "clCreateContext");
905
return 0;
906
}
907
908
vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx,
909
vocp->voc_ocldid,
910
0, &ret);
911
if (!vocp->voc_oclcmdq) {
912
vg_ocl_error(vocp, ret, "clCreateCommandQueue");
913
return 0;
914
}
915
916
if (safe_mode)
917
vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS;
918
919
end = 0;
920
optbuf[end] = '\0';
921
if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL)
922
end += snprintf(optbuf + end, sizeof(optbuf) - end,
923
"-DDEEP_PREPROC_UNROLL ");
924
if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL)
925
end += snprintf(optbuf + end, sizeof(optbuf) - end,
926
"-DPRAGMA_UNROLL ");
927
if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
928
end += snprintf(optbuf + end, sizeof(optbuf) - end,
929
"-DVERY_EXPENSIVE_BRANCHES ");
930
if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
931
end += snprintf(optbuf + end, sizeof(optbuf) - end,
932
"-DDEEP_VLIW ");
933
if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT)
934
end += snprintf(optbuf + end, sizeof(optbuf) - end,
935
"-DAMD_BFI_INT ");
936
if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
937
end += snprintf(optbuf + end, sizeof(optbuf) - end,
938
"-cl-nv-verbose ");
939
940
if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf))
941
return 0;
942
return 1;
943
}
944
945
static void
946
vg_ocl_del(vg_ocl_context_t *vocp)
947
{
948
vg_ocl_free_args(vocp);
949
if (vocp->voc_oclprog) {
950
clReleaseProgram(vocp->voc_oclprog);
951
vocp->voc_oclprog = NULL;
952
}
953
if (vocp->voc_oclcmdq) {
954
clReleaseCommandQueue(vocp->voc_oclcmdq);
955
vocp->voc_oclcmdq = NULL;
956
}
957
if (vocp->voc_oclctx) {
958
clReleaseContext(vocp->voc_oclctx);
959
vocp->voc_oclctx = NULL;
960
}
961
pthread_cond_destroy(&vocp->voc_wait);
962
pthread_mutex_destroy(&vocp->voc_lock);
963
vg_exec_context_del(&vocp->base);
964
}
965
966
static int vg_ocl_arg_map[][8] = {
967
/* hashes_out / found */
968
{ 2, 0, -1 },
969
/* z_heap */
970
{ 0, 1, 1, 0, 2, 2, -1 },
971
/* point_tmp */
972
{ 0, 0, 2, 1, -1 },
973
/* row_in */
974
{ 0, 2, -1 },
975
/* col_in */
976
{ 0, 3, -1 },
977
/* target_table */
978
{ 2, 3, -1 },
979
};
980
981
static int
982
vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
983
int arg, size_t size, int host)
984
{
985
cl_mem clbuf;
986
cl_int ret;
987
int i, j, knum, karg;
988
989
for (i = 0; i < MAX_SLOT; i++) {
990
if ((i != slot) && (slot >= 0))
991
continue;
992
if (vocp->voc_args[i][arg]) {
993
clReleaseMemObject(vocp->voc_args[i][arg]);
994
vocp->voc_args[i][arg] = NULL;
995
vocp->voc_arg_size[i][arg] = 0;
996
}
997
}
998
999
clbuf = clCreateBuffer(vocp->voc_oclctx,
1000
CL_MEM_READ_WRITE |
1001
(host ? CL_MEM_ALLOC_HOST_PTR : 0),
1002
size,
1003
NULL,
1004
&ret);
1005
if (!clbuf) {
1006
fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg);
1007
vg_ocl_error(vocp, ret, NULL);
1008
return 0;
1009
}
1010
1011
for (i = 0; i < MAX_SLOT; i++) {
1012
if ((i != slot) && (slot >= 0))
1013
continue;
1014
1015
clRetainMemObject(clbuf);
1016
vocp->voc_args[i][arg] = clbuf;
1017
vocp->voc_arg_size[i][arg] = size;
1018
1019
for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
1020
knum = vg_ocl_arg_map[arg][j];
1021
karg = vg_ocl_arg_map[arg][j+1];
1022
ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
1023
karg,
1024
sizeof(clbuf),
1025
&clbuf);
1026
1027
if (ret) {
1028
fprintf(stderr,
1029
"clSetKernelArg(%d,%d): ", knum, karg);
1030
vg_ocl_error(vocp, ret, NULL);
1031
return 0;
1032
}
1033
}
1034
}
1035
1036
clReleaseMemObject(clbuf);
1037
return 1;
1038
}
1039
1040
int
1041
vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg,
1042
void *buffer, size_t size)
1043
{
1044
cl_int slot, ret;
1045
1046
slot = (wslot < 0) ? 0 : wslot;
1047
1048
assert((slot >= 0) && (slot < MAX_SLOT));
1049
assert(size <= vocp->voc_arg_size[slot][arg]);
1050
1051
ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq,
1052
vocp->voc_args[slot][arg],
1053
CL_TRUE,
1054
0, size,
1055
buffer,
1056
0, NULL,
1057
NULL);
1058
1059
if (ret) {
1060
fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg);
1061
vg_ocl_error(vocp, ret, NULL);
1062
return 0;
1063
}
1064
1065
return 1;
1066
}
1067
1068
static void *
1069
vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot,
1070
int arg, int rw)
1071
{
1072
void *buf;
1073
cl_int ret;
1074
1075
assert((slot >= 0) && (slot < MAX_SLOT));
1076
1077
buf = clEnqueueMapBuffer(vocp->voc_oclcmdq,
1078
vocp->voc_args[slot][arg],
1079
CL_TRUE,
1080
(rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE)
1081
: (rw ? CL_MAP_WRITE : CL_MAP_READ),
1082
0, vocp->voc_arg_size[slot][arg],
1083
0, NULL,
1084
NULL,
1085
&ret);
1086
if (!buf) {
1087
fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg);
1088
vg_ocl_error(vocp, ret, NULL);
1089
return NULL;
1090
}
1091
return buf;
1092
}
1093
1094
static void
1095
vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot,
1096
int arg, void *buf)
1097
{
1098
cl_int ret;
1099
cl_event ev;
1100
1101
assert((slot >= 0) && (slot < MAX_SLOT));
1102
1103
ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq,
1104
vocp->voc_args[slot][arg],
1105
buf,
1106
0, NULL,
1107
&ev);
1108
if (ret != CL_SUCCESS) {
1109
fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg);
1110
vg_ocl_error(vocp, ret, NULL);
1111
return;
1112
}
1113
1114
ret = clWaitForEvents(1, &ev);
1115
clReleaseEvent(ev);
1116
if (ret != CL_SUCCESS) {
1117
fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg);
1118
vg_ocl_error(vocp, ret, NULL);
1119
}
1120
}
1121
1122
int
1123
vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot,
1124
int arg, int value)
1125
{
1126
cl_int ret;
1127
int i;
1128
1129
for (i = 0; i < MAX_SLOT; i++) {
1130
if ((i != slot) && (slot >= 0))
1131
continue;
1132
ret = clSetKernelArg(vocp->voc_oclkernel[i][2],
1133
arg,
1134
sizeof(value),
1135
&value);
1136
if (ret) {
1137
fprintf(stderr, "clSetKernelArg(%d): ", arg);
1138
vg_ocl_error(vocp, ret, NULL);
1139
return 0;
1140
}
1141
}
1142
return 1;
1143
}
1144
1145
int
1146
vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot,
1147
int arg, void *value, size_t size)
1148
{
1149
cl_int ret;
1150
int i, j, knum, karg;
1151
1152
for (i = 0; i < MAX_SLOT; i++) {
1153
if ((i != slot) && (slot >= 0))
1154
continue;
1155
for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
1156
knum = vg_ocl_arg_map[arg][j];
1157
karg = vg_ocl_arg_map[arg][j+1];
1158
ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
1159
karg,
1160
size,
1161
value);
1162
if (ret) {
1163
fprintf(stderr,
1164
"clSetKernelArg(%d,%d): ", knum, karg);
1165
vg_ocl_error(vocp, ret, NULL);
1166
return 0;
1167
}
1168
}
1169
}
1170
return 1;
1171
}
1172
1173
static void
1174
vg_ocl_free_args(vg_ocl_context_t *vocp)
1175
{
1176
int i, arg;
1177
for (i = 0; i < MAX_SLOT; i++) {
1178
for (arg = 0; arg < MAX_ARG; arg++) {
1179
if (vocp->voc_args[i][arg]) {
1180
clReleaseMemObject(vocp->voc_args[i][arg]);
1181
vocp->voc_args[i][arg] = NULL;
1182
vocp->voc_arg_size[i][arg] = 0;
1183
}
1184
}
1185
}
1186
}
1187
1188
int
1189
vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot)
1190
{
1191
return (vocp->voc_oclkrnwait[slot] == NULL);
1192
}
1193
1194
static int
1195
vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow,
1196
int invsize)
1197
{
1198
cl_int val, ret;
1199
cl_event ev;
1200
size_t globalws[2] = { ncol, nrow };
1201
size_t invws = (ncol * nrow) / invsize;
1202
1203
assert(!vocp->voc_oclkrnwait[slot]);
1204
1205
/* heap_invert() preconditions */
1206
assert(is_pow2(invsize) && (invsize > 1));
1207
1208
val = invsize;
1209
ret = clSetKernelArg(vocp->voc_oclkernel[slot][1],
1210
1,
1211
sizeof(val),
1212
&val);
1213
if (ret != CL_SUCCESS) {
1214
vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)");
1215
return 0;
1216
}
1217
ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
1218
vocp->voc_oclkernel[slot][0],
1219
2,
1220
NULL, globalws, NULL,
1221
0, NULL,
1222
&ev);
1223
if (ret != CL_SUCCESS) {
1224
vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)");
1225
return 0;
1226
}
1227
1228
ret = clWaitForEvents(1, &ev);
1229
clReleaseEvent(ev);
1230
if (ret != CL_SUCCESS) {
1231
vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)");
1232
return 0;
1233
}
1234
1235
if (vocp->voc_verify_func[0] &&
1236
!(vocp->voc_verify_func[0])(vocp, slot)) {
1237
fprintf(stderr, "ERROR: Kernel 0 failed verification test\n");
1238
return 0;
1239
}
1240
1241
ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
1242
vocp->voc_oclkernel[slot][1],
1243
1,
1244
NULL, &invws, NULL,
1245
0, NULL,
1246
&ev);
1247
if (ret != CL_SUCCESS) {
1248
vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)");
1249
return 0;
1250
}
1251
1252
ret = clWaitForEvents(1, &ev);
1253
clReleaseEvent(ev);
1254
if (ret != CL_SUCCESS) {
1255
vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)");
1256
return 0;
1257
}
1258
1259
if (vocp->voc_verify_func[1] &&
1260
!(vocp->voc_verify_func[1])(vocp, slot)) {
1261
fprintf(stderr, "ERROR: Kernel 1 failed verification test\n");
1262
return 0;
1263
}
1264
1265
ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
1266
vocp->voc_oclkernel[slot][2],
1267
2,
1268
NULL, globalws, NULL,
1269
0, NULL,
1270
&ev);
1271
if (ret != CL_SUCCESS) {
1272
vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)");
1273
return 0;
1274
}
1275
1276
vocp->voc_oclkrnwait[slot] = ev;
1277
return 1;
1278
}
1279
1280
static int
1281
vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot)
1282
{
1283
cl_event ev;
1284
cl_int ret;
1285
1286
ev = vocp->voc_oclkrnwait[slot];
1287
vocp->voc_oclkrnwait[slot] = NULL;
1288
if (ev) {
1289
ret = clWaitForEvents(1, &ev);
1290
clReleaseEvent(ev);
1291
if (ret != CL_SUCCESS) {
1292
vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)");
1293
return 0;
1294
}
1295
}
1296
return 1;
1297
}
1298
1299
1300
static INLINE void
1301
vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf)
1302
{
1303
bn_expand(bn, 256);
1304
memcpy(bn->d, buf, 32);
1305
bn->top = (32 / sizeof(BN_ULONG));
1306
}
1307
1308
static INLINE void
1309
vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn)
1310
{
1311
int bnlen = (bn->top * sizeof(BN_ULONG));
1312
if (bnlen >= 32) {
1313
memcpy(buf, bn->d, 32);
1314
} else {
1315
memcpy(buf, bn->d, bnlen);
1316
memset(buf + bnlen, 0, 32 - bnlen);
1317
}
1318
}
1319
1320
#define ACCESS_BUNDLE 1024
1321
#define ACCESS_STRIDE (ACCESS_BUNDLE/8)
1322
1323
static void
1324
vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell)
1325
{
1326
unsigned char bnbuf[32];
1327
int start, i;
1328
1329
start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1330
(cell % ACCESS_STRIDE));
1331
for (i = 0; i < 8; i++)
1332
memcpy(bnbuf+(i*4),
1333
buf + 4*(start + i*ACCESS_STRIDE),
1334
4);
1335
1336
vg_ocl_get_bignum_raw(bn, bnbuf);
1337
}
1338
1339
/*
1340
* Absolutely disgusting.
1341
* We want points in Montgomery form, and it's a lot easier to read the
1342
* coordinates from the structure than to export and re-montgomeryize.
1343
*/
1344
1345
struct ec_point_st {
1346
const EC_METHOD *meth;
1347
BIGNUM X;
1348
BIGNUM Y;
1349
BIGNUM Z;
1350
int Z_is_one;
1351
};
1352
1353
static INLINE void
1354
vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf)
1355
{
1356
static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 };
1357
vg_ocl_get_bignum_raw(&ppnt->X, buf);
1358
vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32);
1359
if (!ppnt->Z_is_one) {
1360
ppnt->Z_is_one = 1;
1361
BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z);
1362
}
1363
}
1364
1365
static INLINE void
1366
vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt)
1367
{
1368
assert(ppnt->Z_is_one);
1369
vg_ocl_put_bignum_raw(buf, &ppnt->X);
1370
vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y);
1371
}
1372
1373
static void
1374
vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt)
1375
{
1376
unsigned char pntbuf[64];
1377
int start, i;
1378
1379
vg_ocl_put_point(pntbuf, ppnt);
1380
1381
start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1382
(cell % (ACCESS_STRIDE/2)));
1383
for (i = 0; i < 8; i++)
1384
memcpy(buf + 4*(start + i*ACCESS_STRIDE),
1385
pntbuf+(i*4),
1386
4);
1387
for (i = 0; i < 8; i++)
1388
memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
1389
pntbuf+32+(i*4),
1390
4);
1391
}
1392
1393
static void
1394
vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell)
1395
{
1396
unsigned char pntbuf[64];
1397
int start, i;
1398
1399
start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
1400
(cell % (ACCESS_STRIDE/2)));
1401
for (i = 0; i < 8; i++)
1402
memcpy(pntbuf+(i*4),
1403
buf + 4*(start + i*ACCESS_STRIDE),
1404
4);
1405
for (i = 0; i < 8; i++)
1406
memcpy(pntbuf+32+(i*4),
1407
buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
1408
4);
1409
1410
vg_ocl_get_point(ppnt, pntbuf);
1411
}
1412
1413
void
1414
show_elapsed(struct timeval *tv, const char *place)
1415
{
1416
struct timeval now, delta;
1417
gettimeofday(&now, NULL);
1418
timersub(&now, tv, &delta);
1419
fprintf(stderr,
1420
"%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
1421
}
1422
1423
1424
/*
1425
* GPU address matching methods
1426
*
1427
* gethash: GPU computes and returns all address hashes.
1428
* + Works with any matching method, including regular expressions.
1429
* - The CPU will not be able to keep up with mid- to high-end GPUs.
1430
*
1431
* prefix: GPU computes hash, searches a range list, and discards.
1432
* + Fast, minimal work for CPU.
1433
*/
1434
1435
static int
1436
vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot)
1437
{
1438
vg_exec_context_t *vxcp = &vocp->base;
1439
vg_context_t *vcp = vocp->base.vxc_vc;
1440
vg_test_func_t test_func = vcp->vc_test;
1441
unsigned char *ocl_hashes_out;
1442
int i, res = 0, round;
1443
1444
ocl_hashes_out = (unsigned char *)
1445
vg_ocl_map_arg_buffer(vocp, slot, 0, 0);
1446
1447
if (!ocl_hashes_out) {
1448
fprintf(stderr,
1449
"ERROR: Could not map hash result buffer "
1450
"for slot %d\n", slot);
1451
return 2;
1452
}
1453
1454
round = vocp->voc_ocl_cols * vocp->voc_ocl_rows;
1455
1456
for (i = 0; i < round; i++, vxcp->vxc_delta++) {
1457
memcpy(&vxcp->vxc_binres[1],
1458
ocl_hashes_out + (20*i),
1459
20);
1460
1461
res = test_func(vxcp);
1462
if (res)
1463
break;
1464
}
1465
1466
vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out);
1467
return res;
1468
}
1469
1470
static int
1471
vg_ocl_gethash_init(vg_ocl_context_t *vocp)
1472
{
1473
int i;
1474
1475
if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get"))
1476
return 0;
1477
1478
for (i = 0; i < vocp->voc_nslots; i++) {
1479
/* Each slot gets its own hash output buffer */
1480
if (!vg_ocl_kernel_arg_alloc(vocp, i, 0,
1481
20 *
1482
vocp->voc_ocl_rows *
1483
vocp->voc_ocl_cols, 1))
1484
return 0;
1485
}
1486
1487
vocp->voc_rekey_func = NULL;
1488
vocp->voc_check_func = vg_ocl_gethash_check;
1489
return 1;
1490
}
1491
1492
1493
static int
1494
vg_ocl_prefix_rekey(vg_ocl_context_t *vocp)
1495
{
1496
vg_context_t *vcp = vocp->base.vxc_vc;
1497
unsigned char *ocl_targets_in;
1498
uint32_t *ocl_found_out;
1499
int i;
1500
1501
/* Set the found indicator for each slot to -1 */
1502
for (i = 0; i < vocp->voc_nslots; i++) {
1503
ocl_found_out = (uint32_t *)
1504
vg_ocl_map_arg_buffer(vocp, i, 0, 1);
1505
if (!ocl_found_out) {
1506
fprintf(stderr,
1507
"ERROR: Could not map result buffer"
1508
" for slot %d (rekey)\n", i);
1509
return -1;
1510
}
1511
ocl_found_out[0] = 0xffffffff;
1512
vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out);
1513
}
1514
1515
if (vocp->voc_pattern_rewrite) {
1516
/* Count number of range records */
1517
i = vg_context_hash160_sort(vcp, NULL);
1518
if (!i)
1519
return 0;
1520
1521
if (i > vocp->voc_pattern_alloc) {
1522
/* (re)allocate target buffer */
1523
if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0))
1524
return -1;
1525
vocp->voc_pattern_alloc = i;
1526
}
1527
1528
/* Write range records */
1529
ocl_targets_in = (unsigned char *)
1530
vg_ocl_map_arg_buffer(vocp, 0, 5, 1);
1531
if (!ocl_targets_in) {
1532
fprintf(stderr,
1533
"ERROR: Could not map hash target buffer\n");
1534
return -1;
1535
}
1536
vg_context_hash160_sort(vcp, ocl_targets_in);
1537
vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in);
1538
vg_ocl_kernel_int_arg(vocp, -1, 4, i);
1539
1540
vocp->voc_pattern_rewrite = 0;
1541
}
1542
return 1;
1543
}
1544
1545
static int
1546
vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot)
1547
{
1548
vg_exec_context_t *vxcp = &vocp->base;
1549
vg_context_t *vcp = vocp->base.vxc_vc;
1550
vg_test_func_t test_func = vcp->vc_test;
1551
uint32_t *ocl_found_out;
1552
uint32_t found_delta;
1553
int orig_delta, tablesize;
1554
int res = 0;
1555
1556
/* Retrieve the found indicator */
1557
ocl_found_out = (uint32_t *)
1558
vg_ocl_map_arg_buffer(vocp, slot, 0, 2);
1559
if (!ocl_found_out) {
1560
fprintf(stderr,
1561
"ERROR: Could not map result buffer"
1562
" for slot %d\n", slot);
1563
return 2;
1564
}
1565
found_delta = ocl_found_out[0];
1566
1567
if (found_delta != 0xffffffff) {
1568
/* GPU code claims match, verify with CPU version */
1569
orig_delta = vxcp->vxc_delta;
1570
vxcp->vxc_delta += found_delta;
1571
vg_exec_context_calc_address(vxcp);
1572
1573
/* Make sure the GPU produced the expected hash */
1574
res = 0;
1575
if (!memcmp(vxcp->vxc_binres + 1,
1576
ocl_found_out + 2,
1577
20)) {
1578
res = test_func(vxcp);
1579
}
1580
if (res == 0) {
1581
/*
1582
* The match was not found in
1583
* the pattern list. Hmm.
1584
*/
1585
tablesize = ocl_found_out[2];
1586
fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]);
1587
fprintf(stderr, "CPU hash: ");
1588
fdumphex(stderr, vxcp->vxc_binres + 1, 20);
1589
fprintf(stderr, "GPU hash: ");
1590
fdumphex(stderr,
1591
(unsigned char *) (ocl_found_out + 2), 20);
1592
fprintf(stderr, "Found delta: %d "
1593
"Start delta: %d\n",
1594
found_delta, orig_delta);
1595
res = 1;
1596
}
1597
} else {
1598
vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows);
1599
}
1600
1601
vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out);
1602
return res;
1603
}
1604
1605
static int
1606
vg_ocl_prefix_init(vg_ocl_context_t *vocp)
1607
{
1608
int i;
1609
1610
if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix"))
1611
return 0;
1612
1613
for (i = 0; i < vocp->voc_nslots; i++) {
1614
if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1))
1615
return 0;
1616
}
1617
vocp->voc_rekey_func = vg_ocl_prefix_rekey;
1618
vocp->voc_check_func = vg_ocl_prefix_check;
1619
vocp->voc_pattern_rewrite = 1;
1620
vocp->voc_pattern_alloc = 0;
1621
return 1;
1622
}
1623
1624
1625
static int
1626
vg_ocl_config_pattern(vg_ocl_context_t *vocp)
1627
{
1628
vg_context_t *vcp = vocp->base.vxc_vc;
1629
int i;
1630
1631
i = vg_context_hash160_sort(vcp, NULL);
1632
if (i > 0) {
1633
if (vcp->vc_verbose > 1)
1634
fprintf(stderr, "Using OpenCL prefix matcher\n");
1635
/* Configure for prefix matching */
1636
return vg_ocl_prefix_init(vocp);
1637
}
1638
1639
if (vcp->vc_verbose > 0)
1640
fprintf(stderr, "WARNING: Using CPU pattern matcher\n");
1641
return vg_ocl_gethash_init(vocp);
1642
}
1643
1644
1645
/*
1646
* Temporary buffer content verification functions
1647
* This provides a simple test of the kernel, the OpenCL compiler,
1648
* and the hardware.
1649
*/
1650
static int
1651
vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted)
1652
{
1653
vg_exec_context_t *vxcp = &vocp->base;
1654
unsigned char *point_tmp = NULL, *z_heap = NULL;
1655
unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL;
1656
const EC_GROUP *pgroup;
1657
EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL;
1658
BIGNUM bnz, bnez, bnm, *bnzc;
1659
BN_CTX *bnctx = NULL;
1660
BN_MONT_CTX *bnmont;
1661
int ret = 0;
1662
int mismatches = 0, mm_r;
1663
int x, y, bx;
1664
static const unsigned char raw_modulus[] = {
1665
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
1666
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
1667
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
1668
0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
1669
};
1670
1671
BN_init(&bnz);
1672
BN_init(&bnez);
1673
BN_init(&bnm);
1674
1675
bnctx = BN_CTX_new();
1676
bnmont = BN_MONT_CTX_new();
1677
pgroup = EC_KEY_get0_group(vxcp->vxc_key);
1678
ppr = EC_POINT_new(pgroup);
1679
ppc = EC_POINT_new(pgroup);
1680
pps = EC_POINT_new(pgroup);
1681
ppt = EC_POINT_new(pgroup);
1682
1683
if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) {
1684
fprintf(stderr, "ERROR: out of memory\n");
1685
goto out;
1686
}
1687
1688
BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm);
1689
BN_MONT_CTX_set(bnmont, &bnm, bnctx);
1690
1691
if (z_inverted) {
1692
bnzc = &bnez;
1693
} else {
1694
bnzc = &pps->Z;
1695
}
1696
1697
z_heap = (unsigned char *)
1698
vg_ocl_map_arg_buffer(vocp, slot, 1, 0);
1699
point_tmp = (unsigned char *)
1700
vg_ocl_map_arg_buffer(vocp, slot, 2, 0);
1701
ocl_points_in = (unsigned char *)
1702
vg_ocl_map_arg_buffer(vocp, slot, 3, 0);
1703
ocl_strides_in = (unsigned char *)
1704
vg_ocl_map_arg_buffer(vocp, slot, 4, 0);
1705
1706
if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) {
1707
fprintf(stderr, "ERROR: could not map OpenCL point buffers\n");
1708
goto out;
1709
}
1710
1711
for (y = 0; y < vocp->voc_ocl_rows; y++) {
1712
vg_ocl_get_point(ppr, ocl_strides_in + (64*y));
1713
bx = y * vocp->voc_ocl_cols;
1714
mm_r = 0;
1715
1716
for (x = 0; x < vocp->voc_ocl_cols; x++) {
1717
vg_ocl_get_point_tpa(ppc, ocl_points_in, x);
1718
assert(ppr->Z_is_one && ppc->Z_is_one);
1719
EC_POINT_add(pgroup, pps, ppc, ppr, bnctx);
1720
assert(!pps->Z_is_one);
1721
vg_ocl_get_point_tpa(ppt, point_tmp, bx + x);
1722
vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x);
1723
if (z_inverted) {
1724
BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx);
1725
BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
1726
BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
1727
}
1728
if (BN_cmp(&ppt->X, &pps->X) ||
1729
BN_cmp(&ppt->Y, &pps->Y) ||
1730
BN_cmp(&bnz, bnzc)) {
1731
if (!mismatches) {
1732
fprintf(stderr, "Base privkey: ");
1733
fdumpbn(stderr, EC_KEY_get0_private_key(
1734
vxcp->vxc_key));
1735
}
1736
mismatches++;
1737
fprintf(stderr, "Mismatch for kernel %d, "
1738
"offset %d (%d,%d)\n",
1739
z_inverted, bx + x, y, x);
1740
if (!mm_r) {
1741
mm_r = 1;
1742
fprintf(stderr, "Row X : ");
1743
fdumpbn(stderr, &ppr->X);
1744
fprintf(stderr, "Row Y : ");
1745
fdumpbn(stderr, &ppr->Y);
1746
}
1747
1748
fprintf(stderr, "Column X: ");
1749
fdumpbn(stderr, &ppc->X);
1750
fprintf(stderr, "Column Y: ");
1751
fdumpbn(stderr, &ppc->Y);
1752
1753
if (BN_cmp(&ppt->X, &pps->X)) {
1754
fprintf(stderr, "Expect X: ");
1755
fdumpbn(stderr, &pps->X);
1756
fprintf(stderr, "Device X: ");
1757
fdumpbn(stderr, &ppt->X);
1758
}
1759
if (BN_cmp(&ppt->Y, &pps->Y)) {
1760
fprintf(stderr, "Expect Y: ");
1761
fdumpbn(stderr, &pps->Y);
1762
fprintf(stderr, "Device Y: ");
1763
fdumpbn(stderr, &ppt->Y);
1764
}
1765
if (BN_cmp(&bnz, bnzc)) {
1766
fprintf(stderr, "Expect Z: ");
1767
fdumpbn(stderr, bnzc);
1768
fprintf(stderr, "Device Z: ");
1769
fdumpbn(stderr, &bnz);
1770
}
1771
}
1772
}
1773
}
1774
1775
ret = !mismatches;
1776
1777
out:
1778
if (z_heap)
1779
vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap);
1780
if (point_tmp)
1781
vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp);
1782
if (ocl_points_in)
1783
vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in);
1784
if (ocl_strides_in)
1785
vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
1786
if (ppr)
1787
EC_POINT_free(ppr);
1788
if (ppc)
1789
EC_POINT_free(ppc);
1790
if (pps)
1791
EC_POINT_free(pps);
1792
if (ppt)
1793
EC_POINT_free(ppt);
1794
BN_clear_free(&bnz);
1795
BN_clear_free(&bnez);
1796
BN_clear_free(&bnm);
1797
if (bnmont)
1798
BN_MONT_CTX_free(bnmont);
1799
if (bnctx)
1800
BN_CTX_free(bnctx);
1801
return ret;
1802
}
1803
1804
static int
1805
vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot)
1806
{
1807
return vg_ocl_verify_temporary(vocp, slot, 0);
1808
}
1809
1810
static int
1811
vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot)
1812
{
1813
return vg_ocl_verify_temporary(vocp, slot, 1);
1814
}
1815
1816
static void *
1817
vg_opencl_thread(void *arg)
1818
{
1819
vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
1820
vg_context_t *vcp = vocp->base.vxc_vc;
1821
int halt = 0;
1822
int slot = -1;
1823
int rows, cols, invsize;
1824
unsigned long long idleu, busyu;
1825
double pidle;
1826
struct timeval tv, tvt, tvd, idle, busy;
1827
1828
memset(&idle, 0, sizeof(idle));
1829
memset(&busy, 0, sizeof(busy));
1830
1831
while (1) {
1832
pthread_mutex_lock(&vocp->voc_lock);
1833
if (halt) {
1834
halt = 0;
1835
vocp->voc_halt = 1;
1836
}
1837
if (slot != -1) {
1838
assert(vocp->voc_ocl_slot == slot);
1839
vocp->voc_ocl_slot = -1;
1840
slot = -1;
1841
pthread_cond_signal(&vocp->voc_wait);
1842
}
1843
if (vocp->voc_ocl_slot == -1) {
1844
gettimeofday(&tv, NULL);
1845
while (vocp->voc_ocl_slot == -1) {
1846
if (vocp->voc_halt)
1847
goto out;
1848
pthread_cond_wait(&vocp->voc_wait,
1849
&vocp->voc_lock);
1850
}
1851
gettimeofday(&tvt, NULL);
1852
timersub(&tvt, &tv, &tvd);
1853
timeradd(&tvd, &idle, &idle);
1854
}
1855
slot = vocp->voc_ocl_slot;
1856
rows = vocp->voc_ocl_rows;
1857
cols = vocp->voc_ocl_cols;
1858
invsize = vocp->voc_ocl_invsize;
1859
pthread_mutex_unlock(&vocp->voc_lock);
1860
1861
gettimeofday(&tv, NULL);
1862
if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize))
1863
halt = 1;
1864
1865
if (!vg_ocl_kernel_wait(vocp, slot))
1866
halt = 1;
1867
1868
if (vcp->vc_verbose > 1) {
1869
gettimeofday(&tvt, NULL);
1870
timersub(&tvt, &tv, &tvd);
1871
timeradd(&tvd, &busy, &busy);
1872
if ((busy.tv_sec + idle.tv_sec) > 1) {
1873
idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
1874
busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
1875
pidle = ((double) idleu) / (idleu + busyu);
1876
1877
if (pidle > 0.01) {
1878
fprintf(stderr, "\rGPU idle: %.2f%%"
1879
" "
1880
" \n",
1881
100 * pidle);
1882
}
1883
memset(&idle, 0, sizeof(idle));
1884
memset(&busy, 0, sizeof(busy));
1885
}
1886
}
1887
}
1888
out:
1889
pthread_mutex_unlock(&vocp->voc_lock);
1890
return NULL;
1891
}
1892
1893
1894
/*
1895
* Address search thread main loop
1896
*/
1897
1898
static void *
1899
vg_opencl_loop(vg_exec_context_t *arg)
1900
{
1901
vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
1902
int i;
1903
int round, nrows, ncols;
1904
int pattern_generation;
1905
1906
const BN_ULONG rekey_max = 100000000;
1907
BN_ULONG npoints, rekey_at;
1908
1909
EC_KEY *pkey = NULL;
1910
const EC_GROUP *pgroup;
1911
const EC_POINT *pgen;
1912
EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL;
1913
EC_POINT *pseek = NULL;
1914
1915
unsigned char *ocl_points_in, *ocl_strides_in;
1916
1917
vg_context_t *vcp = vocp->base.vxc_vc;
1918
vg_exec_context_t *vxcp = &vocp->base;
1919
1920
int slot, nslots;
1921
int slot_busy = 0, slot_done = 0, halt = 0;
1922
int c = 0, output_interval = 1000;
1923
1924
struct timeval tvstart;
1925
1926
pkey = vxcp->vxc_key;
1927
pgroup = EC_KEY_get0_group(pkey);
1928
pgen = EC_GROUP_get0_generator(pgroup);
1929
1930
round = vocp->voc_ocl_rows * vocp->voc_ocl_cols;
1931
1932
if (!vcp->vc_remove_on_match &&
1933
(vcp->vc_chance >= 1.0f) &&
1934
(vcp->vc_chance < round) &&
1935
(vcp->vc_verbose > 0)) {
1936
fprintf(stderr, "WARNING: low pattern difficulty\n");
1937
fprintf(stderr,
1938
"WARNING: better match throughput is possible "
1939
"using vanitygen on the CPU\n");
1940
}
1941
1942
slot = 0;
1943
nslots = 2;
1944
vocp->voc_nslots = nslots;
1945
1946
nrows = vocp->voc_ocl_rows;
1947
ncols = vocp->voc_ocl_cols;
1948
1949
ppbase = (EC_POINT **) malloc((nrows + ncols) *
1950
sizeof(EC_POINT*));
1951
if (!ppbase)
1952
goto enomem;
1953
1954
for (i = 0; i < (nrows + ncols); i++) {
1955
ppbase[i] = EC_POINT_new(pgroup);
1956
if (!ppbase[i])
1957
goto enomem;
1958
}
1959
1960
pprow = ppbase + ncols;
1961
pbatchinc = EC_POINT_new(pgroup);
1962
poffset = EC_POINT_new(pgroup);
1963
pseek = EC_POINT_new(pgroup);
1964
if (!pbatchinc || !poffset || !pseek)
1965
goto enomem;
1966
1967
BN_set_word(&vxcp->vxc_bntmp, ncols);
1968
EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL,
1969
vxcp->vxc_bnctx);
1970
EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx);
1971
1972
BN_set_word(&vxcp->vxc_bntmp, round);
1973
EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL,
1974
vxcp->vxc_bnctx);
1975
EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx);
1976
1977
if (!vg_ocl_config_pattern(vocp))
1978
goto enomem;
1979
1980
for (i = 0; i < nslots; i++) {
1981
/*
1982
* Each work group gets its own:
1983
* - Column point array
1984
*/
1985
if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1))
1986
goto enomem;
1987
}
1988
1989
/*
1990
* All instances share:
1991
* - The z_heap and point scratch spaces
1992
* - The row point array
1993
*/
1994
if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1,
1995
round_up_pow2(32 * 2 * round, 4096), 0) ||
1996
!vg_ocl_kernel_arg_alloc(vocp, -1, 2,
1997
round_up_pow2(32 * 2 * round, 4096), 0) ||
1998
!vg_ocl_kernel_arg_alloc(vocp, -1, 3,
1999
round_up_pow2(32 * 2 * ncols, 4096), 1))
2000
goto enomem;
2001
2002
npoints = 0;
2003
rekey_at = 0;
2004
vxcp->vxc_binres[0] = vcp->vc_addrtype;
2005
2006
if (pthread_create(&vocp->voc_ocl_thread, NULL,
2007
vg_opencl_thread, vocp))
2008
goto enomem;
2009
2010
gettimeofday(&tvstart, NULL);
2011
2012
l_rekey:
2013
if (vocp->voc_rekey_func) {
2014
switch (vocp->voc_rekey_func(vocp)) {
2015
case 1:
2016
break;
2017
case 0:
2018
goto nopatterns;
2019
default:
2020
goto enomem;
2021
}
2022
}
2023
2024
vg_exec_context_upgrade_lock(vxcp);
2025
2026
pattern_generation = vcp->vc_pattern_generation;
2027
2028
/* Generate a new random private key */
2029
EC_KEY_generate_key(pkey);
2030
npoints = 0;
2031
2032
/* Determine rekey interval */
2033
EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx);
2034
BN_sub(&vxcp->vxc_bntmp2,
2035
&vxcp->vxc_bntmp,
2036
EC_KEY_get0_private_key(pkey));
2037
rekey_at = BN_get_word(&vxcp->vxc_bntmp2);
2038
if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max))
2039
rekey_at = rekey_max;
2040
assert(rekey_at > 0);
2041
2042
EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey));
2043
2044
vg_exec_context_downgrade_lock(vxcp);
2045
2046
if (vcp->vc_pubkey_base) {
2047
EC_POINT_add(pgroup,
2048
ppbase[0],
2049
ppbase[0],
2050
vcp->vc_pubkey_base,
2051
vxcp->vxc_bnctx);
2052
}
2053
2054
/* Build the base array of sequential points */
2055
for (i = 1; i < ncols; i++) {
2056
EC_POINT_add(pgroup,
2057
ppbase[i],
2058
ppbase[i-1],
2059
pgen, vxcp->vxc_bnctx);
2060
}
2061
2062
EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx);
2063
2064
/* Fill the sequential point array */
2065
ocl_points_in = (unsigned char *)
2066
vg_ocl_map_arg_buffer(vocp, 0, 3, 1);
2067
if (!ocl_points_in) {
2068
fprintf(stderr, "ERROR: Could not map column buffer\n");
2069
goto enomem;
2070
}
2071
for (i = 0; i < ncols; i++)
2072
vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]);
2073
vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in);
2074
2075
/*
2076
* Set up the initial row increment table.
2077
* Set the first element to pgen -- effectively
2078
* skipping the exact key generated above.
2079
*/
2080
EC_POINT_copy(pprow[0], pgen);
2081
for (i = 1; i < nrows; i++) {
2082
EC_POINT_add(pgroup,
2083
pprow[i],
2084
pprow[i-1],
2085
pbatchinc, vxcp->vxc_bnctx);
2086
}
2087
EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx);
2088
vxcp->vxc_delta = 1;
2089
npoints = 1;
2090
slot = 0;
2091
slot_busy = 0;
2092
slot_done = 0;
2093
2094
while (1) {
2095
if (slot_done) {
2096
assert(rekey_at > 0);
2097
slot_done = 0;
2098
2099
/* Call the result check function */
2100
switch (vocp->voc_check_func(vocp, slot)) {
2101
case 1:
2102
rekey_at = 0;
2103
break;
2104
case 2:
2105
halt = 1;
2106
break;
2107
default:
2108
break;
2109
}
2110
2111
c += round;
2112
if (!halt && (c >= output_interval)) {
2113
output_interval =
2114
vg_output_timing(vcp, c, &tvstart);
2115
c = 0;
2116
}
2117
vg_exec_context_yield(vxcp);
2118
2119
/* If the patterns changed, reload it to the GPU */
2120
if (vocp->voc_rekey_func &&
2121
(pattern_generation !=
2122
vcp->vc_pattern_generation)) {
2123
vocp->voc_pattern_rewrite = 1;
2124
rekey_at = 0;
2125
}
2126
}
2127
2128
if (vcp->vc_halt)
2129
halt = 1;
2130
if (halt)
2131
break;
2132
2133
if ((npoints + round) < rekey_at) {
2134
if (npoints > 1) {
2135
/* Move the row increments forward */
2136
for (i = 0; i < nrows; i++) {
2137
EC_POINT_add(pgroup,
2138
pprow[i],
2139
pprow[i],
2140
poffset,
2141
vxcp->vxc_bnctx);
2142
}
2143
2144
EC_POINTs_make_affine(pgroup, nrows, pprow,
2145
vxcp->vxc_bnctx);
2146
}
2147
2148
/* Copy the row stride array to the device */
2149
ocl_strides_in = (unsigned char *)
2150
vg_ocl_map_arg_buffer(vocp, slot, 4, 1);
2151
if (!ocl_strides_in) {
2152
fprintf(stderr,
2153
"ERROR: Could not map row buffer "
2154
"for slot %d\n", slot);
2155
goto enomem;
2156
}
2157
memset(ocl_strides_in, 0, 64*nrows);
2158
for (i = 0; i < nrows; i++)
2159
vg_ocl_put_point(ocl_strides_in + (64*i),
2160
pprow[i]);
2161
vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
2162
npoints += round;
2163
2164
pthread_mutex_lock(&vocp->voc_lock);
2165
while (vocp->voc_ocl_slot != -1) {
2166
assert(slot_busy);
2167
pthread_cond_wait(&vocp->voc_wait,
2168
&vocp->voc_lock);
2169
}
2170
2171
if (vocp->voc_halt) {
2172
pthread_mutex_unlock(&vocp->voc_lock);
2173
halt = 1;
2174
break;
2175
}
2176
2177
vocp->voc_ocl_slot = slot;
2178
pthread_cond_signal(&vocp->voc_wait);
2179
pthread_mutex_unlock(&vocp->voc_lock);
2180
2181
slot_done = slot_busy;
2182
slot_busy = 1;
2183
slot = (slot + 1) % nslots;
2184
2185
} else {
2186
if (slot_busy) {
2187
pthread_mutex_lock(&vocp->voc_lock);
2188
while (vocp->voc_ocl_slot != -1) {
2189
assert(vocp->voc_ocl_slot ==
2190
((slot + nslots - 1) % nslots));
2191
pthread_cond_wait(&vocp->voc_wait,
2192
&vocp->voc_lock);
2193
}
2194
pthread_mutex_unlock(&vocp->voc_lock);
2195
slot_busy = 0;
2196
slot_done = 1;
2197
}
2198
2199
if (!rekey_at ||
2200
(!slot_done && ((npoints + round) >= rekey_at)))
2201
goto l_rekey;
2202
}
2203
}
2204
2205
if (0) {
2206
enomem:
2207
fprintf(stderr, "ERROR: allocation failure?\n");
2208
nopatterns:
2209
;
2210
}
2211
2212
if (halt) {
2213
if (vcp->vc_verbose > 1) {
2214
printf("Halting...");
2215
fflush(stdout);
2216
}
2217
pthread_mutex_lock(&vocp->voc_lock);
2218
vocp->voc_halt = 1;
2219
pthread_cond_signal(&vocp->voc_wait);
2220
while (vocp->voc_ocl_slot != -1) {
2221
assert(slot_busy);
2222
pthread_cond_wait(&vocp->voc_wait,
2223
&vocp->voc_lock);
2224
}
2225
slot_busy = 0;
2226
pthread_mutex_unlock(&vocp->voc_lock);
2227
pthread_join(vocp->voc_ocl_thread, NULL);
2228
if (vcp->vc_verbose > 1)
2229
printf("done!\n");
2230
}
2231
2232
vg_exec_context_yield(vxcp);
2233
2234
if (ppbase) {
2235
for (i = 0; i < (nrows + ncols); i++)
2236
if (ppbase[i])
2237
EC_POINT_free(ppbase[i]);
2238
free(ppbase);
2239
}
2240
if (pbatchinc)
2241
EC_POINT_free(pbatchinc);
2242
2243
/* Release the argument buffers */
2244
vg_ocl_free_args(vocp);
2245
vocp->voc_halt = 0;
2246
vocp->voc_ocl_slot = -1;
2247
vg_context_thread_exit(vcp);
2248
return NULL;
2249
}
2250
2251
2252
2253
2254
/*
2255
* OpenCL platform/device selection junk
2256
*/
2257
2258
static int
2259
get_device_list(cl_platform_id pid, cl_device_id **list_out)
2260
{
2261
cl_uint nd;
2262
cl_int res;
2263
cl_device_id *ids;
2264
res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
2265
if (res != CL_SUCCESS) {
2266
vg_ocl_error(NULL, res, "clGetDeviceIDs(0)");
2267
*list_out = NULL;
2268
return -1;
2269
}
2270
if (nd) {
2271
ids = (cl_device_id *) malloc(nd * sizeof(*ids));
2272
if (ids == NULL) {
2273
fprintf(stderr, "Could not allocate device ID list\n");
2274
*list_out = NULL;
2275
return -1;
2276
}
2277
res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL);
2278
if (res != CL_SUCCESS) {
2279
vg_ocl_error(NULL, res, "clGetDeviceIDs(n)");
2280
free(ids);
2281
*list_out = NULL;
2282
return -1;
2283
}
2284
*list_out = ids;
2285
}
2286
return nd;
2287
}
2288
2289
static void
2290
show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base)
2291
{
2292
int i;
2293
char nbuf[128];
2294
char vbuf[128];
2295
size_t len;
2296
cl_int res;
2297
2298
for (i = 0; i < nd; i++) {
2299
res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME,
2300
sizeof(nbuf), nbuf, &len);
2301
if (res != CL_SUCCESS)
2302
continue;
2303
if (len >= sizeof(nbuf))
2304
len = sizeof(nbuf) - 1;
2305
nbuf[len] = '\0';
2306
res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR,
2307
sizeof(vbuf), vbuf, &len);
2308
if (res != CL_SUCCESS)
2309
continue;
2310
if (len >= sizeof(vbuf))
2311
len = sizeof(vbuf) - 1;
2312
vbuf[len] = '\0';
2313
fprintf(stderr, " %d: [%s] %s\n", i + base, vbuf, nbuf);
2314
}
2315
}
2316
2317
static cl_device_id
2318
get_device(cl_platform_id pid, int num)
2319
{
2320
int nd;
2321
cl_device_id id, *ids;
2322
2323
nd = get_device_list(pid, &ids);
2324
if (nd < 0)
2325
return NULL;
2326
if (!nd) {
2327
fprintf(stderr, "No OpenCL devices found\n");
2328
return NULL;
2329
}
2330
if (num < 0) {
2331
if (nd == 1)
2332
num = 0;
2333
else
2334
num = nd;
2335
}
2336
if (num < nd) {
2337
id = ids[num];
2338
free(ids);
2339
return id;
2340
}
2341
free(ids);
2342
return NULL;
2343
}
2344
2345
static int
2346
get_platform_list(cl_platform_id **list_out)
2347
{
2348
cl_uint np;
2349
cl_int res;
2350
cl_platform_id *ids;
2351
res = clGetPlatformIDs(0, NULL, &np);
2352
if (res != CL_SUCCESS) {
2353
vg_ocl_error(NULL, res, "clGetPlatformIDs(0)");
2354
*list_out = NULL;
2355
return -1;
2356
}
2357
if (np) {
2358
ids = (cl_platform_id *) malloc(np * sizeof(*ids));
2359
if (ids == NULL) {
2360
fprintf(stderr,
2361
"Could not allocate platform ID list\n");
2362
*list_out = NULL;
2363
return -1;
2364
}
2365
res = clGetPlatformIDs(np, ids, NULL);
2366
if (res != CL_SUCCESS) {
2367
vg_ocl_error(NULL, res, "clGetPlatformIDs(n)");
2368
free(ids);
2369
*list_out = NULL;
2370
return -1;
2371
}
2372
*list_out = ids;
2373
}
2374
return np;
2375
}
2376
2377
void
2378
show_platforms(cl_platform_id *ids, int np, int base)
2379
{
2380
int i;
2381
char nbuf[128];
2382
char vbuf[128];
2383
size_t len;
2384
cl_int res;
2385
2386
for (i = 0; i < np; i++) {
2387
res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME,
2388
sizeof(nbuf), nbuf, &len);
2389
if (res != CL_SUCCESS) {
2390
vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)");
2391
continue;
2392
}
2393
if (len >= sizeof(nbuf))
2394
len = sizeof(nbuf) - 1;
2395
nbuf[len] = '\0';
2396
res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR,
2397
sizeof(vbuf), vbuf, &len);
2398
if (res != CL_SUCCESS) {
2399
vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)");
2400
continue;
2401
}
2402
if (len >= sizeof(vbuf))
2403
len = sizeof(vbuf) - 1;
2404
vbuf[len] = '\0';
2405
fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf);
2406
}
2407
}
2408
2409
static cl_platform_id
2410
get_platform(int num)
2411
{
2412
int np;
2413
cl_platform_id id, *ids;
2414
2415
np = get_platform_list(&ids);
2416
if (np < 0)
2417
return NULL;
2418
if (!np) {
2419
fprintf(stderr, "No OpenCL platforms available\n");
2420
return NULL;
2421
}
2422
if (num < 0) {
2423
if (np == 1)
2424
num = 0;
2425
else
2426
num = np;
2427
}
2428
if (num < np) {
2429
id = ids[num];
2430
free(ids);
2431
return id;
2432
}
2433
free(ids);
2434
return NULL;
2435
}
2436
2437
void
2438
vg_ocl_enumerate_devices(void)
2439
{
2440
cl_platform_id *pids;
2441
cl_device_id *dids;
2442
int np, nd, i;
2443
2444
np = get_platform_list(&pids);
2445
if (!np) {
2446
fprintf(stderr, "No OpenCL platforms available\n");
2447
return;
2448
}
2449
fprintf(stderr, "Available OpenCL platforms:\n");
2450
for (i = 0; i < np; i++) {
2451
show_platforms(&pids[i], 1, i);
2452
nd = get_device_list(pids[i], &dids);
2453
if (!nd) {
2454
fprintf(stderr, " -- No devices\n");
2455
} else {
2456
show_devices(pids[i], dids, nd, 0);
2457
}
2458
}
2459
}
2460
2461
static cl_device_id
2462
get_opencl_device(int platformidx, int deviceidx)
2463
{
2464
cl_platform_id pid;
2465
cl_device_id did = NULL;
2466
2467
pid = get_platform(platformidx);
2468
if (pid) {
2469
did = get_device(pid, deviceidx);
2470
if (did)
2471
return did;
2472
}
2473
return NULL;
2474
}
2475
2476
2477
vg_ocl_context_t *
2478
vg_ocl_context_new(vg_context_t *vcp,
2479
int platformidx, int deviceidx, int safe_mode, int verify,
2480
int worksize, int nthreads, int nrows, int ncols,
2481
int invsize)
2482
{
2483
cl_device_id did;
2484
int round, full_threads, wsmult;
2485
cl_ulong memsize, allocsize;
2486
vg_ocl_context_t *vocp;
2487
2488
/* Find the device */
2489
did = get_opencl_device(platformidx, deviceidx);
2490
if (!did) {
2491
return 0;
2492
}
2493
2494
vocp = (vg_ocl_context_t *) malloc(sizeof(*vocp));
2495
if (!vocp)
2496
return NULL;
2497
2498
/* Open the device and compile the kernel */
2499
if (!vg_ocl_init(vcp, vocp, did, safe_mode)) {
2500
free(vocp);
2501
return NULL;
2502
}
2503
2504
if (verify) {
2505
if (vcp->vc_verbose > 0) {
2506
fprintf(stderr, "WARNING: "
2507
"Hardware verification mode enabled\n");
2508
}
2509
if (!nthreads)
2510
nthreads = 1;
2511
vocp->voc_verify_func[0] = vg_ocl_verify_k0;
2512
vocp->voc_verify_func[1] = vg_ocl_verify_k1;
2513
}
2514
2515
/*
2516
* nrows: number of point rows per job
2517
* ncols: number of point columns per job
2518
* invsize: number of modular inversion tasks per job
2519
* (each task performs (nrows*ncols)/invsize inversions)
2520
* nslots: number of kernels
2521
* (create two, keep one running while we service the other or wait)
2522
*/
2523
2524
if (!nthreads) {
2525
/* Pick nthreads sufficient to saturate one compute unit */
2526
if (vg_ocl_device_gettype(vocp->voc_ocldid) &
2527
CL_DEVICE_TYPE_CPU)
2528
nthreads = 1;
2529
else
2530
nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid,
2531
CL_DEVICE_MAX_WORK_GROUP_SIZE);
2532
}
2533
2534
full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid,
2535
CL_DEVICE_MAX_COMPUTE_UNITS);
2536
full_threads *= nthreads;
2537
2538
/*
2539
* The work size selection is complicated, and the most
2540
* important factor is the batch size of the heap_invert kernel.
2541
* Each value added to the batch trades one complete modular
2542
* inversion for four multiply operations. Ideally the work
2543
* size would be as large as possible. The practical limiting
2544
* factors are:
2545
* 1. Available memory
2546
* 2. Responsiveness and operational latency
2547
*
2548
* We take a naive approach and limit batch size to a point of
2549
* sufficiently diminishing returns, hoping that responsiveness
2550
* will be sufficient.
2551
*
2552
* The measured value for the OpenSSL implementations on my CPU
2553
* is 80:1. This causes heap_invert to get batches of 20 or so
2554
* for free, and receive 10% incremental returns at 200. The CPU
2555
* work size is therefore set to 256.
2556
*
2557
* The ratio on most GPUs with the oclvanitygen implementations
2558
* is closer to 500:1, and larger batches are required for
2559
* good performance.
2560
*/
2561
if (!worksize) {
2562
if (vg_ocl_device_gettype(vocp->voc_ocldid) &
2563
CL_DEVICE_TYPE_GPU)
2564
worksize = 2048;
2565
else
2566
worksize = 256;
2567
}
2568
2569
if (!ncols) {
2570
memsize = vg_ocl_device_getulong(vocp->voc_ocldid,
2571
CL_DEVICE_GLOBAL_MEM_SIZE);
2572
allocsize = vg_ocl_device_getulong(vocp->voc_ocldid,
2573
CL_DEVICE_MAX_MEM_ALLOC_SIZE);
2574
memsize /= 2;
2575
ncols = full_threads;
2576
nrows = 2;
2577
/* Find row and column counts close to sqrt(full_threads) */
2578
while ((ncols > nrows) && !(ncols & 1)) {
2579
ncols /= 2;
2580
nrows *= 2;
2581
}
2582
2583
/*
2584
* Increase row & column counts to satisfy work size
2585
* multiplier or fill available memory.
2586
*/
2587
wsmult = 1;
2588
while ((!worksize || ((wsmult * 2) <= worksize)) &&
2589
((ncols * nrows * 2 * 128) < memsize) &&
2590
((ncols * nrows * 2 * 64) < allocsize)) {
2591
if (ncols > nrows)
2592
nrows *= 2;
2593
else
2594
ncols *= 2;
2595
wsmult *= 2;
2596
}
2597
}
2598
2599
round = nrows * ncols;
2600
2601
if (!invsize) {
2602
invsize = 2;
2603
while (!(round % (invsize << 1)) &&
2604
((round / invsize) > full_threads))
2605
invsize <<= 1;
2606
}
2607
2608
if (vcp->vc_verbose > 1) {
2609
fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
2610
fprintf(stderr, "Modular inverse: %d threads, %d ops each\n",
2611
round/invsize, invsize);
2612
}
2613
2614
if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) {
2615
if (vcp->vc_verbose <= 1) {
2616
fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
2617
fprintf(stderr,
2618
"Modular inverse: %d threads, %d ops each\n",
2619
round/invsize, invsize);
2620
}
2621
if (round % invsize)
2622
fprintf(stderr,
2623
"Modular inverse work size must "
2624
"evenly divide points\n");
2625
else
2626
fprintf(stderr,
2627
"Modular inverse work per task (%d) "
2628
"must be a power of 2\n", invsize);
2629
goto out_fail;
2630
}
2631
2632
vocp->voc_ocl_rows = nrows;
2633
vocp->voc_ocl_cols = ncols;
2634
vocp->voc_ocl_invsize = invsize;
2635
2636
return vocp;
2637
2638
out_fail:
2639
vg_ocl_context_free(vocp);
2640
return NULL;
2641
}
2642
2643
vg_ocl_context_t *
2644
vg_ocl_context_new_from_devstr(vg_context_t *vcp, const char *devstr,
2645
int safemode, int verify)
2646
{
2647
int platformidx, deviceidx;
2648
int worksize = 0, nthreads = 0, nrows = 0, ncols = 0, invsize = 0;
2649
2650
char *dsd, *part, *part2, *save, *param;
2651
2652
dsd = strdup(devstr);
2653
if (!dsd)
2654
return NULL;
2655
2656
save = NULL;
2657
part = strtok_r(dsd, ",", &save);
2658
2659
part2 = strchr(part, ':');
2660
if (!part2) {
2661
fprintf(stderr, "Invalid device specifier '%s'\n", part);
2662
free(dsd);
2663
return NULL;
2664
}
2665
2666
*part2 = '\0';
2667
platformidx = atoi(part);
2668
deviceidx = atoi(part2 + 1);
2669
2670
while ((part = strtok_r(NULL, ",", &save)) != NULL) {
2671
param = strchr(part, '=');
2672
if (!param) {
2673
fprintf(stderr, "Unrecognized parameter '%s'\n", part);
2674
continue;
2675
}
2676
2677
*param = '\0';
2678
param++;
2679
2680
if (!strcmp(part, "grid")) {
2681
ncols = strtol(param, &part2, 0);
2682
if (part2 && *part2 == 'x') {
2683
nrows = strtol(part2+1, NULL, 0);
2684
}
2685
if (!nrows || !ncols) {
2686
fprintf(stderr,
2687
"Invalid grid size '%s'\n", param);
2688
nrows = 0;
2689
ncols = 0;
2690
continue;
2691
}
2692
}
2693
2694
else if (!strcmp(part, "invsize")) {
2695
invsize = atoi(param);
2696
if (!invsize) {
2697
fprintf(stderr,
2698
"Invalid modular inverse size '%s'\n",
2699
param);
2700
continue;
2701
}
2702
if (invsize & (invsize - 1)) {
2703
fprintf(stderr,
2704
"Modular inverse size %d must be "
2705
"a power of 2\n", invsize);
2706
invsize = 0;
2707
continue;
2708
}
2709
}
2710
2711
else if (!strcmp(part, "threads")) {
2712
nthreads = atoi(param);
2713
if (nthreads == 0) {
2714
fprintf(stderr,
2715
"Invalid thread count '%s'\n", param);
2716
continue;
2717
}
2718
}
2719
2720
else {
2721
fprintf(stderr, "Unrecognized parameter '%s'\n", part);
2722
}
2723
}
2724
2725
free(dsd);
2726
2727
return vg_ocl_context_new(vcp, platformidx, deviceidx, safemode,
2728
verify, worksize, nthreads, nrows, ncols,
2729
invsize);
2730
}
2731
2732
2733
void
2734
vg_ocl_context_free(vg_ocl_context_t *vocp)
2735
{
2736
vg_ocl_del(vocp);
2737
free(vocp);
2738
}
2739
2740