CoCalc -- rainforest.c

GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/algo/rainforest.c
¹⁵³² views
1
// RainForest hash algorithm
2
// Author: Bill Schneider
3
// Date: Feb 13th, 2018
4
//
5
// RainForest uses native integer operations which are extremely fast on
6
// modern 64-bit processors, significantly slower on 32-bit processors such
7
// as GPUs, and extremely slow if at all implementable on FPGAs and ASICs.
8
// It makes an intensive use of the L1 cache to maintain a heavy intermediary
9
// state favoring modern CPUs compared to GPUs (small L1 cache shared by many
10
// shaders) or FPGAs (very hard to implement the required low-latency cache)
11
// when scanning ranges for nonces. The purpose is to create a fair balance
12
// between all mining equipments, from mobile phones to extreme performance
13
// GPUs and to rule out farming factories relying on ASICs and FPGAs. The
14
// CRC32 instruction is used a lot as it is extremely fast on low-power ARM
15
// chips and allows such devices to rival high-end PCs mining performance.
16
//
17
// Tests on various devices have shown the following performance :
18
// +--------------------------------------------------------------------------+
19
// | CPU/GPU       Clock Threads Full hash  Nonce scan  Watts   Cost          |
20
// |               (MHz)         (80 bytes) (4 bytes)   total                 |
21
// | Core i7-6700k  4000      8   390 kH/s  1642 kH/s     200  ~$350+PC       |
22
// | Radeon RX560   1300   1024  1100 kH/s  1650 kH/s     300  ~$180+PC       |
23
// | RK3368 (8*A53) 1416      8   534 kH/s  1582 kH/s       6   $60 (Geekbox) |
24
// +--------------------------------------------------------------------------+
25
//
26
// Build instructions on Ubuntu 16.04 :
27
//   - on x86:   use gcc -march=native or -maes to enable AES-NI
28
//   - on ARMv8: use gcc -march=native or -march=armv8-a+crypto+crc to enable
29
//               CRC32 and AES extensions.
30
//
31
// Note: always use the same options to build all files!
32

33
#include <miner.h>
34

35
#include <stdlib.h>
36
#include <stdint.h>
37
#include <string.h>
38
#include <stdio.h>
39

40
//#define DEBUG_ALGO
41

42
/* Rijndael's substitution box for sub_bytes step */
43
static uint8_t SBOX[256] = {
44
     0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
45
     0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
46
     0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
47
     0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
48
     0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
49
     0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
50
     0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
51
     0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
52
     0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
53
     0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
54
     0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
55
     0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
56
     0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
57
     0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
58
     0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
59
     0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
60
};
61

62
/*--- The parts below are not used when crypto extensions are available ---*/
63
/* Use -march=armv8-a+crypto on ARMv8 to use crypto extensions */
64
/* Use -maes on x86_64 to use AES-NI */
65
#if defined(RF_NOASM) || (!defined(__aarch64__) || !defined(__ARM_FEATURE_CRYPTO)) && (!defined(__x86_64__) || !defined(__AES__))
66

67
/* shifts to do for shift_rows step */
68
static uint8_t shifts[16] = {
69
     0,  5, 10, 15,
70
     4,  9, 14,  3,
71
     8, 13,  2,  7,
72
    12,  1,  6, 11
73
};
74

75
/* add the round key to the state with simple XOR operation */
76
static void add_round_key(uint8_t * state, uint8_t * rkey) {
77
    uint8_t i;
78
    for (i = 0; i < 16; i++)
79
        state[i] ^= rkey[i];
80
}
81

82
/* substitute all bytes using Rijndael's substitution box */
83
static void sub_bytes(uint8_t * state) {
84
    uint8_t i;
85
    for (i = 0; i < 16; i++)
86
        state[i] = SBOX[state[i]];
87
}
88

89
/* imagine the state not as 1-dimensional, but a 4x4 grid;
90
 * this step shifts the rows of this grid around */
91
static void shift_rows(uint8_t * state) {
92
    uint8_t temp[16];
93
    uint8_t i;
94

95
    for (i = 0; i < 16; i++) {
96
        temp[i] = state[shifts[i]];
97
    }
98

99
    for (i = 0; i < 16; i++) {
100
        state[i] = temp[i];
101
    }
102
}
103

104
/* mix columns */
105
static void mix_columns(uint8_t * state) {
106
    uint8_t a[4];
107
    uint8_t b[4];
108
    uint8_t h, i, k;
109

110
    for (k = 0; k < 4; k++) {
111
        for (i = 0; i < 4; i++) {
112
            a[i] = state[i + 4 * k];
113
            h = state[i + 4 * k] & 0x80; /* hi bit */
114
            b[i] = state[i + 4 * k] << 1;
115

116
            if (h == 0x80) {
117
                b[i] ^= 0x1b; /* Rijndael's Galois field */
118
            }
119
        }
120

121
        state[4 * k]     = b[0] ^ a[3] ^ a[2] ^ b[1] ^ a[1];
122
        state[1 + 4 * k] = b[1] ^ a[0] ^ a[3] ^ b[2] ^ a[2];
123
        state[2 + 4 * k] = b[2] ^ a[1] ^ a[0] ^ b[3] ^ a[3];
124
        state[3 + 4 * k] = b[3] ^ a[2] ^ a[1] ^ b[0] ^ a[0];
125
    }
126
}
127
#endif // (!defined(__aarch64__) || !defined(__ARM_FEATURE_CRYPTO)) && (!defined(__x86_64__) || !defined(__AES__))
128

129

130
/* key schedule stuff */
131

132
/* simple function to rotate 4 byte array */
133
static inline uint32_t rotate32(uint32_t in) {
134
#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
135
    in = (in >> 8) | (in << 24);
136
#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
137
    in = (in << 8) | (in >> 24);
138
#else
139
    uint8_t *b = (uint8_t *)&in, temp = b[0];
140
    b[0] = b[1]; b[1] = b[2]; b[2] = b[3]; b[3] = temp;
141
#endif
142
    return in;
143
}
144

145
/* key schedule core operation */
146
static inline uint32_t sbox(uint32_t in, uint8_t n) {
147
	in = (SBOX[in & 255]) | (SBOX[(in >> 8) & 255] << 8) | (SBOX[(in >> 16) & 255] << 16) | (SBOX[(in >> 24) & 255] << 24);
148
#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
149
	in ^= n;
150
#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
151
	in ^= n << 24;
152
#else
153
	*(uint8_t *)&in ^= n;
154
#endif
155
	return in;
156
}
157

158
// this version is optimized for exactly two rounds.
159
// _state_ must be 16-byte aligned.
160
static void aes2r_encrypt(uint8_t * state, uint8_t * key) {
161
    uint32_t _ALIGN(16) key_schedule[12];
162
    uint32_t t;
163

164
    /* initialize key schedule; its first 16 bytes are the key */
165
    key_schedule[0] = ((uint32_t *)key)[0];
166
    key_schedule[1] = ((uint32_t *)key)[1];
167
    key_schedule[2] = ((uint32_t *)key)[2];
168
    key_schedule[3] = ((uint32_t *)key)[3];
169
    t = key_schedule[3];
170

171
    t = rotate32(t);
172
    t = sbox(t, 1);
173
    t = key_schedule[4]  = key_schedule[0] ^ t;
174
    t = key_schedule[5]  = key_schedule[1] ^ t;
175
    t = key_schedule[6]  = key_schedule[2] ^ t;
176
    t = key_schedule[7]  = key_schedule[3] ^ t;
177

178
    t = rotate32(t);
179
    t = sbox(t, 2);
180
    t = key_schedule[8]  = key_schedule[4] ^ t;
181
    t = key_schedule[9]  = key_schedule[5] ^ t;
182
    t = key_schedule[10] = key_schedule[6] ^ t;
183
    t = key_schedule[11] = key_schedule[7] ^ t;
184

185
// Use -march=armv8-a+crypto+crc to get this one
186
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
187
    asm volatile(
188
        "ld1   {v0.16b},[%0]        \n"
189
	"ld1   {v1.16b,v2.16b,v3.16b},[%1]  \n"
190
	"aese  v0.16b,v1.16b        \n" // round1: add_round_key,sub_bytes,shift_rows
191
	"aesmc v0.16b,v0.16b        \n" // round1: mix_columns
192
	"aese  v0.16b,v2.16b        \n" // round2: add_round_key,sub_bytes,shift_rows
193
	"eor   v0.16b,v0.16b,v3.16b \n" // finish: add_round_key
194
	"st1   {v0.16b},[%0]        \n"
195
	: /* only output is in *state */
196
	: "r"(state), "r"(key_schedule)
197
	: "v0", "v1", "v2", "v3", "cc", "memory");
198

199
// Use -maes to get this one
200
#elif defined(__x86_64__) && defined(__AES__)
201
    asm volatile(
202
        "movups (%0),  %%xmm0     \n"
203
	"movups (%1),  %%xmm1     \n"
204
	"pxor   %%xmm1,%%xmm0     \n" // add_round_key(state, key_schedule)
205
	"movups 16(%1),%%xmm2     \n"
206
	"movups 32(%1),%%xmm1     \n"
207
	"aesenc %%xmm2,%%xmm0     \n" // first round
208
	"aesenclast %%xmm1,%%xmm0 \n" // final round
209
	"movups %%xmm0, (%0)  \n"
210
	: /* only output is in *state */
211
	: "r"(state), "r" (key_schedule)
212
	: "xmm0", "xmm1", "xmm2", "cc", "memory");
213

214
#else
215
    /* first round of the algorithm */
216
    add_round_key(state, (void*)&key_schedule[0]);
217
    sub_bytes(state);
218
    shift_rows(state);
219
    mix_columns(state);
220
    add_round_key(state, (void*)&key_schedule[4]);
221

222
    /* final round of the algorithm */
223
    sub_bytes(state);
224
    shift_rows(state);
225
    add_round_key(state, (void*)&key_schedule[8]);
226

227
#endif
228
}
229

230
// this seems necessary only for gcc, otherwise hash is bogus
231
#ifdef _MSC_VER
232
typedef unsigned long ulong;
233
typedef uint8_t  rf_u8;
234
typedef uint16_t rf_u16;
235
typedef uint32_t rf_u32;
236
typedef uint64_t rf_u64;
237
#else
238
typedef __attribute__((may_alias)) uint8_t  rf_u8;
239
typedef __attribute__((may_alias)) uint16_t rf_u16;
240
typedef __attribute__((may_alias)) uint32_t rf_u32;
241
typedef __attribute__((may_alias)) uint64_t rf_u64;
242
#endif
243

244
// 2048 entries for the rambox => 16kB
245
#define RAMBOX_SIZE 2048
246
#define RAMBOX_LOOPS 4
247
#define RAMBOX_HIST 32
248

249
typedef union {
250
  rf_u8  b[32];
251
  rf_u16 w[16];
252
  rf_u32 d[8];
253
  rf_u64 q[4];
254
} hash256_t;
255

256
typedef struct _ALIGN(128) rf_ctx {
257
  uint32_t word;  // LE pending message
258
  uint32_t len;   // total message length
259
  uint32_t crc;
260
  uint32_t changes; // must remain lower than RAMBOX_HIST
261
  hash256_t _ALIGN(32) hash;
262
  uint16_t hist[RAMBOX_HIST];
263
  uint64_t _ALIGN(64) rambox[RAMBOX_SIZE];
264
} rf256_ctx_t;
265

266
// these archs are fine with unaligned reads
267
#if defined(__x86_64__)||defined(__aarch64__)
268
#define RF_UNALIGNED_LE64
269
#define RF_UNALIGNED_LE32
270
#elif defined(__i386__)||defined(__ARM_ARCH_7A__)
271
#define RF_UNALIGNED_LE32
272
#endif
273

274
#define RF256_INIT_CRC 20180213
275

276
// the table is used as an 8 bit-aligned array of uint64_t for the first word,
277
// and as a 16 bit-aligned array of uint64_t for the second word. It is filled
278
// with the sha256 of "RainForestProCpuAntiAsic", iterated over and over until
279
// the table is filled. The highest offset being ((uint16_t *)table)[255] we
280
// need to add 6 extra bytes at the end to read an uint64_t. Maybe calculated
281
// on a UNIX system with this loop :
282
//
283
//   ref="RainForestProCpuAntiAsic"
284
//   for ((i=0;i<18;i++)); do
285
//     set $(echo -n $ref|sha256sum)
286
//     echo $1|sed 's/\(..\)/0x\1,/g'
287
//     ref=$(printf $(echo $1|sed 's/\(..\)/\\x\1/g'))
288
//   done
289

290
const uint8_t rf_table[256*2+6] = {
291
  0x8e,0xc1,0xa8,0x04,0x38,0x78,0x7c,0x54,0x29,0x23,0x1b,0x78,0x9f,0xf9,0x27,0x54,
292
  0x11,0x78,0x95,0xb6,0xaf,0x78,0x45,0x16,0x2b,0x9e,0x91,0xe8,0x97,0x25,0xf8,0x63,
293
  0x82,0x56,0xcf,0x48,0x6f,0x82,0x14,0x0d,0x61,0xbe,0x47,0xd1,0x37,0xee,0x30,0xa9,
294
  0x28,0x1e,0x4b,0xbf,0x07,0xcd,0x41,0xdf,0x23,0x21,0x12,0xb8,0x81,0x99,0x1d,0xe6,
295
  0x68,0xcf,0xfa,0x2d,0x8e,0xb9,0x88,0xa7,0x15,0xce,0x9e,0x2f,0xeb,0x1b,0x0f,0x67,
296
  0x20,0x68,0x6c,0xa9,0x5d,0xc1,0x7c,0x76,0xdf,0xbd,0x98,0x61,0xb4,0x14,0x65,0x40,
297
  0x1e,0x72,0x51,0x74,0x93,0xd3,0xad,0xbe,0x46,0x0a,0x25,0xfb,0x6a,0x5e,0x1e,0x8a,
298
  0x5a,0x03,0x3c,0xab,0x12,0xc2,0xd4,0x07,0x91,0xab,0xc9,0xdf,0x92,0x2c,0x85,0x6a,
299
  0xa6,0x25,0x1e,0x66,0x50,0x26,0x4e,0xa8,0xbd,0xda,0x88,0x1b,0x95,0xd4,0x00,0xeb,
300
  0x0d,0x1c,0x9b,0x3c,0x86,0xc7,0xb2,0xdf,0xb4,0x5a,0x36,0x15,0x8e,0x04,0xd2,0x54,
301
  0x79,0xd2,0x3e,0x3d,0x99,0x50,0xa6,0x12,0x4c,0x32,0xc8,0x51,0x14,0x4d,0x4b,0x0e,
302
  0xbb,0x17,0x80,0x8f,0xa4,0xc4,0x99,0x72,0xd7,0x14,0x4b,0xef,0xed,0x14,0xe9,0x17,
303
  0xfa,0x9b,0x5d,0x37,0xd6,0x2f,0xef,0x02,0xd6,0x71,0x0a,0xbd,0xc5,0x40,0x11,0x90,
304
  0x90,0x4e,0xb4,0x4c,0x72,0x51,0x7a,0xd8,0xba,0x30,0x4d,0x8c,0xe2,0x11,0xbb,0x6d,
305
  0x4b,0xbc,0x6f,0x14,0x0c,0x9f,0xfa,0x5e,0x66,0x40,0x45,0xcb,0x7d,0x1b,0x3a,0xc5,
306
  0x5e,0x9c,0x1e,0xcc,0xbd,0x16,0x3b,0xcf,0xfb,0x2a,0xd2,0x08,0x2a,0xf8,0x3d,0x46,
307
  0x93,0x90,0xb3,0x66,0x81,0x34,0x7f,0x6d,0x9b,0x8c,0x99,0x03,0xc5,0x27,0xa3,0xd9,
308
  0xce,0x90,0x88,0x0f,0x55,0xc3,0xa1,0x60,0x53,0xc8,0x0d,0x25,0xae,0x61,0xd9,0x72,
309
  0x48,0x1d,0x6c,0x61,0xd2,0x87,0xdd,0x3d,0x23,0xf5,0xde,0x93,0x39,0x4c,0x43,0x9a,
310
  0xf9,0x37,0xf2,0x61,0xd7,0xf8,0xea,0x65,0xf0,0xf1,0xde,0x3f,0x05,0x57,0x83,0x81,
311
  0xde,0x02,0x62,0x49,0xd4,0x32,0x7e,0x4a,0xd4,0x9f,0x40,0x7e,0xb9,0x91,0xb1,0x35,
312
  0xf7,0x62,0x3f,0x65,0x9e,0x4d,0x2b,0x10,0xde,0xd4,0x77,0x64,0x0f,0x84,0xad,0x92,
313
  0xe7,0xa3,0x8a,0x10,0xc1,0x14,0xeb,0x57,0xc4,0xad,0x8e,0xc2,0xc7,0x32,0xa3,0x7e,
314
  0x50,0x1f,0x7c,0xbb,0x2e,0x5f,0xf5,0x18,0x22,0xea,0xec,0x9d,0xa4,0x77,0xcd,0x85,
315
  0x04,0x2f,0x20,0x61,0x72,0xa7,0x0c,0x92,0x06,0x4d,0x01,0x70,0x9b,0x35,0xa1,0x27,
316
  0x32,0x6e,0xb9,0x78,0xe0,0xaa,0x5f,0x91,0xa6,0x51,0xe3,0x63,0xf8,0x97,0x2f,0x60,
317
  0xd9,0xfb,0x15,0xe5,0x59,0xcf,0x31,0x3c,0x61,0xc7,0xb5,0x61,0x2a,0x6b,0xdd,0xd1,
318
  0x09,0x70,0xc0,0xcf,0x94,0x7a,0xcc,0x31,0x94,0xb1,0xa2,0xf6,0x95,0xc0,0x38,0x3d,
319
  0xef,0x19,0x30,0x70,0xdd,0x62,0x32,0x8f,0x7c,0x30,0xb9,0x18,0xf8,0xe7,0x8f,0x0a,
320
  0xaa,0xb6,0x00,0x86,0xf2,0xe0,0x30,0x5f,0xa2,0xe8,0x00,0x8e,0x05,0xa0,0x22,0x18,
321
  0x9f,0x83,0xd4,0x3a,0x85,0x10,0xb9,0x51,0x8d,0x07,0xf0,0xb3,0xcd,0x9b,0x55,0xa1,
322
  0x14,0xce,0x0f,0xb2,0xcf,0xb8,0xce,0x2d,0xe6,0xe8,0x35,0x32,0x1f,0x22,0xb5,0xec,
323
  0xd0,0xb9,0x72,0xa8,0xb4,0x97
324
  //,0x6e,0x0a,0x47,0xcd,0x5a,0xf0,0xdc,0xeb,0xfd,0x46,
325
  //0xe5,0x6e,0x83,0xe6,0x1a,0xcc,0x4a,0x8b,0xa5,0x28,0x9e,0x50,0x48,0xa9,0xa2,0x6b,
326
};
327

328
// this is made of the last iteration of the rf_table (18th transformation)
329
const uint8_t rf256_iv[32] = {
330
  0x78,0xe9,0x90,0xd3,0xb3,0xc8,0x9b,0x7b,0x0a,0xc4,0x86,0x6e,0x4e,0x38,0xb3,0x6b,
331
  0x33,0x68,0x7c,0xed,0x73,0x35,0x4b,0x0a,0x97,0x25,0x4c,0x77,0x7a,0xaa,0x61,0x1b
332
};
333

334
// crc32 lookup tables
335
#if !defined(__aarch64__) || !defined(__ARM_FEATURE_CRC32)
336
const uint32_t rf_crc32_table[256] = {
337
  /* 0x00 */ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
338
  /* 0x04 */ 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
339
  /* 0x08 */ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
340
  /* 0x0c */ 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
341
  /* 0x10 */ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
342
  /* 0x14 */ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
343
  /* 0x18 */ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
344
  /* 0x1c */ 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
345
  /* 0x20 */ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
346
  /* 0x24 */ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
347
  /* 0x28 */ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
348
  /* 0x2c */ 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
349
  /* 0x30 */ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
350
  /* 0x34 */ 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
351
  /* 0x38 */ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
352
  /* 0x3c */ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
353
  /* 0x40 */ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
354
  /* 0x44 */ 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
355
  /* 0x48 */ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
356
  /* 0x4c */ 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
357
  /* 0x50 */ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
358
  /* 0x54 */ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
359
  /* 0x58 */ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
360
  /* 0x5c */ 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
361
  /* 0x60 */ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
362
  /* 0x64 */ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
363
  /* 0x68 */ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
364
  /* 0x6c */ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
365
  /* 0x70 */ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
366
  /* 0x74 */ 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
367
  /* 0x78 */ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
368
  /* 0x7c */ 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
369
  /* 0x80 */ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
370
  /* 0x84 */ 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
371
  /* 0x88 */ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
372
  /* 0x8c */ 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
373
  /* 0x90 */ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
374
  /* 0x94 */ 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
375
  /* 0x98 */ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
376
  /* 0x9c */ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
377
  /* 0xa0 */ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
378
  /* 0xa4 */ 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
379
  /* 0xa8 */ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
380
  /* 0xac */ 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
381
  /* 0xb0 */ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
382
  /* 0xb4 */ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
383
  /* 0xb8 */ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
384
  /* 0xbc */ 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
385
  /* 0xc0 */ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
386
  /* 0xc4 */ 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
387
  /* 0xc8 */ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
388
  /* 0xcc */ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
389
  /* 0xd0 */ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
390
  /* 0xd4 */ 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
391
  /* 0xd8 */ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
392
  /* 0xdc */ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
393
  /* 0xe0 */ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
394
  /* 0xe4 */ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
395
  /* 0xe8 */ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
396
  /* 0xec */ 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
397
  /* 0xf0 */ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
398
  /* 0xf4 */ 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
399
  /* 0xf8 */ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
400
  /* 0xfc */ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
401
};
402
#endif
403

404
// compute the crc32 of 32-bit message _msg_ from previous crc _crc_.
405
// build with -mcpu=cortex-a53+crc to enable native CRC instruction on ARM
406
static inline uint32_t rf_crc32_32(uint32_t crc, uint32_t msg) {
407
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
408
  asm("crc32w %w0,%w0,%w1\n":"+r"(crc):"r"(msg));
409
#else
410
  crc=crc^msg;
411
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
412
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
413
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
414
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
415
#endif
416
  return crc;
417
}
418

419
//static inline uint32_t rf_crc32_24(uint32_t crc, uint32_t msg) {
420
//#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
421
//  asm("crc32b %w0,%w0,%w1\n":"+r"(crc):"r"(msg));
422
//  asm("crc32h %w0,%w0,%w1\n":"+r"(crc):"r"(msg>>8));
423
//#else
424
//  crc=crc^msg;
425
//  crc=rf_crc32_table[crc&0xff]^(crc>>8);
426
//  crc=rf_crc32_table[crc&0xff]^(crc>>8);
427
//  crc=rf_crc32_table[crc&0xff]^(crc>>8);
428
//#endif
429
//  return crc;
430
//}
431
//
432
//static inline uint32_t rf_crc32_16(uint32_t crc, uint32_t msg) {
433
//#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
434
//  asm("crc32h %w0,%w0,%w1\n":"+r"(crc):"r"(msg));
435
//#else
436
//  crc=crc^msg;
437
//  crc=rf_crc32_table[crc&0xff]^(crc>>8);
438
//  crc=rf_crc32_table[crc&0xff]^(crc>>8);
439
//#endif
440
//  return crc;
441
//}
442
//
443
//static inline uint32_t rf_crc32_8(uint32_t crc, uint32_t msg) {
444
//#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
445
//  asm("crc32b %w0,%w0,%w1\n":"+r"(crc):"r"(msg));
446
//#else
447
//  crc=crc^msg;
448
//  crc=rf_crc32_table[crc&0xff]^(crc>>8);
449
//#endif
450
//  return crc;
451
//}
452

453
// add to _msg_ its own crc32. use -mcpu=cortex-a53+crc to enable native CRC
454
// instruction on ARM.
455
static inline uint64_t rf_add64_crc32(uint64_t msg) {
456
  uint64_t crc=0;
457
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
458
  asm("crc32x %w0,%w0,%x1\n":"+r"(crc):"r"(msg));
459
#else
460
  crc^=(uint32_t)msg;
461
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
462
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
463
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
464
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
465

466
  crc^=msg>>32;
467
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
468
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
469
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
470
  crc=rf_crc32_table[crc&0xff]^(crc>>8);
471
#endif
472
  return msg+crc;
473
}
474

475
// mix the current state with the crc and return the new crc
476
static inline uint32_t rf_crc32x4(rf_u32 *state, uint32_t crc) {
477
  crc=state[0]=rf_crc32_32(crc, state[0]);
478
  crc=state[1]=rf_crc32_32(crc, state[1]);
479
  crc=state[2]=rf_crc32_32(crc, state[2]);
480
  crc=state[3]=rf_crc32_32(crc, state[3]);
481
  return crc;
482
}
483

484
// read 64 bit from possibly unaligned memory address _p_ in little endian mode
485
static inline uint64_t rf_memr64(const uint8_t *p) {
486
#ifdef RF_UNALIGNED_LE64
487
  return *(uint64_t *)p;
488
#else
489
  uint64_t ret;
490
  int byte;
491
  for (ret=byte=0; byte<8; byte++)
492
    ret+=(uint64_t)p[byte]<<(byte*8);
493
  return ret;
494
#endif
495
}
496

497
// return rainforest lower word entry for index
498
static inline uint64_t rf_wltable(uint8_t index) {
499
  return rf_memr64(&rf_table[index]);
500
}
501

502
// return rainforest upper word entry for _index_
503
static inline uint64_t rf_whtable(uint8_t index) {
504
  return rf_memr64(&rf_table[index*2]);
505
}
506

507
// rotate left vector _v_ by _bits_ bits
508
static inline uint64_t rf_rotl64(uint64_t v, uint8_t bits) {
509
#if !defined(__ARM_ARCH_8A) && !defined(__AARCH64EL__) && !defined(x86_64)
510
  bits&=63;
511
#endif
512
  return (v<<bits)|(v>>(64-bits));
513
}
514

515
// rotate right vector _v_ by _bits_ bits
516
static inline uint64_t rf_rotr64(uint64_t v, uint8_t bits) {
517
#if !defined(__ARM_ARCH_8A) && !defined(__AARCH64EL__) && !defined(x86_64)
518
  bits&=63;
519
#endif
520
  return (v>>bits)|(v<<(64-bits));
521
}
522

523
// reverse all bytes in the word _v_
524
static inline uint64_t rf_bswap64(uint64_t v) {
525
#if defined(__x86_64__) && !defined(_MSC_VER)
526
  asm("bswap %0":"+r"(v));
527
#elif defined(__aarch64__)
528
  asm("rev %0,%0\n":"+r"(v));
529
#else
530
  v=((v&0xff00ff00ff00ff00ULL)>>8)|((v&0x00ff00ff00ff00ffULL)<<8);
531
  v=((v&0xffff0000ffff0000ULL)>>16)|((v&0x0000ffff0000ffffULL)<<16);
532
  v=(v>>32)|(v<<32);
533
#endif
534
  return v;
535
}
536

537
// lookup _old_ in _rambox_, update it and perform a substitution if a matching
538
// value is found.
539
static inline uint32_t rf_rambox(rf256_ctx_t *ctx, uint64_t old) {
540
  uint64_t *p, k;
541
  uint32_t idx;
542
  int loops;
543

544
  for (loops=0; loops<RAMBOX_LOOPS; loops++) {
545
    old=rf_add64_crc32(old);
546
    idx=old&(RAMBOX_SIZE-1);
547
    if (ctx->changes < RAMBOX_HIST)
548
	    ctx->hist[ctx->changes++] = idx;
549
    p=&ctx->rambox[idx];
550
    k = *p;
551
    old+=rf_rotr64(k, (uint8_t) (old/RAMBOX_SIZE));
552
    *p = (int64_t)old < 0 ? k : old;
553
  }
554
  return (uint32_t)old;
555
}
556

557
// write (_x_,_y_) at cell _cell_ for offset _ofs_
558
static inline void rf_w128(uint64_t *cell, uint64_t ofs, uint64_t x, uint64_t y) {
559
#if defined(__ARM_ARCH_8A) || defined(__AARCH64EL__)
560
  // 128 bit at once is faster when exactly two parallelizable instructions are
561
  // used between two calls to keep the pipe full.
562
  asm volatile("stp %0, %1, [%2,#%3]\n\t"
563
               : /* no output */
564
               : "r"(x), "r"(y), "r" (cell), "I" (ofs*8));
565
#else
566
  cell[ofs+0] = x;
567
  cell[ofs+1] = y;
568
#endif
569
}
570

571
// initialize the ram box
572
static void rf_raminit(rf256_ctx_t *ctx) {
573
  uint64_t pat1 = 0x0123456789ABCDEFULL;
574
  uint64_t pat2 = 0xFEDCBA9876543210ULL;
575
  uint64_t pat3;
576
  uint32_t pos;
577
  uint64_t *rambox = ctx->rambox;
578

579
  // Note: no need to mask the higher bits on armv8 nor x86 :
580
  //
581
  // From ARMv8's ref manual :
582
  //     The register that is specified for a shift can be 32-bit or
583
  //     64-bit. The amount to be shifted can be specified either as
584
  //     an immediate, that is up to register size minus one, or by
585
  //     a register where the value is taken only from the bottom five
586
  //     (modulo-32) or six (modulo-64) bits.
587
  //
588
  // Here we rotate pat2 by pat1's bits and put it into pat1, and in
589
  // parallel we rotate pat1 by pat2's bits and put it into pat2. Thus
590
  // the two data blocks are exchanged in addition to being rotated.
591
  // What is stored each time is the previous and the rotated blocks,
592
  // which only requires one rotate and a register rename.
593

594
  ctx->changes = 0;
595
  for (pos = 0; pos < RAMBOX_SIZE; pos += 16) {
596
    pat3 = pat1;
597
    pat1 = rf_rotr64(pat2, (uint8_t)pat3) + 0x111;
598
    rf_w128(rambox + pos, 0, pat1, pat3);
599

600
    pat3 = pat2;
601
    pat2 = rf_rotr64(pat1, (uint8_t)pat3) + 0x222;
602
    rf_w128(rambox + pos, 2, pat2, pat3);
603

604
    pat3 = pat1;
605
    pat1 = rf_rotr64(pat2, (uint8_t)pat3) + 0x333;
606
    rf_w128(rambox + pos, 4, pat1, pat3);
607

608
    pat3 = pat2;
609
    pat2 = rf_rotr64(pat1, (uint8_t)pat3) + 0x444;
610
    rf_w128(rambox + pos, 6, pat2, pat3);
611

612
    pat3 = pat1;
613
    pat1 = rf_rotr64(pat2, (uint8_t)pat3) + 0x555;
614
    rf_w128(rambox + pos, 8, pat1, pat3);
615

616
    pat3 = pat2;
617
    pat2 = rf_rotr64(pat1, (uint8_t)pat3) + 0x666;
618
    rf_w128(rambox + pos, 10, pat2, pat3);
619

620
    pat3 = pat1;
621
    pat1 = rf_rotr64(pat2, (uint8_t)pat3) + 0x777;
622
    rf_w128(rambox + pos, 12, pat1, pat3);
623

624
    pat3 = pat2;
625
    pat2 = rf_rotr64(pat1, (uint8_t)pat3) + 0x888;
626
    rf_w128(rambox + pos, 14, pat2, pat3);
627
  }
628
}
629

630
// exec the div/mod box. _v0_ and _v1_ must be aligned.
631
static inline void rf256_divbox(rf_u64 *v0, rf_u64 *v1) {
632
  uint64_t pl, ql, ph, qh;
633

634
  //---- low word ----    ---- high word ----
635
  pl=~*v0;                ph=~*v1;
636
  ql=rf_bswap64(*v0);     qh=rf_bswap64(*v1);
637

638
  if (!pl||!ql)   { pl=ql=0; }
639
  else if (pl>ql) { uint64_t p=pl; pl=p/ql; ql=p%ql; }
640
  else            { uint64_t p=pl; pl=ql/p; ql=ql%p; }
641

642
  if (!ph||!qh)   { ph=qh=0; }
643
  else if (ph>qh) { uint64_t p=ph; ph=p/qh; qh=p%qh; }
644
  else            { uint64_t p=ph; ph=qh/p; qh=qh%p; }
645

646
  pl+=qh;                 ph+=ql;
647
  *v0-=pl;                *v1-=ph;
648
}
649

650
// exec the rotation/add box. _v0_ and _v1_ must be aligned.
651
static inline void rf256_rotbox(rf_u64 *v0, rf_u64 *v1, uint8_t b0, uint8_t b1) {
652
  uint64_t l, h;
653

654
  //---- low word ----    ---- high word ----
655
  l=*v0;                  h=*v1;
656
  l=rf_rotr64(l,b0);      h=rf_rotl64(h,b1);
657
  l+=rf_wltable(b0);      h+=rf_whtable(b1);
658
  b0=(uint8_t)l;          b1=(uint8_t)h;
659
  l=rf_rotl64(l,b1);      h=rf_rotr64(h,b0);
660
  b0=(uint8_t)l;          b1=(uint8_t)h;
661
  l=rf_rotr64(l,b1);      h=rf_rotl64(h,b0);
662
  *v0=l;                  *v1=h;
663
}
664

665
// mix the current state with the current crc
666
static inline uint32_t rf256_scramble(rf256_ctx_t *ctx) {
667
  return ctx->crc=rf_crc32x4(ctx->hash.d, ctx->crc);
668
}
669

670
// mix the state with the crc and the pending text, and update the crc
671
static inline void rf256_inject(rf256_ctx_t *ctx) {
672
  // BS: never <4 bytes with 80 input bytes
673
  //ctx->crc=
674
  //  (ctx->bytes&3)==0?rf_crc32_32(rf256_scramble(ctx), ctx->word):
675
  //  (ctx->bytes&3)==3?rf_crc32_24(rf256_scramble(ctx), ctx->word):
676
  //  (ctx->bytes&3)==2?rf_crc32_16(rf256_scramble(ctx), ctx->word):
677
  //                    rf_crc32_8(rf256_scramble(ctx), ctx->word);
678
  ctx->crc=rf_crc32_32(rf256_scramble(ctx), ctx->word);
679
  ctx->word=0;
680
}
681

682
// rotate the hash by 32 bits. Not using streaming instructions (SSE/NEON) is
683
// faster because the compiler can follow moves an use register renames.
684
static inline void rf256_rot32x256(hash256_t *hash) {
685
#if defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_ARCH_7A__)
686
  uint32_t t0, t1, t2;
687

688
  t0=hash->d[0];
689
  t1=hash->d[1];
690
  t2=hash->d[2];
691
  hash->d[1]=t0;
692
  hash->d[2]=t1;
693

694
  t0=hash->d[3];
695
  t1=hash->d[4];
696
  hash->d[3]=t2;
697
  hash->d[4]=t0;
698

699
  t2=hash->d[5];
700
  t0=hash->d[6];
701
  hash->d[5]=t1;
702
  hash->d[6]=t2;
703

704
  t1=hash->d[7];
705
  hash->d[7]=t0;
706
  hash->d[0]=t1;
707
#else
708
  uint32_t tmp=hash->d[7];
709

710
  memmove(&hash->d[1], &hash->d[0], 28);
711
  hash->d[0]=tmp;
712
#endif
713
}
714

715
// encrypt the first 128 bits of the hash using the last 128 bits as the key
716
static inline void rf256_aesenc(rf256_ctx_t *ctx) {
717
  aes2r_encrypt((uint8_t *)ctx->hash.b, (uint8_t *)ctx->hash.b+16);
718
}
719

720
// each new round consumes exactly 32 bits of text at once and perturbates
721
// 128 bits of output, 96 of which overlap with the previous round, and 32
722
// of which are new. With 5 rounds or more each output bit depends on every
723
// input bit.
724
static inline void rf256_one_round(rf256_ctx_t *ctx) {
725
  uint64_t carry;
726

727
  rf256_rot32x256(&ctx->hash);
728

729
  carry=((uint64_t)ctx->len << 32) + ctx->crc;
730
  rf256_scramble(ctx);
731
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
732
  rf256_scramble(ctx);
733

734
  carry=rf_rambox(ctx, carry);
735
  rf256_rotbox(ctx->hash.q, ctx->hash.q+1, (uint8_t)carry, (uint8_t) (carry>>56));
736
  rf256_scramble(ctx);
737
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
738
  rf256_scramble(ctx);
739
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
740
  rf256_scramble(ctx);
741

742
  carry=rf_rambox(ctx, carry);
743
  rf256_rotbox(ctx->hash.q, ctx->hash.q+1, (uint8_t)(carry>>8), (uint8_t) (carry>>48));
744
  rf256_scramble(ctx);
745
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
746
  rf256_scramble(ctx);
747
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
748
  rf256_scramble(ctx);
749

750
  carry=rf_rambox(ctx, carry);
751
  rf256_rotbox(ctx->hash.q, ctx->hash.q+1, (uint8_t)(carry>>16), (uint8_t) (carry>>40));
752
  rf256_scramble(ctx);
753
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
754
  rf256_scramble(ctx);
755
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
756
  rf256_scramble(ctx);
757

758
  carry=rf_rambox(ctx,carry);
759
  rf256_rotbox(ctx->hash.q, ctx->hash.q+1, (uint8_t)(carry>>24), (uint8_t) (carry>>32));
760
  rf256_scramble(ctx);
761
  rf256_divbox(ctx->hash.q, ctx->hash.q+1);
762
  rf256_inject(ctx);
763
  rf256_aesenc(ctx);
764
  rf256_scramble(ctx);
765
}
766

767
// initialize the hash state
768
static void rf256_init(rf256_ctx_t *ctx) {
769
  rf_raminit(ctx);
770
  memcpy(ctx->hash.b, rf256_iv, sizeof(ctx->hash.b));
771
  ctx->crc=RF256_INIT_CRC;
772
  ctx->word=ctx->len=0;
773
}
774

775
// update the hash context _ctx_ with _len_ bytes from message _msg_
776
static void rf256_update(rf256_ctx_t *ctx, const void *msg, size_t len) {
777
  const uint8_t* ptr = (uint8_t*)msg;
778
  while (len > 0) {
779
#ifdef RF_UNALIGNED_LE32
780
    if (!(ctx->len&3) && len>=4) {
781
      ctx->word=*(uint32_t*)ptr;
782
      ctx->len+=4;
783
      rf256_one_round(ctx);
784
      ptr+=4;
785
      len-=4;
786
      continue;
787
    }
788
#endif
789
    ctx->word |= (uint32_t)*(ptr++) << (8 * (ctx->len++ & 3));
790
    len--;
791
    if (!(ctx->len&3))
792
      rf256_one_round(ctx);
793
  }
794
}
795

796
// finalize the hash and copy the result into _out_ if not null (256 bits)
797
static void rf256_final(void *out, rf256_ctx_t *ctx) {
798
  // BS: never happens with 80 input bytes
799
  //uint32_t pad;
800

801
  //if (ctx->len&3)
802
  //  rf256_one_round(ctx);
803

804
  // always work on at least 256 bits of input
805
  //for (pad=0; pad+ctx->len < 32;pad+=4)
806
  //  rf256_one_round(ctx);
807

808
  // always run 4 extra rounds to complete the last 128 bits
809
  rf256_one_round(ctx);
810
  rf256_one_round(ctx);
811
  rf256_one_round(ctx);
812
  rf256_one_round(ctx);
813
  //if (out)
814
    memcpy(out, ctx->hash.b, 32);
815
}
816

817
// hash _len_ bytes from _in_ into _out_
818
void rf256_hash(void *out, const void *in, size_t len)
819
{
820
	rf256_ctx_t ctx;
821
	rf256_init(&ctx);
822
	rf256_update(&ctx, in, len);
823
	rf256_final(out, &ctx);
824
}
825

826
int scanhash_rf256(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
827
{
828
	uint32_t _ALIGN(64) hash[8];
829
	uint32_t _ALIGN(64) endiandata[20];
830
	uint32_t *pdata = work->data;
831
	uint32_t *ptarget = work->target;
832

833
	const uint32_t Htarg = ptarget[7];
834
	const uint32_t first_nonce = pdata[19];
835
	uint32_t nonce = first_nonce;
836
	volatile uint8_t *restart = &(work_restart[thr_id].restart);
837

838
	rf256_ctx_t ctx, ctx_common;
839

840
	if (opt_benchmark)
841
		ptarget[7] = 0x0cff;
842

843
	//printf("thd%d work=%p htarg=%08x ptarg7=%08x first_nonce=%08x max_nonce=%08x hashes_done=%Lu\n",
844
	//       thr_id, work, Htarg, ptarget[7], first_nonce, max_nonce, (unsigned long)*hashes_done);
845

846
	for (int k=0; k < 19; k++)
847
		be32enc(&endiandata[k], pdata[k]);
848

849
	// pre-compute the hash state based on the constant part of the header
850
	rf256_init(&ctx_common);
851
	rf256_update(&ctx_common, endiandata, 76);
852
	ctx_common.changes = 0;
853

854
	memcpy(&ctx, &ctx_common, sizeof(ctx));
855

856
	do {
857
		be32enc(&endiandata[19], nonce);
858
#ifndef RF_DISABLE_CTX_MEMCPY
859
#ifndef RF_DISABLE_CTX_HISTORY
860
		if (ctx.changes == RAMBOX_HIST)
861
			memcpy(&ctx, &ctx_common, sizeof(ctx));
862
		else {
863
			for (int i=0; i<ctx.changes; i++) {
864
				int k = ctx.hist[i];
865
				ctx.rambox[k] = ctx_common.rambox[k];
866
			}
867
			memcpy(&ctx, &ctx_common, offsetof(rf256_ctx_t, hist));
868
		}
869
#else
870
		memcpy(&ctx, &ctx_common, sizeof(ctx));
871
#endif
872
		rf256_update(&ctx, endiandata+19, 4);
873
		if (ctx.hash.w[7])
874
			goto next;
875
		rf256_final(hash, &ctx);
876
#else
877
		rf256_hash(hash, endiandata, 80);
878
#endif
879

880
		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
881
			work_set_target_ratio(work, hash);
882
			pdata[19] = nonce;
883
			*hashes_done = pdata[19] - first_nonce;
884
			return 1;
885
		}
886
	next:
887
		nonce++;
888
	} while (nonce < max_nonce && !(*restart));
889

890
	pdata[19] = nonce;
891
	*hashes_done = pdata[19] - first_nonce + 1;
892
	return 0;
893
}
894

895
Product

Resources

Company