Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/pkg
Path: blob/main/external/libucl/src/mum.h
2066 views
1
/* Copyright (c) 2016 Vladimir Makarov <[email protected]>
2
3
Permission is hereby granted, free of charge, to any person
4
obtaining a copy of this software and associated documentation
5
files (the "Software"), to deal in the Software without
6
restriction, including without limitation the rights to use, copy,
7
modify, merge, publish, distribute, sublicense, and/or sell copies
8
of the Software, and to permit persons to whom the Software is
9
furnished to do so, subject to the following conditions:
10
11
The above copyright notice and this permission notice shall be
12
included in all copies or substantial portions of the Software.
13
14
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
SOFTWARE.
22
*/
23
24
/* This file implements MUM (MUltiply and Mix) hashing. We randomize
25
input data by 64x64-bit multiplication and mixing hi- and low-parts
26
of the multiplication result by using an addition and then mix it
27
into the current state. We use prime numbers randomly generated
28
with the equal probability of their bit values for the
29
multiplication. When all primes are used once, the state is
30
randomized and the same prime numbers are used again for data
31
randomization.
32
33
The MUM hashing passes all SMHasher tests. Pseudo Random Number
34
Generator based on MUM also passes NIST Statistical Test Suite for
35
Random and Pseudorandom Number Generators for Cryptographic
36
Applications (version 2.2.1) with 1000 bitstreams each containing
37
1M bits. MUM hashing is also faster Spooky64 and City64 on small
38
strings (at least up to 512-bit) on Haswell and Power7. The MUM bulk
39
speed (speed on very long data) is bigger than Spooky and City on
40
Power7. On Haswell the bulk speed is bigger than Spooky one and
41
close to City speed. */
42
43
#ifndef __MUM_HASH__
44
#define __MUM_HASH__
45
46
#include <stddef.h>
47
#include <stdlib.h>
48
#include <string.h>
49
#include <limits.h>
50
51
#ifdef _MSC_VER
52
typedef unsigned __int16 uint16_t;
53
typedef unsigned __int32 uint32_t;
54
typedef unsigned __int64 uint64_t;
55
#else
56
#include <stdint.h>
57
#endif
58
59
/* Macro saying to use 128-bit integers implemented by GCC for some
60
targets. */
61
#ifndef _MUM_USE_INT128
62
/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
63
HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
64
otherwise int. */
65
#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
66
#define _MUM_USE_INT128 1
67
#else
68
#define _MUM_USE_INT128 0
69
#endif
70
#endif
71
72
#if defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
73
#define _MUM_FRESH_GCC
74
#endif
75
76
#if defined(__GNUC__) && !defined(__llvm__) && defined(_MUM_FRESH_GCC)
77
#define _MUM_ATTRIBUTE_UNUSED __attribute__((unused))
78
#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
79
#define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
80
#else
81
#define _MUM_ATTRIBUTE_UNUSED
82
#define _MUM_OPTIMIZE(opts)
83
#define _MUM_TARGET(opts)
84
#endif
85
86
87
/* Here are different primes randomly generated with the equal
88
probability of their bit values. They are used to randomize input
89
values. */
90
static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
91
static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
92
static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
93
static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
94
static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
95
static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
96
static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
97
98
static uint64_t _mum_primes [] = {
99
0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
100
0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
101
0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
102
0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
103
};
104
105
/* Multiply 64-bit V and P and return sum of high and low parts of the
106
result. */
107
static inline uint64_t
108
_mum (uint64_t v, uint64_t p) {
109
uint64_t hi, lo;
110
#if _MUM_USE_INT128
111
#if defined(__aarch64__)
112
/* AARCH64 needs 2 insns to calculate 128-bit result of the
113
multiplication. If we use a generic code we actually call a
114
function doing 128x128->128 bit multiplication. The function is
115
very slow. */
116
lo = v * p, hi;
117
asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
118
#else
119
__uint128_t r = (__uint128_t) v * (__uint128_t) p;
120
hi = (uint64_t) (r >> 64);
121
lo = (uint64_t) r;
122
#endif
123
#else
124
/* Implementation of 64x64->128-bit multiplication by four 32x32->64
125
bit multiplication. */
126
uint64_t hv = v >> 32, hp = p >> 32;
127
uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
128
uint64_t rh = hv * hp;
129
uint64_t rm_0 = hv * lp;
130
uint64_t rm_1 = hp * lv;
131
uint64_t rl = lv * lp;
132
uint64_t t, carry = 0;
133
134
/* We could ignore a carry bit here if we did not care about the
135
same hash for 32-bit and 64-bit targets. */
136
t = rl + (rm_0 << 32);
137
#ifdef MUM_TARGET_INDEPENDENT_HASH
138
carry = t < rl;
139
#endif
140
lo = t + (rm_1 << 32);
141
#ifdef MUM_TARGET_INDEPENDENT_HASH
142
carry += lo < t;
143
#endif
144
hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
145
#endif
146
/* We could use XOR here too but, for some reasons, on Haswell and
147
Power7 using an addition improves hashing performance by 10% for
148
small strings. */
149
return hi + lo;
150
}
151
152
#if defined(_MSC_VER)
153
#define _mum_bswap_32(x) _byteswap_uint32_t (x)
154
#define _mum_bswap_64(x) _byteswap_uint64_t (x)
155
#elif defined(__APPLE__)
156
#include <libkern/OSByteOrder.h>
157
#define _mum_bswap_32(x) OSSwapInt32 (x)
158
#define _mum_bswap_64(x) OSSwapInt64 (x)
159
#elif defined(__GNUC__)
160
#define _mum_bswap32(x) __builtin_bswap32 (x)
161
#define _mum_bswap64(x) __builtin_bswap64 (x)
162
#else
163
#include <byteswap.h>
164
#define _mum_bswap32(x) bswap32 (x)
165
#define _mum_bswap64(x) bswap64 (x)
166
#endif
167
168
static inline uint64_t
169
_mum_le (uint64_t v) {
170
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
171
return v;
172
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
173
return _mum_bswap64 (v);
174
#else
175
#error "Unknown endianness"
176
#endif
177
}
178
179
static inline uint32_t
180
_mum_le32 (uint32_t v) {
181
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
182
return v;
183
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
184
return _mum_bswap32 (v);
185
#else
186
#error "Unknown endianness"
187
#endif
188
}
189
190
/* Macro defining how many times the most nested loop in
191
_mum_hash_aligned will be unrolled by the compiler (although it can
192
make an own decision:). Use only a constant here to help a
193
compiler to unroll a major loop.
194
195
The macro value affects the result hash for strings > 128 bit. The
196
unroll factor greatly affects the hashing speed. We prefer the
197
speed. */
198
#ifndef _MUM_UNROLL_FACTOR_POWER
199
#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
200
#define _MUM_UNROLL_FACTOR_POWER 3
201
#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
202
#define _MUM_UNROLL_FACTOR_POWER 4
203
#else
204
#define _MUM_UNROLL_FACTOR_POWER 2
205
#endif
206
#endif
207
208
#if _MUM_UNROLL_FACTOR_POWER < 1
209
#error "too small unroll factor"
210
#elif _MUM_UNROLL_FACTOR_POWER > 4
211
#error "We have not enough primes for such unroll factor"
212
#endif
213
214
#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
215
216
static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
217
_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
218
uint64_t result = start;
219
const unsigned char *str = (const unsigned char *) key;
220
uint64_t u64;
221
int i;
222
size_t n;
223
224
result = _mum (result, _mum_block_start_prime);
225
while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
226
/* This loop could be vectorized when we have vector insns for
227
64x64->128-bit multiplication. AVX2 currently only have a
228
vector insn for 4 32x32->64-bit multiplication. */
229
for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
230
result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
231
len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
232
str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
233
/* We will use the same prime numbers on the next iterations --
234
randomize the state. */
235
result = _mum (result, _mum_unroll_prime);
236
}
237
n = len / sizeof (uint64_t);
238
for (i = 0; i < (int)n; i++)
239
result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
240
len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
241
switch (len) {
242
case 7:
243
u64 = _mum_le32 (*(uint32_t *) str);
244
u64 |= (uint64_t) str[4] << 32;
245
u64 |= (uint64_t) str[5] << 40;
246
u64 |= (uint64_t) str[6] << 48;
247
return result ^ _mum (u64, _mum_tail_prime);
248
case 6:
249
u64 = _mum_le32 (*(uint32_t *) str);
250
u64 |= (uint64_t) str[4] << 32;
251
u64 |= (uint64_t) str[5] << 40;
252
return result ^ _mum (u64, _mum_tail_prime);
253
case 5:
254
u64 = _mum_le32 (*(uint32_t *) str);
255
u64 |= (uint64_t) str[4] << 32;
256
return result ^ _mum (u64, _mum_tail_prime);
257
case 4:
258
u64 = _mum_le32 (*(uint32_t *) str);
259
return result ^ _mum (u64, _mum_tail_prime);
260
case 3:
261
u64 = str[0];
262
u64 |= (uint64_t) str[1] << 8;
263
u64 |= (uint64_t) str[2] << 16;
264
return result ^ _mum (u64, _mum_tail_prime);
265
case 2:
266
u64 = str[0];
267
u64 |= (uint64_t) str[1] << 8;
268
return result ^ _mum (u64, _mum_tail_prime);
269
case 1:
270
u64 = str[0];
271
return result ^ _mum (u64, _mum_tail_prime);
272
}
273
return result;
274
}
275
276
/* Final randomization of H. */
277
static inline uint64_t
278
_mum_final (uint64_t h) {
279
h ^= _mum (h, _mum_finish_prime1);
280
h ^= _mum (h, _mum_finish_prime2);
281
return h;
282
}
283
284
#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
285
286
/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
287
it is possible. Although on modern Intel processors MULQ takes
288
3-cycles vs. 4 for MULX, MULX permits more freedom in insn
289
scheduling as it uses less fixed registers. */
290
static inline uint64_t _MUM_TARGET("arch=haswell")
291
_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
292
return _mum_final (_mum_hash_aligned (seed + len, key, len));
293
}
294
#endif
295
296
#ifndef _MUM_UNALIGNED_ACCESS
297
#if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
298
|| defined(__s390__) || defined(__m32c__) || defined(cris) \
299
|| defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
300
|| defined(__aarch64__)
301
#define _MUM_UNALIGNED_ACCESS 1
302
#else
303
#define _MUM_UNALIGNED_ACCESS 0
304
#endif
305
#endif
306
307
/* When we need an aligned access to data being hashed we move part of
308
the unaligned data to an aligned block of given size and then
309
process it, repeating processing the data by the block. */
310
#ifndef _MUM_BLOCK_LEN
311
#define _MUM_BLOCK_LEN 1024
312
#endif
313
314
#if _MUM_BLOCK_LEN < 8
315
#error "too small block length"
316
#endif
317
318
static inline uint64_t
319
#if defined(__x86_64__)
320
_MUM_TARGET("inline-all-stringops")
321
#endif
322
_mum_hash_default (const void *key, size_t len, uint64_t seed) {
323
uint64_t result;
324
const unsigned char *str = (const unsigned char *) key;
325
size_t block_len;
326
uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
327
328
result = seed + len;
329
if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
330
result = _mum_hash_aligned (result, key, len);
331
else {
332
while (len != 0) {
333
block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
334
memmove (buf, str, block_len);
335
result = _mum_hash_aligned (result, buf, block_len);
336
len -= block_len;
337
str += block_len;
338
}
339
}
340
return _mum_final (result);
341
}
342
343
static inline uint64_t
344
_mum_next_factor (void) {
345
uint64_t start = 0;
346
int i;
347
348
for (i = 0; i < 8; i++)
349
start = (start << 8) | rand() % 256;
350
return start;
351
}
352
353
/* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++ */
354
355
/* Set random multiplicators depending on SEED. */
356
static inline void
357
mum_hash_randomize (uint64_t seed) {
358
int i;
359
360
srand (seed);
361
_mum_hash_step_prime = _mum_next_factor ();
362
_mum_key_step_prime = _mum_next_factor ();
363
_mum_finish_prime1 = _mum_next_factor ();
364
_mum_finish_prime2 = _mum_next_factor ();
365
_mum_block_start_prime = _mum_next_factor ();
366
_mum_unroll_prime = _mum_next_factor ();
367
_mum_tail_prime = _mum_next_factor ();
368
for (i = 0; i < (int)(sizeof (_mum_primes) / sizeof (uint64_t)); i++)
369
_mum_primes[i] = _mum_next_factor ();
370
}
371
372
/* Start hashing data with SEED. Return the state. */
373
static inline uint64_t
374
mum_hash_init (uint64_t seed) {
375
return seed;
376
}
377
378
/* Process data KEY with the state H and return the updated state. */
379
static inline uint64_t
380
mum_hash_step (uint64_t h, uint64_t key)
381
{
382
return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
383
}
384
385
/* Return the result of hashing using the current state H. */
386
static inline uint64_t
387
mum_hash_finish (uint64_t h) {
388
return _mum_final (h);
389
}
390
391
/* Fast hashing of KEY with SEED. The hash is always the same for the
392
same key on any target. */
393
static inline size_t
394
mum_hash64 (uint64_t key, uint64_t seed) {
395
return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
396
}
397
398
/* Hash data KEY of length LEN and SEED. The hash depends on the
399
target endianness and the unroll factor. */
400
static inline uint64_t
401
mum_hash (const void *key, size_t len, uint64_t seed) {
402
#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
403
static int avx2_support = 0;
404
405
if (avx2_support > 0)
406
return _mum_hash_avx2 (key, len, seed);
407
else if (! avx2_support) {
408
__builtin_cpu_init ();
409
avx2_support = __builtin_cpu_supports ("avx2") ? 1 : -1;
410
if (avx2_support > 0)
411
return _mum_hash_avx2 (key, len, seed);
412
}
413
#endif
414
return _mum_hash_default (key, len, seed);
415
}
416
417
#endif
418
419