Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
epidemian
GitHub Repository: epidemian/gravity
Path: blob/master/src/shared/gravity_hash.c
1214 views
1
//
2
// gravity_hash.c
3
// gravity
4
//
5
// Created by Marco Bambini on 23/04/15.
6
// Copyright (c) 2015 CreoLabs. All rights reserved.
7
//
8
9
#include "gravity_hash.h"
10
11
#if GRAVITYHASH_ENABLE_STATS
12
#define INC_COLLISION(tbl) ++tbl->ncollision
13
#define INC_RESIZE(tbl) ++tbl->nresize
14
#else
15
#define INC_COLLISION(tbl)
16
#define INC_RESIZE(tbl)
17
#endif
18
19
typedef struct hash_node_s {
20
uint32_t hash;
21
gravity_value_t key;
22
gravity_value_t value;
23
struct hash_node_s *next;
24
} hash_node_t;
25
26
struct gravity_hash_t {
27
// internals
28
uint32_t size;
29
uint32_t count;
30
hash_node_t **nodes;
31
gravity_hash_compute_fn compute_fn;
32
gravity_hash_isequal_fn isequal_fn;
33
gravity_hash_iterate_fn free_fn;
34
void *data;
35
36
// stats
37
#if GRAVITYHASH_ENABLE_STATS
38
uint32_t ncollision;
39
uint32_t nresize;
40
#endif
41
};
42
43
// http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed
44
45
/*
46
Hash algorithm used in Gravity Hash Table is DJB2 which does a pretty good job with string keys and it is fast.
47
Original algorithm is: http://www.cse.yorku.ca/~oz/hash.html
48
49
DJBX33A (Daniel J. Bernstein, Times 33 with Addition)
50
51
This is Daniel J. Bernstein's popular `times 33' hash function as
52
posted by him years ago on comp.lang.c. It basically uses a function
53
like ``hash(i) = hash(i-1) 33 + str[i]''. This is one of the best
54
known hash functions for strings. Because it is both computed very
55
fast and distributes very well.
56
57
Why 33? (<< 5 in the code)
58
The magic of number 33, i.e. why it works better than many other
59
constants, prime or not, has never been adequately explained by
60
anyone. So I try an explanation: if one experimentally tests all
61
multipliers between 1 and 256 (as RSE did now) one detects that even
62
numbers are not useable at all. The remaining 128 odd numbers
63
(except for the number 1) work more or less all equally well. They
64
all distribute in an acceptable way and this way fill a hash table
65
with an average percent of approx. 86%.
66
67
If one compares the Chi^2 values of the variants, the number 33 not
68
even has the best value. But the number 33 and a few other equally
69
good numbers like 17, 31, 63, 127 and 129 have nevertheless a great
70
advantage to the remaining numbers in the large set of possible
71
multipliers: their multiply operation can be replaced by a faster
72
operation based on just one shift plus either a single addition
73
or subtraction operation. And because a hash function has to both
74
distribute good _and_ has to be very fast to compute, those few
75
numbers should be preferred and seems to be the reason why Daniel J.
76
Bernstein also preferred it.
77
78
Why 5381?
79
1. odd number
80
2. prime number
81
3. deficient number (https://en.wikipedia.org/wiki/Deficient_number)
82
4. 001/010/100/000/101 b
83
84
Practically any good multiplier works. I think you're worrying about
85
the fact that 31c + d doesn't cover any reasonable range of hash values
86
if c and d are between 0 and 255. That's why, when I discovered the 33 hash
87
function and started using it in my compressors, I started with a hash value
88
of 5381. I think you'll find that this does just as well as a 261 multiplier.
89
90
Note that the starting value of the hash (5381) makes no difference for strings
91
of equal lengths, but will play a role in generating different hash values for
92
strings of different lengths.
93
*/
94
95
#define HASH_SEED_VALUE 5381
96
#define ROT32(x, y) ((x << y) | (x >> (32 - y)))
97
#define COMPUTE_HASH(tbl,key,hash) register uint32_t hash = murmur3_32(key, len, HASH_SEED_VALUE); hash = hash % tbl->size
98
#define COMPUTE_HASH_NOMODULO(key,hash) register uint32_t hash = murmur3_32(key, len, HASH_SEED_VALUE)
99
#define RECOMPUTE_HASH(tbl,key,hash) hash = murmur3_32(key, len, HASH_SEED_VALUE); hash = hash % tbl->size
100
101
static inline uint32_t murmur3_32 (const char *key, uint32_t len, uint32_t seed) {
102
static const uint32_t c1 = 0xcc9e2d51;
103
static const uint32_t c2 = 0x1b873593;
104
static const uint32_t r1 = 15;
105
static const uint32_t r2 = 13;
106
static const uint32_t m = 5;
107
static const uint32_t n = 0xe6546b64;
108
109
uint32_t hash = seed;
110
111
const int nblocks = len / 4;
112
const uint32_t *blocks = (const uint32_t *) key;
113
for (int i = 0; i < nblocks; i++) {
114
uint32_t k = blocks[i];
115
k *= c1;
116
k = ROT32(k, r1);
117
k *= c2;
118
119
hash ^= k;
120
hash = ROT32(hash, r2) * m + n;
121
}
122
123
const uint8_t *tail = (const uint8_t *) (key + nblocks * 4);
124
uint32_t k1 = 0;
125
126
switch (len & 3) {
127
case 3:
128
k1 ^= tail[2] << 16;
129
case 2:
130
k1 ^= tail[1] << 8;
131
case 1:
132
k1 ^= tail[0];
133
134
k1 *= c1;
135
k1 = ROT32(k1, r1);
136
k1 *= c2;
137
hash ^= k1;
138
}
139
140
hash ^= len;
141
hash ^= (hash >> 16);
142
hash *= 0x85ebca6b;
143
hash ^= (hash >> 13);
144
hash *= 0xc2b2ae35;
145
hash ^= (hash >> 16);
146
147
return hash;
148
}
149
150
static void table_dump (gravity_hash_t *hashtable, gravity_value_t key, gravity_value_t value, void *data) {
151
#pragma unused (hashtable, data)
152
const char *k = ((gravity_string_t *)key.p)->s;
153
printf("%-20s=>\t",k);
154
gravity_value_dump(value, NULL, 0);
155
}
156
157
// MARK: -
158
159
gravity_hash_t *gravity_hash_create (uint32_t size, gravity_hash_compute_fn compute, gravity_hash_isequal_fn isequal, gravity_hash_iterate_fn free_fn, void *data) {
160
if ((!compute) || (!isequal)) return NULL;
161
if (size == 0) size = GRAVITYHASH_DEFAULT_SIZE;
162
163
gravity_hash_t *hashtable = (gravity_hash_t *)mem_alloc(sizeof(gravity_hash_t));
164
if (!hashtable) return NULL;
165
if (!(hashtable->nodes = mem_calloc(size, sizeof(hash_node_t*)))) {mem_free(hashtable); return NULL;}
166
167
hashtable->compute_fn = compute;
168
hashtable->isequal_fn = isequal;
169
hashtable->free_fn = free_fn;
170
hashtable->data = data;
171
hashtable->size = size;
172
return hashtable;
173
}
174
175
void gravity_hash_free (gravity_hash_t *hashtable) {
176
if (!hashtable) return;
177
gravity_hash_iterate_fn free_fn = hashtable->free_fn;
178
179
for (uint32_t n = 0; n < hashtable->size; ++n) {
180
hash_node_t *node = hashtable->nodes[n];
181
while (node) {
182
if (free_fn) free_fn(hashtable, node->key, node->value, hashtable->data);
183
hash_node_t *old_node = node;
184
node = node->next;
185
mem_free(old_node);
186
}
187
}
188
mem_free(hashtable->nodes);
189
mem_free(hashtable);
190
}
191
192
uint32_t gravity_hash_memsize (gravity_hash_t *hashtable) {
193
uint32_t size = sizeof(gravity_hash_t);
194
size += hashtable->size * sizeof(hash_node_t);
195
return size;
196
}
197
198
bool gravity_hash_isempty (gravity_hash_t *hashtable) {
199
return (hashtable->count == 0);
200
}
201
202
static inline int gravity_hash_resize (gravity_hash_t *hashtable) {
203
uint32_t size = (hashtable->size * 2) + 1;
204
gravity_hash_t newtbl = {
205
.size = size,
206
.count = 0,
207
.isequal_fn = hashtable->isequal_fn,
208
.compute_fn = hashtable->compute_fn
209
};
210
if (!(newtbl.nodes = mem_calloc(size, sizeof(hash_node_t*)))) return -1;
211
212
hash_node_t *node, *next;
213
for (uint32_t n = 0; n < hashtable->size; ++n) {
214
for (node = hashtable->nodes[n]; node; node = next) {
215
next = node->next;
216
gravity_hash_insert(&newtbl, node->key, node->value);
217
// temporary disable free callback registered in hashtable
218
// because both key and values are reused in the new table
219
gravity_hash_iterate_fn free_fn = hashtable->free_fn;
220
hashtable->free_fn = NULL;
221
gravity_hash_remove(hashtable, node->key);
222
hashtable->free_fn = free_fn;
223
}
224
}
225
226
mem_free(hashtable->nodes);
227
hashtable->size = newtbl.size;
228
hashtable->count = newtbl.count;
229
hashtable->nodes = newtbl.nodes;
230
INC_RESIZE(hashtable);
231
232
return 0;
233
}
234
235
bool gravity_hash_remove (gravity_hash_t *hashtable, gravity_value_t key) {
236
register uint32_t hash = hashtable->compute_fn(key);
237
register uint32_t position = hash % hashtable->size;
238
239
gravity_hash_iterate_fn free_fn = hashtable->free_fn;
240
hash_node_t *node = hashtable->nodes[position];
241
hash_node_t *prevnode = NULL;
242
while (node) {
243
if ((node->hash == hash) && (hashtable->isequal_fn(key, node->key))) {
244
if (free_fn) free_fn(hashtable, node->key, node->value, hashtable->data);
245
if (prevnode) prevnode->next = node->next;
246
else hashtable->nodes[position] = node->next;
247
mem_free(node);
248
hashtable->count--;
249
return true;
250
}
251
252
prevnode = node;
253
node = node->next;
254
}
255
256
return false;
257
}
258
259
bool gravity_hash_insert (gravity_hash_t *hashtable, gravity_value_t key, gravity_value_t value) {
260
register uint32_t hash = hashtable->compute_fn(key);
261
register uint32_t position = hash % hashtable->size;
262
263
hash_node_t *node = hashtable->nodes[position];
264
if (node) INC_COLLISION(hashtable);
265
266
// check if the key is already in the table
267
while (node) {
268
if ((node->hash == hash) && (hashtable->isequal_fn(key, node->key))) {
269
node->value = value;
270
return false;
271
}
272
node = node->next;
273
}
274
275
// resize table if the threshold is exceeded
276
// default threshold is: <table size> * <load factor GRAVITYHASH_THRESHOLD>
277
if (hashtable->count >= hashtable->size * GRAVITYHASH_THRESHOLD) {
278
if (gravity_hash_resize(hashtable) == -1) return -1;
279
// recompute position here because hashtable->size has changed!
280
position = hash % hashtable->size;
281
}
282
283
// allocate new entry and set new data
284
if (!(node = mem_alloc(sizeof(hash_node_t)))) return -1;
285
node->key = key;
286
node->hash = hash;
287
node->value = value;
288
node->next = hashtable->nodes[position];
289
hashtable->nodes[position] = node;
290
++hashtable->count;
291
292
return true;
293
}
294
295
gravity_value_t *gravity_hash_lookup (gravity_hash_t *hashtable, gravity_value_t key) {
296
register uint32_t hash = hashtable->compute_fn(key);
297
register uint32_t position = hash % hashtable->size;
298
299
hash_node_t *node = hashtable->nodes[position];
300
while (node) {
301
if ((node->hash == hash) && (hashtable->isequal_fn(key, node->key))) return &node->value;
302
node = node->next;
303
}
304
305
return NULL;
306
}
307
308
uint32_t gravity_hash_count (gravity_hash_t *hashtable) {
309
return hashtable->count;
310
}
311
312
uint32_t gravity_hash_compute_buffer (const char *key, uint32_t len) {
313
return murmur3_32(key, len, HASH_SEED_VALUE);
314
}
315
316
uint32_t gravity_hash_compute_int (gravity_int_t n) {
317
char buffer[24];
318
snprintf(buffer, sizeof(buffer), "%lld", n);
319
return murmur3_32(buffer, (uint32_t)strlen(buffer), HASH_SEED_VALUE);
320
}
321
322
uint32_t gravity_hash_compute_float (gravity_float_t f) {
323
char buffer[24];
324
snprintf(buffer, sizeof(buffer), "%f", f);
325
return murmur3_32(buffer, (uint32_t)strlen(buffer), HASH_SEED_VALUE);
326
}
327
328
void gravity_hash_stat (gravity_hash_t *hashtable) {
329
#if GRAVITYHASH_ENABLE_STATS
330
printf("==============\n");
331
printf("Collision: %d\n", hashtable->ncollision);
332
printf("Resize: %d\n", hashtable->nresize);
333
printf("Size: %d\n", hashtable->size);
334
printf("Count: %d\n", hashtable->count);
335
printf("==============\n");
336
#endif
337
}
338
339
void gravity_hash_transform (gravity_hash_t *hashtable, gravity_hash_transform_fn transform, void *data) {
340
if ((!hashtable) || (!transform)) return;
341
342
for (uint32_t i=0; i<hashtable->size; ++i) {
343
hash_node_t *node = hashtable->nodes[i];
344
if (!node) continue;
345
346
while (node) {
347
transform(hashtable, node->key, &node->value, data);
348
node = node->next;
349
}
350
}
351
}
352
353
void gravity_hash_iterate (gravity_hash_t *hashtable, gravity_hash_iterate_fn iterate, void *data) {
354
if ((!hashtable) || (!iterate)) return;
355
356
for (uint32_t i=0; i<hashtable->size; ++i) {
357
hash_node_t *node = hashtable->nodes[i];
358
if (!node) continue;
359
360
while (node) {
361
iterate(hashtable, node->key, node->value, data);
362
node = node->next;
363
}
364
}
365
}
366
367
void gravity_hash_iterate2 (gravity_hash_t *hashtable, gravity_hash_iterate2_fn iterate, void *data1, void *data2) {
368
if ((!hashtable) || (!iterate)) return;
369
370
for (uint32_t i=0; i<hashtable->size; ++i) {
371
hash_node_t *node = hashtable->nodes[i];
372
if (!node) continue;
373
374
while (node) {
375
iterate(hashtable, node->key, node->value, data1, data2);
376
node = node->next;
377
}
378
}
379
}
380
381
void gravity_hash_dump (gravity_hash_t *hashtable) {
382
gravity_hash_iterate(hashtable, table_dump, NULL);
383
}
384
385
void gravity_hash_append (gravity_hash_t *hashtable1, gravity_hash_t *hashtable2) {
386
for (uint32_t i=0; i<hashtable2->size; ++i) {
387
hash_node_t *node = hashtable2->nodes[i];
388
if (!node) continue;
389
390
while (node) {
391
gravity_hash_insert(hashtable1, node->key, node->value);
392
node = node->next;
393
}
394
}
395
}
396
397
void gravity_hash_resetfree (gravity_hash_t *hashtable) {
398
hashtable->free_fn = NULL;
399
}
400
401