CoCalc -- jenkins

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/libkern/jenkins_hash.c
¹⁰⁴⁸⁷⁴ views
1
/*
2
 * Taken from http://burtleburtle.net/bob/c/lookup3.c
3
 */
4

5
#include <sys/hash.h>
6
#include <machine/endian.h>
7

8
/*
9
-------------------------------------------------------------------------------
10
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
11

12
These are functions for producing 32-bit hashes for hash table lookup.
13
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() 
14
are externally useful functions.  Routines to test the hash are included 
15
if SELF_TEST is defined.  You can use this free for any purpose.  It's in
16
the public domain.  It has no warranty.
17

18
You probably want to use hashlittle().  hashlittle() and hashbig()
19
hash byte arrays.  hashlittle() is faster than hashbig() on
20
little-endian machines.  Intel and AMD are little-endian machines.
21
On second thought, you probably want hashlittle2(), which is identical to
22
hashlittle() except it returns two 32-bit hashes for the price of one.  
23
You could implement hashbig2() if you wanted but I haven't bothered here.
24

25
If you want to find a hash of, say, exactly 7 integers, do
26
  a = i1;  b = i2;  c = i3;
27
  mix(a,b,c);
28
  a += i4; b += i5; c += i6;
29
  mix(a,b,c);
30
  a += i7;
31
  final(a,b,c);
32
then use c as the hash value.  If you have a variable length array of
33
4-byte integers to hash, use hashword().  If you have a byte array (like
34
a character string), use hashlittle().  If you have several byte arrays, or
35
a mix of things, see the comments above hashlittle().  
36

37
Why is this so big?  I read 12 bytes at a time into 3 4-byte integers, 
38
then mix those integers.  This is fast (you can do a lot more thorough
39
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
40
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
41
-------------------------------------------------------------------------------
42
*/
43

44
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
45

46
/*
47
-------------------------------------------------------------------------------
48
mix -- mix 3 32-bit values reversibly.
49

50
This is reversible, so any information in (a,b,c) before mix() is
51
still in (a,b,c) after mix().
52

53
If four pairs of (a,b,c) inputs are run through mix(), or through
54
mix() in reverse, there are at least 32 bits of the output that
55
are sometimes the same for one pair and different for another pair.
56
This was tested for:
57
* pairs that differed by one bit, by two bits, in any combination
58
  of top bits of (a,b,c), or in any combination of bottom bits of
59
  (a,b,c).
60
* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
61
  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
62
  is commonly produced by subtraction) look like a single 1-bit
63
  difference.
64
* the base values were pseudorandom, all zero but one bit set, or 
65
  all zero plus a counter that starts at zero.
66

67
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
68
satisfy this are
69
    4  6  8 16 19  4
70
    9 15  3 18 27 15
71
   14  9  3  7 17  3
72
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
73
for "differ" defined as + with a one-bit base and a two-bit delta.  I
74
used http://burtleburtle.net/bob/hash/avalanche.html to choose 
75
the operations, constants, and arrangements of the variables.
76

77
This does not achieve avalanche.  There are input bits of (a,b,c)
78
that fail to affect some output bits of (a,b,c), especially of a.  The
79
most thoroughly mixed value is c, but it doesn't really even achieve
80
avalanche in c.
81

82
This allows some parallelism.  Read-after-writes are good at doubling
83
the number of bits affected, so the goal of mixing pulls in the opposite
84
direction as the goal of parallelism.  I did what I could.  Rotates
85
seem to cost as much as shifts on every machine I could lay my hands
86
on, and rotates are much kinder to the top and bottom bits, so I used
87
rotates.
88
-------------------------------------------------------------------------------
89
*/
90
#define mix(a,b,c) \
91
{ \
92
  a -= c;  a ^= rot(c, 4);  c += b; \
93
  b -= a;  b ^= rot(a, 6);  a += c; \
94
  c -= b;  c ^= rot(b, 8);  b += a; \
95
  a -= c;  a ^= rot(c,16);  c += b; \
96
  b -= a;  b ^= rot(a,19);  a += c; \
97
  c -= b;  c ^= rot(b, 4);  b += a; \
98
}
99

100
/*
101
-------------------------------------------------------------------------------
102
final -- final mixing of 3 32-bit values (a,b,c) into c
103

104
Pairs of (a,b,c) values differing in only a few bits will usually
105
produce values of c that look totally different.  This was tested for
106
* pairs that differed by one bit, by two bits, in any combination
107
  of top bits of (a,b,c), or in any combination of bottom bits of
108
  (a,b,c).
109
* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
110
  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
111
  is commonly produced by subtraction) look like a single 1-bit
112
  difference.
113
* the base values were pseudorandom, all zero but one bit set, or 
114
  all zero plus a counter that starts at zero.
115

116
These constants passed:
117
 14 11 25 16 4 14 24
118
 12 14 25 16 4 14 24
119
and these came close:
120
  4  8 15 26 3 22 24
121
 10  8 15 26 3 22 24
122
 11  8 15 26 3 22 24
123
-------------------------------------------------------------------------------
124
*/
125
#define final(a,b,c) \
126
{ \
127
  c ^= b; c -= rot(b,14); \
128
  a ^= c; a -= rot(c,11); \
129
  b ^= a; b -= rot(a,25); \
130
  c ^= b; c -= rot(b,16); \
131
  a ^= c; a -= rot(c,4);  \
132
  b ^= a; b -= rot(a,14); \
133
  c ^= b; c -= rot(b,24); \
134
}
135

136
/*
137
--------------------------------------------------------------------
138
 This works on all machines.  To be useful, it requires
139
 -- that the key be an array of uint32_t's, and
140
 -- that the length be the number of uint32_t's in the key
141

142
 The function hashword() is identical to hashlittle() on little-endian
143
 machines, and identical to hashbig() on big-endian machines,
144
 except that the length has to be measured in uint32_ts rather than in
145
 bytes.  hashlittle() is more complicated than hashword() only because
146
 hashlittle() has to dance around fitting the key bytes into registers.
147
--------------------------------------------------------------------
148
*/
149
uint32_t jenkins_hash32(
150
const uint32_t *k,                   /* the key, an array of uint32_t values */
151
size_t          length,               /* the length of the key, in uint32_ts */
152
uint32_t        initval)         /* the previous hash, or an arbitrary value */
153
{
154
  uint32_t a,b,c;
155

156
  /* Set up the internal state */
157
  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
158

159
  /*------------------------------------------------- handle most of the key */
160
  while (length > 3)
161
  {
162
    a += k[0];
163
    b += k[1];
164
    c += k[2];
165
    mix(a,b,c);
166
    length -= 3;
167
    k += 3;
168
  }
169

170
  /*------------------------------------------- handle the last 3 uint32_t's */
171
  switch(length)                     /* all the case statements fall through */
172
  { 
173
  case 3 : c+=k[2];
174
  case 2 : b+=k[1];
175
  case 1 : a+=k[0];
176
    final(a,b,c);
177
  case 0:     /* case 0: nothing left to add */
178
    break;
179
  }
180
  /*------------------------------------------------------ report the result */
181
  return c;
182
}
183

184
#if BYTE_ORDER == LITTLE_ENDIAN
185
/*
186
-------------------------------------------------------------------------------
187
hashlittle() -- hash a variable-length key into a 32-bit value
188
  k       : the key (the unaligned variable-length array of bytes)
189
  length  : the length of the key, counting by bytes
190
  initval : can be any 4-byte value
191
Returns a 32-bit value.  Every bit of the key affects every bit of
192
the return value.  Two keys differing by one or two bits will have
193
totally different hash values.
194

195
The best hash table sizes are powers of 2.  There is no need to do
196
mod a prime (mod is sooo slow!).  If you need less than 32 bits,
197
use a bitmask.  For example, if you need only 10 bits, do
198
  h = (h & hashmask(10));
199
In which case, the hash table should have hashsize(10) elements.
200

201
If you are hashing n strings (uint8_t **)k, do it like this:
202
  for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
203

204
By Bob Jenkins, 2006.  [email protected].  You may use this
205
code any way you wish, private, educational, or commercial.  It's free.
206

207
Use for hash table lookup, or anything where one collision in 2^^32 is
208
acceptable.  Do NOT use for cryptographic purposes.
209
-------------------------------------------------------------------------------
210
*/
211

212
uint32_t jenkins_hash( const void *key, size_t length, uint32_t initval)
213
{
214
  uint32_t a,b,c;                                          /* internal state */
215
  union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
216

217
  /* Set up the internal state */
218
  a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
219

220
  u.ptr = key;
221
  if ((u.i & 0x3) == 0) {
222
    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
223

224
    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
225
    while (length > 12)
226
    {
227
      a += k[0];
228
      b += k[1];
229
      c += k[2];
230
      mix(a,b,c);
231
      length -= 12;
232
      k += 3;
233
    }
234

235
    /*----------------------------- handle the last (probably partial) block */
236
    /* 
237
     * "k[2]&0xffffff" actually reads beyond the end of the string, but
238
     * then masks off the part it's not allowed to read.  Because the
239
     * string is aligned, the masked-off tail is in the same word as the
240
     * rest of the string.  Every machine with memory protection I've seen
241
     * does it on word boundaries, so is OK with this.  But VALGRIND will
242
     * still catch it and complain.  The masking trick does make the hash
243
     * noticeably faster for short strings (like English words).
244
     */
245

246
    switch(length)
247
    {
248
    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
249
    case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
250
    case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
251
    case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
252
    case 8 : b+=k[1]; a+=k[0]; break;
253
    case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
254
    case 6 : b+=k[1]&0xffff; a+=k[0]; break;
255
    case 5 : b+=k[1]&0xff; a+=k[0]; break;
256
    case 4 : a+=k[0]; break;
257
    case 3 : a+=k[0]&0xffffff; break;
258
    case 2 : a+=k[0]&0xffff; break;
259
    case 1 : a+=k[0]&0xff; break;
260
    case 0 : return c;              /* zero length strings require no mixing */
261
    }
262

263
  } else if ((u.i & 0x1) == 0) {
264
    const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
265
    const uint8_t  *k8;
266

267
    /*--------------- all but last block: aligned reads and different mixing */
268
    while (length > 12)
269
    {
270
      a += k[0] + (((uint32_t)k[1])<<16);
271
      b += k[2] + (((uint32_t)k[3])<<16);
272
      c += k[4] + (((uint32_t)k[5])<<16);
273
      mix(a,b,c);
274
      length -= 12;
275
      k += 6;
276
    }
277

278
    /*----------------------------- handle the last (probably partial) block */
279
    k8 = (const uint8_t *)k;
280
    switch(length)
281
    {
282
    case 12: c+=k[4]+(((uint32_t)k[5])<<16);
283
             b+=k[2]+(((uint32_t)k[3])<<16);
284
             a+=k[0]+(((uint32_t)k[1])<<16);
285
             break;
286
    case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
287
    case 10: c+=k[4];
288
             b+=k[2]+(((uint32_t)k[3])<<16);
289
             a+=k[0]+(((uint32_t)k[1])<<16);
290
             break;
291
    case 9 : c+=k8[8];                      /* fall through */
292
    case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
293
             a+=k[0]+(((uint32_t)k[1])<<16);
294
             break;
295
    case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
296
    case 6 : b+=k[2];
297
             a+=k[0]+(((uint32_t)k[1])<<16);
298
             break;
299
    case 5 : b+=k8[4];                      /* fall through */
300
    case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
301
             break;
302
    case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
303
    case 2 : a+=k[0];
304
             break;
305
    case 1 : a+=k8[0];
306
             break;
307
    case 0 : return c;                     /* zero length requires no mixing */
308
    }
309

310
  } else {                        /* need to read the key one byte at a time */
311
    const uint8_t *k = (const uint8_t *)key;
312

313
    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
314
    while (length > 12)
315
    {
316
      a += k[0];
317
      a += ((uint32_t)k[1])<<8;
318
      a += ((uint32_t)k[2])<<16;
319
      a += ((uint32_t)k[3])<<24;
320
      b += k[4];
321
      b += ((uint32_t)k[5])<<8;
322
      b += ((uint32_t)k[6])<<16;
323
      b += ((uint32_t)k[7])<<24;
324
      c += k[8];
325
      c += ((uint32_t)k[9])<<8;
326
      c += ((uint32_t)k[10])<<16;
327
      c += ((uint32_t)k[11])<<24;
328
      mix(a,b,c);
329
      length -= 12;
330
      k += 12;
331
    }
332

333
    /*-------------------------------- last block: affect all 32 bits of (c) */
334
    switch(length)                   /* all the case statements fall through */
335
    {
336
    case 12: c+=((uint32_t)k[11])<<24;
337
    case 11: c+=((uint32_t)k[10])<<16;
338
    case 10: c+=((uint32_t)k[9])<<8;
339
    case 9 : c+=k[8];
340
    case 8 : b+=((uint32_t)k[7])<<24;
341
    case 7 : b+=((uint32_t)k[6])<<16;
342
    case 6 : b+=((uint32_t)k[5])<<8;
343
    case 5 : b+=k[4];
344
    case 4 : a+=((uint32_t)k[3])<<24;
345
    case 3 : a+=((uint32_t)k[2])<<16;
346
    case 2 : a+=((uint32_t)k[1])<<8;
347
    case 1 : a+=k[0];
348
             break;
349
    case 0 : return c;
350
    }
351
  }
352

353
  final(a,b,c);
354
  return c;
355
}
356

357
#else /* !(BYTE_ORDER == LITTLE_ENDIAN) */
358

359
/*
360
 * hashbig():
361
 * This is the same as hashword() on big-endian machines.  It is different
362
 * from hashlittle() on all machines.  hashbig() takes advantage of
363
 * big-endian byte ordering. 
364
 */
365
uint32_t jenkins_hash( const void *key, size_t length, uint32_t initval)
366
{
367
  uint32_t a,b,c;
368
  union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */
369

370
  /* Set up the internal state */
371
  a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
372

373
  u.ptr = key;
374
  if ((u.i & 0x3) == 0) {
375
    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
376

377
    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
378
    while (length > 12)
379
    {
380
      a += k[0];
381
      b += k[1];
382
      c += k[2];
383
      mix(a,b,c);
384
      length -= 12;
385
      k += 3;
386
    }
387

388
    /*----------------------------- handle the last (probably partial) block */
389
    /* 
390
     * "k[2]<<8" actually reads beyond the end of the string, but
391
     * then shifts out the part it's not allowed to read.  Because the
392
     * string is aligned, the illegal read is in the same word as the
393
     * rest of the string.  Every machine with memory protection I've seen
394
     * does it on word boundaries, so is OK with this.  But VALGRIND will
395
     * still catch it and complain.  The masking trick does make the hash
396
     * noticeably faster for short strings (like English words).
397
     */
398

399
    switch(length)
400
    {
401
    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
402
    case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break;
403
    case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break;
404
    case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break;
405
    case 8 : b+=k[1]; a+=k[0]; break;
406
    case 7 : b+=k[1]&0xffffff00; a+=k[0]; break;
407
    case 6 : b+=k[1]&0xffff0000; a+=k[0]; break;
408
    case 5 : b+=k[1]&0xff000000; a+=k[0]; break;
409
    case 4 : a+=k[0]; break;
410
    case 3 : a+=k[0]&0xffffff00; break;
411
    case 2 : a+=k[0]&0xffff0000; break;
412
    case 1 : a+=k[0]&0xff000000; break;
413
    case 0 : return c;              /* zero length strings require no mixing */
414
    }
415

416
  } else {                        /* need to read the key one byte at a time */
417
    const uint8_t *k = (const uint8_t *)key;
418

419
    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
420
    while (length > 12)
421
    {
422
      a += ((uint32_t)k[0])<<24;
423
      a += ((uint32_t)k[1])<<16;
424
      a += ((uint32_t)k[2])<<8;
425
      a += ((uint32_t)k[3]);
426
      b += ((uint32_t)k[4])<<24;
427
      b += ((uint32_t)k[5])<<16;
428
      b += ((uint32_t)k[6])<<8;
429
      b += ((uint32_t)k[7]);
430
      c += ((uint32_t)k[8])<<24;
431
      c += ((uint32_t)k[9])<<16;
432
      c += ((uint32_t)k[10])<<8;
433
      c += ((uint32_t)k[11]);
434
      mix(a,b,c);
435
      length -= 12;
436
      k += 12;
437
    }
438

439
    /*-------------------------------- last block: affect all 32 bits of (c) */
440
    switch(length)                   /* all the case statements fall through */
441
    {
442
    case 12: c+=k[11];
443
    case 11: c+=((uint32_t)k[10])<<8;
444
    case 10: c+=((uint32_t)k[9])<<16;
445
    case 9 : c+=((uint32_t)k[8])<<24;
446
    case 8 : b+=k[7];
447
    case 7 : b+=((uint32_t)k[6])<<8;
448
    case 6 : b+=((uint32_t)k[5])<<16;
449
    case 5 : b+=((uint32_t)k[4])<<24;
450
    case 4 : a+=k[3];
451
    case 3 : a+=((uint32_t)k[2])<<8;
452
    case 2 : a+=((uint32_t)k[1])<<16;
453
    case 1 : a+=((uint32_t)k[0])<<24;
454
             break;
455
    case 0 : return c;
456
    }
457
  }
458

459
  final(a,b,c);
460
  return c;
461
}
462
#endif
463

464
Product

Resources

Company