Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-utils/src/hashing.rs
6939 views
1
use std::hash::{Hash, Hasher};
2
3
use crate::nulls::IsNull;
4
5
// Hash combine from c++' boost lib.
6
#[inline(always)]
7
pub fn _boost_hash_combine(l: u64, r: u64) -> u64 {
8
l ^ r.wrapping_add(0x9e3779b9u64.wrapping_add(l << 6).wrapping_add(r >> 2))
9
}
10
11
#[inline(always)]
12
pub const fn folded_multiply(a: u64, b: u64) -> u64 {
13
let full = (a as u128).wrapping_mul(b as u128);
14
(full as u64) ^ ((full >> 64) as u64)
15
}
16
17
/// Contains a byte slice and a precomputed hash for that string.
18
/// During rehashes, we will rehash the hash instead of the string, that makes
19
/// rehashing cheap and allows cache coherent small hash tables.
20
#[derive(Eq, Copy, Clone, Debug)]
21
pub struct BytesHash<'a> {
22
payload: Option<&'a [u8]>,
23
pub(super) hash: u64,
24
}
25
26
impl<'a> BytesHash<'a> {
27
#[inline]
28
pub fn new(s: Option<&'a [u8]>, hash: u64) -> Self {
29
Self { payload: s, hash }
30
}
31
}
32
33
impl<'a> IsNull for BytesHash<'a> {
34
const HAS_NULLS: bool = true;
35
type Inner = BytesHash<'a>;
36
37
#[inline(always)]
38
fn is_null(&self) -> bool {
39
self.payload.is_none()
40
}
41
42
fn unwrap_inner(self) -> Self::Inner {
43
assert!(self.payload.is_some());
44
self
45
}
46
}
47
48
impl Hash for BytesHash<'_> {
49
fn hash<H: Hasher>(&self, state: &mut H) {
50
state.write_u64(self.hash)
51
}
52
}
53
54
impl PartialEq for BytesHash<'_> {
55
#[inline]
56
fn eq(&self, other: &Self) -> bool {
57
(self.hash == other.hash) && (self.payload == other.payload)
58
}
59
}
60
61
#[inline(always)]
62
pub fn hash_to_partition(h: u64, n_partitions: usize) -> usize {
63
// Assuming h is a 64-bit random number, we note that
64
// h / 2^64 is almost a uniform random number in [0, 1), and thus
65
// floor(h * n_partitions / 2^64) is almost a uniform random integer in
66
// [0, n_partitions). Despite being written with u128 multiplication this
67
// compiles to a single mul / mulhi instruction on x86-x64/aarch64.
68
((h as u128 * n_partitions as u128) >> 64) as usize
69
}
70
71
#[derive(Clone)]
72
pub struct HashPartitioner {
73
num_partitions: usize,
74
seed: u64,
75
}
76
77
impl HashPartitioner {
78
/// Creates a new hash partitioner with the given number of partitions and
79
/// seed.
80
#[inline]
81
pub fn new(num_partitions: usize, mut seed: u64) -> Self {
82
assert!(num_partitions > 0);
83
// Make sure seeds bits are properly randomized, and is odd.
84
const ARBITRARY1: u64 = 0x85921e81c41226a0;
85
const ARBITRARY2: u64 = 0x3bc1d0faba166294;
86
const ARBITRARY3: u64 = 0xfbde893e21a73756;
87
seed = folded_multiply(seed ^ ARBITRARY1, ARBITRARY2);
88
seed = folded_multiply(seed, ARBITRARY3);
89
seed |= 1;
90
Self {
91
seed,
92
num_partitions,
93
}
94
}
95
96
/// Converts a hash to a partition. It is guaranteed that the output is
97
/// in the range [0, n_partitions), and that independent HashPartitioners
98
/// that we initialized with the same num_partitions and seed return the same
99
/// partition.
100
#[inline(always)]
101
pub fn hash_to_partition(&self, hash: u64) -> usize {
102
// Assuming r is a 64-bit random number, we note that
103
// r / 2^64 is almost a uniform random number in [0, 1), and thus
104
// floor(r * n_partitions / 2^64) is almost a uniform random integer in
105
// [0, n_partitions). Despite being written with u128 multiplication this
106
// compiles to a single mul / mulhi instruction on x86-x64/aarch64.
107
let shuffled = hash.wrapping_mul(self.seed);
108
((shuffled as u128 * self.num_partitions as u128) >> 64) as usize
109
}
110
111
/// The partition nulls are put into.
112
#[inline(always)]
113
pub fn null_partition(&self) -> usize {
114
0
115
}
116
117
#[inline(always)]
118
pub fn num_partitions(&self) -> usize {
119
self.num_partitions
120
}
121
}
122
123
// FIXME: use Hasher interface and support a random state.
124
pub trait DirtyHash {
125
// A quick and dirty hash. Only the top bits of the hash are decent, such as
126
// used in hash_to_partition.
127
fn dirty_hash(&self) -> u64;
128
}
129
130
// Multiplication by a 'random' odd number gives a universal hash function in
131
// the top bits.
132
const RANDOM_ODD: u64 = 0x55fbfd6bfc5458e9;
133
134
macro_rules! impl_hash_partition_as_u64 {
135
($T: ty) => {
136
impl DirtyHash for $T {
137
fn dirty_hash(&self) -> u64 {
138
(*self as u64).wrapping_mul(RANDOM_ODD)
139
}
140
}
141
};
142
}
143
144
impl_hash_partition_as_u64!(u8);
145
impl_hash_partition_as_u64!(u16);
146
impl_hash_partition_as_u64!(u32);
147
impl_hash_partition_as_u64!(u64);
148
impl_hash_partition_as_u64!(i8);
149
impl_hash_partition_as_u64!(i16);
150
impl_hash_partition_as_u64!(i32);
151
impl_hash_partition_as_u64!(i64);
152
153
impl DirtyHash for i128 {
154
fn dirty_hash(&self) -> u64 {
155
(*self as u64)
156
.wrapping_mul(RANDOM_ODD)
157
.wrapping_add((*self >> 64) as u64)
158
}
159
}
160
161
impl DirtyHash for BytesHash<'_> {
162
fn dirty_hash(&self) -> u64 {
163
self.hash
164
}
165
}
166
167
impl<T: DirtyHash + ?Sized> DirtyHash for &T {
168
fn dirty_hash(&self) -> u64 {
169
(*self).dirty_hash()
170
}
171
}
172
173
// FIXME: we should probably encourage explicit null handling, but for now we'll
174
// allow directly getting a partition from a nullable value.
175
impl<T: DirtyHash> DirtyHash for Option<T> {
176
fn dirty_hash(&self) -> u64 {
177
self.as_ref().map(|s| s.dirty_hash()).unwrap_or(0)
178
}
179
}
180
181