Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/list/hash.rs
6939 views
1
use std::hash::{BuildHasher, Hash};
2
3
use polars_core::series::BitRepr;
4
use polars_core::utils::NoNull;
5
use polars_core::{POOL, with_match_physical_float_polars_type};
6
use polars_utils::aliases::PlSeedableRandomStateQuality;
7
use polars_utils::hashing::_boost_hash_combine;
8
use polars_utils::total_ord::{ToTotalOrd, TotalHash};
9
use rayon::prelude::*;
10
11
use super::*;
12
13
fn hash_agg<T>(ca: &ChunkedArray<T>, random_state: &PlSeedableRandomStateQuality) -> u64
14
where
15
T: PolarsNumericType,
16
T::Native: TotalHash + ToTotalOrd,
17
<T::Native as ToTotalOrd>::TotalOrdItem: Hash,
18
{
19
// Note that we don't use the no null branch! This can break in unexpected ways.
20
// for instance with threading we split an array in n_threads, this may lead to
21
// splits that have no nulls and splits that have nulls. Then one array is hashed with
22
// Option<T> and the other array with T.
23
// Meaning that they cannot be compared. By always hashing on Option<T> the random_state is
24
// the only deterministic seed.
25
26
// just some large prime
27
let mut hash_agg = 9069731903u64;
28
29
// just some large prime
30
let null_hash = 2413670057;
31
32
ca.downcast_iter().for_each(|arr| {
33
for opt_v in arr.iter() {
34
match opt_v {
35
Some(v) => {
36
let r = random_state.hash_one(v.to_total_ord());
37
hash_agg = _boost_hash_combine(hash_agg, r);
38
},
39
None => {
40
hash_agg = _boost_hash_combine(hash_agg, null_hash);
41
},
42
}
43
}
44
});
45
hash_agg
46
}
47
48
pub(crate) fn hash(
49
ca: &mut ListChunked,
50
build_hasher: PlSeedableRandomStateQuality,
51
) -> UInt64Chunked {
52
if !ca.inner_dtype().to_physical().is_primitive_numeric() {
53
panic!(
54
"Hashing a list with a non-numeric inner type not supported. Got dtype: {:?}",
55
ca.dtype()
56
);
57
}
58
59
// just some large prime
60
let null_hash = 1969099309u64;
61
62
ca.set_inner_dtype(ca.inner_dtype().to_physical());
63
64
let out: NoNull<UInt64Chunked> = POOL.install(|| {
65
ca.par_iter()
66
.map(|opt_s: Option<Series>| match opt_s {
67
None => null_hash,
68
Some(s) => {
69
if s.dtype().is_float() {
70
with_match_physical_float_polars_type!(s.dtype(), |$T| {
71
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
72
hash_agg(ca, &build_hasher)
73
})
74
} else {
75
match s.bit_repr() {
76
None => unimplemented!("Hash for lists without bit representation"),
77
Some(BitRepr::U8(ca)) => hash_agg(&ca, &build_hasher),
78
Some(BitRepr::U16(ca)) => hash_agg(&ca, &build_hasher),
79
Some(BitRepr::U32(ca)) => hash_agg(&ca, &build_hasher),
80
Some(BitRepr::U64(ca)) => hash_agg(&ca, &build_hasher),
81
#[cfg(feature = "dtype-i128")]
82
Some(BitRepr::I128(ca)) => hash_agg(&ca, &build_hasher),
83
}
84
}
85
},
86
})
87
.collect()
88
});
89
90
let mut out = out.into_inner();
91
out.rename(ca.name().clone());
92
out
93
}
94
95