Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-expr/src/reduce/approx_n_unique.rs
7884 views
1
use std::marker::PhantomData;
2
3
use polars_core::with_match_physical_numeric_polars_type;
4
use polars_utils::cardinality_sketch::CardinalitySketch;
5
use polars_utils::total_ord::{BuildHasherTotalExt, TotalHash};
6
7
use super::*;
8
9
pub fn new_approx_n_unique_reduction(dtype: DataType) -> PolarsResult<Box<dyn GroupedReduction>> {
10
// TODO: Move the error checks up and make this function infallible
11
use DataType::*;
12
use {ApproxNUniqueReducer as R, VecGroupedReduction as VGR};
13
Ok(match dtype {
14
Boolean => Box::new(VGR::new(dtype, R::<BooleanType>::default())),
15
_ if dtype.is_primitive_numeric() || dtype.is_temporal() => {
16
with_match_physical_numeric_polars_type!(dtype.to_physical(), |$T| {
17
Box::new(VGR::new(dtype, R::<$T>::default()))
18
})
19
},
20
String => Box::new(VGR::new(dtype, R::<StringType>::default())),
21
Binary => Box::new(VGR::new(dtype, R::<BinaryType>::default())),
22
#[cfg(feature = "dtype-decimal")]
23
Decimal(_, _) => Box::new(VGR::new(dtype, R::<Int128Type>::default())),
24
#[cfg(feature = "dtype-categorical")]
25
DataType::Enum(_, _) | DataType::Categorical(_, _) => match dtype.cat_physical().unwrap() {
26
CategoricalPhysical::U8 => Box::new(VGR::new(dtype, R::<UInt8Type>::default())),
27
CategoricalPhysical::U16 => Box::new(VGR::new(dtype, R::<UInt16Type>::default())),
28
CategoricalPhysical::U32 => Box::new(VGR::new(dtype, R::<UInt32Type>::default())),
29
},
30
Null => Box::new(super::NullGroupedReduction::new(Scalar::new_idxsize(1))),
31
_ => {
32
polars_bail!(InvalidOperation: "`approx_n_unique` operation not supported for dtype `{dtype}`")
33
},
34
})
35
}
36
37
struct ApproxNUniqueReducer<T> {
38
hasher: PlFixedStateQuality,
39
marker: PhantomData<T>,
40
}
41
42
impl<T> Default for ApproxNUniqueReducer<T> {
43
fn default() -> Self {
44
Self {
45
hasher: PlFixedStateQuality::default(),
46
marker: PhantomData,
47
}
48
}
49
}
50
51
impl<T> Clone for ApproxNUniqueReducer<T> {
52
fn clone(&self) -> Self {
53
Self {
54
hasher: self.hasher.clone(),
55
marker: PhantomData,
56
}
57
}
58
}
59
60
impl<T> Reducer for ApproxNUniqueReducer<T>
61
where
62
T: PolarsPhysicalType,
63
for<'a> T::Physical<'a>: TotalHash,
64
{
65
type Dtype = T;
66
type Value = CardinalitySketch;
67
68
#[inline(always)]
69
fn init(&self) -> Self::Value {
70
CardinalitySketch::new()
71
}
72
73
fn cast_series<'a>(&self, s: &'a Series) -> Cow<'a, Series> {
74
s.to_physical_repr()
75
}
76
77
#[inline(always)]
78
fn combine(&self, a: &mut Self::Value, b: &Self::Value) {
79
a.combine(b);
80
}
81
82
#[inline(always)]
83
fn reduce_one(
84
&self,
85
a: &mut Self::Value,
86
b: Option<<Self::Dtype as PolarsDataType>::Physical<'_>>,
87
_seq_id: u64,
88
) {
89
let hash = self.hasher.tot_hash_one(b);
90
a.insert(hash);
91
}
92
93
fn reduce_ca(&self, v: &mut Self::Value, ca: &ChunkedArray<Self::Dtype>, _seq_id: u64) {
94
for val in ca.iter() {
95
let hash = self.hasher.tot_hash_one(val);
96
v.insert(hash);
97
}
98
}
99
100
fn finish(
101
&self,
102
v: Vec<Self::Value>,
103
m: Option<Bitmap>,
104
_dtype: &DataType,
105
) -> PolarsResult<Series> {
106
assert!(m.is_none());
107
let ca: IdxCa = v
108
.into_iter()
109
.map(|sketch| sketch.estimate().min(IdxSize::MAX as usize) as IdxSize)
110
.collect_ca(PlSmallStr::EMPTY);
111
Ok(ca.into_series())
112
}
113
}
114
115