Path: blob/main/crates/polars-expr/src/reduce/approx_n_unique.rs
7884 views
use std::marker::PhantomData;12use polars_core::with_match_physical_numeric_polars_type;3use polars_utils::cardinality_sketch::CardinalitySketch;4use polars_utils::total_ord::{BuildHasherTotalExt, TotalHash};56use super::*;78pub fn new_approx_n_unique_reduction(dtype: DataType) -> PolarsResult<Box<dyn GroupedReduction>> {9// TODO: Move the error checks up and make this function infallible10use DataType::*;11use {ApproxNUniqueReducer as R, VecGroupedReduction as VGR};12Ok(match dtype {13Boolean => Box::new(VGR::new(dtype, R::<BooleanType>::default())),14_ if dtype.is_primitive_numeric() || dtype.is_temporal() => {15with_match_physical_numeric_polars_type!(dtype.to_physical(), |$T| {16Box::new(VGR::new(dtype, R::<$T>::default()))17})18},19String => Box::new(VGR::new(dtype, R::<StringType>::default())),20Binary => Box::new(VGR::new(dtype, R::<BinaryType>::default())),21#[cfg(feature = "dtype-decimal")]22Decimal(_, _) => Box::new(VGR::new(dtype, R::<Int128Type>::default())),23#[cfg(feature = "dtype-categorical")]24DataType::Enum(_, _) | DataType::Categorical(_, _) => match dtype.cat_physical().unwrap() {25CategoricalPhysical::U8 => Box::new(VGR::new(dtype, R::<UInt8Type>::default())),26CategoricalPhysical::U16 => Box::new(VGR::new(dtype, R::<UInt16Type>::default())),27CategoricalPhysical::U32 => Box::new(VGR::new(dtype, R::<UInt32Type>::default())),28},29Null => Box::new(super::NullGroupedReduction::new(Scalar::new_idxsize(1))),30_ => {31polars_bail!(InvalidOperation: "`approx_n_unique` operation not supported for dtype `{dtype}`")32},33})34}3536struct ApproxNUniqueReducer<T> {37hasher: PlFixedStateQuality,38marker: PhantomData<T>,39}4041impl<T> Default for ApproxNUniqueReducer<T> {42fn default() -> Self {43Self {44hasher: PlFixedStateQuality::default(),45marker: PhantomData,46}47}48}4950impl<T> Clone for ApproxNUniqueReducer<T> {51fn clone(&self) -> Self {52Self {53hasher: self.hasher.clone(),54marker: PhantomData,55}56}57}5859impl<T> Reducer for ApproxNUniqueReducer<T>60where61T: PolarsPhysicalType,62for<'a> T::Physical<'a>: TotalHash,63{64type Dtype = T;65type Value = CardinalitySketch;6667#[inline(always)]68fn init(&self) -> Self::Value {69CardinalitySketch::new()70}7172fn cast_series<'a>(&self, s: &'a Series) -> Cow<'a, Series> {73s.to_physical_repr()74}7576#[inline(always)]77fn combine(&self, a: &mut Self::Value, b: &Self::Value) {78a.combine(b);79}8081#[inline(always)]82fn reduce_one(83&self,84a: &mut Self::Value,85b: Option<<Self::Dtype as PolarsDataType>::Physical<'_>>,86_seq_id: u64,87) {88let hash = self.hasher.tot_hash_one(b);89a.insert(hash);90}9192fn reduce_ca(&self, v: &mut Self::Value, ca: &ChunkedArray<Self::Dtype>, _seq_id: u64) {93for val in ca.iter() {94let hash = self.hasher.tot_hash_one(val);95v.insert(hash);96}97}9899fn finish(100&self,101v: Vec<Self::Value>,102m: Option<Bitmap>,103_dtype: &DataType,104) -> PolarsResult<Series> {105assert!(m.is_none());106let ca: IdxCa = v107.into_iter()108.map(|sketch| sketch.estimate().min(IdxSize::MAX as usize) as IdxSize)109.collect_ca(PlSmallStr::EMPTY);110Ok(ca.into_series())111}112}113114115