Path: blob/main/crates/polars-compute/src/cardinality.rs
6939 views
use arrow::array::{1Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray,2Utf8Array, Utf8ViewArray,3};4use arrow::datatypes::PhysicalType;5use arrow::types::Offset;6use arrow::with_match_primitive_type_full;7use polars_utils::total_ord::ToTotalOrd;89use crate::hyperloglogplus::HyperLogLog;1011/// Get an estimate for the *cardinality* of the array (i.e. the number of unique values)12///13/// This is not currently implemented for nested types.14pub fn estimate_cardinality(array: &dyn Array) -> usize {15if array.is_empty() {16return 0;17}1819if array.null_count() == array.len() {20return 1;21}2223// Estimate the cardinality with HyperLogLog24use PhysicalType as PT;25match array.dtype().to_physical_type() {26PT::Null => 1,2728PT::Boolean => {29let mut cardinality = 0;3031let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();3233cardinality += usize::from(array.has_nulls());3435if let Some(unset_bits) = array.values().lazy_unset_bits() {36cardinality += 1 + usize::from(unset_bits != array.len());37} else {38cardinality += 2;39}4041cardinality42},4344PT::Primitive(primitive_type) => with_match_primitive_type_full!(primitive_type, |$T| {45let mut hll = HyperLogLog::new();4647let array = array48.as_any()49.downcast_ref::<PrimitiveArray<$T>>()50.unwrap();5152if array.has_nulls() {53for v in array.iter() {54let v = v.copied().unwrap_or_default();55hll.add(&v.to_total_ord());56}57} else {58for v in array.values_iter() {59hll.add(&v.to_total_ord());60}61}6263hll.count()64}),65PT::FixedSizeBinary => {66let mut hll = HyperLogLog::new();6768let array = array69.as_any()70.downcast_ref::<FixedSizeBinaryArray>()71.unwrap();7273if array.has_nulls() {74for v in array.iter() {75let v = v.unwrap_or_default();76hll.add(v);77}78} else {79for v in array.values_iter() {80hll.add(v);81}82}8384hll.count()85},86PT::Binary => {87binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i32>>().unwrap())88},89PT::LargeBinary => {90binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i64>>().unwrap())91},92PT::Utf8 => binary_offset_array_estimate(93&array94.as_any()95.downcast_ref::<Utf8Array<i32>>()96.unwrap()97.to_binary(),98),99PT::LargeUtf8 => binary_offset_array_estimate(100&array101.as_any()102.downcast_ref::<Utf8Array<i64>>()103.unwrap()104.to_binary(),105),106PT::BinaryView => {107binary_view_array_estimate(array.as_any().downcast_ref::<BinaryViewArray>().unwrap())108},109PT::Utf8View => binary_view_array_estimate(110&array111.as_any()112.downcast_ref::<Utf8ViewArray>()113.unwrap()114.to_binview(),115),116PT::List => unimplemented!(),117PT::FixedSizeList => unimplemented!(),118PT::LargeList => unimplemented!(),119PT::Struct => unimplemented!(),120PT::Union => unimplemented!(),121PT::Map => unimplemented!(),122PT::Dictionary(_) => unimplemented!(),123}124}125126fn binary_offset_array_estimate<O: Offset>(array: &BinaryArray<O>) -> usize {127let mut hll = HyperLogLog::new();128129if array.has_nulls() {130for v in array.iter() {131let v = v.unwrap_or_default();132hll.add(v);133}134} else {135for v in array.values_iter() {136hll.add(v);137}138}139140hll.count()141}142143fn binary_view_array_estimate(array: &BinaryViewArray) -> usize {144let mut hll = HyperLogLog::new();145146if array.has_nulls() {147for v in array.iter() {148let v = v.unwrap_or_default();149hll.add(v);150}151} else {152for v in array.values_iter() {153hll.add(v);154}155}156157hll.count()158}159160161