Path: blob/main/crates/polars-ops/src/series/ops/unique.rs
8431 views
use std::borrow::Cow;1use std::hash::Hash;23use polars_core::hashing::_HASHMAP_INIT_SIZE;4use polars_core::prelude::row_encode::encode_rows_unordered;5use polars_core::prelude::*;6use polars_core::utils::NoNull;7use polars_core::with_match_physical_numeric_polars_type;8use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};910fn unique_counts_helper<I, J>(items: I) -> IdxCa11where12I: Iterator<Item = J>,13J: TotalHash + TotalEq + ToTotalOrd,14<J as ToTotalOrd>::TotalOrdItem: Hash + Eq,15{16let mut map = PlIndexMap::with_capacity_and_hasher(_HASHMAP_INIT_SIZE, Default::default());17for item in items {18let item = item.to_total_ord();19map.entry(item)20.and_modify(|cnt| {21*cnt += 1;22})23.or_insert(1 as IdxSize);24}25let out: NoNull<IdxCa> = map.into_values().collect();26out.into_inner()27}2829/// Returns a count of the unique values in the order of appearance.30pub fn unique_counts(s: &Series) -> PolarsResult<Series> {31if s.is_empty() {32return Ok(IdxCa::new(s.name().clone(), [] as [IdxSize; 0]).into_series());33} else if s.null_count() == s.len() {34return Ok(IdxCa::new(s.name().clone(), [s.len() as IdxSize]).into_series());35}3637let mut s = Cow::Borrowed(s);3839if s.dtype().is_nested() {40s = Cow::Owned(encode_rows_unordered(&[s.into_owned().into_column()])?.into_series());41}4243match s.dtype().to_physical() {44dt if dt.is_primitive_numeric() => {45let s_physical = s.to_physical_repr();46with_match_physical_numeric_polars_type!(s_physical.dtype(), |$T| {47let ca: &ChunkedArray<$T> = s_physical.as_ref().as_ref().as_ref();48Ok(unique_counts_helper(ca.iter()).into_series())49})50},51DataType::Null => unreachable!("handled before"),52DataType::BinaryOffset => {53let ca = s.binary_offset()?;54Ok(unique_counts_helper(ca.into_iter()).into_series())55},56DataType::Binary => {57let ca = s.binary()?;58Ok(unique_counts_helper(ca.into_iter()).into_series())59},60DataType::String => {61let ca = s.str()?.as_binary();62Ok(unique_counts_helper(ca.into_iter()).into_series())63},64DataType::Boolean => {65let ca = s.bool()?;6667let num_trues = ca.num_trues() as IdxSize;68let num_nulls = ca.null_count() as IdxSize;69let num_falses = ca.len() as IdxSize - num_trues - num_nulls;7071let values: Vec<IdxSize> = match ca.get(0) {72Some(false) if num_nulls == 0 && num_trues == 0 => vec![num_falses],73Some(false) if num_nulls == 0 => vec![num_falses, num_trues],74Some(false) if num_trues == 0 => vec![num_falses, num_nulls],7576Some(true) if num_nulls == 0 && num_falses == 0 => vec![num_trues],77Some(true) if num_nulls == 0 => vec![num_trues, num_falses],78Some(true) if num_falses == 0 => vec![num_trues, num_nulls],7980None if num_trues == 0 && num_falses == 0 => unreachable!(),81None if num_trues == 0 => vec![num_nulls, num_falses],82None if num_falses == 0 => vec![num_nulls, num_trues],8384Some(false) => {85let first_true = ca.first_true_idx().unwrap();86let first_null = ca.first_null().unwrap();8788if first_true < first_null {89vec![num_falses, num_trues, num_nulls]90} else {91vec![num_falses, num_nulls, num_trues]92}93},94Some(true) => {95let first_false = ca.first_false_idx().unwrap();96let first_null = ca.first_null().unwrap();9798if first_false < first_null {99vec![num_trues, num_falses, num_nulls]100} else {101vec![num_trues, num_nulls, num_falses]102}103},104None => {105if ca.get(ca.first_non_null().unwrap()).unwrap() {106vec![num_nulls, num_trues, num_falses]107} else {108vec![num_nulls, num_falses, num_trues]109}110},111};112Ok(IdxCa::new(s.name().clone(), values).into_series())113},114115#[cfg(feature = "dtype-extension")]116DataType::Extension(_, _) => unique_counts(s.ext().unwrap().storage()),117118DataType::UInt8119| DataType::UInt16120| DataType::UInt32121| DataType::UInt64122| DataType::UInt128123| DataType::Int8124| DataType::Int16125| DataType::Int32126| DataType::Int64127| DataType::Int128128| DataType::Float16129| DataType::Float32130| DataType::Float64131| DataType::Date132| DataType::Datetime(..)133| DataType::Duration(..)134| DataType::Time => unreachable!("primitive numeric"),135#[cfg(feature = "dtype-decimal")]136DataType::Decimal(..) => unreachable!("primitive numeric"),137#[cfg(feature = "dtype-categorical")]138DataType::Categorical(..) | DataType::Enum(..) => unreachable!("primitive numeric"),139#[cfg(feature = "dtype-array")]140DataType::Array(..) => unreachable!("row encoded"),141#[cfg(feature = "dtype-struct")]142DataType::Struct(..) => unreachable!("row encoded"),143DataType::List(..) => {144unreachable!("row encoded")145},146#[cfg(feature = "object")]147dt @ DataType::Object(..) => polars_bail!(opq = unique_counts, dt),148dt @ DataType::Unknown(..) => polars_bail!(opq = unique_counts, dt),149}150}151152153