Path: blob/main/crates/polars-ops/src/series/ops/is_unique.rs
8475 views
use std::hash::Hash;12use arrow::array::BooleanArray;3use arrow::bitmap::MutableBitmap;4use polars_core::prelude::*;5use polars_core::with_match_physical_integer_polars_type;6use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};78// If invert is true then this is an `is_duplicated`.9fn is_unique_ca<'a, T>(ca: &'a ChunkedArray<T>, invert: bool) -> BooleanChunked10where11T: PolarsDataType,12T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd,13<Option<T::Physical<'a>> as ToTotalOrd>::TotalOrdItem: Hash + Eq,14{15let len = ca.len();16let mut idx_key = PlHashMap::new();1718// Instead of group_tuples, which allocates a full Vec per group, we now19// just toggle a boolean that's false if a group has multiple entries.20ca.iter().enumerate().for_each(|(idx, key)| {21idx_key22.entry(key.to_total_ord())23.and_modify(|v: &mut (IdxSize, bool)| v.1 = false)24.or_insert((idx as IdxSize, true));25});2627let unique_idx = idx_key28.into_iter()29.filter_map(|(_k, v)| if v.1 { Some(v.0) } else { None });3031let (default, setter) = if invert { (true, false) } else { (false, true) };32let mut values = MutableBitmap::with_capacity(len);33values.extend_constant(len, default);34for idx in unique_idx {35unsafe { values.set_unchecked(idx as usize, setter) }36}37let arr = BooleanArray::from_data_default(values.into(), None);38BooleanChunked::with_chunk(ca.name().clone(), arr)39}4041fn dispatcher(s: &Series, invert: bool) -> PolarsResult<BooleanChunked> {42let s = s.to_physical_repr();43use DataType::*;44let out = match s.dtype() {45Boolean => {46let ca = s.bool().unwrap();47is_unique_ca(ca, invert)48},49Binary => {50let ca = s.binary().unwrap();51is_unique_ca(ca, invert)52},53String => {54let s = s.cast(&Binary).unwrap();55let ca = s.binary().unwrap();56is_unique_ca(ca, invert)57},58#[cfg(feature = "dtype-f16")]59Float16 => {60let ca = s.f16().unwrap();61is_unique_ca(ca, invert)62},63Float32 => {64let ca = s.f32().unwrap();65is_unique_ca(ca, invert)66},67Float64 => {68let ca = s.f64().unwrap();69is_unique_ca(ca, invert)70},71#[cfg(feature = "dtype-struct")]72Struct(_) => {73let ca = s.struct_().unwrap().clone();74let df = ca.unnest();75return if invert {76df.is_duplicated()77} else {78df.is_unique()79};80},81Null => match s.len() {820 => BooleanChunked::new(s.name().clone(), [] as [bool; 0]),831 => BooleanChunked::new(s.name().clone(), [!invert]),84len => BooleanChunked::full(s.name().clone(), invert, len),85},86dt if dt.is_primitive_numeric() => {87with_match_physical_integer_polars_type!(s.dtype(), |$T| {88let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();89is_unique_ca(ca, invert)90})91},92dt => polars_bail!(opq = is_unique, dt),93};94Ok(out)95}9697pub fn is_unique(s: &Series) -> PolarsResult<BooleanChunked> {98dispatcher(s, false)99}100101pub fn is_duplicated(s: &Series) -> PolarsResult<BooleanChunked> {102dispatcher(s, true)103}104105106