Path: blob/main/crates/polars-ops/src/series/ops/is_unique.rs
6939 views
use std::hash::Hash;12use arrow::array::BooleanArray;3use arrow::bitmap::MutableBitmap;4use polars_core::prelude::*;5use polars_core::with_match_physical_integer_polars_type;6use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};78// If invert is true then this is an `is_duplicated`.9fn is_unique_ca<'a, T>(ca: &'a ChunkedArray<T>, invert: bool) -> BooleanChunked10where11T: PolarsDataType,12T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd,13<Option<T::Physical<'a>> as ToTotalOrd>::TotalOrdItem: Hash + Eq,14{15let len = ca.len();16let mut idx_key = PlHashMap::new();1718// Instead of group_tuples, which allocates a full Vec per group, we now19// just toggle a boolean that's false if a group has multiple entries.20ca.iter().enumerate().for_each(|(idx, key)| {21idx_key22.entry(key.to_total_ord())23.and_modify(|v: &mut (IdxSize, bool)| v.1 = false)24.or_insert((idx as IdxSize, true));25});2627let unique_idx = idx_key28.into_iter()29.filter_map(|(_k, v)| if v.1 { Some(v.0) } else { None });3031let (default, setter) = if invert { (true, false) } else { (false, true) };32let mut values = MutableBitmap::with_capacity(len);33values.extend_constant(len, default);34for idx in unique_idx {35unsafe { values.set_unchecked(idx as usize, setter) }36}37let arr = BooleanArray::from_data_default(values.into(), None);38BooleanChunked::with_chunk(ca.name().clone(), arr)39}4041fn dispatcher(s: &Series, invert: bool) -> PolarsResult<BooleanChunked> {42let s = s.to_physical_repr();43use DataType::*;44let out = match s.dtype() {45Boolean => {46let ca = s.bool().unwrap();47is_unique_ca(ca, invert)48},49Binary => {50let ca = s.binary().unwrap();51is_unique_ca(ca, invert)52},53String => {54let s = s.cast(&Binary).unwrap();55let ca = s.binary().unwrap();56is_unique_ca(ca, invert)57},58Float32 => {59let ca = s.f32().unwrap();60is_unique_ca(ca, invert)61},62Float64 => {63let ca = s.f64().unwrap();64is_unique_ca(ca, invert)65},66#[cfg(feature = "dtype-struct")]67Struct(_) => {68let ca = s.struct_().unwrap().clone();69let df = ca.unnest();70return if invert {71df.is_duplicated()72} else {73df.is_unique()74};75},76Null => match s.len() {770 => BooleanChunked::new(s.name().clone(), [] as [bool; 0]),781 => BooleanChunked::new(s.name().clone(), [!invert]),79len => BooleanChunked::full(s.name().clone(), invert, len),80},81dt if dt.is_primitive_numeric() => {82with_match_physical_integer_polars_type!(s.dtype(), |$T| {83let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();84is_unique_ca(ca, invert)85})86},87dt => polars_bail!(opq = is_unique, dt),88};89Ok(out)90}9192pub fn is_unique(s: &Series) -> PolarsResult<BooleanChunked> {93dispatcher(s, false)94}9596pub fn is_duplicated(s: &Series) -> PolarsResult<BooleanChunked> {97dispatcher(s, true)98}99100101