Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/series/ops/is_unique.rs
6939 views
1
use std::hash::Hash;
2
3
use arrow::array::BooleanArray;
4
use arrow::bitmap::MutableBitmap;
5
use polars_core::prelude::*;
6
use polars_core::with_match_physical_integer_polars_type;
7
use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};
8
9
// If invert is true then this is an `is_duplicated`.
10
fn is_unique_ca<'a, T>(ca: &'a ChunkedArray<T>, invert: bool) -> BooleanChunked
11
where
12
T: PolarsDataType,
13
T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd,
14
<Option<T::Physical<'a>> as ToTotalOrd>::TotalOrdItem: Hash + Eq,
15
{
16
let len = ca.len();
17
let mut idx_key = PlHashMap::new();
18
19
// Instead of group_tuples, which allocates a full Vec per group, we now
20
// just toggle a boolean that's false if a group has multiple entries.
21
ca.iter().enumerate().for_each(|(idx, key)| {
22
idx_key
23
.entry(key.to_total_ord())
24
.and_modify(|v: &mut (IdxSize, bool)| v.1 = false)
25
.or_insert((idx as IdxSize, true));
26
});
27
28
let unique_idx = idx_key
29
.into_iter()
30
.filter_map(|(_k, v)| if v.1 { Some(v.0) } else { None });
31
32
let (default, setter) = if invert { (true, false) } else { (false, true) };
33
let mut values = MutableBitmap::with_capacity(len);
34
values.extend_constant(len, default);
35
for idx in unique_idx {
36
unsafe { values.set_unchecked(idx as usize, setter) }
37
}
38
let arr = BooleanArray::from_data_default(values.into(), None);
39
BooleanChunked::with_chunk(ca.name().clone(), arr)
40
}
41
42
fn dispatcher(s: &Series, invert: bool) -> PolarsResult<BooleanChunked> {
43
let s = s.to_physical_repr();
44
use DataType::*;
45
let out = match s.dtype() {
46
Boolean => {
47
let ca = s.bool().unwrap();
48
is_unique_ca(ca, invert)
49
},
50
Binary => {
51
let ca = s.binary().unwrap();
52
is_unique_ca(ca, invert)
53
},
54
String => {
55
let s = s.cast(&Binary).unwrap();
56
let ca = s.binary().unwrap();
57
is_unique_ca(ca, invert)
58
},
59
Float32 => {
60
let ca = s.f32().unwrap();
61
is_unique_ca(ca, invert)
62
},
63
Float64 => {
64
let ca = s.f64().unwrap();
65
is_unique_ca(ca, invert)
66
},
67
#[cfg(feature = "dtype-struct")]
68
Struct(_) => {
69
let ca = s.struct_().unwrap().clone();
70
let df = ca.unnest();
71
return if invert {
72
df.is_duplicated()
73
} else {
74
df.is_unique()
75
};
76
},
77
Null => match s.len() {
78
0 => BooleanChunked::new(s.name().clone(), [] as [bool; 0]),
79
1 => BooleanChunked::new(s.name().clone(), [!invert]),
80
len => BooleanChunked::full(s.name().clone(), invert, len),
81
},
82
dt if dt.is_primitive_numeric() => {
83
with_match_physical_integer_polars_type!(s.dtype(), |$T| {
84
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
85
is_unique_ca(ca, invert)
86
})
87
},
88
dt => polars_bail!(opq = is_unique, dt),
89
};
90
Ok(out)
91
}
92
93
pub fn is_unique(s: &Series) -> PolarsResult<BooleanChunked> {
94
dispatcher(s, false)
95
}
96
97
pub fn is_duplicated(s: &Series) -> PolarsResult<BooleanChunked> {
98
dispatcher(s, true)
99
}
100
101