Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/series/ops/unique.rs
8431 views
1
use std::borrow::Cow;
2
use std::hash::Hash;
3
4
use polars_core::hashing::_HASHMAP_INIT_SIZE;
5
use polars_core::prelude::row_encode::encode_rows_unordered;
6
use polars_core::prelude::*;
7
use polars_core::utils::NoNull;
8
use polars_core::with_match_physical_numeric_polars_type;
9
use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};
10
11
fn unique_counts_helper<I, J>(items: I) -> IdxCa
12
where
13
I: Iterator<Item = J>,
14
J: TotalHash + TotalEq + ToTotalOrd,
15
<J as ToTotalOrd>::TotalOrdItem: Hash + Eq,
16
{
17
let mut map = PlIndexMap::with_capacity_and_hasher(_HASHMAP_INIT_SIZE, Default::default());
18
for item in items {
19
let item = item.to_total_ord();
20
map.entry(item)
21
.and_modify(|cnt| {
22
*cnt += 1;
23
})
24
.or_insert(1 as IdxSize);
25
}
26
let out: NoNull<IdxCa> = map.into_values().collect();
27
out.into_inner()
28
}
29
30
/// Returns a count of the unique values in the order of appearance.
31
pub fn unique_counts(s: &Series) -> PolarsResult<Series> {
32
if s.is_empty() {
33
return Ok(IdxCa::new(s.name().clone(), [] as [IdxSize; 0]).into_series());
34
} else if s.null_count() == s.len() {
35
return Ok(IdxCa::new(s.name().clone(), [s.len() as IdxSize]).into_series());
36
}
37
38
let mut s = Cow::Borrowed(s);
39
40
if s.dtype().is_nested() {
41
s = Cow::Owned(encode_rows_unordered(&[s.into_owned().into_column()])?.into_series());
42
}
43
44
match s.dtype().to_physical() {
45
dt if dt.is_primitive_numeric() => {
46
let s_physical = s.to_physical_repr();
47
with_match_physical_numeric_polars_type!(s_physical.dtype(), |$T| {
48
let ca: &ChunkedArray<$T> = s_physical.as_ref().as_ref().as_ref();
49
Ok(unique_counts_helper(ca.iter()).into_series())
50
})
51
},
52
DataType::Null => unreachable!("handled before"),
53
DataType::BinaryOffset => {
54
let ca = s.binary_offset()?;
55
Ok(unique_counts_helper(ca.into_iter()).into_series())
56
},
57
DataType::Binary => {
58
let ca = s.binary()?;
59
Ok(unique_counts_helper(ca.into_iter()).into_series())
60
},
61
DataType::String => {
62
let ca = s.str()?.as_binary();
63
Ok(unique_counts_helper(ca.into_iter()).into_series())
64
},
65
DataType::Boolean => {
66
let ca = s.bool()?;
67
68
let num_trues = ca.num_trues() as IdxSize;
69
let num_nulls = ca.null_count() as IdxSize;
70
let num_falses = ca.len() as IdxSize - num_trues - num_nulls;
71
72
let values: Vec<IdxSize> = match ca.get(0) {
73
Some(false) if num_nulls == 0 && num_trues == 0 => vec![num_falses],
74
Some(false) if num_nulls == 0 => vec![num_falses, num_trues],
75
Some(false) if num_trues == 0 => vec![num_falses, num_nulls],
76
77
Some(true) if num_nulls == 0 && num_falses == 0 => vec![num_trues],
78
Some(true) if num_nulls == 0 => vec![num_trues, num_falses],
79
Some(true) if num_falses == 0 => vec![num_trues, num_nulls],
80
81
None if num_trues == 0 && num_falses == 0 => unreachable!(),
82
None if num_trues == 0 => vec![num_nulls, num_falses],
83
None if num_falses == 0 => vec![num_nulls, num_trues],
84
85
Some(false) => {
86
let first_true = ca.first_true_idx().unwrap();
87
let first_null = ca.first_null().unwrap();
88
89
if first_true < first_null {
90
vec![num_falses, num_trues, num_nulls]
91
} else {
92
vec![num_falses, num_nulls, num_trues]
93
}
94
},
95
Some(true) => {
96
let first_false = ca.first_false_idx().unwrap();
97
let first_null = ca.first_null().unwrap();
98
99
if first_false < first_null {
100
vec![num_trues, num_falses, num_nulls]
101
} else {
102
vec![num_trues, num_nulls, num_falses]
103
}
104
},
105
None => {
106
if ca.get(ca.first_non_null().unwrap()).unwrap() {
107
vec![num_nulls, num_trues, num_falses]
108
} else {
109
vec![num_nulls, num_falses, num_trues]
110
}
111
},
112
};
113
Ok(IdxCa::new(s.name().clone(), values).into_series())
114
},
115
116
#[cfg(feature = "dtype-extension")]
117
DataType::Extension(_, _) => unique_counts(s.ext().unwrap().storage()),
118
119
DataType::UInt8
120
| DataType::UInt16
121
| DataType::UInt32
122
| DataType::UInt64
123
| DataType::UInt128
124
| DataType::Int8
125
| DataType::Int16
126
| DataType::Int32
127
| DataType::Int64
128
| DataType::Int128
129
| DataType::Float16
130
| DataType::Float32
131
| DataType::Float64
132
| DataType::Date
133
| DataType::Datetime(..)
134
| DataType::Duration(..)
135
| DataType::Time => unreachable!("primitive numeric"),
136
#[cfg(feature = "dtype-decimal")]
137
DataType::Decimal(..) => unreachable!("primitive numeric"),
138
#[cfg(feature = "dtype-categorical")]
139
DataType::Categorical(..) | DataType::Enum(..) => unreachable!("primitive numeric"),
140
#[cfg(feature = "dtype-array")]
141
DataType::Array(..) => unreachable!("row encoded"),
142
#[cfg(feature = "dtype-struct")]
143
DataType::Struct(..) => unreachable!("row encoded"),
144
DataType::List(..) => {
145
unreachable!("row encoded")
146
},
147
#[cfg(feature = "object")]
148
dt @ DataType::Object(..) => polars_bail!(opq = unique_counts, dt),
149
dt @ DataType::Unknown(..) => polars_bail!(opq = unique_counts, dt),
150
}
151
}
152
153