Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/series/ops/is_first_distinct.rs
6939 views
1
use std::hash::Hash;
2
3
use arrow::array::BooleanArray;
4
use arrow::bitmap::MutableBitmap;
5
use arrow::legacy::bit_util::*;
6
use arrow::legacy::utils::CustomIterTools;
7
use polars_core::prelude::*;
8
use polars_core::with_match_physical_numeric_polars_type;
9
use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};
10
fn is_first_distinct_numeric<T>(ca: &ChunkedArray<T>) -> BooleanChunked
11
where
12
T: PolarsNumericType,
13
T::Native: TotalHash + TotalEq + ToTotalOrd,
14
<T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq,
15
{
16
let mut unique = PlHashSet::new();
17
let chunks = ca.downcast_iter().map(|arr| -> BooleanArray {
18
arr.into_iter()
19
.map(|opt_v| unique.insert(opt_v.to_total_ord()))
20
.collect_trusted()
21
});
22
23
BooleanChunked::from_chunk_iter(ca.name().clone(), chunks)
24
}
25
26
fn is_first_distinct_bin(ca: &BinaryChunked) -> BooleanChunked {
27
let mut unique = PlHashSet::new();
28
let chunks = ca.downcast_iter().map(|arr| -> BooleanArray {
29
arr.into_iter()
30
.map(|opt_v| unique.insert(opt_v))
31
.collect_trusted()
32
});
33
34
BooleanChunked::from_chunk_iter(ca.name().clone(), chunks)
35
}
36
37
fn is_first_distinct_boolean(ca: &BooleanChunked) -> BooleanChunked {
38
let mut out = MutableBitmap::with_capacity(ca.len());
39
out.extend_constant(ca.len(), false);
40
41
if ca.null_count() == ca.len() {
42
out.set(0, true);
43
} else {
44
let ca = ca.rechunk();
45
let arr = ca.downcast_as_array();
46
if ca.null_count() == 0 {
47
let (true_index, false_index) =
48
find_first_true_false_no_null(arr.values().chunks::<u64>());
49
if let Some(idx) = true_index {
50
out.set(idx, true)
51
}
52
if let Some(idx) = false_index {
53
out.set(idx, true)
54
}
55
} else {
56
let (true_index, false_index, null_index) = find_first_true_false_null(
57
arr.values().chunks::<u64>(),
58
arr.validity().unwrap().chunks::<u64>(),
59
);
60
if let Some(idx) = true_index {
61
out.set(idx, true)
62
}
63
if let Some(idx) = false_index {
64
out.set(idx, true)
65
}
66
if let Some(idx) = null_index {
67
out.set(idx, true)
68
}
69
}
70
}
71
let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None);
72
BooleanChunked::with_chunk(ca.name().clone(), arr)
73
}
74
75
#[cfg(feature = "dtype-struct")]
76
fn is_first_distinct_struct(s: &Series) -> PolarsResult<BooleanChunked> {
77
let groups = s.group_tuples(true, false)?;
78
let first = groups.take_group_firsts();
79
let mut out = MutableBitmap::with_capacity(s.len());
80
out.extend_constant(s.len(), false);
81
82
for idx in first {
83
// Group tuples are always in bounds
84
unsafe { out.set_unchecked(idx as usize, true) }
85
}
86
87
let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None);
88
Ok(BooleanChunked::with_chunk(s.name().clone(), arr))
89
}
90
91
fn is_first_distinct_list(ca: &ListChunked) -> PolarsResult<BooleanChunked> {
92
let groups = ca.group_tuples(true, false)?;
93
let first = groups.take_group_firsts();
94
let mut out = MutableBitmap::with_capacity(ca.len());
95
out.extend_constant(ca.len(), false);
96
97
for idx in first {
98
// Group tuples are always in bounds
99
unsafe { out.set_unchecked(idx as usize, true) }
100
}
101
102
let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None);
103
Ok(BooleanChunked::with_chunk(ca.name().clone(), arr))
104
}
105
106
pub fn is_first_distinct(s: &Series) -> PolarsResult<BooleanChunked> {
107
// fast path.
108
if s.is_empty() {
109
return Ok(BooleanChunked::full_null(s.name().clone(), 0));
110
} else if s.len() == 1 {
111
return Ok(BooleanChunked::new(s.name().clone(), &[true]));
112
}
113
114
let s = s.to_physical_repr();
115
116
use DataType::*;
117
let out = match s.dtype() {
118
Boolean => {
119
let ca = s.bool().unwrap();
120
is_first_distinct_boolean(ca)
121
},
122
Binary => {
123
let ca = s.binary().unwrap();
124
is_first_distinct_bin(ca)
125
},
126
String => {
127
let s = s.cast(&Binary).unwrap();
128
return is_first_distinct(&s);
129
},
130
dt if dt.is_primitive_numeric() => {
131
with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
132
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
133
is_first_distinct_numeric(ca)
134
})
135
},
136
#[cfg(feature = "dtype-struct")]
137
Struct(_) => return is_first_distinct_struct(&s),
138
List(inner) => {
139
polars_ensure!(
140
!inner.is_nested(),
141
InvalidOperation: "`is_first_distinct` on list type is only allowed if the inner type is not nested."
142
);
143
let ca = s.list().unwrap();
144
return is_first_distinct_list(ca);
145
},
146
dt => polars_bail!(opq = is_first_distinct, dt),
147
};
148
Ok(out)
149
}
150
151