Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/series/ops/various.rs
6939 views
1
use num_traits::Bounded;
2
#[cfg(feature = "dtype-struct")]
3
use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4
use polars_core::prelude::arity::unary_elementwise_values;
5
use polars_core::prelude::*;
6
use polars_core::series::IsSorted;
7
use polars_core::with_match_physical_numeric_polars_type;
8
#[cfg(feature = "hash")]
9
use polars_utils::aliases::PlSeedableRandomStateQuality;
10
use polars_utils::total_ord::TotalOrd;
11
12
use crate::series::ops::SeriesSealed;
13
14
pub trait SeriesMethods: SeriesSealed {
15
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
16
/// with dtype [`IdxType`]
17
fn value_counts(
18
&self,
19
sort: bool,
20
parallel: bool,
21
name: PlSmallStr,
22
normalize: bool,
23
) -> PolarsResult<DataFrame> {
24
let s = self.as_series();
25
polars_ensure!(
26
s.name() != &name,
27
Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
28
column names; change `name` to fix", name,
29
);
30
// we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
31
let groups = s.group_tuples(parallel, sort)?;
32
let values = unsafe { s.agg_first(&groups) }
33
.with_name(s.name().clone())
34
.into();
35
let counts = groups.group_count().with_name(name.clone());
36
37
let counts = if normalize {
38
let len = s.len() as f64;
39
let counts: Float64Chunked =
40
unary_elementwise_values(&counts, |count| count as f64 / len);
41
counts.into_column()
42
} else {
43
counts.into_column()
44
};
45
46
let height = counts.len();
47
let cols = vec![values, counts];
48
let df = unsafe { DataFrame::new_no_checks(height, cols) };
49
if sort {
50
df.sort(
51
[name],
52
SortMultipleOptions::default()
53
.with_order_descending(true)
54
.with_multithreaded(parallel),
55
)
56
} else {
57
Ok(df)
58
}
59
}
60
61
#[cfg(feature = "hash")]
62
fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {
63
let s = self.as_series().to_physical_repr();
64
match s.dtype() {
65
DataType::List(_) => {
66
let mut ca = s.list().unwrap().clone();
67
crate::chunked_array::hash::hash(&mut ca, build_hasher)
68
},
69
_ => {
70
let mut h = vec![];
71
s.0.vec_hash(build_hasher, &mut h).unwrap();
72
UInt64Chunked::from_vec(s.name().clone(), h)
73
},
74
}
75
}
76
77
fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
78
polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
79
Ok(())
80
}
81
82
/// Checks if a [`Series`] is sorted. Tries to fail fast.
83
fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
84
let s = self.as_series();
85
let null_count = s.null_count();
86
87
// fast paths
88
if (options.descending
89
&& (options.nulls_last || null_count == 0)
90
&& matches!(s.is_sorted_flag(), IsSorted::Descending))
91
|| (!options.descending
92
&& (!options.nulls_last || null_count == 0)
93
&& matches!(s.is_sorted_flag(), IsSorted::Ascending))
94
{
95
return Ok(true);
96
}
97
98
// for struct types we row-encode and recurse
99
#[cfg(feature = "dtype-struct")]
100
if matches!(s.dtype(), DataType::Struct(_)) {
101
let encoded = _get_rows_encoded_ca(
102
PlSmallStr::EMPTY,
103
&[s.clone().into()],
104
&[options.descending],
105
&[options.nulls_last],
106
)?;
107
return encoded.into_series().is_sorted(options);
108
}
109
110
let s_len = s.len();
111
if null_count == s_len {
112
// All nulls is all equal
113
return Ok(true);
114
}
115
// Check if nulls are in the right location.
116
if null_count > 0 {
117
// The slice triggers a fast null count
118
if options.nulls_last {
119
if s.slice((s_len - null_count) as i64, null_count)
120
.null_count()
121
!= null_count
122
{
123
return Ok(false);
124
}
125
} else if s.slice(0, null_count).null_count() != null_count {
126
return Ok(false);
127
}
128
}
129
130
if s.dtype().is_primitive_numeric() {
131
with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
132
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
133
return Ok(is_sorted_ca_num::<$T>(ca, options))
134
})
135
}
136
137
let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do
138
// TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.
139
// Compare adjacent elements with no-copy slices that don't include any nulls
140
let offset = !options.nulls_last as i64 * null_count as i64;
141
let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));
142
let cmp_op = if options.descending {
143
Series::gt_eq
144
} else {
145
Series::lt_eq
146
};
147
Ok(cmp_op(&s1, &s2)?.all())
148
}
149
}
150
151
fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
152
vals: &[T],
153
f: Cmp,
154
previous: &mut T,
155
) -> bool {
156
let mut sorted = true;
157
158
// Outer loop so we can fail fast
159
// Inner loop will auto vectorize
160
for c in vals.chunks(1024) {
161
// don't early stop or branch
162
// so it autovectorizes
163
for v in c {
164
sorted &= f(previous, v);
165
*previous = *v;
166
}
167
if !sorted {
168
return false;
169
}
170
}
171
sorted
172
}
173
174
// Assumes nulls last/first is already checked.
175
fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
176
if let Ok(vals) = ca.cont_slice() {
177
let mut previous = vals[0];
178
return if options.descending {
179
check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
180
} else {
181
check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
182
};
183
};
184
185
if ca.null_count() == 0 {
186
let mut previous = if options.descending {
187
T::Native::max_value()
188
} else {
189
T::Native::min_value()
190
};
191
for arr in ca.downcast_iter() {
192
let vals = arr.values();
193
194
let sorted = if options.descending {
195
check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
196
} else {
197
check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
198
};
199
if !sorted {
200
return false;
201
}
202
}
203
return true;
204
};
205
206
// Slice off nulls and recurse.
207
let null_count = ca.null_count();
208
if options.nulls_last {
209
let ca = ca.slice(0, ca.len() - null_count);
210
is_sorted_ca_num(&ca, options)
211
} else {
212
let ca = ca.slice(null_count as i64, ca.len() - null_count);
213
is_sorted_ca_num(&ca, options)
214
}
215
}
216
217
impl SeriesMethods for Series {}
218
219