Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/series/ops/rle.rs
8467 views
1
use std::hash::Hash;
2
3
use polars_core::prelude::*;
4
use polars_core::series::IsSorted;
5
use polars_core::with_match_physical_numeric_polars_type;
6
use polars_utils::select::select_unpredictable;
7
use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};
8
9
pub static RLE_VALUE_COLUMN_NAME: &str = "value";
10
pub static RLE_LENGTH_COLUMN_NAME: &str = "len";
11
12
/// Get the run-lengths of values.
13
pub fn rle_lengths(s: &Column, lengths: &mut Vec<IdxSize>) -> PolarsResult<()> {
14
lengths.clear();
15
if s.is_empty() {
16
return Ok(());
17
}
18
if s.len() == 1 {
19
lengths.push(1);
20
return Ok(());
21
}
22
23
if let Some(sc) = s.as_scalar_column() {
24
lengths.push(sc.len() as IdxSize);
25
return Ok(());
26
}
27
28
let s = s.as_materialized_series().to_physical_repr();
29
match s.dtype() {
30
DataType::Boolean => {
31
let ca: &BooleanChunked = s.as_ref().as_ref().as_ref();
32
rle_lengths_helper_ca(ca, lengths);
33
return Ok(());
34
},
35
dt if dt.is_numeric() => {
36
with_match_physical_numeric_polars_type!(dt, |$T| {
37
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
38
rle_lengths_helper_ca(ca, lengths);
39
return Ok(());
40
})
41
},
42
DataType::String => {
43
let ca: &StringChunked = s.as_ref().as_ref().as_ref();
44
rle_lengths_helper_ca(&ca.as_binary(), lengths);
45
return Ok(());
46
},
47
DataType::Binary => {
48
let ca: &BinaryChunked = s.as_ref().as_ref().as_ref();
49
rle_lengths_helper_ca(ca, lengths);
50
return Ok(());
51
},
52
_ => {},
53
}
54
55
let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len()));
56
let s_neq = s1.not_equal_missing(&s2)?;
57
let n_runs = s_neq.sum().unwrap() + 1;
58
59
lengths.reserve(n_runs as usize);
60
lengths.push(1);
61
62
assert!(!s_neq.has_nulls());
63
for arr in s_neq.downcast_iter() {
64
let mut values = arr.values().clone();
65
while !values.is_empty() {
66
// @NOTE: This `as IdxSize` is safe because it is less than or equal to the a ChunkedArray
67
// length.
68
*lengths.last_mut().unwrap() += values.take_leading_zeros() as IdxSize;
69
70
if !values.is_empty() {
71
lengths.push(1);
72
values.slice(1, values.len() - 1);
73
}
74
}
75
}
76
Ok(())
77
}
78
79
fn rle_lengths_helper_ca<'a, T>(ca: &'a ChunkedArray<T>, lengths: &mut Vec<IdxSize>)
80
where
81
T: PolarsDataType,
82
T::Physical<'a>: TotalHash + TotalEq + ToTotalOrd + Copy,
83
<T::Physical<'a> as ToTotalOrd>::TotalOrdItem: Hash + Eq + Copy,
84
{
85
lengths.clear();
86
if ca.is_empty() {
87
return;
88
}
89
90
unsafe {
91
lengths.reserve(ca.len());
92
93
if ca.has_nulls() {
94
let mut prev = ca.get_unchecked(0).map(|v| v.to_total_ord());
95
let mut out_idx = 0;
96
let mut run_len = 0;
97
for arr in ca.downcast_iter() {
98
for val in arr.iter() {
99
let val = val.map(|v| v.to_total_ord());
100
let diff = val != prev;
101
run_len = 1 + select_unpredictable(diff, 0, run_len);
102
out_idx += diff as usize;
103
lengths.as_mut_ptr().add(out_idx).write(run_len);
104
prev = val;
105
}
106
}
107
lengths.set_len(out_idx + 1);
108
} else {
109
let mut prev = ca.value_unchecked(0).to_total_ord();
110
let mut out_idx = 0;
111
let mut run_len = 0;
112
for arr in ca.downcast_iter() {
113
for val in arr.values_iter() {
114
let val = val.to_total_ord();
115
let diff = val != prev;
116
run_len = 1 + select_unpredictable(diff, 0, run_len);
117
out_idx += diff as usize;
118
lengths.as_mut_ptr().add(out_idx).write(run_len);
119
prev = val;
120
}
121
}
122
lengths.set_len(out_idx + 1);
123
}
124
}
125
}
126
127
/// Get the lengths of runs of identical values.
128
pub fn rle(s: &Column) -> PolarsResult<Column> {
129
let mut lengths = Vec::new();
130
rle_lengths(s, &mut lengths)?;
131
132
let mut idxs = Vec::with_capacity(lengths.len());
133
if !lengths.is_empty() {
134
idxs.push(0);
135
for length in &lengths[..lengths.len() - 1] {
136
idxs.push(*idxs.last().unwrap() + length);
137
}
138
}
139
140
let vals = s
141
.take_slice(&idxs)
142
.unwrap()
143
.with_name(PlSmallStr::from_static(RLE_VALUE_COLUMN_NAME));
144
let outvals = vec![
145
Series::from_vec(PlSmallStr::from_static(RLE_LENGTH_COLUMN_NAME), lengths).into(),
146
vals,
147
];
148
Ok(StructChunked::from_columns(s.name().clone(), idxs.len(), &outvals)?.into_column())
149
}
150
151
/// Similar to `rle`, but maps values to run IDs.
152
pub fn rle_id(s: &Column) -> PolarsResult<Column> {
153
if s.is_empty() {
154
return Ok(Column::new_empty(s.name().clone(), &IDX_DTYPE));
155
}
156
157
let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len()));
158
let s_neq = s1
159
.as_materialized_series()
160
.not_equal_missing(s2.as_materialized_series())?;
161
162
let mut out = Vec::<IdxSize>::with_capacity(s.len());
163
let mut last = 0;
164
out.push(last); // Run numbers start at zero
165
assert_eq!(s_neq.null_count(), 0);
166
for a in s_neq.downcast_iter() {
167
for aa in a.values_iter() {
168
last += aa as IdxSize;
169
out.push(last);
170
}
171
}
172
Ok(IdxCa::from_vec(s.name().clone(), out)
173
.with_sorted_flag(IsSorted::Ascending)
174
.into_column())
175
}
176
177