Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/legacy/kernels/mod.rs
6939 views
1
use std::iter::Enumerate;
2
3
use crate::array::BooleanArray;
4
use crate::bitmap::utils::BitChunks;
5
pub mod ewm;
6
pub mod set;
7
pub mod sort_partition;
8
#[cfg(feature = "performant")]
9
pub mod sorted_join;
10
#[cfg(feature = "strings")]
11
pub mod string;
12
pub mod take_agg;
13
mod time;
14
15
pub use time::{Ambiguous, NonExistent};
16
#[cfg(feature = "timezones")]
17
pub use time::{convert_to_naive_local, convert_to_naive_local_opt};
18
19
/// Internal state of [SlicesIterator]
20
#[derive(Debug, PartialEq)]
21
enum State {
22
// it is iterating over bits of a mask (`u64`, steps of size of 1 slot)
23
Bits(u64),
24
// it is iterating over chunks (steps of size of 64 slots)
25
Chunks,
26
// it is iterating over the remaining bits (steps of size of 1 slot)
27
Remainder,
28
// nothing more to iterate.
29
Finish,
30
}
31
32
/// Forked and modified from arrow crate.
33
///
34
/// An iterator of `(usize, usize)` each representing an interval `[start,end[` whose
35
/// slots of a [BooleanArray] are true. Each interval corresponds to a contiguous region of memory to be
36
/// "taken" from an array to be filtered.
37
#[derive(Debug)]
38
struct MaskedSlicesIterator<'a> {
39
iter: Enumerate<BitChunks<'a, u64>>,
40
state: State,
41
remainder_mask: u64,
42
remainder_len: usize,
43
chunk_len: usize,
44
len: usize,
45
start: usize,
46
on_region: bool,
47
current_chunk: usize,
48
current_bit: usize,
49
total_len: usize,
50
}
51
52
impl<'a> MaskedSlicesIterator<'a> {
53
pub(crate) fn new(mask: &'a BooleanArray) -> Self {
54
let chunks = mask.values().chunks::<u64>();
55
56
let chunk_len = mask.len() / 64;
57
let remainder_len = chunks.remainder_len();
58
let remainder_mask = chunks.remainder();
59
60
Self {
61
iter: chunks.enumerate(),
62
state: State::Chunks,
63
remainder_len,
64
chunk_len,
65
remainder_mask,
66
len: 0,
67
start: 0,
68
on_region: false,
69
current_chunk: 0,
70
current_bit: 0,
71
total_len: mask.len(),
72
}
73
}
74
75
#[inline]
76
fn current_start(&self) -> usize {
77
self.current_chunk * 64 + self.current_bit
78
}
79
80
#[inline]
81
fn iterate_bits(&mut self, mask: u64, max: usize) -> Option<(usize, usize)> {
82
while self.current_bit < max {
83
if (mask & (1 << self.current_bit)) != 0 {
84
if !self.on_region {
85
self.start = self.current_start();
86
self.on_region = true;
87
}
88
self.len += 1;
89
} else if self.on_region {
90
let result = (self.start, self.start + self.len);
91
self.len = 0;
92
self.on_region = false;
93
self.current_bit += 1;
94
return Some(result);
95
}
96
self.current_bit += 1;
97
}
98
self.current_bit = 0;
99
None
100
}
101
102
/// iterates over chunks.
103
#[inline]
104
fn iterate_chunks(&mut self) -> Option<(usize, usize)> {
105
while let Some((i, mask)) = self.iter.next() {
106
self.current_chunk = i;
107
if mask == 0 {
108
if self.on_region {
109
let result = (self.start, self.start + self.len);
110
self.len = 0;
111
self.on_region = false;
112
return Some(result);
113
}
114
} else if mask == u64::MAX {
115
// = !0u64
116
if !self.on_region {
117
self.start = self.current_start();
118
self.on_region = true;
119
}
120
self.len += 64;
121
} else {
122
// there is a chunk that has a non-trivial mask => iterate over bits.
123
self.state = State::Bits(mask);
124
return None;
125
}
126
}
127
// no more chunks => start iterating over the remainder
128
self.current_chunk = self.chunk_len;
129
self.state = State::Remainder;
130
None
131
}
132
}
133
134
impl Iterator for MaskedSlicesIterator<'_> {
135
type Item = (usize, usize);
136
137
fn next(&mut self) -> Option<Self::Item> {
138
match self.state {
139
State::Chunks => {
140
match self.iterate_chunks() {
141
None => {
142
// iterating over chunks does not yield any new slice => continue to the next
143
self.current_bit = 0;
144
self.next()
145
},
146
other => other,
147
}
148
},
149
State::Bits(mask) => {
150
match self.iterate_bits(mask, 64) {
151
None => {
152
// iterating over bits does not yield any new slice => change back
153
// to chunks and continue to the next
154
self.state = State::Chunks;
155
self.next()
156
},
157
other => other,
158
}
159
},
160
State::Remainder => match self.iterate_bits(self.remainder_mask, self.remainder_len) {
161
None => {
162
self.state = State::Finish;
163
if self.on_region {
164
Some((self.start, self.start + self.len))
165
} else {
166
None
167
}
168
},
169
other => other,
170
},
171
State::Finish => None,
172
}
173
}
174
}
175
176
#[derive(Eq, PartialEq, Debug)]
177
enum BinaryMaskedState {
178
Start,
179
// Last masks were false values
180
LastFalse,
181
// Last masks were true values
182
LastTrue,
183
Finish,
184
}
185
186
pub(crate) struct BinaryMaskedSliceIterator<'a> {
187
slice_iter: MaskedSlicesIterator<'a>,
188
filled: usize,
189
low: usize,
190
high: usize,
191
state: BinaryMaskedState,
192
}
193
194
impl<'a> BinaryMaskedSliceIterator<'a> {
195
pub(crate) fn new(mask: &'a BooleanArray) -> Self {
196
Self {
197
slice_iter: MaskedSlicesIterator::new(mask),
198
filled: 0,
199
low: 0,
200
high: 0,
201
state: BinaryMaskedState::Start,
202
}
203
}
204
}
205
206
impl Iterator for BinaryMaskedSliceIterator<'_> {
207
type Item = (usize, usize, bool);
208
209
fn next(&mut self) -> Option<Self::Item> {
210
use BinaryMaskedState::*;
211
212
match self.state {
213
Start => {
214
// first iteration
215
if self.low == 0 && self.high == 0 {
216
match self.slice_iter.next() {
217
Some((low, high)) => {
218
self.low = low;
219
self.high = high;
220
221
if low > 0 {
222
// do another start iteration.
223
Some((0, low, false))
224
} else {
225
self.state = LastTrue;
226
self.filled = high;
227
Some((low, high, true))
228
}
229
},
230
None => {
231
self.state = Finish;
232
Some((self.filled, self.slice_iter.total_len, false))
233
},
234
}
235
} else {
236
self.filled = self.high;
237
self.state = LastTrue;
238
Some((self.low, self.high, true))
239
}
240
},
241
LastFalse => {
242
self.state = LastTrue;
243
self.filled = self.high;
244
Some((self.low, self.high, true))
245
},
246
LastTrue => match self.slice_iter.next() {
247
Some((low, high)) => {
248
self.low = low;
249
self.high = high;
250
self.state = LastFalse;
251
let last_filled = self.filled;
252
self.filled = low;
253
Some((last_filled, low, false))
254
},
255
None => {
256
self.state = Finish;
257
if self.filled != self.slice_iter.total_len {
258
Some((self.filled, self.slice_iter.total_len, false))
259
} else {
260
None
261
}
262
},
263
},
264
Finish => None,
265
}
266
}
267
}
268
269
#[cfg(test)]
270
mod test {
271
use super::*;
272
273
#[test]
274
fn test_binary_masked_slice_iter() {
275
let mask = BooleanArray::from_slice([false, false, true, true, true, false, false]);
276
277
let out = BinaryMaskedSliceIterator::new(&mask)
278
.map(|(_, _, b)| b)
279
.collect::<Vec<_>>();
280
assert_eq!(out, &[false, true, false]);
281
let out = BinaryMaskedSliceIterator::new(&mask).collect::<Vec<_>>();
282
assert_eq!(out, &[(0, 2, false), (2, 5, true), (5, 7, false)]);
283
let mask = BooleanArray::from_slice([true, true, false, true]);
284
let out = BinaryMaskedSliceIterator::new(&mask)
285
.map(|(_, _, b)| b)
286
.collect::<Vec<_>>();
287
assert_eq!(out, &[true, false, true]);
288
let mask = BooleanArray::from_slice([true, true, false, true, true]);
289
let out = BinaryMaskedSliceIterator::new(&mask)
290
.map(|(_, _, b)| b)
291
.collect::<Vec<_>>();
292
assert_eq!(out, &[true, false, true]);
293
}
294
295
#[test]
296
fn test_binary_slice_mask_iter_with_false() {
297
let mask = BooleanArray::from_slice([false, false]);
298
let out = BinaryMaskedSliceIterator::new(&mask).collect::<Vec<_>>();
299
assert_eq!(out, &[(0, 2, false)]);
300
}
301
}
302
303