Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/legacy/kernels/mod.rs
8430 views
1
use std::iter::Enumerate;
2
3
use crate::array::BooleanArray;
4
use crate::bitmap::utils::BitChunks;
5
pub mod set;
6
pub mod sort_partition;
7
#[cfg(feature = "performant")]
8
pub mod sorted_join;
9
#[cfg(feature = "strings")]
10
pub mod string;
11
pub mod take_agg;
12
mod time;
13
14
pub use time::{Ambiguous, NonExistent};
15
#[cfg(feature = "timezones")]
16
pub use time::{convert_to_naive_local, convert_to_naive_local_opt};
17
18
/// Internal state of [SlicesIterator]
19
#[derive(Debug, PartialEq)]
20
enum State {
21
// it is iterating over bits of a mask (`u64`, steps of size of 1 slot)
22
Bits(u64),
23
// it is iterating over chunks (steps of size of 64 slots)
24
Chunks,
25
// it is iterating over the remaining bits (steps of size of 1 slot)
26
Remainder,
27
// nothing more to iterate.
28
Finish,
29
}
30
31
/// Forked and modified from arrow crate.
32
///
33
/// An iterator of `(usize, usize)` each representing an interval `[start,end[` whose
34
/// slots of a [BooleanArray] are true. Each interval corresponds to a contiguous region of memory to be
35
/// "taken" from an array to be filtered.
36
#[derive(Debug)]
37
struct MaskedSlicesIterator<'a> {
38
iter: Enumerate<BitChunks<'a, u64>>,
39
state: State,
40
remainder_mask: u64,
41
remainder_len: usize,
42
chunk_len: usize,
43
len: usize,
44
start: usize,
45
on_region: bool,
46
current_chunk: usize,
47
current_bit: usize,
48
total_len: usize,
49
}
50
51
impl<'a> MaskedSlicesIterator<'a> {
52
pub(crate) fn new(mask: &'a BooleanArray) -> Self {
53
let chunks = mask.values().chunks::<u64>();
54
55
let chunk_len = mask.len() / 64;
56
let remainder_len = chunks.remainder_len();
57
let remainder_mask = chunks.remainder();
58
59
Self {
60
iter: chunks.enumerate(),
61
state: State::Chunks,
62
remainder_len,
63
chunk_len,
64
remainder_mask,
65
len: 0,
66
start: 0,
67
on_region: false,
68
current_chunk: 0,
69
current_bit: 0,
70
total_len: mask.len(),
71
}
72
}
73
74
#[inline]
75
fn current_start(&self) -> usize {
76
self.current_chunk * 64 + self.current_bit
77
}
78
79
#[inline]
80
fn iterate_bits(&mut self, mask: u64, max: usize) -> Option<(usize, usize)> {
81
while self.current_bit < max {
82
if (mask & (1 << self.current_bit)) != 0 {
83
if !self.on_region {
84
self.start = self.current_start();
85
self.on_region = true;
86
}
87
self.len += 1;
88
} else if self.on_region {
89
let result = (self.start, self.start + self.len);
90
self.len = 0;
91
self.on_region = false;
92
self.current_bit += 1;
93
return Some(result);
94
}
95
self.current_bit += 1;
96
}
97
self.current_bit = 0;
98
None
99
}
100
101
/// iterates over chunks.
102
#[inline]
103
fn iterate_chunks(&mut self) -> Option<(usize, usize)> {
104
while let Some((i, mask)) = self.iter.next() {
105
self.current_chunk = i;
106
if mask == 0 {
107
if self.on_region {
108
let result = (self.start, self.start + self.len);
109
self.len = 0;
110
self.on_region = false;
111
return Some(result);
112
}
113
} else if mask == u64::MAX {
114
// = !0u64
115
if !self.on_region {
116
self.start = self.current_start();
117
self.on_region = true;
118
}
119
self.len += 64;
120
} else {
121
// there is a chunk that has a non-trivial mask => iterate over bits.
122
self.state = State::Bits(mask);
123
return None;
124
}
125
}
126
// no more chunks => start iterating over the remainder
127
self.current_chunk = self.chunk_len;
128
self.state = State::Remainder;
129
None
130
}
131
}
132
133
impl Iterator for MaskedSlicesIterator<'_> {
134
type Item = (usize, usize);
135
136
fn next(&mut self) -> Option<Self::Item> {
137
match self.state {
138
State::Chunks => {
139
match self.iterate_chunks() {
140
None => {
141
// iterating over chunks does not yield any new slice => continue to the next
142
self.current_bit = 0;
143
self.next()
144
},
145
other => other,
146
}
147
},
148
State::Bits(mask) => {
149
match self.iterate_bits(mask, 64) {
150
None => {
151
// iterating over bits does not yield any new slice => change back
152
// to chunks and continue to the next
153
self.state = State::Chunks;
154
self.next()
155
},
156
other => other,
157
}
158
},
159
State::Remainder => match self.iterate_bits(self.remainder_mask, self.remainder_len) {
160
None => {
161
self.state = State::Finish;
162
if self.on_region {
163
Some((self.start, self.start + self.len))
164
} else {
165
None
166
}
167
},
168
other => other,
169
},
170
State::Finish => None,
171
}
172
}
173
}
174
175
#[derive(Eq, PartialEq, Debug)]
176
enum BinaryMaskedState {
177
Start,
178
// Last masks were false values
179
LastFalse,
180
// Last masks were true values
181
LastTrue,
182
Finish,
183
}
184
185
pub(crate) struct BinaryMaskedSliceIterator<'a> {
186
slice_iter: MaskedSlicesIterator<'a>,
187
filled: usize,
188
low: usize,
189
high: usize,
190
state: BinaryMaskedState,
191
}
192
193
impl<'a> BinaryMaskedSliceIterator<'a> {
194
pub(crate) fn new(mask: &'a BooleanArray) -> Self {
195
Self {
196
slice_iter: MaskedSlicesIterator::new(mask),
197
filled: 0,
198
low: 0,
199
high: 0,
200
state: BinaryMaskedState::Start,
201
}
202
}
203
}
204
205
impl Iterator for BinaryMaskedSliceIterator<'_> {
206
type Item = (usize, usize, bool);
207
208
fn next(&mut self) -> Option<Self::Item> {
209
use BinaryMaskedState::*;
210
211
match self.state {
212
Start => {
213
// first iteration
214
if self.low == 0 && self.high == 0 {
215
match self.slice_iter.next() {
216
Some((low, high)) => {
217
self.low = low;
218
self.high = high;
219
220
if low > 0 {
221
// do another start iteration.
222
Some((0, low, false))
223
} else {
224
self.state = LastTrue;
225
self.filled = high;
226
Some((low, high, true))
227
}
228
},
229
None => {
230
self.state = Finish;
231
Some((self.filled, self.slice_iter.total_len, false))
232
},
233
}
234
} else {
235
self.filled = self.high;
236
self.state = LastTrue;
237
Some((self.low, self.high, true))
238
}
239
},
240
LastFalse => {
241
self.state = LastTrue;
242
self.filled = self.high;
243
Some((self.low, self.high, true))
244
},
245
LastTrue => match self.slice_iter.next() {
246
Some((low, high)) => {
247
self.low = low;
248
self.high = high;
249
self.state = LastFalse;
250
let last_filled = self.filled;
251
self.filled = low;
252
Some((last_filled, low, false))
253
},
254
None => {
255
self.state = Finish;
256
if self.filled != self.slice_iter.total_len {
257
Some((self.filled, self.slice_iter.total_len, false))
258
} else {
259
None
260
}
261
},
262
},
263
Finish => None,
264
}
265
}
266
}
267
268
#[cfg(test)]
269
mod test {
270
use super::*;
271
272
#[test]
273
fn test_binary_masked_slice_iter() {
274
let mask = BooleanArray::from_slice([false, false, true, true, true, false, false]);
275
276
let out = BinaryMaskedSliceIterator::new(&mask)
277
.map(|(_, _, b)| b)
278
.collect::<Vec<_>>();
279
assert_eq!(out, &[false, true, false]);
280
let out = BinaryMaskedSliceIterator::new(&mask).collect::<Vec<_>>();
281
assert_eq!(out, &[(0, 2, false), (2, 5, true), (5, 7, false)]);
282
let mask = BooleanArray::from_slice([true, true, false, true]);
283
let out = BinaryMaskedSliceIterator::new(&mask)
284
.map(|(_, _, b)| b)
285
.collect::<Vec<_>>();
286
assert_eq!(out, &[true, false, true]);
287
let mask = BooleanArray::from_slice([true, true, false, true, true]);
288
let out = BinaryMaskedSliceIterator::new(&mask)
289
.map(|(_, _, b)| b)
290
.collect::<Vec<_>>();
291
assert_eq!(out, &[true, false, true]);
292
}
293
294
#[test]
295
fn test_binary_slice_mask_iter_with_false() {
296
let mask = BooleanArray::from_slice([false, false]);
297
let out = BinaryMaskedSliceIterator::new(&mask).collect::<Vec<_>>();
298
assert_eq!(out, &[(0, 2, false)]);
299
}
300
}
301
302