Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs
6939 views
1
use crate::bitmap::Bitmap;
2
3
/// Internal state of [`SlicesIterator`]
4
#[derive(Debug, Clone, PartialEq)]
5
enum State {
6
// normal iteration
7
Nominal,
8
// nothing more to iterate.
9
Finished,
10
}
11
12
/// Iterator over a bitmap that returns slices of set regions.
13
///
14
/// This is the most efficient method to extract slices of values from arrays
15
/// with a validity bitmap.
16
/// For example, the bitmap `00101111` returns `[(0,4), (6,1)]`
17
#[derive(Debug, Clone)]
18
pub struct SlicesIterator<'a> {
19
values: std::slice::Iter<'a, u8>,
20
count: usize,
21
mask: u8,
22
max_len: usize,
23
current_byte: &'a u8,
24
state: State,
25
len: usize,
26
start: usize,
27
on_region: bool,
28
}
29
30
impl<'a> SlicesIterator<'a> {
31
/// Creates a new [`SlicesIterator`]
32
pub fn new(values: &'a Bitmap) -> Self {
33
let (buffer, offset, _) = values.as_slice();
34
let mut iter = buffer.iter();
35
36
let (current_byte, state) = match iter.next() {
37
Some(b) => (b, State::Nominal),
38
None => (&0, State::Finished),
39
};
40
41
Self {
42
state,
43
count: values.len() - values.unset_bits(),
44
max_len: values.len(),
45
values: iter,
46
mask: 1u8.rotate_left(offset as u32),
47
current_byte,
48
len: 0,
49
start: 0,
50
on_region: false,
51
}
52
}
53
54
#[inline]
55
fn finish(&mut self) -> Option<(usize, usize)> {
56
self.state = State::Finished;
57
if self.on_region {
58
Some((self.start, self.len))
59
} else {
60
None
61
}
62
}
63
64
#[inline]
65
fn current_len(&self) -> usize {
66
self.start + self.len
67
}
68
69
/// Returns the total number of slots.
70
/// It corresponds to the sum of all lengths of all slices.
71
#[inline]
72
pub fn slots(&self) -> usize {
73
self.count
74
}
75
}
76
77
impl Iterator for SlicesIterator<'_> {
78
type Item = (usize, usize);
79
80
#[inline]
81
fn next(&mut self) -> Option<Self::Item> {
82
loop {
83
if self.state == State::Finished {
84
return None;
85
}
86
if self.current_len() == self.max_len {
87
return self.finish();
88
}
89
90
if self.mask == 1 {
91
// at the beginning of a byte => try to skip it all together
92
match (self.on_region, self.current_byte) {
93
(true, &255u8) => {
94
self.len = std::cmp::min(self.max_len - self.start, self.len + 8);
95
if let Some(v) = self.values.next() {
96
self.current_byte = v;
97
};
98
continue;
99
},
100
(false, &0) => {
101
self.len = std::cmp::min(self.max_len - self.start, self.len + 8);
102
if let Some(v) = self.values.next() {
103
self.current_byte = v;
104
};
105
continue;
106
},
107
_ => (), // we need to run over all bits of this byte
108
}
109
};
110
111
let value = (self.current_byte & self.mask) != 0;
112
self.mask = self.mask.rotate_left(1);
113
114
match (self.on_region, value) {
115
(true, true) => self.len += 1,
116
(false, false) => self.len += 1,
117
(true, false) => {
118
self.on_region = false;
119
let result = (self.start, self.len);
120
self.start += self.len;
121
self.len = 1;
122
if self.mask == 1 {
123
// reached a new byte => try to fetch it from the iterator
124
if let Some(v) = self.values.next() {
125
self.current_byte = v;
126
};
127
}
128
return Some(result);
129
},
130
(false, true) => {
131
self.start += self.len;
132
self.len = 1;
133
self.on_region = true;
134
},
135
}
136
137
if self.mask == 1 {
138
// reached a new byte => try to fetch it from the iterator
139
match self.values.next() {
140
Some(v) => self.current_byte = v,
141
None => return self.finish(),
142
};
143
}
144
}
145
}
146
}
147
148