Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/bitmap/iterator.rs
6939 views
1
use polars_utils::slice::load_padded_le_u64;
2
3
use super::Bitmap;
4
use super::bitmask::BitMask;
5
use crate::trusted_len::TrustedLen;
6
7
/// Calculates how many iterations are remaining, assuming:
8
/// - We have length elements left.
9
/// - We need max(consume, min_length_for_iter) elements to start a new iteration.
10
/// - On each iteration we consume the given amount of elements.
11
fn calc_iters_remaining(length: usize, min_length_for_iter: usize, consume: usize) -> usize {
12
let min_length_for_iter = min_length_for_iter.max(consume);
13
if length < min_length_for_iter {
14
return 0;
15
}
16
17
let obvious_part = length - min_length_for_iter;
18
let obvious_iters = obvious_part / consume;
19
// let obvious_part_remaining = obvious_part % consume;
20
// let total_remaining = min_length_for_iter + obvious_part_remaining;
21
// assert!(total_remaining >= min_length_for_iter); // We have at least 1 more iter.
22
// assert!(obvious_part_remaining < consume); // Basic modulo property.
23
// assert!(total_remaining < min_length_for_iter + consume); // Add min_length_for_iter to both sides.
24
// assert!(total_remaining - consume < min_length_for_iter); // Not enough remaining after 1 iter.
25
1 + obvious_iters // Thus always exactly 1 more iter.
26
}
27
28
#[derive(Clone)]
29
pub struct TrueIdxIter<'a> {
30
mask: BitMask<'a>,
31
first_unknown: usize,
32
i: usize,
33
len: usize,
34
remaining: usize,
35
}
36
37
impl<'a> TrueIdxIter<'a> {
38
#[inline]
39
pub fn new(len: usize, validity: Option<&'a Bitmap>) -> Self {
40
if let Some(bitmap) = validity {
41
assert!(len == bitmap.len());
42
Self {
43
mask: BitMask::from_bitmap(bitmap),
44
first_unknown: 0,
45
i: 0,
46
remaining: bitmap.len() - bitmap.unset_bits(),
47
len,
48
}
49
} else {
50
Self {
51
mask: BitMask::default(),
52
first_unknown: len,
53
i: 0,
54
remaining: len,
55
len,
56
}
57
}
58
}
59
}
60
61
impl Iterator for TrueIdxIter<'_> {
62
type Item = usize;
63
64
#[inline]
65
fn next(&mut self) -> Option<Self::Item> {
66
// Fast path for many non-nulls in a row.
67
if self.i < self.first_unknown {
68
let ret = self.i;
69
self.i += 1;
70
self.remaining -= 1;
71
return Some(ret);
72
}
73
74
while self.i < self.len {
75
let mask = self.mask.get_u32(self.i);
76
let num_null = mask.trailing_zeros();
77
self.i += num_null as usize;
78
if num_null < 32 {
79
self.first_unknown = self.i + (mask >> num_null).trailing_ones() as usize;
80
let ret = self.i;
81
self.i += 1;
82
self.remaining -= 1;
83
return Some(ret);
84
}
85
}
86
87
None
88
}
89
90
#[inline]
91
fn size_hint(&self) -> (usize, Option<usize>) {
92
(self.remaining, Some(self.remaining))
93
}
94
}
95
96
unsafe impl TrustedLen for TrueIdxIter<'_> {}
97
98
pub struct FastU32BitmapIter<'a> {
99
bytes: &'a [u8],
100
shift: u32,
101
bits_left: usize,
102
}
103
104
impl<'a> FastU32BitmapIter<'a> {
105
pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
106
assert!(bytes.len() * 8 >= offset + len);
107
let shift = (offset % 8) as u32;
108
let bytes = &bytes[offset / 8..];
109
Self {
110
bytes,
111
shift,
112
bits_left: len,
113
}
114
}
115
116
// The iteration logic that would normally follow the fast-path.
117
fn next_remainder(&mut self) -> Option<u32> {
118
if self.bits_left > 0 {
119
let word = load_padded_le_u64(self.bytes);
120
let mask;
121
if self.bits_left >= 32 {
122
mask = u32::MAX;
123
self.bits_left -= 32;
124
self.bytes = unsafe { self.bytes.get_unchecked(4..) };
125
} else {
126
mask = (1 << self.bits_left) - 1;
127
self.bits_left = 0;
128
}
129
130
return Some((word >> self.shift) as u32 & mask);
131
}
132
133
None
134
}
135
136
/// Returns the remainder bits and how many there are,
137
/// assuming the iterator was fully consumed.
138
pub fn remainder(mut self) -> (u64, usize) {
139
let bits_left = self.bits_left;
140
let lo = self.next_remainder().unwrap_or(0);
141
let hi = self.next_remainder().unwrap_or(0);
142
(((hi as u64) << 32) | (lo as u64), bits_left)
143
}
144
}
145
146
impl Iterator for FastU32BitmapIter<'_> {
147
type Item = u32;
148
149
#[inline]
150
fn next(&mut self) -> Option<Self::Item> {
151
// Fast path, can load a whole u64.
152
if self.bits_left >= 64 {
153
let chunk;
154
unsafe {
155
// SAFETY: bits_left ensures this is in-bounds.
156
chunk = self.bytes.get_unchecked(0..8);
157
self.bytes = self.bytes.get_unchecked(4..);
158
}
159
self.bits_left -= 32;
160
let word = u64::from_le_bytes(chunk.try_into().unwrap());
161
return Some((word >> self.shift) as u32);
162
}
163
164
None
165
}
166
167
#[inline]
168
fn size_hint(&self) -> (usize, Option<usize>) {
169
let hint = calc_iters_remaining(self.bits_left, 64, 32);
170
(hint, Some(hint))
171
}
172
}
173
174
unsafe impl TrustedLen for FastU32BitmapIter<'_> {}
175
176
#[derive(Clone)]
177
pub struct FastU56BitmapIter<'a> {
178
bytes: &'a [u8],
179
shift: u32,
180
bits_left: usize,
181
}
182
183
impl<'a> FastU56BitmapIter<'a> {
184
pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
185
assert!(bytes.len() * 8 >= offset + len);
186
let shift = (offset % 8) as u32;
187
let bytes = &bytes[offset / 8..];
188
Self {
189
bytes,
190
shift,
191
bits_left: len,
192
}
193
}
194
195
// The iteration logic that would normally follow the fast-path.
196
fn next_remainder(&mut self) -> Option<u64> {
197
if self.bits_left > 0 {
198
let word = load_padded_le_u64(self.bytes);
199
let mask;
200
if self.bits_left >= 56 {
201
mask = (1 << 56) - 1;
202
self.bits_left -= 56;
203
self.bytes = unsafe { self.bytes.get_unchecked(7..) };
204
} else {
205
mask = (1 << self.bits_left) - 1;
206
self.bits_left = 0;
207
};
208
209
return Some((word >> self.shift) & mask);
210
}
211
212
None
213
}
214
215
/// Returns the remainder bits and how many there are,
216
/// assuming the iterator was fully consumed. Output is safe but
217
/// not specified if the iterator wasn't fully consumed.
218
pub fn remainder(mut self) -> (u64, usize) {
219
let bits_left = self.bits_left;
220
let lo = self.next_remainder().unwrap_or(0);
221
let hi = self.next_remainder().unwrap_or(0);
222
((hi << 56) | lo, bits_left)
223
}
224
}
225
226
impl Iterator for FastU56BitmapIter<'_> {
227
type Item = u64;
228
229
#[inline]
230
fn next(&mut self) -> Option<Self::Item> {
231
// Fast path, can load a whole u64.
232
if self.bits_left >= 64 {
233
let chunk;
234
unsafe {
235
// SAFETY: bits_left ensures this is in-bounds.
236
chunk = self.bytes.get_unchecked(0..8);
237
self.bytes = self.bytes.get_unchecked(7..);
238
self.bits_left -= 56;
239
}
240
241
let word = u64::from_le_bytes(chunk.try_into().unwrap());
242
let mask = (1 << 56) - 1;
243
return Some((word >> self.shift) & mask);
244
}
245
246
None
247
}
248
249
#[inline]
250
fn size_hint(&self) -> (usize, Option<usize>) {
251
let hint = calc_iters_remaining(self.bits_left, 64, 56);
252
(hint, Some(hint))
253
}
254
}
255
256
unsafe impl TrustedLen for FastU56BitmapIter<'_> {}
257
258
pub struct FastU64BitmapIter<'a> {
259
bytes: &'a [u8],
260
shift: u32,
261
bits_left: usize,
262
next_word: u64,
263
}
264
265
impl<'a> FastU64BitmapIter<'a> {
266
pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
267
assert!(bytes.len() * 8 >= offset + len);
268
let shift = (offset % 8) as u32;
269
let bytes = &bytes[offset / 8..];
270
let next_word = load_padded_le_u64(bytes);
271
let bytes = bytes.get(8..).unwrap_or(&[]);
272
Self {
273
bytes,
274
shift,
275
bits_left: len,
276
next_word,
277
}
278
}
279
280
#[inline]
281
fn combine(&self, lo: u64, hi: u64) -> u64 {
282
// Compiles to 128-bit SHRD instruction on x86-64.
283
// Yes, the % 64 is important for the compiler to generate optimal code.
284
let wide = ((hi as u128) << 64) | lo as u128;
285
(wide >> (self.shift % 64)) as u64
286
}
287
288
// The iteration logic that would normally follow the fast-path.
289
fn next_remainder(&mut self) -> Option<u64> {
290
if self.bits_left > 0 {
291
let lo = self.next_word;
292
let hi = load_padded_le_u64(self.bytes);
293
let mask;
294
if self.bits_left >= 64 {
295
mask = u64::MAX;
296
self.bits_left -= 64;
297
self.bytes = self.bytes.get(8..).unwrap_or(&[]);
298
} else {
299
mask = (1 << self.bits_left) - 1;
300
self.bits_left = 0;
301
};
302
self.next_word = hi;
303
304
return Some(self.combine(lo, hi) & mask);
305
}
306
307
None
308
}
309
310
/// Returns the remainder bits and how many there are,
311
/// assuming the iterator was fully consumed. Output is safe but
312
/// not specified if the iterator wasn't fully consumed.
313
pub fn remainder(mut self) -> ([u64; 2], usize) {
314
let bits_left = self.bits_left;
315
let lo = self.next_remainder().unwrap_or(0);
316
let hi = self.next_remainder().unwrap_or(0);
317
([lo, hi], bits_left)
318
}
319
}
320
321
impl Iterator for FastU64BitmapIter<'_> {
322
type Item = u64;
323
324
#[inline]
325
fn next(&mut self) -> Option<Self::Item> {
326
// Fast path: can load two u64s in a row.
327
// (Note that we already loaded one in the form of self.next_word).
328
if self.bits_left >= 128 {
329
let chunk;
330
unsafe {
331
// SAFETY: bits_left ensures this is in-bounds.
332
chunk = self.bytes.get_unchecked(0..8);
333
self.bytes = self.bytes.get_unchecked(8..);
334
}
335
let lo = self.next_word;
336
let hi = u64::from_le_bytes(chunk.try_into().unwrap());
337
self.next_word = hi;
338
self.bits_left -= 64;
339
340
return Some(self.combine(lo, hi));
341
}
342
343
None
344
}
345
346
#[inline]
347
fn size_hint(&self) -> (usize, Option<usize>) {
348
let hint = calc_iters_remaining(self.bits_left, 128, 64);
349
(hint, Some(hint))
350
}
351
}
352
353
unsafe impl TrustedLen for FastU64BitmapIter<'_> {}
354
355
/// This crates' equivalent of [`std::vec::IntoIter`] for [`Bitmap`].
356
#[derive(Debug, Clone)]
357
pub struct IntoIter {
358
values: Bitmap,
359
index: usize,
360
end: usize,
361
}
362
363
impl IntoIter {
364
/// Creates a new [`IntoIter`] from a [`Bitmap`]
365
#[inline]
366
pub fn new(values: Bitmap) -> Self {
367
let end = values.len();
368
Self {
369
values,
370
index: 0,
371
end,
372
}
373
}
374
}
375
376
impl Iterator for IntoIter {
377
type Item = bool;
378
379
#[inline]
380
fn next(&mut self) -> Option<Self::Item> {
381
if self.index == self.end {
382
return None;
383
}
384
let old = self.index;
385
self.index += 1;
386
Some(unsafe { self.values.get_bit_unchecked(old) })
387
}
388
389
#[inline]
390
fn size_hint(&self) -> (usize, Option<usize>) {
391
(self.end - self.index, Some(self.end - self.index))
392
}
393
394
#[inline]
395
fn nth(&mut self, n: usize) -> Option<Self::Item> {
396
let new_index = self.index + n;
397
if new_index > self.end {
398
self.index = self.end;
399
None
400
} else {
401
self.index = new_index;
402
self.next()
403
}
404
}
405
}
406
407
impl DoubleEndedIterator for IntoIter {
408
#[inline]
409
fn next_back(&mut self) -> Option<Self::Item> {
410
if self.index == self.end {
411
None
412
} else {
413
self.end -= 1;
414
Some(unsafe { self.values.get_bit_unchecked(self.end) })
415
}
416
}
417
}
418
419
unsafe impl TrustedLen for IntoIter {}
420
421