Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/bitmap/builder.rs
6939 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
use polars_utils::IdxSize;
3
use polars_utils::slice::load_padded_le_u64;
4
5
use super::bitmask::BitMask;
6
use crate::bitmap::{Bitmap, MutableBitmap};
7
use crate::storage::SharedStorage;
8
use crate::trusted_len::TrustedLen;
9
10
/// Used to build bitmaps bool-by-bool in sequential order.
11
#[derive(Default, Clone)]
12
pub struct BitmapBuilder {
13
buf: u64, // A buffer containing the last self.bit_len % 64 bits.
14
bit_len: usize, // Length in bits.
15
bit_cap: usize, // Capacity in bits (always multiple of 64).
16
set_bits_in_bytes: usize, // The number of bits set in self.bytes, not including self.buf.
17
bytes: Vec<u8>,
18
}
19
20
impl BitmapBuilder {
21
pub fn new() -> Self {
22
Self::default()
23
}
24
25
#[inline(always)]
26
pub fn len(&self) -> usize {
27
self.bit_len
28
}
29
30
#[inline(always)]
31
pub fn is_empty(&self) -> bool {
32
self.bit_len == 0
33
}
34
35
#[inline(always)]
36
pub fn capacity(&self) -> usize {
37
self.bit_cap
38
}
39
40
#[inline(always)]
41
pub fn set_bits(&self) -> usize {
42
self.set_bits_in_bytes + self.buf.count_ones() as usize
43
}
44
45
#[inline(always)]
46
pub fn unset_bits(&self) -> usize {
47
self.bit_len - self.set_bits()
48
}
49
50
pub fn with_capacity(bits: usize) -> Self {
51
let bytes = Vec::with_capacity(bits.div_ceil(64) * 8);
52
let words_available = bytes.capacity() / 8;
53
Self {
54
buf: 0,
55
bit_len: 0,
56
bit_cap: words_available * 64,
57
set_bits_in_bytes: 0,
58
bytes,
59
}
60
}
61
62
#[inline(always)]
63
pub fn reserve(&mut self, additional: usize) {
64
if self.bit_len + additional > self.bit_cap {
65
self.reserve_slow(additional)
66
}
67
}
68
69
#[cold]
70
#[inline(never)]
71
fn reserve_slow(&mut self, additional: usize) {
72
let bytes_needed = (self.bit_len + additional).div_ceil(64) * 8;
73
self.bytes.reserve(bytes_needed - self.bytes.len());
74
let words_available = self.bytes.capacity() / 8;
75
self.bit_cap = words_available * 64;
76
}
77
78
pub fn clear(&mut self) {
79
self.buf = 0;
80
self.bit_len = 0;
81
self.set_bits_in_bytes = 0;
82
self.bytes.clear();
83
}
84
85
#[inline(always)]
86
pub fn push(&mut self, x: bool) {
87
self.reserve(1);
88
unsafe { self.push_unchecked(x) }
89
}
90
91
/// Does not update len/set_bits, simply writes to the output buffer.
92
/// # Safety
93
/// self.bytes.len() + 8 <= self.bytes.capacity() must hold.
94
#[inline(always)]
95
unsafe fn flush_word_unchecked(&mut self, w: u64) {
96
let cur_len = self.bytes.len();
97
let p = self.bytes.as_mut_ptr().add(cur_len).cast::<u64>();
98
p.write_unaligned(w.to_le());
99
self.bytes.set_len(cur_len + 8);
100
}
101
102
/// # Safety
103
/// self.len() < self.capacity() must hold.
104
#[inline(always)]
105
pub unsafe fn push_unchecked(&mut self, x: bool) {
106
debug_assert!(self.bit_len < self.bit_cap);
107
self.buf |= (x as u64) << (self.bit_len % 64);
108
self.bit_len += 1;
109
if self.bit_len.is_multiple_of(64) {
110
self.flush_word_unchecked(self.buf);
111
self.set_bits_in_bytes += self.buf.count_ones() as usize;
112
self.buf = 0;
113
}
114
}
115
116
#[inline(always)]
117
pub fn extend_constant(&mut self, length: usize, value: bool) {
118
// Fast path if the extension still fits in buf with room left to spare.
119
let bits_in_buf = self.bit_len % 64;
120
if bits_in_buf + length < 64 {
121
let bit_block = ((value as u64) << length) - (value as u64);
122
self.buf |= bit_block << bits_in_buf;
123
self.bit_len += length;
124
} else {
125
self.extend_constant_slow(length, value);
126
}
127
}
128
129
#[cold]
130
fn extend_constant_slow(&mut self, length: usize, value: bool) {
131
unsafe {
132
let value_spread = if value { u64::MAX } else { 0 }; // Branchless neg.
133
134
// Extend and flush current buf.
135
self.reserve(length);
136
let bits_in_buf = self.bit_len % 64;
137
let ext_buf = self.buf | (value_spread << bits_in_buf);
138
self.flush_word_unchecked(ext_buf);
139
self.set_bits_in_bytes += ext_buf.count_ones() as usize;
140
141
// Write complete words.
142
let remaining_bits = length - (64 - bits_in_buf);
143
let remaining_words = remaining_bits / 64;
144
for _ in 0..remaining_words {
145
self.flush_word_unchecked(value_spread);
146
}
147
self.set_bits_in_bytes += (remaining_words * 64) & value_spread as usize;
148
149
// Put remainder in buf and update length.
150
self.buf = ((value as u64) << (remaining_bits % 64)) - (value as u64);
151
self.bit_len += length;
152
}
153
}
154
155
/// Pushes the first length bits from the given word, assuming the rest of
156
/// the bits are zero.
157
/// # Safety
158
/// self.len + length <= self.cap and length <= 64 must hold.
159
pub unsafe fn push_word_with_len_unchecked(&mut self, word: u64, length: usize) {
160
debug_assert!(self.bit_len + length <= self.bit_cap);
161
debug_assert!(length <= 64);
162
debug_assert!(length == 64 || (word >> length) == 0);
163
let bits_in_buf = self.bit_len % 64;
164
self.buf |= word << bits_in_buf;
165
if bits_in_buf + length >= 64 {
166
self.flush_word_unchecked(self.buf);
167
self.set_bits_in_bytes += self.buf.count_ones() as usize;
168
self.buf = if bits_in_buf > 0 {
169
word >> (64 - bits_in_buf)
170
} else {
171
0
172
};
173
}
174
self.bit_len += length;
175
}
176
177
/// # Safety
178
/// self.len() + length <= self.capacity() must hold, as well as
179
/// offset + length <= 8 * slice.len().
180
unsafe fn extend_from_slice_unchecked(
181
&mut self,
182
mut slice: &[u8],
183
mut offset: usize,
184
mut length: usize,
185
) {
186
if length == 0 {
187
return;
188
}
189
190
// Deal with slice offset so it's aligned to bytes.
191
let slice_bit_offset = offset % 8;
192
if slice_bit_offset > 0 {
193
let bits_in_first_byte = (8 - slice_bit_offset).min(length);
194
let first_byte = *slice.get_unchecked(offset / 8) >> slice_bit_offset;
195
self.push_word_with_len_unchecked(
196
first_byte as u64 & ((1 << bits_in_first_byte) - 1),
197
bits_in_first_byte,
198
);
199
length -= bits_in_first_byte;
200
offset += bits_in_first_byte;
201
}
202
slice = slice.get_unchecked(offset / 8..);
203
204
// Write word-by-word.
205
let bits_in_buf = self.bit_len % 64;
206
if bits_in_buf > 0 {
207
while length >= 64 {
208
let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
209
self.buf |= word << bits_in_buf;
210
self.flush_word_unchecked(self.buf);
211
self.set_bits_in_bytes += self.buf.count_ones() as usize;
212
self.buf = word >> (64 - bits_in_buf);
213
self.bit_len += 64;
214
length -= 64;
215
slice = slice.get_unchecked(8..);
216
}
217
} else {
218
while length >= 64 {
219
let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
220
self.flush_word_unchecked(word);
221
self.set_bits_in_bytes += word.count_ones() as usize;
222
self.bit_len += 64;
223
length -= 64;
224
slice = slice.get_unchecked(8..);
225
}
226
}
227
228
// Just the last word left.
229
if length > 0 {
230
let word = load_padded_le_u64(slice);
231
self.push_word_with_len_unchecked(word & ((1 << length) - 1), length);
232
}
233
}
234
235
/// # Safety
236
/// self.len() + length*repeats <= self.capacity() must hold, as well as
237
/// offset + length <= 8 * slice.len().
238
unsafe fn extend_each_repeated_from_slice_unchecked(
239
&mut self,
240
slice: &[u8],
241
offset: usize,
242
length: usize,
243
repeats: usize,
244
) {
245
if repeats == 0 {
246
return;
247
}
248
if repeats == 1 {
249
return self.extend_from_slice_unchecked(slice, offset, length);
250
}
251
for bit_idx in offset..length {
252
let bit = (*slice.get_unchecked(bit_idx / 8) >> (bit_idx % 8)) & 1 != 0;
253
self.extend_constant(repeats, bit);
254
}
255
}
256
257
pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {
258
assert!(8 * slice.len() >= offset + length);
259
self.reserve(length);
260
unsafe {
261
self.extend_from_slice_unchecked(slice, offset, length);
262
}
263
}
264
265
pub fn extend_each_repeated_from_slice(
266
&mut self,
267
slice: &[u8],
268
offset: usize,
269
length: usize,
270
repeats: usize,
271
) {
272
assert!(8 * slice.len() >= offset + length);
273
self.reserve(length * repeats);
274
unsafe {
275
self.extend_each_repeated_from_slice_unchecked(slice, offset, length, repeats);
276
}
277
}
278
279
pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) {
280
// TODO: we can perhaps use the bitmaps bitcount here instead of
281
// recomputing it if it has a known bitcount.
282
let (slice, offset, length) = bitmap.as_slice();
283
self.extend_from_slice(slice, offset, length);
284
}
285
286
pub fn extend_from_bitmask(&mut self, bitmap: BitMask<'_>) {
287
let (slice, offset, length) = bitmap.inner();
288
self.extend_from_slice(slice, offset, length);
289
}
290
291
/// Extends this BitmapBuilder with a subslice of a bitmap.
292
pub fn subslice_extend_from_bitmap(&mut self, bitmap: &Bitmap, start: usize, length: usize) {
293
let (slice, bm_offset, bm_length) = bitmap.as_slice();
294
assert!(start + length <= bm_length);
295
self.extend_from_slice(slice, bm_offset + start, length);
296
}
297
298
/// Extends this BitmapBuilder with a subslice of a bitmap, repeating each bit `repeats` times.
299
pub fn subslice_extend_each_repeated_from_bitmap(
300
&mut self,
301
bitmap: &Bitmap,
302
start: usize,
303
length: usize,
304
repeats: usize,
305
) {
306
let (slice, bm_offset, bm_length) = bitmap.as_slice();
307
assert!(start + length <= bm_length);
308
self.extend_each_repeated_from_slice(slice, bm_offset + start, length, repeats);
309
}
310
311
pub fn subslice_extend_from_opt_validity(
312
&mut self,
313
bitmap: Option<&Bitmap>,
314
start: usize,
315
length: usize,
316
) {
317
match bitmap {
318
Some(bm) => self.subslice_extend_from_bitmap(bm, start, length),
319
None => self.extend_constant(length, true),
320
}
321
}
322
323
pub fn subslice_extend_each_repeated_from_opt_validity(
324
&mut self,
325
bitmap: Option<&Bitmap>,
326
start: usize,
327
length: usize,
328
repeats: usize,
329
) {
330
match bitmap {
331
Some(bm) => self.subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats),
332
None => self.extend_constant(length * repeats, true),
333
}
334
}
335
336
/// # Safety
337
/// The indices must be in-bounds.
338
pub unsafe fn gather_extend_from_slice(
339
&mut self,
340
slice: &[u8],
341
offset: usize,
342
length: usize,
343
idxs: &[IdxSize],
344
) {
345
assert!(8 * slice.len() >= offset + length);
346
347
self.reserve(idxs.len());
348
unsafe {
349
for idx in idxs {
350
debug_assert!((*idx as usize) < length);
351
let idx_in_slice = offset + *idx as usize;
352
let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;
353
self.push_unchecked(bit != 0);
354
}
355
}
356
}
357
358
pub fn opt_gather_extend_from_slice(
359
&mut self,
360
slice: &[u8],
361
offset: usize,
362
length: usize,
363
idxs: &[IdxSize],
364
) {
365
assert!(8 * slice.len() >= offset + length);
366
367
self.reserve(idxs.len());
368
unsafe {
369
for idx in idxs {
370
if (*idx as usize) < length {
371
let idx_in_slice = offset + *idx as usize;
372
let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;
373
self.push_unchecked(bit != 0);
374
} else {
375
self.push_unchecked(false);
376
}
377
}
378
}
379
}
380
381
/// # Safety
382
/// The indices must be in-bounds.
383
pub unsafe fn gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {
384
let (slice, offset, length) = bitmap.as_slice();
385
self.gather_extend_from_slice(slice, offset, length, idxs);
386
}
387
388
pub fn opt_gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {
389
let (slice, offset, length) = bitmap.as_slice();
390
self.opt_gather_extend_from_slice(slice, offset, length, idxs);
391
}
392
393
/// # Safety
394
/// The indices must be in-bounds.
395
pub unsafe fn gather_extend_from_opt_validity(
396
&mut self,
397
bitmap: Option<&Bitmap>,
398
idxs: &[IdxSize],
399
length: usize,
400
) {
401
if let Some(bm) = bitmap {
402
let (slice, offset, sl_length) = bm.as_slice();
403
debug_assert_eq!(sl_length, length);
404
self.gather_extend_from_slice(slice, offset, length, idxs);
405
} else {
406
self.extend_constant(length, true);
407
}
408
}
409
410
pub fn opt_gather_extend_from_opt_validity(
411
&mut self,
412
bitmap: Option<&Bitmap>,
413
idxs: &[IdxSize],
414
length: usize,
415
) {
416
if let Some(bm) = bitmap {
417
let (slice, offset, sl_length) = bm.as_slice();
418
debug_assert_eq!(sl_length, length);
419
self.opt_gather_extend_from_slice(slice, offset, sl_length, idxs);
420
} else {
421
unsafe {
422
self.reserve(idxs.len());
423
for idx in idxs {
424
self.push_unchecked((*idx as usize) < length);
425
}
426
}
427
}
428
}
429
430
/// # Safety
431
/// May only be called once at the end.
432
unsafe fn finish(&mut self) {
433
if !self.bit_len.is_multiple_of(64) {
434
self.bytes.extend_from_slice(&self.buf.to_le_bytes());
435
self.set_bits_in_bytes += self.buf.count_ones() as usize;
436
self.buf = 0;
437
}
438
}
439
440
/// Converts this BitmapBuilder into a mutable bitmap.
441
pub fn into_mut(mut self) -> MutableBitmap {
442
unsafe {
443
self.finish();
444
MutableBitmap::from_vec(self.bytes, self.bit_len)
445
}
446
}
447
448
/// The same as into_mut, but returns None if the bitmap is all-ones.
449
pub fn into_opt_mut_validity(mut self) -> Option<MutableBitmap> {
450
unsafe {
451
self.finish();
452
if self.set_bits_in_bytes == self.bit_len {
453
return None;
454
}
455
Some(MutableBitmap::from_vec(self.bytes, self.bit_len))
456
}
457
}
458
459
/// Freezes this BitmapBuilder into an immutable Bitmap.
460
pub fn freeze(mut self) -> Bitmap {
461
unsafe {
462
self.finish();
463
let storage = SharedStorage::from_vec(self.bytes);
464
Bitmap::from_inner_unchecked(
465
storage,
466
0,
467
self.bit_len,
468
Some(self.bit_len - self.set_bits_in_bytes),
469
)
470
}
471
}
472
473
/// The same as freeze, but returns None if the bitmap is all-ones.
474
pub fn into_opt_validity(mut self) -> Option<Bitmap> {
475
unsafe {
476
self.finish();
477
if self.set_bits_in_bytes == self.bit_len {
478
return None;
479
}
480
let storage = SharedStorage::from_vec(self.bytes);
481
let bitmap = Bitmap::from_inner_unchecked(
482
storage,
483
0,
484
self.bit_len,
485
Some(self.bit_len - self.set_bits_in_bytes),
486
);
487
Some(bitmap)
488
}
489
}
490
491
pub fn extend_trusted_len_iter<I>(&mut self, iterator: I)
492
where
493
I: Iterator<Item = bool> + TrustedLen,
494
{
495
self.reserve(iterator.size_hint().1.unwrap());
496
for b in iterator {
497
// SAFETY: we reserved and the iterator's length is trusted.
498
unsafe {
499
self.push_unchecked(b);
500
}
501
}
502
}
503
504
#[inline]
505
pub fn from_trusted_len_iter<I>(iterator: I) -> Self
506
where
507
I: Iterator<Item = bool> + TrustedLen,
508
{
509
let mut builder = Self::new();
510
builder.extend_trusted_len_iter(iterator);
511
builder
512
}
513
}
514
515
/// A wrapper for BitmapBuilder that does not allocate until the first false is
516
/// pushed. Less efficient if you know there are false values because it must
517
/// check if it has allocated for each push.
518
pub enum OptBitmapBuilder {
519
AllTrue { bit_len: usize, bit_cap: usize },
520
MayHaveFalse(BitmapBuilder),
521
}
522
523
impl Default for OptBitmapBuilder {
524
fn default() -> Self {
525
Self::AllTrue {
526
bit_len: 0,
527
bit_cap: 0,
528
}
529
}
530
}
531
532
impl OptBitmapBuilder {
533
pub fn reserve(&mut self, additional: usize) {
534
match self {
535
Self::AllTrue { bit_len, bit_cap } => {
536
*bit_cap = usize::max(*bit_cap, *bit_len + additional);
537
},
538
Self::MayHaveFalse(inner) => inner.reserve(additional),
539
}
540
}
541
542
pub fn extend_constant(&mut self, length: usize, value: bool) {
543
match self {
544
Self::AllTrue { bit_len, bit_cap } => {
545
if value {
546
*bit_cap = usize::max(*bit_cap, *bit_len + length);
547
*bit_len += length;
548
} else {
549
self.get_builder().extend_constant(length, value);
550
}
551
},
552
Self::MayHaveFalse(inner) => inner.extend_constant(length, value),
553
}
554
}
555
556
pub fn into_opt_validity(self) -> Option<Bitmap> {
557
match self {
558
Self::AllTrue { .. } => None,
559
Self::MayHaveFalse(inner) => inner.into_opt_validity(),
560
}
561
}
562
563
pub fn subslice_extend_from_opt_validity(
564
&mut self,
565
bitmap: Option<&Bitmap>,
566
start: usize,
567
length: usize,
568
) {
569
match bitmap {
570
Some(bm) => {
571
self.get_builder()
572
.subslice_extend_from_bitmap(bm, start, length);
573
},
574
None => {
575
self.extend_constant(length, true);
576
},
577
}
578
}
579
580
pub fn subslice_extend_each_repeated_from_opt_validity(
581
&mut self,
582
bitmap: Option<&Bitmap>,
583
start: usize,
584
length: usize,
585
repeats: usize,
586
) {
587
match bitmap {
588
Some(bm) => {
589
self.get_builder()
590
.subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats);
591
},
592
None => {
593
self.extend_constant(length * repeats, true);
594
},
595
}
596
}
597
598
/// # Safety
599
/// The indices must be in-bounds.
600
pub unsafe fn gather_extend_from_opt_validity(
601
&mut self,
602
bitmap: Option<&Bitmap>,
603
idxs: &[IdxSize],
604
) {
605
match bitmap {
606
Some(bm) => {
607
self.get_builder().gather_extend_from_bitmap(bm, idxs);
608
},
609
None => {
610
self.extend_constant(idxs.len(), true);
611
},
612
}
613
}
614
615
pub fn opt_gather_extend_from_opt_validity(
616
&mut self,
617
bitmap: Option<&Bitmap>,
618
idxs: &[IdxSize],
619
length: usize,
620
) {
621
match bitmap {
622
Some(bm) => {
623
self.get_builder().opt_gather_extend_from_bitmap(bm, idxs);
624
},
625
None => {
626
if let Some(first_oob) = idxs.iter().position(|idx| *idx as usize >= length) {
627
let builder = self.get_builder();
628
builder.extend_constant(first_oob, true);
629
for idx in idxs.iter().skip(first_oob) {
630
builder.push((*idx as usize) < length);
631
}
632
} else {
633
self.extend_constant(idxs.len(), true);
634
}
635
},
636
}
637
}
638
639
fn get_builder(&mut self) -> &mut BitmapBuilder {
640
match self {
641
Self::AllTrue { bit_len, bit_cap } => {
642
let mut builder = BitmapBuilder::with_capacity(*bit_cap);
643
builder.extend_constant(*bit_len, true);
644
*self = Self::MayHaveFalse(builder);
645
let Self::MayHaveFalse(inner) = self else {
646
unreachable!()
647
};
648
inner
649
},
650
Self::MayHaveFalse(inner) => inner,
651
}
652
}
653
}
654
655