Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/bitmap/utils/mod.rs
6939 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! General utilities for bitmaps representing items where LSB is the first item.
3
mod chunk_iterator;
4
mod chunks_exact_mut;
5
mod fmt;
6
mod iterator;
7
mod slice_iterator;
8
mod zip_validity;
9
10
pub(crate) use chunk_iterator::merge_reversed;
11
pub use chunk_iterator::{BitChunk, BitChunkIterExact, BitChunks, BitChunksExact};
12
pub use chunks_exact_mut::BitChunksExactMut;
13
pub use fmt::fmt;
14
pub use iterator::BitmapIter;
15
use polars_utils::slice::load_padded_le_u64;
16
pub use slice_iterator::SlicesIterator;
17
pub use zip_validity::{ZipValidity, ZipValidityIter};
18
19
use crate::bitmap::aligned::AlignedBitmapSlice;
20
21
/// Returns whether bit at position `i` in `byte` is set or not
22
#[inline]
23
pub fn is_set(byte: u8, i: usize) -> bool {
24
debug_assert!(i < 8);
25
byte & (1 << i) != 0
26
}
27
28
/// Sets bit at position `i` in `byte`.
29
#[inline(always)]
30
pub fn set_bit_in_byte(byte: u8, i: usize, value: bool) -> u8 {
31
debug_assert!(i < 8);
32
let mask = !(1 << i);
33
let insert = (value as u8) << i;
34
(byte & mask) | insert
35
}
36
37
/// Returns whether bit at position `i` in `bytes` is set or not.
38
///
39
/// # Safety
40
/// `i >= bytes.len() * 8` results in undefined behavior.
41
#[inline(always)]
42
pub unsafe fn get_bit_unchecked(bytes: &[u8], i: usize) -> bool {
43
let byte = *bytes.get_unchecked(i / 8);
44
let bit = (byte >> (i % 8)) & 1;
45
bit != 0
46
}
47
48
/// Sets bit at position `i` in `bytes` without doing bound checks.
49
/// # Safety
50
/// `i >= bytes.len() * 8` results in undefined behavior.
51
#[inline(always)]
52
pub unsafe fn set_bit_unchecked(bytes: &mut [u8], i: usize, value: bool) {
53
let byte = bytes.get_unchecked_mut(i / 8);
54
*byte = set_bit_in_byte(*byte, i % 8, value);
55
}
56
57
/// Returns the number of bytes required to hold `bits` bits.
58
#[inline]
59
pub fn bytes_for(bits: usize) -> usize {
60
bits.saturating_add(7) / 8
61
}
62
63
/// Returns the number of zero bits in the slice offsetted by `offset` and a length of `length`.
64
/// # Panics
65
/// This function panics iff `offset + len > 8 * slice.len()``.
66
pub fn count_zeros(slice: &[u8], offset: usize, len: usize) -> usize {
67
if len == 0 {
68
return 0;
69
}
70
71
assert!(8 * slice.len() >= offset + len);
72
73
// Fast-path: fits in a single u64 load.
74
let first_byte_idx = offset / 8;
75
let offset_in_byte = offset % 8;
76
if offset_in_byte + len <= 64 {
77
let mut word = load_padded_le_u64(&slice[first_byte_idx..]);
78
word >>= offset_in_byte;
79
word <<= 64 - len;
80
return len - word.count_ones() as usize;
81
}
82
83
let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
84
let ones_in_prefix = aligned.prefix().count_ones() as usize;
85
let ones_in_bulk: usize = aligned.bulk_iter().map(|w| w.count_ones() as usize).sum();
86
let ones_in_suffix = aligned.suffix().count_ones() as usize;
87
len - ones_in_prefix - ones_in_bulk - ones_in_suffix
88
}
89
90
/// Returns the number of zero bits before seeing a one bit in the slice offsetted by `offset` and
91
/// a length of `length`.
92
///
93
/// # Panics
94
/// This function panics iff `offset + len > 8 * slice.len()``.
95
pub fn leading_zeros(slice: &[u8], offset: usize, len: usize) -> usize {
96
if len == 0 {
97
return 0;
98
}
99
100
assert!(8 * slice.len() >= offset + len);
101
102
let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
103
let leading_zeros_in_prefix =
104
(aligned.prefix().trailing_zeros() as usize).min(aligned.prefix_bitlen());
105
if leading_zeros_in_prefix < aligned.prefix_bitlen() {
106
return leading_zeros_in_prefix;
107
}
108
if let Some(full_zero_bulk_words) = aligned.bulk_iter().position(|w| w != 0) {
109
return aligned.prefix_bitlen()
110
+ full_zero_bulk_words * 64
111
+ aligned.bulk()[full_zero_bulk_words].trailing_zeros() as usize;
112
}
113
114
aligned.prefix_bitlen()
115
+ aligned.bulk_bitlen()
116
+ (aligned.suffix().trailing_zeros() as usize).min(aligned.suffix_bitlen())
117
}
118
119
/// Returns the number of one bits before seeing a zero bit in the slice offsetted by `offset` and
120
/// a length of `length`.
121
///
122
/// # Panics
123
/// This function panics iff `offset + len > 8 * slice.len()``.
124
pub fn leading_ones(slice: &[u8], offset: usize, len: usize) -> usize {
125
if len == 0 {
126
return 0;
127
}
128
129
assert!(8 * slice.len() >= offset + len);
130
131
let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
132
let leading_ones_in_prefix = aligned.prefix().trailing_ones() as usize;
133
if leading_ones_in_prefix < aligned.prefix_bitlen() {
134
return leading_ones_in_prefix;
135
}
136
if let Some(full_one_bulk_words) = aligned.bulk_iter().position(|w| w != u64::MAX) {
137
return aligned.prefix_bitlen()
138
+ full_one_bulk_words * 64
139
+ aligned.bulk()[full_one_bulk_words].trailing_ones() as usize;
140
}
141
142
aligned.prefix_bitlen() + aligned.bulk_bitlen() + aligned.suffix().trailing_ones() as usize
143
}
144
145
/// Returns the number of zero bits before seeing a one bit in the slice offsetted by `offset` and
146
/// a length of `length`.
147
///
148
/// # Panics
149
/// This function panics iff `offset + len > 8 * slice.len()``.
150
pub fn trailing_zeros(slice: &[u8], offset: usize, len: usize) -> usize {
151
if len == 0 {
152
return 0;
153
}
154
155
assert!(8 * slice.len() >= offset + len);
156
157
let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
158
let trailing_zeros_in_suffix = ((aligned.suffix() << ((64 - aligned.suffix_bitlen()) % 64))
159
.leading_zeros() as usize)
160
.min(aligned.suffix_bitlen());
161
if trailing_zeros_in_suffix < aligned.suffix_bitlen() {
162
return trailing_zeros_in_suffix;
163
}
164
if let Some(full_zero_bulk_words) = aligned.bulk_iter().rev().position(|w| w != 0) {
165
return aligned.suffix_bitlen()
166
+ full_zero_bulk_words * 64
167
+ aligned.bulk()[aligned.bulk().len() - full_zero_bulk_words - 1].leading_zeros()
168
as usize;
169
}
170
171
let trailing_zeros_in_prefix = ((aligned.prefix() << ((64 - aligned.prefix_bitlen()) % 64))
172
.leading_zeros() as usize)
173
.min(aligned.prefix_bitlen());
174
aligned.suffix_bitlen() + aligned.bulk_bitlen() + trailing_zeros_in_prefix
175
}
176
177
/// Returns the number of one bits before seeing a zero bit in the slice offsetted by `offset` and
178
/// a length of `length`.
179
///
180
/// # Panics
181
/// This function panics iff `offset + len > 8 * slice.len()``.
182
pub fn trailing_ones(slice: &[u8], offset: usize, len: usize) -> usize {
183
if len == 0 {
184
return 0;
185
}
186
187
assert!(8 * slice.len() >= offset + len);
188
189
let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
190
let trailing_ones_in_suffix =
191
(aligned.suffix() << ((64 - aligned.suffix_bitlen()) % 64)).leading_ones() as usize;
192
if trailing_ones_in_suffix < aligned.suffix_bitlen() {
193
return trailing_ones_in_suffix;
194
}
195
if let Some(full_one_bulk_words) = aligned.bulk_iter().rev().position(|w| w != u64::MAX) {
196
return aligned.suffix_bitlen()
197
+ full_one_bulk_words * 64
198
+ aligned.bulk()[aligned.bulk().len() - full_one_bulk_words - 1].leading_ones()
199
as usize;
200
}
201
202
let trailing_ones_in_prefix =
203
(aligned.prefix() << ((64 - aligned.prefix_bitlen()) % 64)).leading_ones() as usize;
204
aligned.suffix_bitlen() + aligned.bulk_bitlen() + trailing_ones_in_prefix
205
}
206
207
#[cfg(test)]
208
mod tests {
209
use rand::Rng;
210
211
use super::*;
212
use crate::bitmap::Bitmap;
213
214
#[test]
215
fn leading_trailing() {
216
macro_rules! testcase {
217
($slice:expr, $offset:expr, $length:expr => lz=$lz:expr,lo=$lo:expr,tz=$tz:expr,to=$to:expr) => {
218
assert_eq!(
219
leading_zeros($slice, $offset, $length),
220
$lz,
221
"leading_zeros"
222
);
223
assert_eq!(leading_ones($slice, $offset, $length), $lo, "leading_ones");
224
assert_eq!(
225
trailing_zeros($slice, $offset, $length),
226
$tz,
227
"trailing_zeros"
228
);
229
assert_eq!(
230
trailing_ones($slice, $offset, $length),
231
$to,
232
"trailing_ones"
233
);
234
};
235
}
236
237
testcase!(&[], 0, 0 => lz=0,lo=0,tz=0,to=0);
238
testcase!(&[0], 0, 1 => lz=1,lo=0,tz=1,to=0);
239
testcase!(&[1], 0, 1 => lz=0,lo=1,tz=0,to=1);
240
241
testcase!(&[0b010], 0, 3 => lz=1,lo=0,tz=1,to=0);
242
testcase!(&[0b101], 0, 3 => lz=0,lo=1,tz=0,to=1);
243
testcase!(&[0b100], 0, 3 => lz=2,lo=0,tz=0,to=1);
244
testcase!(&[0b110], 0, 3 => lz=1,lo=0,tz=0,to=2);
245
testcase!(&[0b001], 0, 3 => lz=0,lo=1,tz=2,to=0);
246
testcase!(&[0b011], 0, 3 => lz=0,lo=2,tz=1,to=0);
247
248
testcase!(&[0b010], 1, 2 => lz=0,lo=1,tz=1,to=0);
249
testcase!(&[0b101], 1, 2 => lz=1,lo=0,tz=0,to=1);
250
testcase!(&[0b100], 1, 2 => lz=1,lo=0,tz=0,to=1);
251
testcase!(&[0b110], 1, 2 => lz=0,lo=2,tz=0,to=2);
252
testcase!(&[0b001], 1, 2 => lz=2,lo=0,tz=2,to=0);
253
testcase!(&[0b011], 1, 2 => lz=0,lo=1,tz=1,to=0);
254
}
255
256
#[ignore = "Fuzz test. Too slow"]
257
#[test]
258
fn leading_trailing_fuzz() {
259
let mut rng = rand::rng();
260
261
const SIZE: usize = 1000;
262
const REPEATS: usize = 10_000;
263
264
let mut v = Vec::<bool>::with_capacity(SIZE);
265
266
for _ in 0..REPEATS {
267
v.clear();
268
let offset = rng.random_range(0..SIZE);
269
let length = rng.random_range(0..SIZE - offset);
270
let extra_padding = rng.random_range(0..64);
271
272
let mut num_remaining = usize::min(SIZE, offset + length + extra_padding);
273
while num_remaining > 0 {
274
let chunk_size = rng.random_range(1..=num_remaining);
275
v.extend(
276
rng.clone()
277
.sample_iter(rand::distr::slice::Choose::new(&[false, true]).unwrap())
278
.take(chunk_size),
279
);
280
num_remaining -= chunk_size;
281
}
282
283
let v_slice = &v[offset..offset + length];
284
let lz = v_slice.iter().take_while(|&v| !*v).count();
285
let lo = v_slice.iter().take_while(|&v| *v).count();
286
let tz = v_slice.iter().rev().take_while(|&v| !*v).count();
287
let to = v_slice.iter().rev().take_while(|&v| *v).count();
288
289
let bm = Bitmap::from_iter(v.iter().copied());
290
let (slice, _, _) = bm.as_slice();
291
292
assert_eq!(leading_zeros(slice, offset, length), lz);
293
assert_eq!(leading_ones(slice, offset, length), lo);
294
assert_eq!(trailing_zeros(slice, offset, length), tz);
295
assert_eq!(trailing_ones(slice, offset, length), to);
296
}
297
}
298
}
299
300