Path: blob/main/crates/polars-arrow/src/bitmap/utils/mod.rs
6939 views
#![allow(unsafe_op_in_unsafe_fn)]1//! General utilities for bitmaps representing items where LSB is the first item.2mod chunk_iterator;3mod chunks_exact_mut;4mod fmt;5mod iterator;6mod slice_iterator;7mod zip_validity;89pub(crate) use chunk_iterator::merge_reversed;10pub use chunk_iterator::{BitChunk, BitChunkIterExact, BitChunks, BitChunksExact};11pub use chunks_exact_mut::BitChunksExactMut;12pub use fmt::fmt;13pub use iterator::BitmapIter;14use polars_utils::slice::load_padded_le_u64;15pub use slice_iterator::SlicesIterator;16pub use zip_validity::{ZipValidity, ZipValidityIter};1718use crate::bitmap::aligned::AlignedBitmapSlice;1920/// Returns whether bit at position `i` in `byte` is set or not21#[inline]22pub fn is_set(byte: u8, i: usize) -> bool {23debug_assert!(i < 8);24byte & (1 << i) != 025}2627/// Sets bit at position `i` in `byte`.28#[inline(always)]29pub fn set_bit_in_byte(byte: u8, i: usize, value: bool) -> u8 {30debug_assert!(i < 8);31let mask = !(1 << i);32let insert = (value as u8) << i;33(byte & mask) | insert34}3536/// Returns whether bit at position `i` in `bytes` is set or not.37///38/// # Safety39/// `i >= bytes.len() * 8` results in undefined behavior.40#[inline(always)]41pub unsafe fn get_bit_unchecked(bytes: &[u8], i: usize) -> bool {42let byte = *bytes.get_unchecked(i / 8);43let bit = (byte >> (i % 8)) & 1;44bit != 045}4647/// Sets bit at position `i` in `bytes` without doing bound checks.48/// # Safety49/// `i >= bytes.len() * 8` results in undefined behavior.50#[inline(always)]51pub unsafe fn set_bit_unchecked(bytes: &mut [u8], i: usize, value: bool) {52let byte = bytes.get_unchecked_mut(i / 8);53*byte = set_bit_in_byte(*byte, i % 8, value);54}5556/// Returns the number of bytes required to hold `bits` bits.57#[inline]58pub fn bytes_for(bits: usize) -> usize {59bits.saturating_add(7) / 860}6162/// Returns the number of zero bits in the slice offsetted by `offset` and a length of `length`.63/// # Panics64/// This function panics iff `offset + len > 8 * slice.len()``.65pub fn count_zeros(slice: &[u8], offset: usize, len: usize) -> usize {66if len == 0 {67return 0;68}6970assert!(8 * slice.len() >= offset + len);7172// Fast-path: fits in a single u64 load.73let first_byte_idx = offset / 8;74let offset_in_byte = offset % 8;75if offset_in_byte + len <= 64 {76let mut word = load_padded_le_u64(&slice[first_byte_idx..]);77word >>= offset_in_byte;78word <<= 64 - len;79return len - word.count_ones() as usize;80}8182let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);83let ones_in_prefix = aligned.prefix().count_ones() as usize;84let ones_in_bulk: usize = aligned.bulk_iter().map(|w| w.count_ones() as usize).sum();85let ones_in_suffix = aligned.suffix().count_ones() as usize;86len - ones_in_prefix - ones_in_bulk - ones_in_suffix87}8889/// Returns the number of zero bits before seeing a one bit in the slice offsetted by `offset` and90/// a length of `length`.91///92/// # Panics93/// This function panics iff `offset + len > 8 * slice.len()``.94pub fn leading_zeros(slice: &[u8], offset: usize, len: usize) -> usize {95if len == 0 {96return 0;97}9899assert!(8 * slice.len() >= offset + len);100101let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);102let leading_zeros_in_prefix =103(aligned.prefix().trailing_zeros() as usize).min(aligned.prefix_bitlen());104if leading_zeros_in_prefix < aligned.prefix_bitlen() {105return leading_zeros_in_prefix;106}107if let Some(full_zero_bulk_words) = aligned.bulk_iter().position(|w| w != 0) {108return aligned.prefix_bitlen()109+ full_zero_bulk_words * 64110+ aligned.bulk()[full_zero_bulk_words].trailing_zeros() as usize;111}112113aligned.prefix_bitlen()114+ aligned.bulk_bitlen()115+ (aligned.suffix().trailing_zeros() as usize).min(aligned.suffix_bitlen())116}117118/// Returns the number of one bits before seeing a zero bit in the slice offsetted by `offset` and119/// a length of `length`.120///121/// # Panics122/// This function panics iff `offset + len > 8 * slice.len()``.123pub fn leading_ones(slice: &[u8], offset: usize, len: usize) -> usize {124if len == 0 {125return 0;126}127128assert!(8 * slice.len() >= offset + len);129130let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);131let leading_ones_in_prefix = aligned.prefix().trailing_ones() as usize;132if leading_ones_in_prefix < aligned.prefix_bitlen() {133return leading_ones_in_prefix;134}135if let Some(full_one_bulk_words) = aligned.bulk_iter().position(|w| w != u64::MAX) {136return aligned.prefix_bitlen()137+ full_one_bulk_words * 64138+ aligned.bulk()[full_one_bulk_words].trailing_ones() as usize;139}140141aligned.prefix_bitlen() + aligned.bulk_bitlen() + aligned.suffix().trailing_ones() as usize142}143144/// Returns the number of zero bits before seeing a one bit in the slice offsetted by `offset` and145/// a length of `length`.146///147/// # Panics148/// This function panics iff `offset + len > 8 * slice.len()``.149pub fn trailing_zeros(slice: &[u8], offset: usize, len: usize) -> usize {150if len == 0 {151return 0;152}153154assert!(8 * slice.len() >= offset + len);155156let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);157let trailing_zeros_in_suffix = ((aligned.suffix() << ((64 - aligned.suffix_bitlen()) % 64))158.leading_zeros() as usize)159.min(aligned.suffix_bitlen());160if trailing_zeros_in_suffix < aligned.suffix_bitlen() {161return trailing_zeros_in_suffix;162}163if let Some(full_zero_bulk_words) = aligned.bulk_iter().rev().position(|w| w != 0) {164return aligned.suffix_bitlen()165+ full_zero_bulk_words * 64166+ aligned.bulk()[aligned.bulk().len() - full_zero_bulk_words - 1].leading_zeros()167as usize;168}169170let trailing_zeros_in_prefix = ((aligned.prefix() << ((64 - aligned.prefix_bitlen()) % 64))171.leading_zeros() as usize)172.min(aligned.prefix_bitlen());173aligned.suffix_bitlen() + aligned.bulk_bitlen() + trailing_zeros_in_prefix174}175176/// Returns the number of one bits before seeing a zero bit in the slice offsetted by `offset` and177/// a length of `length`.178///179/// # Panics180/// This function panics iff `offset + len > 8 * slice.len()``.181pub fn trailing_ones(slice: &[u8], offset: usize, len: usize) -> usize {182if len == 0 {183return 0;184}185186assert!(8 * slice.len() >= offset + len);187188let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);189let trailing_ones_in_suffix =190(aligned.suffix() << ((64 - aligned.suffix_bitlen()) % 64)).leading_ones() as usize;191if trailing_ones_in_suffix < aligned.suffix_bitlen() {192return trailing_ones_in_suffix;193}194if let Some(full_one_bulk_words) = aligned.bulk_iter().rev().position(|w| w != u64::MAX) {195return aligned.suffix_bitlen()196+ full_one_bulk_words * 64197+ aligned.bulk()[aligned.bulk().len() - full_one_bulk_words - 1].leading_ones()198as usize;199}200201let trailing_ones_in_prefix =202(aligned.prefix() << ((64 - aligned.prefix_bitlen()) % 64)).leading_ones() as usize;203aligned.suffix_bitlen() + aligned.bulk_bitlen() + trailing_ones_in_prefix204}205206#[cfg(test)]207mod tests {208use rand::Rng;209210use super::*;211use crate::bitmap::Bitmap;212213#[test]214fn leading_trailing() {215macro_rules! testcase {216($slice:expr, $offset:expr, $length:expr => lz=$lz:expr,lo=$lo:expr,tz=$tz:expr,to=$to:expr) => {217assert_eq!(218leading_zeros($slice, $offset, $length),219$lz,220"leading_zeros"221);222assert_eq!(leading_ones($slice, $offset, $length), $lo, "leading_ones");223assert_eq!(224trailing_zeros($slice, $offset, $length),225$tz,226"trailing_zeros"227);228assert_eq!(229trailing_ones($slice, $offset, $length),230$to,231"trailing_ones"232);233};234}235236testcase!(&[], 0, 0 => lz=0,lo=0,tz=0,to=0);237testcase!(&[0], 0, 1 => lz=1,lo=0,tz=1,to=0);238testcase!(&[1], 0, 1 => lz=0,lo=1,tz=0,to=1);239240testcase!(&[0b010], 0, 3 => lz=1,lo=0,tz=1,to=0);241testcase!(&[0b101], 0, 3 => lz=0,lo=1,tz=0,to=1);242testcase!(&[0b100], 0, 3 => lz=2,lo=0,tz=0,to=1);243testcase!(&[0b110], 0, 3 => lz=1,lo=0,tz=0,to=2);244testcase!(&[0b001], 0, 3 => lz=0,lo=1,tz=2,to=0);245testcase!(&[0b011], 0, 3 => lz=0,lo=2,tz=1,to=0);246247testcase!(&[0b010], 1, 2 => lz=0,lo=1,tz=1,to=0);248testcase!(&[0b101], 1, 2 => lz=1,lo=0,tz=0,to=1);249testcase!(&[0b100], 1, 2 => lz=1,lo=0,tz=0,to=1);250testcase!(&[0b110], 1, 2 => lz=0,lo=2,tz=0,to=2);251testcase!(&[0b001], 1, 2 => lz=2,lo=0,tz=2,to=0);252testcase!(&[0b011], 1, 2 => lz=0,lo=1,tz=1,to=0);253}254255#[ignore = "Fuzz test. Too slow"]256#[test]257fn leading_trailing_fuzz() {258let mut rng = rand::rng();259260const SIZE: usize = 1000;261const REPEATS: usize = 10_000;262263let mut v = Vec::<bool>::with_capacity(SIZE);264265for _ in 0..REPEATS {266v.clear();267let offset = rng.random_range(0..SIZE);268let length = rng.random_range(0..SIZE - offset);269let extra_padding = rng.random_range(0..64);270271let mut num_remaining = usize::min(SIZE, offset + length + extra_padding);272while num_remaining > 0 {273let chunk_size = rng.random_range(1..=num_remaining);274v.extend(275rng.clone()276.sample_iter(rand::distr::slice::Choose::new(&[false, true]).unwrap())277.take(chunk_size),278);279num_remaining -= chunk_size;280}281282let v_slice = &v[offset..offset + length];283let lz = v_slice.iter().take_while(|&v| !*v).count();284let lo = v_slice.iter().take_while(|&v| *v).count();285let tz = v_slice.iter().rev().take_while(|&v| !*v).count();286let to = v_slice.iter().rev().take_while(|&v| *v).count();287288let bm = Bitmap::from_iter(v.iter().copied());289let (slice, _, _) = bm.as_slice();290291assert_eq!(leading_zeros(slice, offset, length), lz);292assert_eq!(leading_ones(slice, offset, length), lo);293assert_eq!(trailing_zeros(slice, offset, length), tz);294assert_eq!(trailing_ones(slice, offset, length), to);295}296}297}298299300