Path: blob/main/crates/polars-arrow/src/bitmap/builder.rs
6939 views
#![allow(unsafe_op_in_unsafe_fn)]1use polars_utils::IdxSize;2use polars_utils::slice::load_padded_le_u64;34use super::bitmask::BitMask;5use crate::bitmap::{Bitmap, MutableBitmap};6use crate::storage::SharedStorage;7use crate::trusted_len::TrustedLen;89/// Used to build bitmaps bool-by-bool in sequential order.10#[derive(Default, Clone)]11pub struct BitmapBuilder {12buf: u64, // A buffer containing the last self.bit_len % 64 bits.13bit_len: usize, // Length in bits.14bit_cap: usize, // Capacity in bits (always multiple of 64).15set_bits_in_bytes: usize, // The number of bits set in self.bytes, not including self.buf.16bytes: Vec<u8>,17}1819impl BitmapBuilder {20pub fn new() -> Self {21Self::default()22}2324#[inline(always)]25pub fn len(&self) -> usize {26self.bit_len27}2829#[inline(always)]30pub fn is_empty(&self) -> bool {31self.bit_len == 032}3334#[inline(always)]35pub fn capacity(&self) -> usize {36self.bit_cap37}3839#[inline(always)]40pub fn set_bits(&self) -> usize {41self.set_bits_in_bytes + self.buf.count_ones() as usize42}4344#[inline(always)]45pub fn unset_bits(&self) -> usize {46self.bit_len - self.set_bits()47}4849pub fn with_capacity(bits: usize) -> Self {50let bytes = Vec::with_capacity(bits.div_ceil(64) * 8);51let words_available = bytes.capacity() / 8;52Self {53buf: 0,54bit_len: 0,55bit_cap: words_available * 64,56set_bits_in_bytes: 0,57bytes,58}59}6061#[inline(always)]62pub fn reserve(&mut self, additional: usize) {63if self.bit_len + additional > self.bit_cap {64self.reserve_slow(additional)65}66}6768#[cold]69#[inline(never)]70fn reserve_slow(&mut self, additional: usize) {71let bytes_needed = (self.bit_len + additional).div_ceil(64) * 8;72self.bytes.reserve(bytes_needed - self.bytes.len());73let words_available = self.bytes.capacity() / 8;74self.bit_cap = words_available * 64;75}7677pub fn clear(&mut self) {78self.buf = 0;79self.bit_len = 0;80self.set_bits_in_bytes = 0;81self.bytes.clear();82}8384#[inline(always)]85pub fn push(&mut self, x: bool) {86self.reserve(1);87unsafe { self.push_unchecked(x) }88}8990/// Does not update len/set_bits, simply writes to the output buffer.91/// # Safety92/// self.bytes.len() + 8 <= self.bytes.capacity() must hold.93#[inline(always)]94unsafe fn flush_word_unchecked(&mut self, w: u64) {95let cur_len = self.bytes.len();96let p = self.bytes.as_mut_ptr().add(cur_len).cast::<u64>();97p.write_unaligned(w.to_le());98self.bytes.set_len(cur_len + 8);99}100101/// # Safety102/// self.len() < self.capacity() must hold.103#[inline(always)]104pub unsafe fn push_unchecked(&mut self, x: bool) {105debug_assert!(self.bit_len < self.bit_cap);106self.buf |= (x as u64) << (self.bit_len % 64);107self.bit_len += 1;108if self.bit_len.is_multiple_of(64) {109self.flush_word_unchecked(self.buf);110self.set_bits_in_bytes += self.buf.count_ones() as usize;111self.buf = 0;112}113}114115#[inline(always)]116pub fn extend_constant(&mut self, length: usize, value: bool) {117// Fast path if the extension still fits in buf with room left to spare.118let bits_in_buf = self.bit_len % 64;119if bits_in_buf + length < 64 {120let bit_block = ((value as u64) << length) - (value as u64);121self.buf |= bit_block << bits_in_buf;122self.bit_len += length;123} else {124self.extend_constant_slow(length, value);125}126}127128#[cold]129fn extend_constant_slow(&mut self, length: usize, value: bool) {130unsafe {131let value_spread = if value { u64::MAX } else { 0 }; // Branchless neg.132133// Extend and flush current buf.134self.reserve(length);135let bits_in_buf = self.bit_len % 64;136let ext_buf = self.buf | (value_spread << bits_in_buf);137self.flush_word_unchecked(ext_buf);138self.set_bits_in_bytes += ext_buf.count_ones() as usize;139140// Write complete words.141let remaining_bits = length - (64 - bits_in_buf);142let remaining_words = remaining_bits / 64;143for _ in 0..remaining_words {144self.flush_word_unchecked(value_spread);145}146self.set_bits_in_bytes += (remaining_words * 64) & value_spread as usize;147148// Put remainder in buf and update length.149self.buf = ((value as u64) << (remaining_bits % 64)) - (value as u64);150self.bit_len += length;151}152}153154/// Pushes the first length bits from the given word, assuming the rest of155/// the bits are zero.156/// # Safety157/// self.len + length <= self.cap and length <= 64 must hold.158pub unsafe fn push_word_with_len_unchecked(&mut self, word: u64, length: usize) {159debug_assert!(self.bit_len + length <= self.bit_cap);160debug_assert!(length <= 64);161debug_assert!(length == 64 || (word >> length) == 0);162let bits_in_buf = self.bit_len % 64;163self.buf |= word << bits_in_buf;164if bits_in_buf + length >= 64 {165self.flush_word_unchecked(self.buf);166self.set_bits_in_bytes += self.buf.count_ones() as usize;167self.buf = if bits_in_buf > 0 {168word >> (64 - bits_in_buf)169} else {1700171};172}173self.bit_len += length;174}175176/// # Safety177/// self.len() + length <= self.capacity() must hold, as well as178/// offset + length <= 8 * slice.len().179unsafe fn extend_from_slice_unchecked(180&mut self,181mut slice: &[u8],182mut offset: usize,183mut length: usize,184) {185if length == 0 {186return;187}188189// Deal with slice offset so it's aligned to bytes.190let slice_bit_offset = offset % 8;191if slice_bit_offset > 0 {192let bits_in_first_byte = (8 - slice_bit_offset).min(length);193let first_byte = *slice.get_unchecked(offset / 8) >> slice_bit_offset;194self.push_word_with_len_unchecked(195first_byte as u64 & ((1 << bits_in_first_byte) - 1),196bits_in_first_byte,197);198length -= bits_in_first_byte;199offset += bits_in_first_byte;200}201slice = slice.get_unchecked(offset / 8..);202203// Write word-by-word.204let bits_in_buf = self.bit_len % 64;205if bits_in_buf > 0 {206while length >= 64 {207let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());208self.buf |= word << bits_in_buf;209self.flush_word_unchecked(self.buf);210self.set_bits_in_bytes += self.buf.count_ones() as usize;211self.buf = word >> (64 - bits_in_buf);212self.bit_len += 64;213length -= 64;214slice = slice.get_unchecked(8..);215}216} else {217while length >= 64 {218let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());219self.flush_word_unchecked(word);220self.set_bits_in_bytes += word.count_ones() as usize;221self.bit_len += 64;222length -= 64;223slice = slice.get_unchecked(8..);224}225}226227// Just the last word left.228if length > 0 {229let word = load_padded_le_u64(slice);230self.push_word_with_len_unchecked(word & ((1 << length) - 1), length);231}232}233234/// # Safety235/// self.len() + length*repeats <= self.capacity() must hold, as well as236/// offset + length <= 8 * slice.len().237unsafe fn extend_each_repeated_from_slice_unchecked(238&mut self,239slice: &[u8],240offset: usize,241length: usize,242repeats: usize,243) {244if repeats == 0 {245return;246}247if repeats == 1 {248return self.extend_from_slice_unchecked(slice, offset, length);249}250for bit_idx in offset..length {251let bit = (*slice.get_unchecked(bit_idx / 8) >> (bit_idx % 8)) & 1 != 0;252self.extend_constant(repeats, bit);253}254}255256pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {257assert!(8 * slice.len() >= offset + length);258self.reserve(length);259unsafe {260self.extend_from_slice_unchecked(slice, offset, length);261}262}263264pub fn extend_each_repeated_from_slice(265&mut self,266slice: &[u8],267offset: usize,268length: usize,269repeats: usize,270) {271assert!(8 * slice.len() >= offset + length);272self.reserve(length * repeats);273unsafe {274self.extend_each_repeated_from_slice_unchecked(slice, offset, length, repeats);275}276}277278pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) {279// TODO: we can perhaps use the bitmaps bitcount here instead of280// recomputing it if it has a known bitcount.281let (slice, offset, length) = bitmap.as_slice();282self.extend_from_slice(slice, offset, length);283}284285pub fn extend_from_bitmask(&mut self, bitmap: BitMask<'_>) {286let (slice, offset, length) = bitmap.inner();287self.extend_from_slice(slice, offset, length);288}289290/// Extends this BitmapBuilder with a subslice of a bitmap.291pub fn subslice_extend_from_bitmap(&mut self, bitmap: &Bitmap, start: usize, length: usize) {292let (slice, bm_offset, bm_length) = bitmap.as_slice();293assert!(start + length <= bm_length);294self.extend_from_slice(slice, bm_offset + start, length);295}296297/// Extends this BitmapBuilder with a subslice of a bitmap, repeating each bit `repeats` times.298pub fn subslice_extend_each_repeated_from_bitmap(299&mut self,300bitmap: &Bitmap,301start: usize,302length: usize,303repeats: usize,304) {305let (slice, bm_offset, bm_length) = bitmap.as_slice();306assert!(start + length <= bm_length);307self.extend_each_repeated_from_slice(slice, bm_offset + start, length, repeats);308}309310pub fn subslice_extend_from_opt_validity(311&mut self,312bitmap: Option<&Bitmap>,313start: usize,314length: usize,315) {316match bitmap {317Some(bm) => self.subslice_extend_from_bitmap(bm, start, length),318None => self.extend_constant(length, true),319}320}321322pub fn subslice_extend_each_repeated_from_opt_validity(323&mut self,324bitmap: Option<&Bitmap>,325start: usize,326length: usize,327repeats: usize,328) {329match bitmap {330Some(bm) => self.subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats),331None => self.extend_constant(length * repeats, true),332}333}334335/// # Safety336/// The indices must be in-bounds.337pub unsafe fn gather_extend_from_slice(338&mut self,339slice: &[u8],340offset: usize,341length: usize,342idxs: &[IdxSize],343) {344assert!(8 * slice.len() >= offset + length);345346self.reserve(idxs.len());347unsafe {348for idx in idxs {349debug_assert!((*idx as usize) < length);350let idx_in_slice = offset + *idx as usize;351let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;352self.push_unchecked(bit != 0);353}354}355}356357pub fn opt_gather_extend_from_slice(358&mut self,359slice: &[u8],360offset: usize,361length: usize,362idxs: &[IdxSize],363) {364assert!(8 * slice.len() >= offset + length);365366self.reserve(idxs.len());367unsafe {368for idx in idxs {369if (*idx as usize) < length {370let idx_in_slice = offset + *idx as usize;371let bit = (*slice.get_unchecked(idx_in_slice / 8) >> (idx_in_slice % 8)) & 1;372self.push_unchecked(bit != 0);373} else {374self.push_unchecked(false);375}376}377}378}379380/// # Safety381/// The indices must be in-bounds.382pub unsafe fn gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {383let (slice, offset, length) = bitmap.as_slice();384self.gather_extend_from_slice(slice, offset, length, idxs);385}386387pub fn opt_gather_extend_from_bitmap(&mut self, bitmap: &Bitmap, idxs: &[IdxSize]) {388let (slice, offset, length) = bitmap.as_slice();389self.opt_gather_extend_from_slice(slice, offset, length, idxs);390}391392/// # Safety393/// The indices must be in-bounds.394pub unsafe fn gather_extend_from_opt_validity(395&mut self,396bitmap: Option<&Bitmap>,397idxs: &[IdxSize],398length: usize,399) {400if let Some(bm) = bitmap {401let (slice, offset, sl_length) = bm.as_slice();402debug_assert_eq!(sl_length, length);403self.gather_extend_from_slice(slice, offset, length, idxs);404} else {405self.extend_constant(length, true);406}407}408409pub fn opt_gather_extend_from_opt_validity(410&mut self,411bitmap: Option<&Bitmap>,412idxs: &[IdxSize],413length: usize,414) {415if let Some(bm) = bitmap {416let (slice, offset, sl_length) = bm.as_slice();417debug_assert_eq!(sl_length, length);418self.opt_gather_extend_from_slice(slice, offset, sl_length, idxs);419} else {420unsafe {421self.reserve(idxs.len());422for idx in idxs {423self.push_unchecked((*idx as usize) < length);424}425}426}427}428429/// # Safety430/// May only be called once at the end.431unsafe fn finish(&mut self) {432if !self.bit_len.is_multiple_of(64) {433self.bytes.extend_from_slice(&self.buf.to_le_bytes());434self.set_bits_in_bytes += self.buf.count_ones() as usize;435self.buf = 0;436}437}438439/// Converts this BitmapBuilder into a mutable bitmap.440pub fn into_mut(mut self) -> MutableBitmap {441unsafe {442self.finish();443MutableBitmap::from_vec(self.bytes, self.bit_len)444}445}446447/// The same as into_mut, but returns None if the bitmap is all-ones.448pub fn into_opt_mut_validity(mut self) -> Option<MutableBitmap> {449unsafe {450self.finish();451if self.set_bits_in_bytes == self.bit_len {452return None;453}454Some(MutableBitmap::from_vec(self.bytes, self.bit_len))455}456}457458/// Freezes this BitmapBuilder into an immutable Bitmap.459pub fn freeze(mut self) -> Bitmap {460unsafe {461self.finish();462let storage = SharedStorage::from_vec(self.bytes);463Bitmap::from_inner_unchecked(464storage,4650,466self.bit_len,467Some(self.bit_len - self.set_bits_in_bytes),468)469}470}471472/// The same as freeze, but returns None if the bitmap is all-ones.473pub fn into_opt_validity(mut self) -> Option<Bitmap> {474unsafe {475self.finish();476if self.set_bits_in_bytes == self.bit_len {477return None;478}479let storage = SharedStorage::from_vec(self.bytes);480let bitmap = Bitmap::from_inner_unchecked(481storage,4820,483self.bit_len,484Some(self.bit_len - self.set_bits_in_bytes),485);486Some(bitmap)487}488}489490pub fn extend_trusted_len_iter<I>(&mut self, iterator: I)491where492I: Iterator<Item = bool> + TrustedLen,493{494self.reserve(iterator.size_hint().1.unwrap());495for b in iterator {496// SAFETY: we reserved and the iterator's length is trusted.497unsafe {498self.push_unchecked(b);499}500}501}502503#[inline]504pub fn from_trusted_len_iter<I>(iterator: I) -> Self505where506I: Iterator<Item = bool> + TrustedLen,507{508let mut builder = Self::new();509builder.extend_trusted_len_iter(iterator);510builder511}512}513514/// A wrapper for BitmapBuilder that does not allocate until the first false is515/// pushed. Less efficient if you know there are false values because it must516/// check if it has allocated for each push.517pub enum OptBitmapBuilder {518AllTrue { bit_len: usize, bit_cap: usize },519MayHaveFalse(BitmapBuilder),520}521522impl Default for OptBitmapBuilder {523fn default() -> Self {524Self::AllTrue {525bit_len: 0,526bit_cap: 0,527}528}529}530531impl OptBitmapBuilder {532pub fn reserve(&mut self, additional: usize) {533match self {534Self::AllTrue { bit_len, bit_cap } => {535*bit_cap = usize::max(*bit_cap, *bit_len + additional);536},537Self::MayHaveFalse(inner) => inner.reserve(additional),538}539}540541pub fn extend_constant(&mut self, length: usize, value: bool) {542match self {543Self::AllTrue { bit_len, bit_cap } => {544if value {545*bit_cap = usize::max(*bit_cap, *bit_len + length);546*bit_len += length;547} else {548self.get_builder().extend_constant(length, value);549}550},551Self::MayHaveFalse(inner) => inner.extend_constant(length, value),552}553}554555pub fn into_opt_validity(self) -> Option<Bitmap> {556match self {557Self::AllTrue { .. } => None,558Self::MayHaveFalse(inner) => inner.into_opt_validity(),559}560}561562pub fn subslice_extend_from_opt_validity(563&mut self,564bitmap: Option<&Bitmap>,565start: usize,566length: usize,567) {568match bitmap {569Some(bm) => {570self.get_builder()571.subslice_extend_from_bitmap(bm, start, length);572},573None => {574self.extend_constant(length, true);575},576}577}578579pub fn subslice_extend_each_repeated_from_opt_validity(580&mut self,581bitmap: Option<&Bitmap>,582start: usize,583length: usize,584repeats: usize,585) {586match bitmap {587Some(bm) => {588self.get_builder()589.subslice_extend_each_repeated_from_bitmap(bm, start, length, repeats);590},591None => {592self.extend_constant(length * repeats, true);593},594}595}596597/// # Safety598/// The indices must be in-bounds.599pub unsafe fn gather_extend_from_opt_validity(600&mut self,601bitmap: Option<&Bitmap>,602idxs: &[IdxSize],603) {604match bitmap {605Some(bm) => {606self.get_builder().gather_extend_from_bitmap(bm, idxs);607},608None => {609self.extend_constant(idxs.len(), true);610},611}612}613614pub fn opt_gather_extend_from_opt_validity(615&mut self,616bitmap: Option<&Bitmap>,617idxs: &[IdxSize],618length: usize,619) {620match bitmap {621Some(bm) => {622self.get_builder().opt_gather_extend_from_bitmap(bm, idxs);623},624None => {625if let Some(first_oob) = idxs.iter().position(|idx| *idx as usize >= length) {626let builder = self.get_builder();627builder.extend_constant(first_oob, true);628for idx in idxs.iter().skip(first_oob) {629builder.push((*idx as usize) < length);630}631} else {632self.extend_constant(idxs.len(), true);633}634},635}636}637638fn get_builder(&mut self) -> &mut BitmapBuilder {639match self {640Self::AllTrue { bit_len, bit_cap } => {641let mut builder = BitmapBuilder::with_capacity(*bit_cap);642builder.extend_constant(*bit_len, true);643*self = Self::MayHaveFalse(builder);644let Self::MayHaveFalse(inner) = self else {645unreachable!()646};647inner648},649Self::MayHaveFalse(inner) => inner,650}651}652}653654655