Path: blob/main/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs
6940 views
use arrow::array::{BooleanArray, Splitable};1use arrow::bitmap::bitmask::BitMask;2use arrow::bitmap::utils::BitmapIter;3use arrow::bitmap::{Bitmap, BitmapBuilder};4use arrow::datatypes::ArrowDataType;5use polars_compute::filter::filter_boolean_kernel;67use super::dictionary_encoded::{append_validity, constrain_page_validity};8use super::utils::{9self, Decoded, Decoder, decode_hybrid_rle_into_bitmap, filter_from_range, freeze_validity,10};11use super::{Filter, PredicateFilter};12use crate::parquet::encoding::Encoding;13use crate::parquet::encoding::hybrid_rle::{HybridRleChunk, HybridRleDecoder};14use crate::parquet::error::ParquetResult;15use crate::parquet::page::{DataPage, DictPage, split_buffer};1617#[allow(clippy::large_enum_variant)]18#[derive(Debug)]19pub(crate) enum StateTranslation<'a> {20Plain(BitMask<'a>),21Rle(HybridRleDecoder<'a>),22}2324impl<'a> utils::StateTranslation<'a, BooleanDecoder> for StateTranslation<'a> {25type PlainDecoder = BitmapIter<'a>;2627fn new(28_decoder: &BooleanDecoder,29page: &'a DataPage,30_dict: Option<&'a <BooleanDecoder as Decoder>::Dict>,31page_validity: Option<&Bitmap>,32) -> ParquetResult<Self> {33let values = split_buffer(page)?.values;3435match page.encoding() {36Encoding::Plain => {37let max_num_values = values.len() * u8::BITS as usize;38let num_values = if page_validity.is_some() {39// @NOTE: We overestimate the amount of values here, but in the V140// specification we don't really have a way to know the number of valid items.41// Without traversing the list.42max_num_values43} else {44// @NOTE: We cannot really trust the value from this as it might relate to the45// number of top-level nested values. Therefore, we do a `min` with the maximum46// number of possible values.47usize::min(page.num_values(), max_num_values)48};4950Ok(Self::Plain(BitMask::new(values, 0, num_values)))51},52Encoding::Rle => {53// @NOTE: For a nullable list, we might very well overestimate the amount of54// values, but we never collect those items. We don't really have a way to know the55// number of valid items in the V1 specification.5657// For RLE boolean values the length in bytes is pre-pended.58// https://github.com/apache/parquet-format/blob/e517ac4dbe08d518eb5c2e58576d4c711973db94/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--359let (_len_in_bytes, values) = values.split_at(4);60Ok(Self::Rle(HybridRleDecoder::new(61values,621,63page.num_values(),64)))65},66_ => Err(utils::not_implemented(page)),67}68}69fn num_rows(&self) -> usize {70match self {71Self::Plain(m) => m.len(),72Self::Rle(m) => m.len(),73}74}75}7677fn decode_required_rle(78values: HybridRleDecoder<'_>,79limit: Option<usize>,80target: &mut BitmapBuilder,81) -> ParquetResult<()> {82decode_hybrid_rle_into_bitmap(values, limit, target)?;83Ok(())84}8586fn decode_optional_rle(87values: HybridRleDecoder<'_>,88target: &mut BitmapBuilder,89page_validity: &Bitmap,90) -> ParquetResult<()> {91debug_assert!(page_validity.set_bits() <= values.len());9293if page_validity.unset_bits() == 0 {94return decode_required_rle(values, Some(page_validity.len()), target);95}9697target.reserve(page_validity.len());9899let mut validity_mask = BitMask::from_bitmap(page_validity);100101for chunk in values.into_chunk_iter() {102let chunk = chunk?;103104match chunk {105HybridRleChunk::Rle(value, size) => {106let offset = validity_mask107.nth_set_bit_idx(size, 0)108.unwrap_or(validity_mask.len());109110let t;111(t, validity_mask) = validity_mask.split_at(offset);112113target.extend_constant(t.len(), value != 0);114},115HybridRleChunk::Bitpacked(decoder) => {116let decoder_slice = decoder.as_slice();117let offset = validity_mask118.nth_set_bit_idx(decoder.len(), 0)119.unwrap_or(validity_mask.len());120121let decoder_validity;122(decoder_validity, validity_mask) = validity_mask.split_at(offset);123124let mut offset = 0;125let mut validity_iter = decoder_validity.iter();126while validity_iter.num_remaining() > 0 {127let num_valid = validity_iter.take_leading_ones();128target.extend_from_slice(decoder_slice, offset, num_valid);129offset += num_valid;130131let num_invalid = validity_iter.take_leading_zeros();132target.extend_constant(num_invalid, false);133}134},135}136}137138if cfg!(debug_assertions) {139assert_eq!(validity_mask.set_bits(), 0);140}141target.extend_constant(validity_mask.len(), false);142143Ok(())144}145146fn decode_masked_required_rle(147values: HybridRleDecoder<'_>,148target: &mut BitmapBuilder,149mask: &Bitmap,150) -> ParquetResult<()> {151debug_assert!(mask.len() <= values.len());152153if mask.unset_bits() == 0 {154return decode_required_rle(values, Some(mask.len()), target);155}156157let mut im_target = BitmapBuilder::new();158decode_required_rle(values, Some(mask.len()), &mut im_target)?;159160target.extend_from_bitmap(&filter_boolean_kernel(&im_target.freeze(), mask));161162Ok(())163}164165fn decode_masked_optional_rle(166values: HybridRleDecoder<'_>,167target: &mut BitmapBuilder,168page_validity: &Bitmap,169mask: &Bitmap,170) -> ParquetResult<()> {171debug_assert_eq!(page_validity.len(), mask.len());172debug_assert!(mask.len() <= values.len());173174if mask.unset_bits() == 0 {175return decode_optional_rle(values, target, page_validity);176}177178if page_validity.unset_bits() == 0 {179return decode_masked_required_rle(values, target, mask);180}181182let mut im_target = BitmapBuilder::new();183decode_optional_rle(values, &mut im_target, page_validity)?;184185target.extend_from_bitmap(&filter_boolean_kernel(&im_target.freeze(), mask));186187Ok(())188}189190fn decode_required_plain(values: BitMask<'_>, target: &mut BitmapBuilder) -> ParquetResult<()> {191target.extend_from_bitmask(values);192Ok(())193}194195fn decode_optional_plain(196mut values: BitMask<'_>,197target: &mut BitmapBuilder,198mut page_validity: Bitmap,199) -> ParquetResult<()> {200debug_assert!(page_validity.set_bits() <= values.len());201202if page_validity.unset_bits() == 0 {203return decode_required_plain(values.sliced(0, page_validity.len()), target);204}205206target.reserve(page_validity.len());207208while !page_validity.is_empty() {209let num_valid = page_validity.take_leading_ones();210let iv;211(iv, values) = values.split_at(num_valid);212target.extend_from_bitmask(iv);213214let num_invalid = page_validity.take_leading_zeros();215target.extend_constant(num_invalid, false);216}217218Ok(())219}220221fn decode_masked_required_plain(222mut values: BitMask,223target: &mut BitmapBuilder,224mut mask: Bitmap,225) -> ParquetResult<()> {226debug_assert!(mask.len() <= values.len());227228let leading_zeros = mask.take_leading_zeros();229mask.take_trailing_zeros();230231values = values.sliced(leading_zeros, mask.len());232233if mask.unset_bits() == 0 {234return decode_required_plain(values, target);235}236237let mut im_target = BitmapBuilder::new();238decode_required_plain(values, &mut im_target)?;239240target.extend_from_bitmap(&filter_boolean_kernel(&im_target.freeze(), &mask));241242Ok(())243}244245fn decode_masked_optional_plain(246mut values: BitMask<'_>,247target: &mut BitmapBuilder,248mut page_validity: Bitmap,249mut mask: Bitmap,250) -> ParquetResult<()> {251debug_assert_eq!(page_validity.len(), mask.len());252debug_assert!(page_validity.set_bits() <= values.len());253254let leading_zeros = mask.take_leading_zeros();255mask.take_trailing_zeros();256257let (skipped, truncated);258(skipped, page_validity) = page_validity.split_at(leading_zeros);259(page_validity, truncated) = page_validity.split_at(mask.len());260261let skipped_values = skipped.set_bits();262let truncated_values = truncated.set_bits();263values = values.sliced(264skipped_values,265values.len() - skipped_values - truncated_values,266);267268if mask.unset_bits() == 0 {269return decode_optional_plain(values, target, page_validity);270}271272if page_validity.unset_bits() == 0 {273return decode_masked_required_plain(values, target, mask);274}275276let mut im_target = BitmapBuilder::new();277decode_optional_plain(values, &mut im_target, page_validity)?;278279target.extend_from_bitmap(&filter_boolean_kernel(&im_target.freeze(), &mask));280281Ok(())282}283284impl Decoded for (BitmapBuilder, BitmapBuilder) {285fn len(&self) -> usize {286self.0.len()287}288fn extend_nulls(&mut self, n: usize) {289self.0.extend_constant(n, false);290self.1.extend_constant(n, false);291}292}293294pub(crate) struct BooleanDecoder;295296impl Decoder for BooleanDecoder {297type Translation<'a> = StateTranslation<'a>;298type Dict = BooleanArray;299type DecodedState = (BitmapBuilder, BitmapBuilder);300type Output = BooleanArray;301302fn with_capacity(&self, capacity: usize) -> Self::DecodedState {303(304BitmapBuilder::with_capacity(capacity),305BitmapBuilder::with_capacity(capacity),306)307}308309fn deserialize_dict(&mut self, _: DictPage) -> ParquetResult<Self::Dict> {310Ok(BooleanArray::new_empty(ArrowDataType::Boolean))311}312313fn finalize(314&self,315dtype: ArrowDataType,316_dict: Option<Self::Dict>,317(values, validity): Self::DecodedState,318) -> ParquetResult<Self::Output> {319let validity = freeze_validity(validity);320Ok(BooleanArray::new(dtype, values.freeze(), validity))321}322323fn has_predicate_specialization(324&self,325_state: &utils::State<'_, Self>,326_predicate: &PredicateFilter,327) -> ParquetResult<bool> {328// @TODO: This can be enabled for the fast paths329Ok(false)330}331332fn extend_decoded(333&self,334decoded: &mut Self::DecodedState,335additional: &dyn arrow::array::Array,336is_optional: bool,337) -> ParquetResult<()> {338let additional = additional.as_any().downcast_ref::<BooleanArray>().unwrap();339decoded.0.extend_from_bitmap(additional.values());340match additional.validity() {341Some(v) => decoded.1.extend_from_bitmap(v),342None if is_optional => decoded.1.extend_constant(additional.len(), true),343None => {},344}345346Ok(())347}348349fn extend_filtered_with_state(350&mut self,351state: utils::State<'_, Self>,352(target, validity): &mut Self::DecodedState,353_pred_true_mask: &mut BitmapBuilder,354filter: Option<super::Filter>,355) -> ParquetResult<()> {356match state.translation {357StateTranslation::Plain(values) => {358if state.is_optional {359append_validity(360state.page_validity.as_ref(),361filter.as_ref(),362validity,363values.len(),364);365}366367let page_validity = constrain_page_validity(368values.len(),369state.page_validity.as_ref(),370filter.as_ref(),371);372373match (filter, page_validity) {374(None, None) => decode_required_plain(values, target),375(Some(Filter::Range(rng)), None) => {376decode_required_plain(values.sliced(rng.start, rng.len()), target)377},378(None, Some(page_validity)) => {379decode_optional_plain(values, target, page_validity)380},381(Some(Filter::Range(rng)), Some(mut page_validity)) => {382let (skipped, truncated);383(skipped, page_validity) = page_validity.split_at(rng.start);384(page_validity, truncated) = page_validity.split_at(rng.len());385386let skipped_values = skipped.set_bits();387let truncated_values = truncated.set_bits();388let values = values.sliced(389skipped_values,390values.len() - skipped_values - truncated_values,391);392393decode_optional_plain(values, target, page_validity)394},395(Some(Filter::Mask(mask)), None) => {396decode_masked_required_plain(values, target, mask)397},398(Some(Filter::Mask(mask)), Some(page_validity)) => {399decode_masked_optional_plain(values, target, page_validity, mask)400},401(Some(Filter::Predicate(_)), _) => todo!(),402}?;403404Ok(())405},406StateTranslation::Rle(values) => {407if state.is_optional {408append_validity(409state.page_validity.as_ref(),410filter.as_ref(),411validity,412values.len(),413);414}415416let page_validity = constrain_page_validity(417values.len(),418state.page_validity.as_ref(),419filter.as_ref(),420);421422match (filter, page_validity) {423(None, None) => decode_required_rle(values, None, target),424(Some(Filter::Range(rng)), None) if rng.start == 0 => {425decode_required_rle(values, Some(rng.end), target)426},427(None, Some(page_validity)) => {428decode_optional_rle(values, target, &page_validity)429},430(Some(Filter::Range(rng)), Some(page_validity)) if rng.start == 0 => {431decode_optional_rle(values, target, &page_validity)432},433(Some(Filter::Mask(filter)), None) => {434decode_masked_required_rle(values, target, &filter)435},436(Some(Filter::Mask(filter)), Some(page_validity)) => {437decode_masked_optional_rle(values, target, &page_validity, &filter)438},439(Some(Filter::Range(rng)), None) => {440decode_masked_required_rle(values, target, &filter_from_range(rng))441},442(Some(Filter::Range(rng)), Some(page_validity)) => decode_masked_optional_rle(443values,444target,445&page_validity,446&filter_from_range(rng),447),448(Some(Filter::Predicate(_)), _) => todo!(),449}?;450451Ok(())452},453}454}455}456457458