Path: blob/main/crates/polars-parquet/src/arrow/write/boolean/basic.rs
6940 views
use arrow::array::*;1use polars_error::{PolarsResult, polars_bail};23use super::super::{WriteOptions, utils};4use crate::arrow::read::schema::is_nullable;5use crate::parquet::encoding::Encoding;6use crate::parquet::encoding::hybrid_rle::{self, bitpacked_encode};7use crate::parquet::page::DataPage;8use crate::parquet::schema::types::PrimitiveType;9use crate::parquet::statistics::{BooleanStatistics, ParquetStatistics};10use crate::write::{EncodeNullability, StatisticsOptions};1112fn encode(iterator: impl Iterator<Item = bool>, buffer: &mut Vec<u8>) -> PolarsResult<()> {13// encode values using bitpacking14let len = buffer.len();15let mut buffer = std::io::Cursor::new(buffer);16buffer.set_position(len as u64);17Ok(bitpacked_encode(&mut buffer, iterator)?)18}1920pub(super) fn encode_plain(21array: &BooleanArray,22encode_options: EncodeNullability,23buffer: &mut Vec<u8>,24) -> PolarsResult<()> {25if encode_options.is_optional() && array.validity().is_some() {26encode(array.non_null_values_iter(), buffer)27} else {28encode(array.values().iter(), buffer)29}30}3132pub(super) fn encode_hybrid_rle(33array: &BooleanArray,34encode_options: EncodeNullability,35buffer: &mut Vec<u8>,36) -> PolarsResult<()> {37buffer.extend_from_slice(&[0; 4]);38let start = buffer.len();3940if encode_options.is_optional() && array.validity().is_some() {41hybrid_rle::encode(buffer, array.non_null_values_iter(), 1)?;42} else {43hybrid_rle::encode(buffer, array.values().iter(), 1)?;44}4546let length = buffer.len() - start;4748// write the first 4 bytes as length49let length = (length as i32).to_le_bytes();50(0..4).for_each(|i| buffer[start - 4 + i] = length[i]);5152Ok(())53}5455pub fn array_to_page(56array: &BooleanArray,57options: WriteOptions,58type_: PrimitiveType,59encoding: Encoding,60) -> PolarsResult<DataPage> {61let is_optional = is_nullable(&type_.field_info);62let encode_nullability = EncodeNullability::new(is_optional);6364let validity = array.validity();6566let mut buffer = vec![];67utils::write_def_levels(68&mut buffer,69is_optional,70validity,71array.len(),72options.version,73)?;7475let definition_levels_byte_length = buffer.len();7677match encoding {78Encoding::Plain => encode_plain(array, encode_nullability, &mut buffer)?,79Encoding::Rle => encode_hybrid_rle(array, encode_nullability, &mut buffer)?,80other => polars_bail!(nyi = "Encoding boolean as {other:?}"),81}8283let statistics = if options.has_statistics() {84Some(build_statistics(array, &options.statistics))85} else {86None87};8889utils::build_plain_page(90buffer,91array.len(),92array.len(),93array.null_count(),940,95definition_levels_byte_length,96statistics,97type_,98options,99encoding,100)101}102103pub(super) fn build_statistics(104array: &BooleanArray,105options: &StatisticsOptions,106) -> ParquetStatistics {107use polars_compute::min_max::MinMaxKernel;108use polars_compute::unique::GenericUniqueKernel;109110BooleanStatistics {111null_count: options.null_count.then(|| array.null_count() as i64),112distinct_count: options113.distinct_count114.then(|| array.n_unique_non_null().try_into().ok())115.flatten(),116max_value: options117.max_value118.then(|| array.max_propagate_nan_kernel())119.flatten(),120min_value: options121.min_value122.then(|| array.min_propagate_nan_kernel())123.flatten(),124}125.serialize()126}127128129