Path: blob/main/crates/polars-parquet/src/parquet/write/indexes/serialize.rs
6940 views
use polars_parquet_format::{BoundaryOrder, ColumnIndex, OffsetIndex, PageLocation};12use crate::parquet::error::{ParquetError, ParquetResult};3use crate::parquet::write::page::{PageWriteSpec, is_data_page};45pub fn serialize_column_index(pages: &[PageWriteSpec]) -> ParquetResult<ColumnIndex> {6let mut null_pages = Vec::with_capacity(pages.len());7let mut min_values = Vec::with_capacity(pages.len());8let mut max_values = Vec::with_capacity(pages.len());9let mut null_counts = Vec::with_capacity(pages.len());1011pages12.iter()13.filter(|x| is_data_page(x))14.try_for_each(|spec| {15if let Some(stats) = &spec.statistics {16let stats = stats.serialize();1718let null_count = stats19.null_count20.ok_or_else(|| ParquetError::oos("null count of a page is required"))?;21null_counts.push(null_count);2223if let Some(min_value) = stats.min_value {24min_values.push(min_value);25max_values.push(26stats27.max_value28.ok_or_else(|| ParquetError::oos("max value of a page is required"))?,29);30null_pages.push(false)31} else {32min_values.push(vec![0]);33max_values.push(vec![0]);34null_pages.push(true)35}3637ParquetResult::Ok(())38} else {39Err(ParquetError::oos(40"options were set to write statistics but some pages miss them",41))42}43})?;44Ok(ColumnIndex {45null_pages,46min_values,47max_values,48boundary_order: BoundaryOrder::UNORDERED,49null_counts: Some(null_counts),50repetition_level_histograms: None,51definition_level_histograms: None,52})53}5455pub fn serialize_offset_index(pages: &[PageWriteSpec]) -> ParquetResult<OffsetIndex> {56let mut first_row_index = 0;57let page_locations = pages58.iter()59.filter(|x| is_data_page(x))60.map(|spec| {61let location = PageLocation {62offset: spec.offset.try_into()?,63compressed_page_size: spec.bytes_written.try_into()?,64first_row_index,65};66let num_rows = spec.num_rows;67first_row_index += num_rows as i64;68Ok(location)69})70.collect::<ParquetResult<Vec<_>>>()?;7172Ok(OffsetIndex {73page_locations,74unencoded_byte_array_data_bytes: None,75})76}777879