Path: blob/main/crates/polars-parquet/src/parquet/metadata/file_metadata.rs
6940 views
use polars_parquet_format::ColumnOrder as TColumnOrder;12use super::RowGroupMetadata;3use super::column_order::ColumnOrder;4use super::schema_descriptor::SchemaDescriptor;5use crate::parquet::error::ParquetError;6use crate::parquet::metadata::get_sort_order;7pub use crate::parquet::thrift_format::KeyValue;89/// Metadata for a Parquet file.10// This is almost equal to [`polars_parquet_format::FileMetaData`] but contains the descriptors,11// which are crucial to deserialize pages.12#[derive(Debug, Clone)]13pub struct FileMetadata {14/// version of this file.15pub version: i32,16/// number of rows in the file.17pub num_rows: usize,18/// Max row group height, useful for sharing column materializations.19pub max_row_group_height: usize,20/// String message for application that wrote this file.21///22/// This should have the following format:23/// `<application> version <application version> (build <application build hash>)`.24///25/// ```shell26/// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)27/// ```28pub created_by: Option<String>,29/// The row groups of this file30pub row_groups: Vec<RowGroupMetadata>,31/// key_value_metadata of this file.32pub key_value_metadata: Option<Vec<KeyValue>>,33/// schema descriptor.34pub schema_descr: SchemaDescriptor,35/// Column (sort) order used for `min` and `max` values of each column in this file.36///37/// Each column order corresponds to one column, determined by its position in the38/// list, matching the position of the column in the schema.39///40/// When `None` is returned, there are no column orders available, and each column41/// should be assumed to have undefined (legacy) column order.42pub column_orders: Option<Vec<ColumnOrder>>,43}4445impl FileMetadata {46/// Returns the [`SchemaDescriptor`] that describes schema of this file.47pub fn schema(&self) -> &SchemaDescriptor {48&self.schema_descr49}5051/// returns the metadata52pub fn key_value_metadata(&self) -> &Option<Vec<KeyValue>> {53&self.key_value_metadata54}5556/// Returns column order for `i`th column in this file.57/// If column orders are not available, returns undefined (legacy) column order.58pub fn column_order(&self, i: usize) -> ColumnOrder {59self.column_orders60.as_ref()61.map(|data| data[i])62.unwrap_or(ColumnOrder::Undefined)63}6465/// Deserializes [`crate::parquet::thrift_format::FileMetadata`] into this struct66pub fn try_from_thrift(67metadata: polars_parquet_format::FileMetaData,68) -> Result<Self, ParquetError> {69let schema_descr = SchemaDescriptor::try_from_thrift(&metadata.schema)?;7071let mut max_row_group_height = 0;7273let row_groups = metadata74.row_groups75.into_iter()76.map(|rg| {77let md = RowGroupMetadata::try_from_thrift(&schema_descr, rg)?;78max_row_group_height = max_row_group_height.max(md.num_rows());79Ok(md)80})81.collect::<Result<_, ParquetError>>()?;8283let column_orders = metadata84.column_orders85.map(|orders| parse_column_orders(&orders, &schema_descr));8687Ok(FileMetadata {88version: metadata.version,89num_rows: metadata.num_rows.try_into()?,90max_row_group_height,91created_by: metadata.created_by,92row_groups,93key_value_metadata: metadata.key_value_metadata,94schema_descr,95column_orders,96})97}98}99100/// Parses [`ColumnOrder`] from Thrift definition.101fn parse_column_orders(102orders: &[TColumnOrder],103schema_descr: &SchemaDescriptor,104) -> Vec<ColumnOrder> {105schema_descr106.columns()107.iter()108.zip(orders.iter())109.map(|(column, order)| match order {110TColumnOrder::TYPEORDER(_) => {111let sort_order = get_sort_order(112&column.descriptor.primitive_type.logical_type,113&column.descriptor.primitive_type.converted_type,114&column.descriptor.primitive_type.physical_type,115);116ColumnOrder::TypeDefinedOrder(sort_order)117},118})119.collect()120}121122123