Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/metadata/file_metadata.rs
6940 views
1
use polars_parquet_format::ColumnOrder as TColumnOrder;
2
3
use super::RowGroupMetadata;
4
use super::column_order::ColumnOrder;
5
use super::schema_descriptor::SchemaDescriptor;
6
use crate::parquet::error::ParquetError;
7
use crate::parquet::metadata::get_sort_order;
8
pub use crate::parquet::thrift_format::KeyValue;
9
10
/// Metadata for a Parquet file.
11
// This is almost equal to [`polars_parquet_format::FileMetaData`] but contains the descriptors,
12
// which are crucial to deserialize pages.
13
#[derive(Debug, Clone)]
14
pub struct FileMetadata {
15
/// version of this file.
16
pub version: i32,
17
/// number of rows in the file.
18
pub num_rows: usize,
19
/// Max row group height, useful for sharing column materializations.
20
pub max_row_group_height: usize,
21
/// String message for application that wrote this file.
22
///
23
/// This should have the following format:
24
/// `<application> version <application version> (build <application build hash>)`.
25
///
26
/// ```shell
27
/// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
28
/// ```
29
pub created_by: Option<String>,
30
/// The row groups of this file
31
pub row_groups: Vec<RowGroupMetadata>,
32
/// key_value_metadata of this file.
33
pub key_value_metadata: Option<Vec<KeyValue>>,
34
/// schema descriptor.
35
pub schema_descr: SchemaDescriptor,
36
/// Column (sort) order used for `min` and `max` values of each column in this file.
37
///
38
/// Each column order corresponds to one column, determined by its position in the
39
/// list, matching the position of the column in the schema.
40
///
41
/// When `None` is returned, there are no column orders available, and each column
42
/// should be assumed to have undefined (legacy) column order.
43
pub column_orders: Option<Vec<ColumnOrder>>,
44
}
45
46
impl FileMetadata {
47
/// Returns the [`SchemaDescriptor`] that describes schema of this file.
48
pub fn schema(&self) -> &SchemaDescriptor {
49
&self.schema_descr
50
}
51
52
/// returns the metadata
53
pub fn key_value_metadata(&self) -> &Option<Vec<KeyValue>> {
54
&self.key_value_metadata
55
}
56
57
/// Returns column order for `i`th column in this file.
58
/// If column orders are not available, returns undefined (legacy) column order.
59
pub fn column_order(&self, i: usize) -> ColumnOrder {
60
self.column_orders
61
.as_ref()
62
.map(|data| data[i])
63
.unwrap_or(ColumnOrder::Undefined)
64
}
65
66
/// Deserializes [`crate::parquet::thrift_format::FileMetadata`] into this struct
67
pub fn try_from_thrift(
68
metadata: polars_parquet_format::FileMetaData,
69
) -> Result<Self, ParquetError> {
70
let schema_descr = SchemaDescriptor::try_from_thrift(&metadata.schema)?;
71
72
let mut max_row_group_height = 0;
73
74
let row_groups = metadata
75
.row_groups
76
.into_iter()
77
.map(|rg| {
78
let md = RowGroupMetadata::try_from_thrift(&schema_descr, rg)?;
79
max_row_group_height = max_row_group_height.max(md.num_rows());
80
Ok(md)
81
})
82
.collect::<Result<_, ParquetError>>()?;
83
84
let column_orders = metadata
85
.column_orders
86
.map(|orders| parse_column_orders(&orders, &schema_descr));
87
88
Ok(FileMetadata {
89
version: metadata.version,
90
num_rows: metadata.num_rows.try_into()?,
91
max_row_group_height,
92
created_by: metadata.created_by,
93
row_groups,
94
key_value_metadata: metadata.key_value_metadata,
95
schema_descr,
96
column_orders,
97
})
98
}
99
}
100
101
/// Parses [`ColumnOrder`] from Thrift definition.
102
fn parse_column_orders(
103
orders: &[TColumnOrder],
104
schema_descr: &SchemaDescriptor,
105
) -> Vec<ColumnOrder> {
106
schema_descr
107
.columns()
108
.iter()
109
.zip(orders.iter())
110
.map(|(column, order)| match order {
111
TColumnOrder::TYPEORDER(_) => {
112
let sort_order = get_sort_order(
113
&column.descriptor.primitive_type.logical_type,
114
&column.descriptor.primitive_type.converted_type,
115
&column.descriptor.primitive_type.physical_type,
116
);
117
ColumnOrder::TypeDefinedOrder(sort_order)
118
},
119
})
120
.collect()
121
}
122
123