Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs
6940 views
1
use polars_parquet_format::SchemaElement;
2
use polars_utils::pl_str::PlSmallStr;
3
#[cfg(feature = "serde")]
4
use serde::{Deserialize, Serialize};
5
6
use super::column_descriptor::{BaseType, ColumnDescriptor, Descriptor};
7
use crate::parquet::error::{ParquetError, ParquetResult};
8
use crate::parquet::schema::Repetition;
9
use crate::parquet::schema::io_message::from_message;
10
use crate::parquet::schema::types::{FieldInfo, ParquetType};
11
12
/// A schema descriptor. This encapsulates the top-level schemas for all the columns,
13
/// as well as all descriptors for all the primitive columns.
14
#[derive(Debug, Clone)]
15
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
16
pub struct SchemaDescriptor {
17
name: PlSmallStr,
18
// The top-level schema (the "message" type).
19
fields: Vec<ParquetType>,
20
21
// All the descriptors for primitive columns in this schema, constructed from
22
// `schema` in DFS order.
23
leaves: Vec<ColumnDescriptor>,
24
}
25
26
impl SchemaDescriptor {
27
/// Creates new schema descriptor from Parquet schema.
28
pub fn new(name: PlSmallStr, fields: Vec<ParquetType>) -> Self {
29
let mut leaves = vec![];
30
for f in &fields {
31
let mut path = vec![];
32
build_tree(f, BaseType::Owned(f.clone()), 0, 0, &mut leaves, &mut path);
33
}
34
35
Self {
36
name,
37
fields,
38
leaves,
39
}
40
}
41
42
/// The [`ColumnDescriptor`] (leaves) of this schema.
43
///
44
/// Note that, for nested fields, this may contain more entries than the number of fields
45
/// in the file - e.g. a struct field may have two columns.
46
pub fn columns(&self) -> &[ColumnDescriptor] {
47
&self.leaves
48
}
49
50
/// The schemas' name.
51
pub fn name(&self) -> &str {
52
&self.name
53
}
54
55
/// The schemas' fields.
56
pub fn fields(&self) -> &[ParquetType] {
57
&self.fields
58
}
59
60
/// The schemas' leaves.
61
pub fn leaves(&self) -> &[ColumnDescriptor] {
62
&self.leaves
63
}
64
65
pub(crate) fn into_thrift(self) -> Vec<SchemaElement> {
66
ParquetType::GroupType {
67
field_info: FieldInfo {
68
name: self.name,
69
repetition: Repetition::Optional,
70
id: None,
71
},
72
logical_type: None,
73
converted_type: None,
74
fields: self.fields,
75
}
76
.to_thrift()
77
}
78
79
fn try_from_type(type_: ParquetType) -> ParquetResult<Self> {
80
match type_ {
81
ParquetType::GroupType {
82
field_info, fields, ..
83
} => Ok(Self::new(field_info.name, fields)),
84
_ => Err(ParquetError::oos("The parquet schema MUST be a group type")),
85
}
86
}
87
88
pub(crate) fn try_from_thrift(elements: &[SchemaElement]) -> ParquetResult<Self> {
89
let schema = ParquetType::try_from_thrift(elements)?;
90
Self::try_from_type(schema)
91
}
92
93
/// Creates a schema from
94
pub fn try_from_message(message: &str) -> ParquetResult<Self> {
95
let schema = from_message(message)?;
96
Self::try_from_type(schema)
97
}
98
}
99
100
fn build_tree<'a>(
101
tp: &'a ParquetType,
102
base_tp: BaseType,
103
mut max_rep_level: i16,
104
mut max_def_level: i16,
105
leaves: &mut Vec<ColumnDescriptor>,
106
path_so_far: &mut Vec<&'a str>,
107
) {
108
path_so_far.push(tp.name());
109
match tp.get_field_info().repetition {
110
Repetition::Optional => {
111
max_def_level += 1;
112
},
113
Repetition::Repeated => {
114
max_def_level += 1;
115
max_rep_level += 1;
116
},
117
_ => {},
118
}
119
120
match tp {
121
ParquetType::PrimitiveType(p) => {
122
let path_in_schema = path_so_far.iter().copied().map(Into::into).collect();
123
leaves.push(ColumnDescriptor::new(
124
Descriptor {
125
primitive_type: p.clone(),
126
max_def_level,
127
max_rep_level,
128
},
129
path_in_schema,
130
base_tp,
131
));
132
},
133
ParquetType::GroupType { fields, .. } => {
134
let base_tp = base_tp.into_arc();
135
for f in fields {
136
build_tree(
137
f,
138
base_tp.clone(),
139
max_rep_level,
140
max_def_level,
141
leaves,
142
path_so_far,
143
);
144
path_so_far.pop();
145
}
146
},
147
}
148
}
149
150