Path: blob/main/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs
6940 views
use polars_parquet_format::SchemaElement;1use polars_utils::pl_str::PlSmallStr;2#[cfg(feature = "serde")]3use serde::{Deserialize, Serialize};45use super::column_descriptor::{BaseType, ColumnDescriptor, Descriptor};6use crate::parquet::error::{ParquetError, ParquetResult};7use crate::parquet::schema::Repetition;8use crate::parquet::schema::io_message::from_message;9use crate::parquet::schema::types::{FieldInfo, ParquetType};1011/// A schema descriptor. This encapsulates the top-level schemas for all the columns,12/// as well as all descriptors for all the primitive columns.13#[derive(Debug, Clone)]14#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]15pub struct SchemaDescriptor {16name: PlSmallStr,17// The top-level schema (the "message" type).18fields: Vec<ParquetType>,1920// All the descriptors for primitive columns in this schema, constructed from21// `schema` in DFS order.22leaves: Vec<ColumnDescriptor>,23}2425impl SchemaDescriptor {26/// Creates new schema descriptor from Parquet schema.27pub fn new(name: PlSmallStr, fields: Vec<ParquetType>) -> Self {28let mut leaves = vec![];29for f in &fields {30let mut path = vec![];31build_tree(f, BaseType::Owned(f.clone()), 0, 0, &mut leaves, &mut path);32}3334Self {35name,36fields,37leaves,38}39}4041/// The [`ColumnDescriptor`] (leaves) of this schema.42///43/// Note that, for nested fields, this may contain more entries than the number of fields44/// in the file - e.g. a struct field may have two columns.45pub fn columns(&self) -> &[ColumnDescriptor] {46&self.leaves47}4849/// The schemas' name.50pub fn name(&self) -> &str {51&self.name52}5354/// The schemas' fields.55pub fn fields(&self) -> &[ParquetType] {56&self.fields57}5859/// The schemas' leaves.60pub fn leaves(&self) -> &[ColumnDescriptor] {61&self.leaves62}6364pub(crate) fn into_thrift(self) -> Vec<SchemaElement> {65ParquetType::GroupType {66field_info: FieldInfo {67name: self.name,68repetition: Repetition::Optional,69id: None,70},71logical_type: None,72converted_type: None,73fields: self.fields,74}75.to_thrift()76}7778fn try_from_type(type_: ParquetType) -> ParquetResult<Self> {79match type_ {80ParquetType::GroupType {81field_info, fields, ..82} => Ok(Self::new(field_info.name, fields)),83_ => Err(ParquetError::oos("The parquet schema MUST be a group type")),84}85}8687pub(crate) fn try_from_thrift(elements: &[SchemaElement]) -> ParquetResult<Self> {88let schema = ParquetType::try_from_thrift(elements)?;89Self::try_from_type(schema)90}9192/// Creates a schema from93pub fn try_from_message(message: &str) -> ParquetResult<Self> {94let schema = from_message(message)?;95Self::try_from_type(schema)96}97}9899fn build_tree<'a>(100tp: &'a ParquetType,101base_tp: BaseType,102mut max_rep_level: i16,103mut max_def_level: i16,104leaves: &mut Vec<ColumnDescriptor>,105path_so_far: &mut Vec<&'a str>,106) {107path_so_far.push(tp.name());108match tp.get_field_info().repetition {109Repetition::Optional => {110max_def_level += 1;111},112Repetition::Repeated => {113max_def_level += 1;114max_rep_level += 1;115},116_ => {},117}118119match tp {120ParquetType::PrimitiveType(p) => {121let path_in_schema = path_so_far.iter().copied().map(Into::into).collect();122leaves.push(ColumnDescriptor::new(123Descriptor {124primitive_type: p.clone(),125max_def_level,126max_rep_level,127},128path_in_schema,129base_tp,130));131},132ParquetType::GroupType { fields, .. } => {133let base_tp = base_tp.into_arc();134for f in fields {135build_tree(136f,137base_tp.clone(),138max_rep_level,139max_def_level,140leaves,141path_so_far,142);143path_so_far.pop();144}145},146}147}148149150