Path: blob/main/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs
7885 views
// Licensed to the Apache Software Foundation (ASF) under one1// or more contributor license agreements. See the NOTICE file2// distributed with this work for additional information3// regarding copyright ownership. The ASF licenses this file4// to you under the Apache License, Version 2.0 (the5// "License"); you may not use this file except in compliance6// with the License. You may obtain a copy of the License at7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing,11// software distributed under the License is distributed on an12// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY13// KIND, either express or implied. See the License for the14// specific language governing permissions and limitations15// under the License.1617//! Parquet schema parser.18//! Provides methods to parse and validate string message type into Parquet19//! [`ParquetType`](crate::parquet::schema::types::ParquetType).20//!21//! # Example22//!23//! ```rust24//! use polars_parquet::parquet::schema::io_message::from_message;25//!26//! let message_type = "27//! message spark_schema {28//! OPTIONAL BYTE_ARRAY a (UTF8);29//! REQUIRED INT32 b;30//! REQUIRED DOUBLE c;31//! REQUIRED BOOLEAN d;32//! OPTIONAL group e (LIST) {33//! REPEATED group list {34//! REQUIRED INT32 element;35//! }36//! }37//! }38//! ";39//!40//! let schema = from_message(message_type).expect("Expected valid schema");41//! println!("{:?}", schema);42//! ```4344use polars_parquet_format::Type;45use polars_utils::pl_str::PlSmallStr;46use types::PrimitiveLogicalType;4748use super::super::types::{ParquetType, TimeUnit};49use super::super::*;50use crate::parquet::error::{ParquetError, ParquetResult};51use crate::parquet::schema::types::{GroupConvertedType, PrimitiveConvertedType};5253fn is_logical_type(s: &str) -> bool {54matches!(55s,56"INTEGER"57| "MAP"58| "LIST"59| "ENUM"60| "DECIMAL"61| "DATE"62| "TIME"63| "TIMESTAMP"64| "STRING"65| "JSON"66| "BSON"67| "UUID"68| "UNKNOWN"69| "INTERVAL"70)71}7273fn is_converted_type(s: &str) -> bool {74matches!(75s,76"UTF8"77| "ENUM"78| "DECIMAL"79| "DATE"80| "TIME_MILLIS"81| "TIME_MICROS"82| "TIMESTAMP_MILLIS"83| "TIMESTAMP_MICROS"84| "UINT_8"85| "UINT_16"86| "UINT_32"87| "UINT_64"88| "INT_8"89| "INT_16"90| "INT_32"91| "INT_64"92| "JSON"93| "BSON"94| "INTERVAL"95)96}9798fn converted_group_from_str(s: &str) -> ParquetResult<GroupConvertedType> {99Ok(match s {100"MAP" => GroupConvertedType::Map,101"MAP_KEY_VALUE" => GroupConvertedType::MapKeyValue,102"LIST" => GroupConvertedType::List,103other => {104return Err(ParquetError::oos(format!("Invalid converted type {other}")));105},106})107}108109fn converted_primitive_from_str(s: &str) -> Option<PrimitiveConvertedType> {110use PrimitiveConvertedType::*;111Some(match s {112"UTF8" => Utf8,113"ENUM" => Enum,114"DECIMAL" => Decimal(0, 0),115"DATE" => Date,116"TIME_MILLIS" => TimeMillis,117"TIME_MICROS" => TimeMicros,118"TIMESTAMP_MILLIS" => TimestampMillis,119"TIMESTAMP_MICROS" => TimestampMicros,120"UINT_8" => Uint8,121"UINT_16" => Uint16,122"UINT_32" => Uint32,123"UINT_64" => Uint64,124"INT_8" => Int8,125"INT_16" => Int16,126"INT_32" => Int32,127"INT_64" => Int64,128"JSON" => Json,129"BSON" => Bson,130"INTERVAL" => Interval,131_ => return None,132})133}134135fn repetition_from_str(s: &str) -> ParquetResult<Repetition> {136Ok(match s {137"REQUIRED" => Repetition::Required,138"OPTIONAL" => Repetition::Optional,139"REPEATED" => Repetition::Repeated,140other => return Err(ParquetError::oos(format!("Invalid repetition {other}"))),141})142}143144fn type_from_str(s: &str) -> ParquetResult<Type> {145match s {146"BOOLEAN" => Ok(Type::BOOLEAN),147"INT32" => Ok(Type::INT32),148"INT64" => Ok(Type::INT64),149"INT96" => Ok(Type::INT96),150"FLOAT" => Ok(Type::FLOAT),151"DOUBLE" => Ok(Type::DOUBLE),152"BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY),153"FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY),154other => Err(ParquetError::oos(format!("Invalid type {other}"))),155}156}157158/// Parses message type as string into a Parquet [`ParquetType`](crate::parquet::schema::types::ParquetType).159///160/// This could, for example, be used to extract individual columns.161///162/// Returns Parquet general error when parsing or validation fails.163pub fn from_message(message_type: &str) -> ParquetResult<ParquetType> {164let mut parser = Parser {165tokenizer: &mut Tokenizer::from_str(message_type),166};167parser.parse_message_type()168}169170/// Tokenizer to split message type string into tokens that are separated using characters171/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens.172/// Tokenizer provides Iterator interface to process tokens; it also allows to step back173/// to reprocess previous tokens.174struct Tokenizer<'a> {175// List of all tokens for a string176tokens: Vec<&'a str>,177// Current index of vector178index: usize,179}180181impl<'a> Tokenizer<'a> {182// Create tokenizer from message type string183pub fn from_str(string: &'a str) -> Self {184let vec = string185.split_whitespace()186.flat_map(Self::split_token)187.collect();188Tokenizer {189tokens: vec,190index: 0,191}192}193194// List of all special characters in schema195fn is_schema_delim(c: char) -> bool {196c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ','197}198199/// Splits string into tokens; input string can already be token or can contain200/// delimiters, e.g. required" -> Vec("required") and201/// "(UTF8);" -> Vec("(", "UTF8", ")", ";")202fn split_token(string: &str) -> Vec<&str> {203let mut buffer: Vec<&str> = Vec::new();204let mut tail = string;205while let Some(index) = tail.find(Self::is_schema_delim) {206let (h, t) = tail.split_at(index);207if !h.is_empty() {208buffer.push(h);209}210buffer.push(&t[0..1]);211tail = &t[1..];212}213if !tail.is_empty() {214buffer.push(tail);215}216buffer217}218219// Move pointer to a previous element220fn backtrack(&mut self) {221self.index -= 1;222}223}224225impl<'a> Iterator for Tokenizer<'a> {226type Item = &'a str;227228fn next(&mut self) -> Option<&'a str> {229if self.index < self.tokens.len() {230self.index += 1;231Some(self.tokens[self.index - 1])232} else {233None234}235}236}237238/// Internal Schema parser.239/// Traverses message type using tokenizer and parses each group/primitive type240/// recursively.241struct Parser<'a> {242tokenizer: &'a mut Tokenizer<'a>,243}244245// Utility function to assert token on validity.246fn assert_token(token: Option<&str>, expected: &str) -> ParquetResult<()> {247match token {248Some(value) if value == expected => Ok(()),249Some(other) => Err(ParquetError::oos(format!(250"Expected '{expected}', found token '{other}'"251))),252None => Err(ParquetError::oos(format!(253"Expected '{expected}', but no token found (None)"254))),255}256}257258// Utility function to parse i32 or return general error.259fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> ParquetResult<i32> {260value261.ok_or_else(|| ParquetError::oos(not_found_msg))262.and_then(|v| {263v.parse::<i32>()264.map_err(|_| ParquetError::oos(parse_fail_msg))265})266}267268// Utility function to parse boolean or return general error.269#[inline]270fn parse_bool(271value: Option<&str>,272not_found_msg: &str,273parse_fail_msg: &str,274) -> ParquetResult<bool> {275value276.ok_or_else(|| ParquetError::oos(not_found_msg))277.and_then(|v| {278v.to_lowercase()279.parse::<bool>()280.map_err(|_| ParquetError::oos(parse_fail_msg))281})282}283284// Utility function to parse TimeUnit or return general error.285fn parse_timeunit(286value: Option<&str>,287not_found_msg: &str,288parse_fail_msg: &str,289) -> ParquetResult<TimeUnit> {290value291.ok_or_else(|| ParquetError::oos(not_found_msg))292.and_then(|v| match v.to_uppercase().as_str() {293"MILLIS" => Ok(TimeUnit::Milliseconds),294"MICROS" => Ok(TimeUnit::Microseconds),295"NANOS" => Ok(TimeUnit::Nanoseconds),296_ => Err(ParquetError::oos(parse_fail_msg)),297})298}299300impl Parser<'_> {301// Entry function to parse message type, uses internal tokenizer.302fn parse_message_type(&mut self) -> ParquetResult<ParquetType> {303// Check that message type starts with "message".304match self.tokenizer.next() {305Some("message") => {306let name = self307.tokenizer308.next()309.ok_or_else(|| ParquetError::oos("Expected name, found None"))?;310let fields = self.parse_child_types()?;311Ok(ParquetType::new_root(PlSmallStr::from_str(name), fields))312},313_ => Err(ParquetError::oos(314"Message type does not start with 'message'",315)),316}317}318319// Parses child types for a current group type.320// This is only invoked on root and group types.321fn parse_child_types(&mut self) -> ParquetResult<Vec<ParquetType>> {322assert_token(self.tokenizer.next(), "{")?;323let mut vec = Vec::new();324while let Some(value) = self.tokenizer.next() {325if value == "}" {326break;327} else {328self.tokenizer.backtrack();329vec.push(self.add_type()?);330}331}332Ok(vec)333}334335fn add_type(&mut self) -> ParquetResult<ParquetType> {336// Parse repetition337let repetition = self338.tokenizer339.next()340.ok_or_else(|| ParquetError::oos("Expected repetition, found None"))341.and_then(|v| repetition_from_str(&v.to_uppercase()))?;342343match self.tokenizer.next() {344Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(repetition),345Some(type_string) => {346let physical_type = type_from_str(&type_string.to_uppercase())?;347self.add_primitive_type(repetition, physical_type)348},349None => Err(ParquetError::oos(350"Invalid type, could not extract next token",351)),352}353}354355fn add_group_type(&mut self, repetition: Repetition) -> ParquetResult<ParquetType> {356// Parse name of the group type357let name = self358.tokenizer359.next()360.ok_or_else(|| ParquetError::oos("Expected name, found None"))?;361362// Parse converted type if exists363let converted_type = if let Some("(") = self.tokenizer.next() {364let converted_type = self365.tokenizer366.next()367.ok_or_else(|| ParquetError::oos("Expected converted type, found None"))368.and_then(|v| converted_group_from_str(&v.to_uppercase()))?;369assert_token(self.tokenizer.next(), ")")?;370Some(converted_type)371} else {372self.tokenizer.backtrack();373None374};375376// Parse optional id377let id = if let Some("=") = self.tokenizer.next() {378self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())379} else {380self.tokenizer.backtrack();381None382};383384let fields = self.parse_child_types()?;385386Ok(ParquetType::from_converted(387PlSmallStr::from_str(name),388fields,389repetition,390converted_type,391id,392))393}394395fn add_primitive_type(396&mut self,397repetition: Repetition,398physical_type: Type,399) -> ParquetResult<ParquetType> {400// Read type length if the type is FIXED_LEN_BYTE_ARRAY.401let length = if physical_type == Type::FIXED_LEN_BYTE_ARRAY {402assert_token(self.tokenizer.next(), "(")?;403let length = parse_i32(404self.tokenizer.next(),405"Expected length for FIXED_LEN_BYTE_ARRAY, found None",406"Failed to parse length for FIXED_LEN_BYTE_ARRAY",407)?;408assert_token(self.tokenizer.next(), ")")?;409Some(length)410} else {411None412};413414// Parse name of the primitive type415let name = self416.tokenizer417.next()418.ok_or_else(|| ParquetError::oos("Expected name, found None"))?;419420// Parse logical types421let (converted_type, logical_type) = if let Some("(") = self.tokenizer.next() {422let (is_logical_type, converted_type, token) = self423.tokenizer424.next()425.ok_or_else(|| ParquetError::oos("Expected converted or logical type, found None"))426.and_then(|v| {427let string = v.to_uppercase();428Ok(if is_logical_type(&string) {429(true, None, string)430} else if is_converted_type(&string) {431(false, converted_primitive_from_str(&string), string)432} else {433return Err(ParquetError::oos(format!(434"Expected converted or logical type, found {string}"435)));436})437})?;438439let logical_type = if is_logical_type {440Some(self.parse_logical_type(&token)?)441} else {442None443};444445// converted type decimal446let converted_type = match converted_type {447Some(PrimitiveConvertedType::Decimal(_, _)) => {448Some(self.parse_converted_decimal()?)449},450other => other,451};452453assert_token(self.tokenizer.next(), ")")?;454(converted_type, logical_type)455} else {456self.tokenizer.backtrack();457(None, None)458};459460// Parse optional id461let id = if let Some("=") = self.tokenizer.next() {462self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())463} else {464self.tokenizer.backtrack();465None466};467assert_token(self.tokenizer.next(), ";")?;468469ParquetType::try_from_primitive(470PlSmallStr::from_str(name),471(physical_type, length).try_into()?,472repetition,473converted_type,474logical_type,475id,476)477}478479fn parse_converted_decimal(&mut self) -> ParquetResult<PrimitiveConvertedType> {480assert_token(self.tokenizer.next(), "(")?;481// Parse precision482let precision = parse_i32(483self.tokenizer.next(),484"Expected precision, found None",485"Failed to parse precision for DECIMAL type",486)?;487488// Parse scale489let scale = if let Some(",") = self.tokenizer.next() {490parse_i32(491self.tokenizer.next(),492"Expected scale, found None",493"Failed to parse scale for DECIMAL type",494)?495} else {496// Scale is not provided, set it to 0.497self.tokenizer.backtrack();4980499};500501assert_token(self.tokenizer.next(), ")")?;502Ok(PrimitiveConvertedType::Decimal(503precision.try_into()?,504scale.try_into()?,505))506}507508fn parse_logical_type(&mut self, tpe: &str) -> ParquetResult<PrimitiveLogicalType> {509Ok(match tpe {510"ENUM" => PrimitiveLogicalType::Enum,511"DATE" => PrimitiveLogicalType::Date,512"DECIMAL" => {513let (precision, scale) = if let Some("(") = self.tokenizer.next() {514let precision = parse_i32(515self.tokenizer.next(),516"Expected precision, found None",517"Failed to parse precision for DECIMAL type",518)?;519let scale = if let Some(",") = self.tokenizer.next() {520parse_i32(521self.tokenizer.next(),522"Expected scale, found None",523"Failed to parse scale for DECIMAL type",524)?525} else {526self.tokenizer.backtrack();5270528};529assert_token(self.tokenizer.next(), ")")?;530(precision, scale)531} else {532self.tokenizer.backtrack();533(0, 0)534};535PrimitiveLogicalType::Decimal(precision.try_into()?, scale.try_into()?)536},537"TIME" => {538let (unit, is_adjusted_to_utc) = if let Some("(") = self.tokenizer.next() {539let unit = parse_timeunit(540self.tokenizer.next(),541"Invalid timeunit found",542"Failed to parse timeunit for TIME type",543)?;544let is_adjusted_to_utc = if let Some(",") = self.tokenizer.next() {545parse_bool(546self.tokenizer.next(),547"Invalid boolean found",548"Failed to parse timezone info for TIME type",549)?550} else {551self.tokenizer.backtrack();552false553};554assert_token(self.tokenizer.next(), ")")?;555(unit, is_adjusted_to_utc)556} else {557self.tokenizer.backtrack();558(TimeUnit::Milliseconds, false)559};560PrimitiveLogicalType::Time {561is_adjusted_to_utc,562unit,563}564},565"TIMESTAMP" => {566let (unit, is_adjusted_to_utc) = if let Some("(") = self.tokenizer.next() {567let unit = parse_timeunit(568self.tokenizer.next(),569"Invalid timeunit found",570"Failed to parse timeunit for TIMESTAMP type",571)?;572let is_adjusted_to_utc = if let Some(",") = self.tokenizer.next() {573parse_bool(574self.tokenizer.next(),575"Invalid boolean found",576"Failed to parse timezone info for TIMESTAMP type",577)?578} else {579// Invalid token for unit580self.tokenizer.backtrack();581false582};583assert_token(self.tokenizer.next(), ")")?;584(unit, is_adjusted_to_utc)585} else {586self.tokenizer.backtrack();587(TimeUnit::Milliseconds, false)588};589PrimitiveLogicalType::Timestamp {590is_adjusted_to_utc,591unit,592}593},594"INTEGER" => {595let (bit_width, is_signed) = if let Some("(") = self.tokenizer.next() {596let bit_width = parse_i32(597self.tokenizer.next(),598"Invalid bit_width found",599"Failed to parse bit_width for INTEGER type",600)?;601let is_signed = if let Some(",") = self.tokenizer.next() {602parse_bool(603self.tokenizer.next(),604"Invalid boolean found",605"Failed to parse is_signed for INTEGER type",606)?607} else {608// Invalid token for unit609self.tokenizer.backtrack();610return Err(ParquetError::oos("INTEGER requires sign"));611};612assert_token(self.tokenizer.next(), ")")?;613(bit_width, is_signed)614} else {615// Invalid token for unit616self.tokenizer.backtrack();617return Err(ParquetError::oos("INTEGER requires width and sign"));618};619PrimitiveLogicalType::Integer((bit_width, is_signed).into())620},621"STRING" => PrimitiveLogicalType::String,622"JSON" => PrimitiveLogicalType::Json,623"BSON" => PrimitiveLogicalType::Bson,624"UUID" => PrimitiveLogicalType::Uuid,625"UNKNOWN" => PrimitiveLogicalType::Unknown,626"INTERVAL" => return Err(ParquetError::oos("Interval logical type not yet supported")),627_ => unreachable!(),628})629}630}631632#[cfg(test)]633mod tests {634use types::IntegerType;635636use super::*;637use crate::parquet::schema::types::PhysicalType;638639#[test]640fn test_tokenize_empty_string() {641assert_eq!(Tokenizer::from_str("").next(), None);642}643644#[test]645fn test_tokenize_delimiters() {646let mut iter = Tokenizer::from_str(",;{}()=");647assert_eq!(iter.next(), Some(","));648assert_eq!(iter.next(), Some(";"));649assert_eq!(iter.next(), Some("{"));650assert_eq!(iter.next(), Some("}"));651assert_eq!(iter.next(), Some("("));652assert_eq!(iter.next(), Some(")"));653assert_eq!(iter.next(), Some("="));654assert_eq!(iter.next(), None);655}656657#[test]658fn test_tokenize_delimiters_with_whitespaces() {659let mut iter = Tokenizer::from_str(" , ; { } ( ) = ");660assert_eq!(iter.next(), Some(","));661assert_eq!(iter.next(), Some(";"));662assert_eq!(iter.next(), Some("{"));663assert_eq!(iter.next(), Some("}"));664assert_eq!(iter.next(), Some("("));665assert_eq!(iter.next(), Some(")"));666assert_eq!(iter.next(), Some("="));667assert_eq!(iter.next(), None);668}669670#[test]671fn test_tokenize_words() {672let mut iter = Tokenizer::from_str("abc def ghi jkl mno");673assert_eq!(iter.next(), Some("abc"));674assert_eq!(iter.next(), Some("def"));675assert_eq!(iter.next(), Some("ghi"));676assert_eq!(iter.next(), Some("jkl"));677assert_eq!(iter.next(), Some("mno"));678assert_eq!(iter.next(), None);679}680681#[test]682fn test_tokenize_backtrack() {683let mut iter = Tokenizer::from_str("abc;");684assert_eq!(iter.next(), Some("abc"));685assert_eq!(iter.next(), Some(";"));686iter.backtrack();687assert_eq!(iter.next(), Some(";"));688assert_eq!(iter.next(), None);689}690691#[test]692fn test_tokenize_message_type() {693let schema = "694message schema {695required int32 a;696optional binary c (UTF8);697required group d {698required int32 a;699optional binary c (UTF8);700}701required group e (LIST) {702repeated group list {703required int32 element;704}705}706}707";708let iter = Tokenizer::from_str(schema);709let mut res = Vec::new();710for token in iter {711res.push(token);712}713assert_eq!(714res,715vec![716"message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c",717"(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a",718";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group",719"e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32",720"element", ";", "}", "}", "}"721]722);723}724725#[test]726fn test_assert_token() {727assert!(assert_token(Some("a"), "a").is_ok());728assert!(assert_token(Some("a"), "b").is_err());729assert!(assert_token(None, "b").is_err());730}731732#[test]733fn test_parse_message_type_invalid() {734let mut iter = Tokenizer::from_str("test");735let result = Parser {736tokenizer: &mut iter,737}738.parse_message_type();739assert!(result.is_err());740assert_eq!(741result.unwrap_err().to_string(),742"File out of specification: Message type does not start with 'message'"743);744}745746#[test]747fn test_parse_message_type_no_name() {748let mut iter = Tokenizer::from_str("message");749let result = Parser {750tokenizer: &mut iter,751}752.parse_message_type();753assert!(result.is_err());754assert_eq!(755result.unwrap_err().to_string(),756"File out of specification: Expected name, found None"757);758}759760#[test]761fn test_parse_message_type_fixed_byte_array() {762let schema = "763message schema {764REQUIRED FIXED_LEN_BYTE_ARRAY col;765}766";767let mut iter = Tokenizer::from_str(schema);768let result = Parser {769tokenizer: &mut iter,770}771.parse_message_type();772assert!(result.is_err());773774let schema = "775message schema {776REQUIRED FIXED_LEN_BYTE_ARRAY(16) col;777}778";779let mut iter = Tokenizer::from_str(schema);780let result = Parser {781tokenizer: &mut iter,782}783.parse_message_type();784assert!(result.is_ok());785}786787#[test]788fn test_parse_message_type_decimal() {789// It is okay for decimal to omit precision and scale with right syntax.790// Here we test wrong syntax of decimal type791792// Invalid decimal syntax793let schema = "794message root {795optional int32 f1 (DECIMAL();796}797";798let mut iter = Tokenizer::from_str(schema);799let result = Parser {800tokenizer: &mut iter,801}802.parse_message_type();803assert!(result.is_err());804805// Invalid decimal, need precision and scale806let schema = "807message root {808optional int32 f1 (DECIMAL());809}810";811let mut iter = Tokenizer::from_str(schema);812let result = Parser {813tokenizer: &mut iter,814}815.parse_message_type();816assert!(result.is_err());817818// Invalid decimal because of `,` - has precision, needs scale819let schema = "820message root {821optional int32 f1 (DECIMAL(8,));822}823";824let mut iter = Tokenizer::from_str(schema);825let result = Parser {826tokenizer: &mut iter,827}828.parse_message_type();829assert!(result.is_err());830}831832#[test]833fn test_parse_decimal_wrong() {834// Invalid decimal because, we always require either precision or scale to be835// specified as part of converted type836let schema = "837message root {838optional int32 f3 (DECIMAL);839}840";841let mut iter = Tokenizer::from_str(schema);842let result = Parser {843tokenizer: &mut iter,844}845.parse_message_type();846assert!(result.is_err());847848// Valid decimal (precision, scale)849let schema = "850message root {851optional int32 f1 (DECIMAL(8, 3));852optional int32 f2 (DECIMAL(8));853}854";855let mut iter = Tokenizer::from_str(schema);856let result = Parser {857tokenizer: &mut iter,858}859.parse_message_type();860assert!(result.is_ok());861}862863#[test]864fn test_parse_message_type_compare_1() -> ParquetResult<()> {865let schema = "866message root {867optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));868optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18));869}870";871let mut iter = Tokenizer::from_str(schema);872let message = Parser {873tokenizer: &mut iter,874}875.parse_message_type()876.unwrap();877878let fields = vec![879ParquetType::try_from_primitive(880PlSmallStr::from_static("f1"),881PhysicalType::FixedLenByteArray(5),882Repetition::Optional,883None,884Some(PrimitiveLogicalType::Decimal(9, 3)),885None,886)?,887ParquetType::try_from_primitive(888PlSmallStr::from_static("f2"),889PhysicalType::FixedLenByteArray(16),890Repetition::Optional,891None,892Some(PrimitiveLogicalType::Decimal(38, 18)),893None,894)?,895];896897let expected = ParquetType::new_root(PlSmallStr::from_static("root"), fields);898899assert_eq!(message, expected);900Ok(())901}902903#[test]904fn test_parse_message_type_compare_2() -> ParquetResult<()> {905let schema = "906message root {907required group a0 {908optional group a1 (LIST) {909repeated binary a2 (UTF8);910}911912optional group b1 (LIST) {913repeated group b2 {914optional int32 b3;915optional double b4;916}917}918}919}920";921let mut iter = Tokenizer::from_str(schema);922let message = Parser {923tokenizer: &mut iter,924}925.parse_message_type()926.unwrap();927928let a2 = ParquetType::try_from_primitive(929"a2".into(),930PhysicalType::ByteArray,931Repetition::Repeated,932Some(PrimitiveConvertedType::Utf8),933None,934None,935)?;936let a1 = ParquetType::from_converted(937"a1".into(),938vec![a2],939Repetition::Optional,940Some(GroupConvertedType::List),941None,942);943let b2 = ParquetType::from_converted(944"b2".into(),945vec![946ParquetType::from_physical("b3".into(), PhysicalType::Int32),947ParquetType::from_physical("b4".into(), PhysicalType::Double),948],949Repetition::Repeated,950None,951None,952);953let b1 = ParquetType::from_converted(954"b1".into(),955vec![b2],956Repetition::Optional,957Some(GroupConvertedType::List),958None,959);960let a0 = ParquetType::from_converted(961"a0".into(),962vec![a1, b1],963Repetition::Required,964None,965None,966);967968let expected = ParquetType::new_root("root".into(), vec![a0]);969970assert_eq!(message, expected);971Ok(())972}973974#[test]975fn test_parse_message_type_compare_3() -> ParquetResult<()> {976let schema = "977message root {978required int32 _1 (INT_8);979required int32 _2 (INT_16);980required float _3;981required double _4;982optional int32 _5 (DATE);983optional binary _6 (UTF8);984}985";986let mut iter = Tokenizer::from_str(schema);987let message = Parser {988tokenizer: &mut iter,989}990.parse_message_type()991.unwrap();992993let f1 = ParquetType::try_from_primitive(994"_1".into(),995PhysicalType::Int32,996Repetition::Required,997Some(PrimitiveConvertedType::Int8),998None,999None,1000)?;1001let f2 = ParquetType::try_from_primitive(1002"_2".into(),1003PhysicalType::Int32,1004Repetition::Required,1005Some(PrimitiveConvertedType::Int16),1006None,1007None,1008)?;1009let f3 = ParquetType::try_from_primitive(1010"_3".into(),1011PhysicalType::Float,1012Repetition::Required,1013None,1014None,1015None,1016)?;1017let f4 = ParquetType::try_from_primitive(1018"_4".into(),1019PhysicalType::Double,1020Repetition::Required,1021None,1022None,1023None,1024)?;1025let f5 = ParquetType::try_from_primitive(1026"_5".into(),1027PhysicalType::Int32,1028Repetition::Optional,1029None,1030Some(PrimitiveLogicalType::Date),1031None,1032)?;1033let f6 = ParquetType::try_from_primitive(1034"_6".into(),1035PhysicalType::ByteArray,1036Repetition::Optional,1037Some(PrimitiveConvertedType::Utf8),1038None,1039None,1040)?;10411042let fields = vec![f1, f2, f3, f4, f5, f6];10431044let expected = ParquetType::new_root("root".into(), fields);1045assert_eq!(message, expected);1046Ok(())1047}10481049#[test]1050fn test_parse_message_type_compare_4() -> ParquetResult<()> {1051let schema = "1052message root {1053required int32 _1 (INTEGER(8,true));1054required int32 _2 (INTEGER(16,false));1055required float _3;1056required double _4;1057optional int32 _5 (DATE);1058optional int32 _6 (TIME(MILLIS,false));1059optional int64 _7 (TIME(MICROS,true));1060optional int64 _8 (TIMESTAMP(MILLIS,true));1061optional int64 _9 (TIMESTAMP(NANOS,false));1062optional binary _10 (STRING);1063}1064";1065let mut iter = Tokenizer::from_str(schema);1066let message = Parser {1067tokenizer: &mut iter,1068}1069.parse_message_type()?;10701071let f1 = ParquetType::try_from_primitive(1072"_1".into(),1073PhysicalType::Int32,1074Repetition::Required,1075None,1076Some(PrimitiveLogicalType::Integer(IntegerType::Int8)),1077None,1078)?;1079let f2 = ParquetType::try_from_primitive(1080"_2".into(),1081PhysicalType::Int32,1082Repetition::Required,1083None,1084Some(PrimitiveLogicalType::Integer(IntegerType::UInt16)),1085None,1086)?;1087let f3 = ParquetType::try_from_primitive(1088"_3".into(),1089PhysicalType::Float,1090Repetition::Required,1091None,1092None,1093None,1094)?;1095let f4 = ParquetType::try_from_primitive(1096"_4".into(),1097PhysicalType::Double,1098Repetition::Required,1099None,1100None,1101None,1102)?;1103let f5 = ParquetType::try_from_primitive(1104"_5".into(),1105PhysicalType::Int32,1106Repetition::Optional,1107None,1108Some(PrimitiveLogicalType::Date),1109None,1110)?;1111let f6 = ParquetType::try_from_primitive(1112"_6".into(),1113PhysicalType::Int32,1114Repetition::Optional,1115None,1116Some(PrimitiveLogicalType::Time {1117is_adjusted_to_utc: false,1118unit: TimeUnit::Milliseconds,1119}),1120None,1121)?;1122let f7 = ParquetType::try_from_primitive(1123"_7".into(),1124PhysicalType::Int64,1125Repetition::Optional,1126None,1127Some(PrimitiveLogicalType::Time {1128is_adjusted_to_utc: true,1129unit: TimeUnit::Microseconds,1130}),1131None,1132)?;1133let f8 = ParquetType::try_from_primitive(1134"_8".into(),1135PhysicalType::Int64,1136Repetition::Optional,1137None,1138Some(PrimitiveLogicalType::Timestamp {1139is_adjusted_to_utc: true,1140unit: TimeUnit::Milliseconds,1141}),1142None,1143)?;1144let f9 = ParquetType::try_from_primitive(1145"_9".into(),1146PhysicalType::Int64,1147Repetition::Optional,1148None,1149Some(PrimitiveLogicalType::Timestamp {1150is_adjusted_to_utc: false,1151unit: TimeUnit::Nanoseconds,1152}),1153None,1154)?;11551156let f10 = ParquetType::try_from_primitive(1157"_10".into(),1158PhysicalType::ByteArray,1159Repetition::Optional,1160None,1161Some(PrimitiveLogicalType::String),1162None,1163)?;11641165let fields = vec![f1, f2, f3, f4, f5, f6, f7, f8, f9, f10];11661167let expected = ParquetType::new_root("root".into(), fields);1168assert_eq!(message, expected);1169Ok(())1170}1171}117211731174