Path: blob/main/crates/polars-parquet/src/arrow/read/schema/convert.rs
8489 views
//! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`].1use std::sync::Arc;23use arrow::datatypes::{ArrowDataType, ArrowSchema, Field, IntervalUnit, Metadata, TimeUnit};4use polars_utils::format_pl_smallstr;5use polars_utils::pl_str::PlSmallStr;67use crate::arrow::read::schema::SchemaInferenceOptions;8use crate::parquet::schema::Repetition;9use crate::parquet::schema::types::{10FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType,11PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit,12};1314/// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain15/// any physical column.16pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> ArrowSchema {17parquet_to_arrow_schema_with_options(fields, &None)18}1920/// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference21pub fn parquet_to_arrow_schema_with_options(22fields: &[ParquetType],23options: &Option<SchemaInferenceOptions>,24) -> ArrowSchema {25fields26.iter()27.filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default())))28.map(|x| (x.name.clone(), x))29.collect()30}3132fn from_int32(33logical_type: Option<PrimitiveLogicalType>,34converted_type: Option<PrimitiveConvertedType>,35) -> ArrowDataType {36use PrimitiveLogicalType::*;37match (logical_type, converted_type) {38// handle logical types first39(Some(Integer(t)), _) => match t {40IntegerType::Int8 => ArrowDataType::Int8,41IntegerType::Int16 => ArrowDataType::Int16,42IntegerType::Int32 => ArrowDataType::Int32,43IntegerType::UInt8 => ArrowDataType::UInt8,44IntegerType::UInt16 => ArrowDataType::UInt16,45IntegerType::UInt32 => ArrowDataType::UInt32,46// The above are the only possible annotations for parquet's int32. Anything else47// is a deviation to the parquet specification and we ignore48_ => ArrowDataType::Int32,49},50(Some(Decimal(precision, scale)), _) => ArrowDataType::Decimal(precision, scale),51(Some(Date), _) => ArrowDataType::Date32,52(Some(Time { unit, .. }), _) => match unit {53ParquetTimeUnit::Milliseconds => ArrowDataType::Time32(TimeUnit::Millisecond),54// MILLIS is the only possible annotation for parquet's int32. Anything else55// is a deviation to the parquet specification and we ignore56_ => ArrowDataType::Int32,57},58// handle converted types:59(_, Some(PrimitiveConvertedType::Uint8)) => ArrowDataType::UInt8,60(_, Some(PrimitiveConvertedType::Uint16)) => ArrowDataType::UInt16,61(_, Some(PrimitiveConvertedType::Uint32)) => ArrowDataType::UInt32,62(_, Some(PrimitiveConvertedType::Int8)) => ArrowDataType::Int8,63(_, Some(PrimitiveConvertedType::Int16)) => ArrowDataType::Int16,64(_, Some(PrimitiveConvertedType::Int32)) => ArrowDataType::Int32,65(_, Some(PrimitiveConvertedType::Date)) => ArrowDataType::Date32,66(_, Some(PrimitiveConvertedType::TimeMillis)) => {67ArrowDataType::Time32(TimeUnit::Millisecond)68},69(_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {70ArrowDataType::Decimal(precision, scale)71},72(_, _) => ArrowDataType::Int32,73}74}7576fn from_int64(77logical_type: Option<PrimitiveLogicalType>,78converted_type: Option<PrimitiveConvertedType>,79) -> ArrowDataType {80use PrimitiveLogicalType::*;81match (logical_type, converted_type) {82// handle logical types first83(Some(Integer(integer)), _) => match integer {84IntegerType::UInt64 => ArrowDataType::UInt64,85IntegerType::Int64 => ArrowDataType::Int64,86_ => ArrowDataType::Int64,87},88(89Some(Timestamp {90is_adjusted_to_utc,91unit,92}),93_,94) => {95let timezone = if is_adjusted_to_utc {96// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md97// A TIMESTAMP with isAdjustedToUTC=true is defined as [...] elapsed since the Unix epoch98Some(PlSmallStr::from_static("+00:00"))99} else {100// PARQUET:101// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md102// A TIMESTAMP with isAdjustedToUTC=false represents [...] such103// timestamps should always be displayed the same way, regardless of the local time zone in effect104// ARROW:105// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md106// If the time zone is null or equal to an empty string, the data is "time107// zone naive" and shall be displayed *as is* to the user, not localized108// to the locale of the user.109None110};111112match unit {113ParquetTimeUnit::Milliseconds => {114ArrowDataType::Timestamp(TimeUnit::Millisecond, timezone)115},116ParquetTimeUnit::Microseconds => {117ArrowDataType::Timestamp(TimeUnit::Microsecond, timezone)118},119ParquetTimeUnit::Nanoseconds => {120ArrowDataType::Timestamp(TimeUnit::Nanosecond, timezone)121},122}123},124(Some(Time { unit, .. }), _) => match unit {125ParquetTimeUnit::Microseconds => ArrowDataType::Time64(TimeUnit::Microsecond),126ParquetTimeUnit::Nanoseconds => ArrowDataType::Time64(TimeUnit::Nanosecond),127// MILLIS is only possible for int32. Appearing in int64 is a deviation128// to parquet's spec, which we ignore129_ => ArrowDataType::Int64,130},131(Some(Decimal(precision, scale)), _) => ArrowDataType::Decimal(precision, scale),132// handle converted types:133(_, Some(PrimitiveConvertedType::TimeMicros)) => {134ArrowDataType::Time64(TimeUnit::Microsecond)135},136(_, Some(PrimitiveConvertedType::TimestampMillis)) => {137ArrowDataType::Timestamp(TimeUnit::Millisecond, None)138},139(_, Some(PrimitiveConvertedType::TimestampMicros)) => {140ArrowDataType::Timestamp(TimeUnit::Microsecond, None)141},142(_, Some(PrimitiveConvertedType::Int64)) => ArrowDataType::Int64,143(_, Some(PrimitiveConvertedType::Uint64)) => ArrowDataType::UInt64,144(_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {145ArrowDataType::Decimal(precision, scale)146},147148(_, _) => ArrowDataType::Int64,149}150}151152fn from_byte_array(153logical_type: &Option<PrimitiveLogicalType>,154converted_type: &Option<PrimitiveConvertedType>,155) -> ArrowDataType {156match (logical_type, converted_type) {157(Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => {158ArrowDataType::Decimal(*precision, *scale)159},160(None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {161ArrowDataType::Decimal(*precision, *scale)162},163(Some(PrimitiveLogicalType::String), _) => ArrowDataType::Utf8View,164(Some(PrimitiveLogicalType::Json), _) => ArrowDataType::BinaryView,165(Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::BinaryView,166(Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::BinaryView,167(_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::BinaryView,168(_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::BinaryView,169(_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::BinaryView,170(_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::Utf8View,171(_, _) => ArrowDataType::BinaryView,172}173}174175fn from_fixed_len_byte_array(176length: usize,177logical_type: Option<PrimitiveLogicalType>,178converted_type: Option<PrimitiveConvertedType>,179) -> ArrowDataType {180match (logical_type, converted_type) {181(Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => {182ArrowDataType::Decimal(precision, scale)183},184(None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {185ArrowDataType::Decimal(precision, scale)186},187(None, Some(PrimitiveConvertedType::Interval)) => {188ArrowDataType::Interval(IntervalUnit::MonthDayMillis)189},190_ => ArrowDataType::FixedSizeBinary(length),191}192}193194/// Maps a [`PhysicalType`] with optional metadata to a [`ArrowDataType`]195fn to_primitive_type_inner(196primitive_type: &PrimitiveType,197options: &SchemaInferenceOptions,198) -> ArrowDataType {199match primitive_type.physical_type {200PhysicalType::Boolean => ArrowDataType::Boolean,201PhysicalType::Int32 => {202from_int32(primitive_type.logical_type, primitive_type.converted_type)203},204PhysicalType::Int64 => {205from_int64(primitive_type.logical_type, primitive_type.converted_type)206},207PhysicalType::Int96 => ArrowDataType::Timestamp(options.int96_coerce_to_timeunit, None),208PhysicalType::Float => ArrowDataType::Float32,209PhysicalType::Double => ArrowDataType::Float64,210PhysicalType::ByteArray => {211from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type)212},213PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array(214length,215primitive_type.logical_type,216primitive_type.converted_type,217),218}219}220221/// Entry point for converting parquet primitive type to arrow type.222///223/// This function takes care of repetition.224fn to_primitive_type(225primitive_type: &PrimitiveType,226options: &SchemaInferenceOptions,227) -> ArrowDataType {228let base_type = to_primitive_type_inner(primitive_type, options);229230if primitive_type.field_info.repetition == Repetition::Repeated {231ArrowDataType::LargeList(Box::new(Field::new(232primitive_type.field_info.name.clone(),233base_type,234is_nullable(&primitive_type.field_info),235)))236} else {237base_type238}239}240241fn non_repeated_group(242logical_type: &Option<GroupLogicalType>,243converted_type: &Option<GroupConvertedType>,244fields: &[ParquetType],245parent_name: &str,246options: &SchemaInferenceOptions,247) -> Option<ArrowDataType> {248debug_assert!(!fields.is_empty());249match (logical_type, converted_type) {250(Some(GroupLogicalType::List), _) => to_list(fields, parent_name, options),251(None, Some(GroupConvertedType::List)) => to_list(fields, parent_name, options),252(Some(GroupLogicalType::Map), _) => to_list(fields, parent_name, options),253(None, Some(GroupConvertedType::Map) | Some(GroupConvertedType::MapKeyValue)) => {254to_map(fields, options)255},256_ => to_struct(fields, options),257}258}259260/// Converts a parquet group type to an arrow [`ArrowDataType::Struct`].261/// Returns [`None`] if all its fields are empty262fn to_struct(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option<ArrowDataType> {263let fields = fields264.iter()265.filter_map(|f| to_field(f, options))266.collect::<Vec<Field>>();267if fields.is_empty() {268None269} else {270Some(ArrowDataType::Struct(fields))271}272}273274/// Converts a parquet group type to an arrow [`ArrowDataType::Struct`].275/// Returns [`None`] if all its fields are empty276fn to_map(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option<ArrowDataType> {277let inner = to_field(&fields[0], options)?;278Some(ArrowDataType::Map(Box::new(inner), false))279}280281/// Entry point for converting parquet group type.282///283/// This function takes care of logical type and repetition.284fn to_group_type(285field_info: &FieldInfo,286logical_type: &Option<GroupLogicalType>,287converted_type: &Option<GroupConvertedType>,288fields: &[ParquetType],289parent_name: &str,290options: &SchemaInferenceOptions,291) -> Option<ArrowDataType> {292debug_assert!(!fields.is_empty());293if field_info.repetition == Repetition::Repeated {294Some(ArrowDataType::LargeList(Box::new(Field::new(295field_info.name.clone(),296to_struct(fields, options)?,297is_nullable(field_info),298))))299} else {300non_repeated_group(logical_type, converted_type, fields, parent_name, options)301}302}303304/// Checks whether this schema is nullable.305pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {306match field_info.repetition {307Repetition::Optional => true,308Repetition::Repeated => true,309Repetition::Required => false,310}311}312313/// Converts parquet schema to arrow field.314/// Returns `None` iff the parquet type has no associated primitive types,315/// i.e. if it is a column-less group type.316fn to_field(type_: &ParquetType, options: &SchemaInferenceOptions) -> Option<Field> {317let field_info = type_.get_field_info();318319let metadata: Option<Arc<Metadata>> = field_info.id.map(|x: i32| {320Arc::new(321[(322PlSmallStr::from_static("PARQUET:field_id"),323format_pl_smallstr!("{x}"),324)]325.into(),326)327});328329let mut arrow_field = Field::new(330field_info.name.clone(),331to_dtype(type_, options)?,332is_nullable(type_.get_field_info()),333);334335arrow_field.metadata = metadata;336337Some(arrow_field)338}339340/// Converts a parquet list to arrow list.341///342/// To fully understand this algorithm, please refer to343/// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).344fn to_list(345fields: &[ParquetType],346parent_name: &str,347options: &SchemaInferenceOptions,348) -> Option<ArrowDataType> {349let item = fields.first().unwrap();350351let item_type = match item {352ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive, options)),353ParquetType::GroupType { fields, .. } => {354if fields.len() == 1 && item.name() != "array" && {355// item.name() != format!("{parent_name}_tuple")356let cmp = [parent_name, "_tuple"];357let len_1 = parent_name.len();358let len = len_1 + "_tuple".len();359360item.name().len() != len || [&item.name()[..len_1], &item.name()[len_1..]] != cmp361} {362// extract the repetition field363let nested_item = fields.first().unwrap();364to_dtype(nested_item, options)365} else {366to_struct(fields, options)367}368},369}?;370371// Check that the name of the list child is "list", in which case we372// get the child nullability and name (normally "element") from the nested373// group type.374// Without this step, the child incorrectly inherits the parent's optionality375let (list_item_name, item_is_optional) = match item {376ParquetType::GroupType {377field_info, fields, ..378} if field_info.name.as_str() == "list" && fields.len() == 1 => {379let field = fields.first().unwrap();380(381field.get_field_info().name.clone(),382field.get_field_info().repetition == Repetition::Optional,383)384},385_ => (386item.get_field_info().name.clone(),387item.get_field_info().repetition == Repetition::Optional,388),389};390391Some(ArrowDataType::LargeList(Box::new(Field::new(392list_item_name,393item_type,394item_is_optional,395))))396}397398/// Converts parquet schema to arrow data type.399///400/// This function discards schema name.401///402/// If this schema is a primitive type and not included in the leaves, the result is403/// Ok(None).404///405/// If this schema is a group type and none of its children is reserved in the406/// conversion, the result is Ok(None).407pub(crate) fn to_dtype(408type_: &ParquetType,409options: &SchemaInferenceOptions,410) -> Option<ArrowDataType> {411match type_ {412ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive, options)),413ParquetType::GroupType {414field_info,415logical_type,416converted_type,417fields,418} => {419if fields.is_empty() {420None421} else {422to_group_type(423field_info,424logical_type,425converted_type,426fields,427field_info.name.as_str(),428options,429)430}431},432}433}434435#[cfg(test)]436mod tests {437use polars_error::*;438439use super::*;440use crate::parquet::metadata::SchemaDescriptor;441442#[test]443fn test_flat_primitives() -> PolarsResult<()> {444let message = "445message test_schema {446REQUIRED BOOLEAN boolean;447REQUIRED INT32 int8 (INT_8);448REQUIRED INT32 int16 (INT_16);449REQUIRED INT32 uint8 (INTEGER(8,false));450REQUIRED INT32 uint16 (INTEGER(16,false));451REQUIRED INT32 int32;452REQUIRED INT64 int64 ;453OPTIONAL DOUBLE double;454OPTIONAL FLOAT float;455OPTIONAL BINARY string (UTF8);456OPTIONAL BINARY string_2 (STRING);457}458";459let expected = &[460Field::new("boolean".into(), ArrowDataType::Boolean, false),461Field::new("int8".into(), ArrowDataType::Int8, false),462Field::new("int16".into(), ArrowDataType::Int16, false),463Field::new("uint8".into(), ArrowDataType::UInt8, false),464Field::new("uint16".into(), ArrowDataType::UInt16, false),465Field::new("int32".into(), ArrowDataType::Int32, false),466Field::new("int64".into(), ArrowDataType::Int64, false),467Field::new("double".into(), ArrowDataType::Float64, true),468Field::new("float".into(), ArrowDataType::Float32, true),469Field::new("string".into(), ArrowDataType::Utf8View, true),470Field::new("string_2".into(), ArrowDataType::Utf8View, true),471];472473let parquet_schema = SchemaDescriptor::try_from_message(message)?;474let fields = parquet_to_arrow_schema(parquet_schema.fields());475let fields = fields.iter_values().cloned().collect::<Vec<_>>();476477assert_eq!(fields, expected);478Ok(())479}480481#[test]482fn test_byte_array_fields() -> PolarsResult<()> {483let message = "484message test_schema {485REQUIRED BYTE_ARRAY binary;486REQUIRED FIXED_LEN_BYTE_ARRAY (20) fixed_binary;487}488";489let expected = vec![490Field::new("binary".into(), ArrowDataType::BinaryView, false),491Field::new(492"fixed_binary".into(),493ArrowDataType::FixedSizeBinary(20),494false,495),496];497498let parquet_schema = SchemaDescriptor::try_from_message(message)?;499let fields = parquet_to_arrow_schema(parquet_schema.fields());500let fields = fields.iter_values().cloned().collect::<Vec<_>>();501502assert_eq!(fields, expected);503Ok(())504}505506#[test]507fn test_duplicate_fields() -> PolarsResult<()> {508let message = "509message test_schema {510REQUIRED BOOLEAN boolean;511REQUIRED INT32 int8 (INT_8);512}513";514let expected = &[515Field::new("boolean".into(), ArrowDataType::Boolean, false),516Field::new("int8".into(), ArrowDataType::Int8, false),517];518519let parquet_schema = SchemaDescriptor::try_from_message(message)?;520let fields = parquet_to_arrow_schema(parquet_schema.fields());521let fields = fields.iter_values().cloned().collect::<Vec<_>>();522523assert_eq!(fields, expected);524Ok(())525}526527#[ignore]528#[test]529fn test_parquet_lists() -> PolarsResult<()> {530let mut arrow_fields = Vec::new();531532// LIST encoding example taken from parquet-format/LogicalTypes.md533let message_type = "534message test_schema {535REQUIRED GROUP my_list (LIST) {536REPEATED GROUP list {537OPTIONAL BINARY element (UTF8);538}539}540OPTIONAL GROUP my_list (LIST) {541REPEATED GROUP list {542REQUIRED BINARY element (UTF8);543}544}545OPTIONAL GROUP array_of_arrays (LIST) {546REPEATED GROUP list {547REQUIRED GROUP element (LIST) {548REPEATED GROUP list {549REQUIRED INT32 element;550}551}552}553}554OPTIONAL GROUP my_list (LIST) {555REPEATED GROUP element {556REQUIRED BINARY str (UTF8);557}558}559OPTIONAL GROUP my_list (LIST) {560REPEATED INT32 element;561}562OPTIONAL GROUP my_list (LIST) {563REPEATED GROUP element {564REQUIRED BINARY str (UTF8);565REQUIRED INT32 num;566}567}568OPTIONAL GROUP my_list (LIST) {569REPEATED GROUP array {570REQUIRED BINARY str (UTF8);571}572573}574OPTIONAL GROUP my_list (LIST) {575REPEATED GROUP my_list_tuple {576REQUIRED BINARY str (UTF8);577}578}579REPEATED INT32 name;580}581";582583// // List<String> (list non-null, elements nullable)584// required group my_list (LIST) {585// repeated group list {586// optional binary element (UTF8);587// }588// }589{590arrow_fields.push(Field::new(591"my_list".into(),592ArrowDataType::LargeList(Box::new(Field::new(593"element".into(),594ArrowDataType::Utf8,595true,596))),597false,598));599}600601// // List<String> (list nullable, elements non-null)602// optional group my_list (LIST) {603// repeated group list {604// required binary element (UTF8);605// }606// }607{608arrow_fields.push(Field::new(609"my_list".into(),610ArrowDataType::LargeList(Box::new(Field::new(611"element".into(),612ArrowDataType::Utf8,613false,614))),615true,616));617}618619// Element types can be nested structures. For example, a list of lists:620//621// // List<List<Integer>>622// optional group array_of_arrays (LIST) {623// repeated group list {624// required group element (LIST) {625// repeated group list {626// required int32 element;627// }628// }629// }630// }631{632let arrow_inner_list = ArrowDataType::LargeList(Box::new(Field::new(633"element".into(),634ArrowDataType::Int32,635false,636)));637arrow_fields.push(Field::new(638"array_of_arrays".into(),639ArrowDataType::LargeList(Box::new(Field::new(640PlSmallStr::from_static("element"),641arrow_inner_list,642false,643))),644true,645));646}647648// // List<String> (list nullable, elements non-null)649// optional group my_list (LIST) {650// repeated group element {651// required binary str (UTF8);652// };653// }654{655arrow_fields.push(Field::new(656"my_list".into(),657ArrowDataType::LargeList(Box::new(Field::new(658"element".into(),659ArrowDataType::Utf8,660false,661))),662true,663));664}665666// // List<Integer> (nullable list, non-null elements)667// optional group my_list (LIST) {668// repeated int32 element;669// }670{671arrow_fields.push(Field::new(672"my_list".into(),673ArrowDataType::LargeList(Box::new(Field::new(674"element".into(),675ArrowDataType::Int32,676false,677))),678true,679));680}681682// // List<Tuple<String, Integer>> (nullable list, non-null elements)683// optional group my_list (LIST) {684// repeated group element {685// required binary str (UTF8);686// required int32 num;687// };688// }689{690let arrow_struct = ArrowDataType::Struct(vec![691Field::new("str".into(), ArrowDataType::Utf8, false),692Field::new("num".into(), ArrowDataType::Int32, false),693]);694arrow_fields.push(Field::new(695"my_list".into(),696ArrowDataType::LargeList(Box::new(Field::new(697"element".into(),698arrow_struct,699false,700))),701true,702));703}704705// // List<OneTuple<String>> (nullable list, non-null elements)706// optional group my_list (LIST) {707// repeated group array {708// required binary str (UTF8);709// };710// }711// Special case: group is named array712{713let arrow_struct =714ArrowDataType::Struct(vec![Field::new("str".into(), ArrowDataType::Utf8, false)]);715arrow_fields.push(Field::new(716"my_list".into(),717ArrowDataType::LargeList(Box::new(Field::new("array".into(), arrow_struct, false))),718true,719));720}721722// // List<OneTuple<String>> (nullable list, non-null elements)723// optional group my_list (LIST) {724// repeated group my_list_tuple {725// required binary str (UTF8);726// };727// }728// Special case: group named ends in _tuple729{730let arrow_struct =731ArrowDataType::Struct(vec![Field::new("str".into(), ArrowDataType::Utf8, false)]);732arrow_fields.push(Field::new(733"my_list".into(),734ArrowDataType::LargeList(Box::new(Field::new(735"my_list_tuple".into(),736arrow_struct,737false,738))),739true,740));741}742743// One-level encoding: Only allows required lists with required cells744// repeated value_type name745{746arrow_fields.push(Field::new(747"name".into(),748ArrowDataType::LargeList(Box::new(Field::new(749"name".into(),750ArrowDataType::Int32,751false,752))),753false,754));755}756757let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;758let fields = parquet_to_arrow_schema(parquet_schema.fields());759let fields = fields.iter_values().cloned().collect::<Vec<_>>();760761assert_eq!(arrow_fields, fields);762Ok(())763}764765#[test]766fn test_parquet_list_with_struct() -> PolarsResult<()> {767let mut arrow_fields = Vec::new();768769let message_type = "770message eventlog {771REQUIRED group events (LIST) {772REPEATED group array {773REQUIRED BYTE_ARRAY event_name (STRING);774REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));775}776}777}778";779780{781let struct_fields = vec![782Field::new("event_name".into(), ArrowDataType::Utf8View, false),783Field::new(784"event_time".into(),785ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),786false,787),788];789arrow_fields.push(Field::new(790"events".into(),791ArrowDataType::LargeList(Box::new(Field::new(792"array".into(),793ArrowDataType::Struct(struct_fields),794false,795))),796false,797));798}799800let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;801let fields = parquet_to_arrow_schema(parquet_schema.fields());802let fields = fields.iter_values().cloned().collect::<Vec<_>>();803804assert_eq!(arrow_fields, fields);805Ok(())806}807808#[test]809fn test_parquet_list_nullable() -> PolarsResult<()> {810let mut arrow_fields = Vec::new();811812let message_type = "813message test_schema {814REQUIRED GROUP my_list1 (LIST) {815REPEATED GROUP list {816OPTIONAL BINARY element (UTF8);817}818}819OPTIONAL GROUP my_list2 (LIST) {820REPEATED GROUP list {821REQUIRED BINARY element (UTF8);822}823}824REQUIRED GROUP my_list3 (LIST) {825REPEATED GROUP list {826REQUIRED BINARY element (UTF8);827}828}829}830";831832// // List<String> (list non-null, elements nullable)833// required group my_list1 (LIST) {834// repeated group list {835// optional binary element (UTF8);836// }837// }838{839arrow_fields.push(Field::new(840"my_list1".into(),841ArrowDataType::LargeList(Box::new(Field::new(842"element".into(),843ArrowDataType::Utf8View,844true,845))),846false,847));848}849850// // List<String> (list nullable, elements non-null)851// optional group my_list2 (LIST) {852// repeated group list {853// required binary element (UTF8);854// }855// }856{857arrow_fields.push(Field::new(858"my_list2".into(),859ArrowDataType::LargeList(Box::new(Field::new(860"element".into(),861ArrowDataType::Utf8View,862false,863))),864true,865));866}867868// // List<String> (list non-null, elements non-null)869// repeated group my_list3 (LIST) {870// repeated group list {871// required binary element (UTF8);872// }873// }874{875arrow_fields.push(Field::new(876"my_list3".into(),877ArrowDataType::LargeList(Box::new(Field::new(878"element".into(),879ArrowDataType::Utf8View,880false,881))),882false,883));884}885886let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;887let fields = parquet_to_arrow_schema(parquet_schema.fields());888let fields = fields.iter_values().cloned().collect::<Vec<_>>();889890assert_eq!(arrow_fields, fields);891Ok(())892}893894#[test]895fn test_nested_schema() -> PolarsResult<()> {896let mut arrow_fields = Vec::new();897{898let group1_fields = vec![899Field::new("leaf1".into(), ArrowDataType::Boolean, false),900Field::new("leaf2".into(), ArrowDataType::Int32, false),901];902let group1_struct =903Field::new("group1".into(), ArrowDataType::Struct(group1_fields), false);904arrow_fields.push(group1_struct);905906let leaf3_field = Field::new("leaf3".into(), ArrowDataType::Int64, false);907arrow_fields.push(leaf3_field);908}909910let message_type = "911message test_schema {912REQUIRED GROUP group1 {913REQUIRED BOOLEAN leaf1;914REQUIRED INT32 leaf2;915}916REQUIRED INT64 leaf3;917}918";919920let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;921let fields = parquet_to_arrow_schema(parquet_schema.fields());922let fields = fields.iter_values().cloned().collect::<Vec<_>>();923924assert_eq!(arrow_fields, fields);925Ok(())926}927928#[ignore]929#[test]930fn test_repeated_nested_schema() -> PolarsResult<()> {931let mut arrow_fields = Vec::new();932{933arrow_fields.push(Field::new("leaf1".into(), ArrowDataType::Int32, true));934935let inner_group_list = Field::new(936"innerGroup".into(),937ArrowDataType::LargeList(Box::new(Field::new(938"innerGroup".into(),939ArrowDataType::Struct(vec![Field::new(940"leaf3".into(),941ArrowDataType::Int32,942true,943)]),944false,945))),946false,947);948949let outer_group_list = Field::new(950"outerGroup".into(),951ArrowDataType::LargeList(Box::new(Field::new(952"outerGroup".into(),953ArrowDataType::Struct(vec![954Field::new("leaf2".into(), ArrowDataType::Int32, true),955inner_group_list,956]),957false,958))),959false,960);961arrow_fields.push(outer_group_list);962}963964let message_type = "965message test_schema {966OPTIONAL INT32 leaf1;967REPEATED GROUP outerGroup {968OPTIONAL INT32 leaf2;969REPEATED GROUP innerGroup {970OPTIONAL INT32 leaf3;971}972}973}974";975976let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;977let fields = parquet_to_arrow_schema(parquet_schema.fields());978let fields = fields.iter_values().cloned().collect::<Vec<_>>();979980assert_eq!(arrow_fields, fields);981Ok(())982}983984#[ignore]985#[test]986fn test_column_desc_to_field() -> PolarsResult<()> {987let message_type = "988message test_schema {989REQUIRED BOOLEAN boolean;990REQUIRED INT32 int8 (INT_8);991REQUIRED INT32 uint8 (INTEGER(8,false));992REQUIRED INT32 int16 (INT_16);993REQUIRED INT32 uint16 (INTEGER(16,false));994REQUIRED INT32 int32;995REQUIRED INT64 int64;996OPTIONAL DOUBLE double;997OPTIONAL FLOAT float;998OPTIONAL BINARY string (UTF8);999REPEATED BOOLEAN bools;1000OPTIONAL INT32 date (DATE);1001OPTIONAL INT32 time_milli (TIME_MILLIS);1002OPTIONAL INT64 time_micro (TIME_MICROS);1003OPTIONAL INT64 time_nano (TIME(NANOS,false));1004OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);1005REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);1006REQUIRED INT64 ts_nano (TIMESTAMP(NANOS,true));1007}1008";1009let arrow_fields = vec![1010Field::new("boolean".into(), ArrowDataType::Boolean, false),1011Field::new("int8".into(), ArrowDataType::Int8, false),1012Field::new("uint8".into(), ArrowDataType::UInt8, false),1013Field::new("int16".into(), ArrowDataType::Int16, false),1014Field::new("uint16".into(), ArrowDataType::UInt16, false),1015Field::new("int32".into(), ArrowDataType::Int32, false),1016Field::new("int64".into(), ArrowDataType::Int64, false),1017Field::new("double".into(), ArrowDataType::Float64, true),1018Field::new("float".into(), ArrowDataType::Float32, true),1019Field::new("string".into(), ArrowDataType::Utf8, true),1020Field::new(1021"bools".into(),1022ArrowDataType::LargeList(Box::new(Field::new(1023"bools".into(),1024ArrowDataType::Boolean,1025false,1026))),1027false,1028),1029Field::new("date".into(), ArrowDataType::Date32, true),1030Field::new(1031"time_milli".into(),1032ArrowDataType::Time32(TimeUnit::Millisecond),1033true,1034),1035Field::new(1036"time_micro".into(),1037ArrowDataType::Time64(TimeUnit::Microsecond),1038true,1039),1040Field::new(1041"time_nano".into(),1042ArrowDataType::Time64(TimeUnit::Nanosecond),1043true,1044),1045Field::new(1046"ts_milli".into(),1047ArrowDataType::Timestamp(TimeUnit::Millisecond, None),1048true,1049),1050Field::new(1051"ts_micro".into(),1052ArrowDataType::Timestamp(TimeUnit::Microsecond, None),1053false,1054),1055Field::new(1056"ts_nano".into(),1057ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),1058false,1059),1060];10611062let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;1063let fields = parquet_to_arrow_schema(parquet_schema.fields());1064let fields = fields.iter_values().cloned().collect::<Vec<_>>();10651066assert_eq!(arrow_fields, fields);1067Ok(())1068}10691070#[test]1071fn test_field_to_column_desc() -> PolarsResult<()> {1072let message_type = "1073message arrow_schema {1074REQUIRED BOOLEAN boolean;1075REQUIRED INT32 int8 (INT_8);1076REQUIRED INT32 int16 (INTEGER(16,true));1077REQUIRED INT32 int32;1078REQUIRED INT64 int64;1079OPTIONAL DOUBLE double;1080OPTIONAL FLOAT float;1081OPTIONAL BINARY string (STRING);1082OPTIONAL GROUP bools (LIST) {1083REPEATED GROUP list {1084OPTIONAL BOOLEAN element;1085}1086}1087REQUIRED GROUP bools_non_null (LIST) {1088REPEATED GROUP list {1089REQUIRED BOOLEAN element;1090}1091}1092OPTIONAL INT32 date (DATE);1093OPTIONAL INT32 time_milli (TIME(MILLIS,false));1094OPTIONAL INT64 time_micro (TIME_MICROS);1095OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);1096REQUIRED INT64 ts_micro (TIMESTAMP(MICROS,false));1097REQUIRED GROUP struct {1098REQUIRED BOOLEAN bools;1099REQUIRED INT32 uint32 (INTEGER(32,false));1100REQUIRED GROUP int32 (LIST) {1101REPEATED GROUP list {1102OPTIONAL INT32 element;1103}1104}1105}1106REQUIRED BINARY dictionary_strings (STRING);1107}1108";11091110let arrow_fields = vec![1111Field::new("boolean".into(), ArrowDataType::Boolean, false),1112Field::new("int8".into(), ArrowDataType::Int8, false),1113Field::new("int16".into(), ArrowDataType::Int16, false),1114Field::new("int32".into(), ArrowDataType::Int32, false),1115Field::new("int64".into(), ArrowDataType::Int64, false),1116Field::new("double".into(), ArrowDataType::Float64, true),1117Field::new("float".into(), ArrowDataType::Float32, true),1118Field::new("string".into(), ArrowDataType::Utf8View, true),1119Field::new(1120"bools".into(),1121ArrowDataType::LargeList(Box::new(Field::new(1122"element".into(),1123ArrowDataType::Boolean,1124true,1125))),1126true,1127),1128Field::new(1129"bools_non_null".into(),1130ArrowDataType::LargeList(Box::new(Field::new(1131"element".into(),1132ArrowDataType::Boolean,1133false,1134))),1135false,1136),1137Field::new("date".into(), ArrowDataType::Date32, true),1138Field::new(1139"time_milli".into(),1140ArrowDataType::Time32(TimeUnit::Millisecond),1141true,1142),1143Field::new(1144"time_micro".into(),1145ArrowDataType::Time64(TimeUnit::Microsecond),1146true,1147),1148Field::new(1149"ts_milli".into(),1150ArrowDataType::Timestamp(TimeUnit::Millisecond, None),1151true,1152),1153Field::new(1154"ts_micro".into(),1155ArrowDataType::Timestamp(TimeUnit::Microsecond, None),1156false,1157),1158Field::new(1159"struct".into(),1160ArrowDataType::Struct(vec![1161Field::new("bools".into(), ArrowDataType::Boolean, false),1162Field::new("uint32".into(), ArrowDataType::UInt32, false),1163Field::new(1164"int32".into(),1165ArrowDataType::LargeList(Box::new(Field::new(1166"element".into(),1167ArrowDataType::Int32,1168true,1169))),1170false,1171),1172]),1173false,1174),1175Field::new("dictionary_strings".into(), ArrowDataType::Utf8View, false),1176];11771178let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;1179let fields = parquet_to_arrow_schema(parquet_schema.fields());1180let fields = fields.iter_values().cloned().collect::<Vec<_>>();11811182assert_eq!(arrow_fields, fields);1183Ok(())1184}11851186#[test]1187fn test_int96_options() -> PolarsResult<()> {1188for tu in [1189TimeUnit::Second,1190TimeUnit::Microsecond,1191TimeUnit::Millisecond,1192TimeUnit::Nanosecond,1193] {1194let message_type = "1195message arrow_schema {1196REQUIRED INT96 int96_field;1197OPTIONAL GROUP int96_list (LIST) {1198REPEATED GROUP list {1199OPTIONAL INT96 element;1200}1201}1202REQUIRED GROUP int96_struct {1203REQUIRED INT96 int96_field;1204}1205}1206";1207let coerced_to = ArrowDataType::Timestamp(tu, None);1208let arrow_fields = vec![1209Field::new("int96_field".into(), coerced_to.clone(), false),1210Field::new(1211"int96_list".into(),1212ArrowDataType::LargeList(Box::new(Field::new(1213"element".into(),1214coerced_to.clone(),1215true,1216))),1217true,1218),1219Field::new(1220"int96_struct".into(),1221ArrowDataType::Struct(vec![Field::new(1222"int96_field".into(),1223coerced_to.clone(),1224false,1225)]),1226false,1227),1228];12291230let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;1231let fields = parquet_to_arrow_schema_with_options(1232parquet_schema.fields(),1233&Some(SchemaInferenceOptions {1234int96_coerce_to_timeunit: tu,1235}),1236);1237let fields = fields.iter_values().cloned().collect::<Vec<_>>();1238assert_eq!(arrow_fields, fields);1239}1240Ok(())1241}1242}124312441245