Path: blob/main/crates/polars-parquet/src/arrow/read/schema/convert.rs
6940 views
//! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`].1use std::sync::Arc;23use arrow::datatypes::{ArrowDataType, ArrowSchema, Field, IntervalUnit, Metadata, TimeUnit};4use polars_utils::format_pl_smallstr;5use polars_utils::pl_str::PlSmallStr;67use crate::arrow::read::schema::SchemaInferenceOptions;8use crate::parquet::schema::Repetition;9use crate::parquet::schema::types::{10FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType,11PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit,12};1314/// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain15/// any physical column.16pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> ArrowSchema {17parquet_to_arrow_schema_with_options(fields, &None)18}1920/// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference21pub fn parquet_to_arrow_schema_with_options(22fields: &[ParquetType],23options: &Option<SchemaInferenceOptions>,24) -> ArrowSchema {25fields26.iter()27.filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default())))28.map(|x| (x.name.clone(), x))29.collect()30}3132fn from_int32(33logical_type: Option<PrimitiveLogicalType>,34converted_type: Option<PrimitiveConvertedType>,35) -> ArrowDataType {36use PrimitiveLogicalType::*;37match (logical_type, converted_type) {38// handle logical types first39(Some(Integer(t)), _) => match t {40IntegerType::Int8 => ArrowDataType::Int8,41IntegerType::Int16 => ArrowDataType::Int16,42IntegerType::Int32 => ArrowDataType::Int32,43IntegerType::UInt8 => ArrowDataType::UInt8,44IntegerType::UInt16 => ArrowDataType::UInt16,45IntegerType::UInt32 => ArrowDataType::UInt32,46// The above are the only possible annotations for parquet's int32. Anything else47// is a deviation to the parquet specification and we ignore48_ => ArrowDataType::Int32,49},50(Some(Decimal(precision, scale)), _) => ArrowDataType::Decimal(precision, scale),51(Some(Date), _) => ArrowDataType::Date32,52(Some(Time { unit, .. }), _) => match unit {53ParquetTimeUnit::Milliseconds => ArrowDataType::Time32(TimeUnit::Millisecond),54// MILLIS is the only possible annotation for parquet's int32. Anything else55// is a deviation to the parquet specification and we ignore56_ => ArrowDataType::Int32,57},58// handle converted types:59(_, Some(PrimitiveConvertedType::Uint8)) => ArrowDataType::UInt8,60(_, Some(PrimitiveConvertedType::Uint16)) => ArrowDataType::UInt16,61(_, Some(PrimitiveConvertedType::Uint32)) => ArrowDataType::UInt32,62(_, Some(PrimitiveConvertedType::Int8)) => ArrowDataType::Int8,63(_, Some(PrimitiveConvertedType::Int16)) => ArrowDataType::Int16,64(_, Some(PrimitiveConvertedType::Int32)) => ArrowDataType::Int32,65(_, Some(PrimitiveConvertedType::Date)) => ArrowDataType::Date32,66(_, Some(PrimitiveConvertedType::TimeMillis)) => {67ArrowDataType::Time32(TimeUnit::Millisecond)68},69(_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {70ArrowDataType::Decimal(precision, scale)71},72(_, _) => ArrowDataType::Int32,73}74}7576fn from_int64(77logical_type: Option<PrimitiveLogicalType>,78converted_type: Option<PrimitiveConvertedType>,79) -> ArrowDataType {80use PrimitiveLogicalType::*;81match (logical_type, converted_type) {82// handle logical types first83(Some(Integer(integer)), _) => match integer {84IntegerType::UInt64 => ArrowDataType::UInt64,85IntegerType::Int64 => ArrowDataType::Int64,86_ => ArrowDataType::Int64,87},88(89Some(Timestamp {90is_adjusted_to_utc,91unit,92}),93_,94) => {95let timezone = if is_adjusted_to_utc {96// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md97// A TIMESTAMP with isAdjustedToUTC=true is defined as [...] elapsed since the Unix epoch98Some(PlSmallStr::from_static("+00:00"))99} else {100// PARQUET:101// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md102// A TIMESTAMP with isAdjustedToUTC=false represents [...] such103// timestamps should always be displayed the same way, regardless of the local time zone in effect104// ARROW:105// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md106// If the time zone is null or equal to an empty string, the data is "time107// zone naive" and shall be displayed *as is* to the user, not localized108// to the locale of the user.109None110};111112match unit {113ParquetTimeUnit::Milliseconds => {114ArrowDataType::Timestamp(TimeUnit::Millisecond, timezone)115},116ParquetTimeUnit::Microseconds => {117ArrowDataType::Timestamp(TimeUnit::Microsecond, timezone)118},119ParquetTimeUnit::Nanoseconds => {120ArrowDataType::Timestamp(TimeUnit::Nanosecond, timezone)121},122}123},124(Some(Time { unit, .. }), _) => match unit {125ParquetTimeUnit::Microseconds => ArrowDataType::Time64(TimeUnit::Microsecond),126ParquetTimeUnit::Nanoseconds => ArrowDataType::Time64(TimeUnit::Nanosecond),127// MILLIS is only possible for int32. Appearing in int64 is a deviation128// to parquet's spec, which we ignore129_ => ArrowDataType::Int64,130},131(Some(Decimal(precision, scale)), _) => ArrowDataType::Decimal(precision, scale),132// handle converted types:133(_, Some(PrimitiveConvertedType::TimeMicros)) => {134ArrowDataType::Time64(TimeUnit::Microsecond)135},136(_, Some(PrimitiveConvertedType::TimestampMillis)) => {137ArrowDataType::Timestamp(TimeUnit::Millisecond, None)138},139(_, Some(PrimitiveConvertedType::TimestampMicros)) => {140ArrowDataType::Timestamp(TimeUnit::Microsecond, None)141},142(_, Some(PrimitiveConvertedType::Int64)) => ArrowDataType::Int64,143(_, Some(PrimitiveConvertedType::Uint64)) => ArrowDataType::UInt64,144(_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {145ArrowDataType::Decimal(precision, scale)146},147148(_, _) => ArrowDataType::Int64,149}150}151152fn from_byte_array(153logical_type: &Option<PrimitiveLogicalType>,154converted_type: &Option<PrimitiveConvertedType>,155) -> ArrowDataType {156match (logical_type, converted_type) {157(Some(PrimitiveLogicalType::String), _) => ArrowDataType::Utf8View,158(Some(PrimitiveLogicalType::Json), _) => ArrowDataType::BinaryView,159(Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::BinaryView,160(Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::BinaryView,161(_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::BinaryView,162(_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::BinaryView,163(_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::BinaryView,164(_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::Utf8View,165(_, _) => ArrowDataType::BinaryView,166}167}168169fn from_fixed_len_byte_array(170length: usize,171logical_type: Option<PrimitiveLogicalType>,172converted_type: Option<PrimitiveConvertedType>,173) -> ArrowDataType {174match (logical_type, converted_type) {175(Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => {176ArrowDataType::Decimal(precision, scale)177},178(None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {179ArrowDataType::Decimal(precision, scale)180},181(None, Some(PrimitiveConvertedType::Interval)) => {182// There is currently no reliable way of determining which IntervalUnit183// to return. Thus without the original Arrow schema, the results184// would be incorrect if all 12 bytes of the interval are populated185ArrowDataType::Interval(IntervalUnit::DayTime)186},187_ => ArrowDataType::FixedSizeBinary(length),188}189}190191/// Maps a [`PhysicalType`] with optional metadata to a [`ArrowDataType`]192fn to_primitive_type_inner(193primitive_type: &PrimitiveType,194options: &SchemaInferenceOptions,195) -> ArrowDataType {196match primitive_type.physical_type {197PhysicalType::Boolean => ArrowDataType::Boolean,198PhysicalType::Int32 => {199from_int32(primitive_type.logical_type, primitive_type.converted_type)200},201PhysicalType::Int64 => {202from_int64(primitive_type.logical_type, primitive_type.converted_type)203},204PhysicalType::Int96 => ArrowDataType::Timestamp(options.int96_coerce_to_timeunit, None),205PhysicalType::Float => ArrowDataType::Float32,206PhysicalType::Double => ArrowDataType::Float64,207PhysicalType::ByteArray => {208from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type)209},210PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array(211length,212primitive_type.logical_type,213primitive_type.converted_type,214),215}216}217218/// Entry point for converting parquet primitive type to arrow type.219///220/// This function takes care of repetition.221fn to_primitive_type(222primitive_type: &PrimitiveType,223options: &SchemaInferenceOptions,224) -> ArrowDataType {225let base_type = to_primitive_type_inner(primitive_type, options);226227if primitive_type.field_info.repetition == Repetition::Repeated {228ArrowDataType::LargeList(Box::new(Field::new(229primitive_type.field_info.name.clone(),230base_type,231is_nullable(&primitive_type.field_info),232)))233} else {234base_type235}236}237238fn non_repeated_group(239logical_type: &Option<GroupLogicalType>,240converted_type: &Option<GroupConvertedType>,241fields: &[ParquetType],242parent_name: &str,243options: &SchemaInferenceOptions,244) -> Option<ArrowDataType> {245debug_assert!(!fields.is_empty());246match (logical_type, converted_type) {247(Some(GroupLogicalType::List), _) => to_list(fields, parent_name, options),248(None, Some(GroupConvertedType::List)) => to_list(fields, parent_name, options),249(Some(GroupLogicalType::Map), _) => to_list(fields, parent_name, options),250(None, Some(GroupConvertedType::Map) | Some(GroupConvertedType::MapKeyValue)) => {251to_map(fields, options)252},253_ => to_struct(fields, options),254}255}256257/// Converts a parquet group type to an arrow [`ArrowDataType::Struct`].258/// Returns [`None`] if all its fields are empty259fn to_struct(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option<ArrowDataType> {260let fields = fields261.iter()262.filter_map(|f| to_field(f, options))263.collect::<Vec<Field>>();264if fields.is_empty() {265None266} else {267Some(ArrowDataType::Struct(fields))268}269}270271/// Converts a parquet group type to an arrow [`ArrowDataType::Struct`].272/// Returns [`None`] if all its fields are empty273fn to_map(fields: &[ParquetType], options: &SchemaInferenceOptions) -> Option<ArrowDataType> {274let inner = to_field(&fields[0], options)?;275Some(ArrowDataType::Map(Box::new(inner), false))276}277278/// Entry point for converting parquet group type.279///280/// This function takes care of logical type and repetition.281fn to_group_type(282field_info: &FieldInfo,283logical_type: &Option<GroupLogicalType>,284converted_type: &Option<GroupConvertedType>,285fields: &[ParquetType],286parent_name: &str,287options: &SchemaInferenceOptions,288) -> Option<ArrowDataType> {289debug_assert!(!fields.is_empty());290if field_info.repetition == Repetition::Repeated {291Some(ArrowDataType::LargeList(Box::new(Field::new(292field_info.name.clone(),293to_struct(fields, options)?,294is_nullable(field_info),295))))296} else {297non_repeated_group(logical_type, converted_type, fields, parent_name, options)298}299}300301/// Checks whether this schema is nullable.302pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {303match field_info.repetition {304Repetition::Optional => true,305Repetition::Repeated => true,306Repetition::Required => false,307}308}309310/// Converts parquet schema to arrow field.311/// Returns `None` iff the parquet type has no associated primitive types,312/// i.e. if it is a column-less group type.313fn to_field(type_: &ParquetType, options: &SchemaInferenceOptions) -> Option<Field> {314let field_info = type_.get_field_info();315316let metadata: Option<Arc<Metadata>> = field_info.id.map(|x: i32| {317Arc::new(318[(319PlSmallStr::from_static("PARQUET:field_id"),320format_pl_smallstr!("{x}"),321)]322.into(),323)324});325326let mut arrow_field = Field::new(327field_info.name.clone(),328to_dtype(type_, options)?,329is_nullable(type_.get_field_info()),330);331332arrow_field.metadata = metadata;333334Some(arrow_field)335}336337/// Converts a parquet list to arrow list.338///339/// To fully understand this algorithm, please refer to340/// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).341fn to_list(342fields: &[ParquetType],343parent_name: &str,344options: &SchemaInferenceOptions,345) -> Option<ArrowDataType> {346let item = fields.first().unwrap();347348let item_type = match item {349ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive, options)),350ParquetType::GroupType { fields, .. } => {351if fields.len() == 1 && item.name() != "array" && {352// item.name() != format!("{parent_name}_tuple")353let cmp = [parent_name, "_tuple"];354let len_1 = parent_name.len();355let len = len_1 + "_tuple".len();356357item.name().len() != len || [&item.name()[..len_1], &item.name()[len_1..]] != cmp358} {359// extract the repetition field360let nested_item = fields.first().unwrap();361to_dtype(nested_item, options)362} else {363to_struct(fields, options)364}365},366}?;367368// Check that the name of the list child is "list", in which case we369// get the child nullability and name (normally "element") from the nested370// group type.371// Without this step, the child incorrectly inherits the parent's optionality372let (list_item_name, item_is_optional) = match item {373ParquetType::GroupType {374field_info, fields, ..375} if field_info.name.as_str() == "list" && fields.len() == 1 => {376let field = fields.first().unwrap();377(378field.get_field_info().name.clone(),379field.get_field_info().repetition == Repetition::Optional,380)381},382_ => (383item.get_field_info().name.clone(),384item.get_field_info().repetition == Repetition::Optional,385),386};387388Some(ArrowDataType::LargeList(Box::new(Field::new(389list_item_name,390item_type,391item_is_optional,392))))393}394395/// Converts parquet schema to arrow data type.396///397/// This function discards schema name.398///399/// If this schema is a primitive type and not included in the leaves, the result is400/// Ok(None).401///402/// If this schema is a group type and none of its children is reserved in the403/// conversion, the result is Ok(None).404pub(crate) fn to_dtype(405type_: &ParquetType,406options: &SchemaInferenceOptions,407) -> Option<ArrowDataType> {408match type_ {409ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive, options)),410ParquetType::GroupType {411field_info,412logical_type,413converted_type,414fields,415} => {416if fields.is_empty() {417None418} else {419to_group_type(420field_info,421logical_type,422converted_type,423fields,424field_info.name.as_str(),425options,426)427}428},429}430}431432#[cfg(test)]433mod tests {434use polars_error::*;435436use super::*;437use crate::parquet::metadata::SchemaDescriptor;438439#[test]440fn test_flat_primitives() -> PolarsResult<()> {441let message = "442message test_schema {443REQUIRED BOOLEAN boolean;444REQUIRED INT32 int8 (INT_8);445REQUIRED INT32 int16 (INT_16);446REQUIRED INT32 uint8 (INTEGER(8,false));447REQUIRED INT32 uint16 (INTEGER(16,false));448REQUIRED INT32 int32;449REQUIRED INT64 int64 ;450OPTIONAL DOUBLE double;451OPTIONAL FLOAT float;452OPTIONAL BINARY string (UTF8);453OPTIONAL BINARY string_2 (STRING);454}455";456let expected = &[457Field::new("boolean".into(), ArrowDataType::Boolean, false),458Field::new("int8".into(), ArrowDataType::Int8, false),459Field::new("int16".into(), ArrowDataType::Int16, false),460Field::new("uint8".into(), ArrowDataType::UInt8, false),461Field::new("uint16".into(), ArrowDataType::UInt16, false),462Field::new("int32".into(), ArrowDataType::Int32, false),463Field::new("int64".into(), ArrowDataType::Int64, false),464Field::new("double".into(), ArrowDataType::Float64, true),465Field::new("float".into(), ArrowDataType::Float32, true),466Field::new("string".into(), ArrowDataType::Utf8View, true),467Field::new("string_2".into(), ArrowDataType::Utf8View, true),468];469470let parquet_schema = SchemaDescriptor::try_from_message(message)?;471let fields = parquet_to_arrow_schema(parquet_schema.fields());472let fields = fields.iter_values().cloned().collect::<Vec<_>>();473474assert_eq!(fields, expected);475Ok(())476}477478#[test]479fn test_byte_array_fields() -> PolarsResult<()> {480let message = "481message test_schema {482REQUIRED BYTE_ARRAY binary;483REQUIRED FIXED_LEN_BYTE_ARRAY (20) fixed_binary;484}485";486let expected = vec![487Field::new("binary".into(), ArrowDataType::BinaryView, false),488Field::new(489"fixed_binary".into(),490ArrowDataType::FixedSizeBinary(20),491false,492),493];494495let parquet_schema = SchemaDescriptor::try_from_message(message)?;496let fields = parquet_to_arrow_schema(parquet_schema.fields());497let fields = fields.iter_values().cloned().collect::<Vec<_>>();498499assert_eq!(fields, expected);500Ok(())501}502503#[test]504fn test_duplicate_fields() -> PolarsResult<()> {505let message = "506message test_schema {507REQUIRED BOOLEAN boolean;508REQUIRED INT32 int8 (INT_8);509}510";511let expected = &[512Field::new("boolean".into(), ArrowDataType::Boolean, false),513Field::new("int8".into(), ArrowDataType::Int8, false),514];515516let parquet_schema = SchemaDescriptor::try_from_message(message)?;517let fields = parquet_to_arrow_schema(parquet_schema.fields());518let fields = fields.iter_values().cloned().collect::<Vec<_>>();519520assert_eq!(fields, expected);521Ok(())522}523524#[ignore]525#[test]526fn test_parquet_lists() -> PolarsResult<()> {527let mut arrow_fields = Vec::new();528529// LIST encoding example taken from parquet-format/LogicalTypes.md530let message_type = "531message test_schema {532REQUIRED GROUP my_list (LIST) {533REPEATED GROUP list {534OPTIONAL BINARY element (UTF8);535}536}537OPTIONAL GROUP my_list (LIST) {538REPEATED GROUP list {539REQUIRED BINARY element (UTF8);540}541}542OPTIONAL GROUP array_of_arrays (LIST) {543REPEATED GROUP list {544REQUIRED GROUP element (LIST) {545REPEATED GROUP list {546REQUIRED INT32 element;547}548}549}550}551OPTIONAL GROUP my_list (LIST) {552REPEATED GROUP element {553REQUIRED BINARY str (UTF8);554}555}556OPTIONAL GROUP my_list (LIST) {557REPEATED INT32 element;558}559OPTIONAL GROUP my_list (LIST) {560REPEATED GROUP element {561REQUIRED BINARY str (UTF8);562REQUIRED INT32 num;563}564}565OPTIONAL GROUP my_list (LIST) {566REPEATED GROUP array {567REQUIRED BINARY str (UTF8);568}569570}571OPTIONAL GROUP my_list (LIST) {572REPEATED GROUP my_list_tuple {573REQUIRED BINARY str (UTF8);574}575}576REPEATED INT32 name;577}578";579580// // List<String> (list non-null, elements nullable)581// required group my_list (LIST) {582// repeated group list {583// optional binary element (UTF8);584// }585// }586{587arrow_fields.push(Field::new(588"my_list".into(),589ArrowDataType::LargeList(Box::new(Field::new(590"element".into(),591ArrowDataType::Utf8,592true,593))),594false,595));596}597598// // List<String> (list nullable, elements non-null)599// optional group my_list (LIST) {600// repeated group list {601// required binary element (UTF8);602// }603// }604{605arrow_fields.push(Field::new(606"my_list".into(),607ArrowDataType::LargeList(Box::new(Field::new(608"element".into(),609ArrowDataType::Utf8,610false,611))),612true,613));614}615616// Element types can be nested structures. For example, a list of lists:617//618// // List<List<Integer>>619// optional group array_of_arrays (LIST) {620// repeated group list {621// required group element (LIST) {622// repeated group list {623// required int32 element;624// }625// }626// }627// }628{629let arrow_inner_list = ArrowDataType::LargeList(Box::new(Field::new(630"element".into(),631ArrowDataType::Int32,632false,633)));634arrow_fields.push(Field::new(635"array_of_arrays".into(),636ArrowDataType::LargeList(Box::new(Field::new(637PlSmallStr::from_static("element"),638arrow_inner_list,639false,640))),641true,642));643}644645// // List<String> (list nullable, elements non-null)646// optional group my_list (LIST) {647// repeated group element {648// required binary str (UTF8);649// };650// }651{652arrow_fields.push(Field::new(653"my_list".into(),654ArrowDataType::LargeList(Box::new(Field::new(655"element".into(),656ArrowDataType::Utf8,657false,658))),659true,660));661}662663// // List<Integer> (nullable list, non-null elements)664// optional group my_list (LIST) {665// repeated int32 element;666// }667{668arrow_fields.push(Field::new(669"my_list".into(),670ArrowDataType::LargeList(Box::new(Field::new(671"element".into(),672ArrowDataType::Int32,673false,674))),675true,676));677}678679// // List<Tuple<String, Integer>> (nullable list, non-null elements)680// optional group my_list (LIST) {681// repeated group element {682// required binary str (UTF8);683// required int32 num;684// };685// }686{687let arrow_struct = ArrowDataType::Struct(vec![688Field::new("str".into(), ArrowDataType::Utf8, false),689Field::new("num".into(), ArrowDataType::Int32, false),690]);691arrow_fields.push(Field::new(692"my_list".into(),693ArrowDataType::LargeList(Box::new(Field::new(694"element".into(),695arrow_struct,696false,697))),698true,699));700}701702// // List<OneTuple<String>> (nullable list, non-null elements)703// optional group my_list (LIST) {704// repeated group array {705// required binary str (UTF8);706// };707// }708// Special case: group is named array709{710let arrow_struct =711ArrowDataType::Struct(vec![Field::new("str".into(), ArrowDataType::Utf8, false)]);712arrow_fields.push(Field::new(713"my_list".into(),714ArrowDataType::LargeList(Box::new(Field::new("array".into(), arrow_struct, false))),715true,716));717}718719// // List<OneTuple<String>> (nullable list, non-null elements)720// optional group my_list (LIST) {721// repeated group my_list_tuple {722// required binary str (UTF8);723// };724// }725// Special case: group named ends in _tuple726{727let arrow_struct =728ArrowDataType::Struct(vec![Field::new("str".into(), ArrowDataType::Utf8, false)]);729arrow_fields.push(Field::new(730"my_list".into(),731ArrowDataType::LargeList(Box::new(Field::new(732"my_list_tuple".into(),733arrow_struct,734false,735))),736true,737));738}739740// One-level encoding: Only allows required lists with required cells741// repeated value_type name742{743arrow_fields.push(Field::new(744"name".into(),745ArrowDataType::LargeList(Box::new(Field::new(746"name".into(),747ArrowDataType::Int32,748false,749))),750false,751));752}753754let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;755let fields = parquet_to_arrow_schema(parquet_schema.fields());756let fields = fields.iter_values().cloned().collect::<Vec<_>>();757758assert_eq!(arrow_fields, fields);759Ok(())760}761762#[test]763fn test_parquet_list_with_struct() -> PolarsResult<()> {764let mut arrow_fields = Vec::new();765766let message_type = "767message eventlog {768REQUIRED group events (LIST) {769REPEATED group array {770REQUIRED BYTE_ARRAY event_name (STRING);771REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));772}773}774}775";776777{778let struct_fields = vec![779Field::new("event_name".into(), ArrowDataType::Utf8View, false),780Field::new(781"event_time".into(),782ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),783false,784),785];786arrow_fields.push(Field::new(787"events".into(),788ArrowDataType::LargeList(Box::new(Field::new(789"array".into(),790ArrowDataType::Struct(struct_fields),791false,792))),793false,794));795}796797let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;798let fields = parquet_to_arrow_schema(parquet_schema.fields());799let fields = fields.iter_values().cloned().collect::<Vec<_>>();800801assert_eq!(arrow_fields, fields);802Ok(())803}804805#[test]806fn test_parquet_list_nullable() -> PolarsResult<()> {807let mut arrow_fields = Vec::new();808809let message_type = "810message test_schema {811REQUIRED GROUP my_list1 (LIST) {812REPEATED GROUP list {813OPTIONAL BINARY element (UTF8);814}815}816OPTIONAL GROUP my_list2 (LIST) {817REPEATED GROUP list {818REQUIRED BINARY element (UTF8);819}820}821REQUIRED GROUP my_list3 (LIST) {822REPEATED GROUP list {823REQUIRED BINARY element (UTF8);824}825}826}827";828829// // List<String> (list non-null, elements nullable)830// required group my_list1 (LIST) {831// repeated group list {832// optional binary element (UTF8);833// }834// }835{836arrow_fields.push(Field::new(837"my_list1".into(),838ArrowDataType::LargeList(Box::new(Field::new(839"element".into(),840ArrowDataType::Utf8View,841true,842))),843false,844));845}846847// // List<String> (list nullable, elements non-null)848// optional group my_list2 (LIST) {849// repeated group list {850// required binary element (UTF8);851// }852// }853{854arrow_fields.push(Field::new(855"my_list2".into(),856ArrowDataType::LargeList(Box::new(Field::new(857"element".into(),858ArrowDataType::Utf8View,859false,860))),861true,862));863}864865// // List<String> (list non-null, elements non-null)866// repeated group my_list3 (LIST) {867// repeated group list {868// required binary element (UTF8);869// }870// }871{872arrow_fields.push(Field::new(873"my_list3".into(),874ArrowDataType::LargeList(Box::new(Field::new(875"element".into(),876ArrowDataType::Utf8View,877false,878))),879false,880));881}882883let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;884let fields = parquet_to_arrow_schema(parquet_schema.fields());885let fields = fields.iter_values().cloned().collect::<Vec<_>>();886887assert_eq!(arrow_fields, fields);888Ok(())889}890891#[test]892fn test_nested_schema() -> PolarsResult<()> {893let mut arrow_fields = Vec::new();894{895let group1_fields = vec![896Field::new("leaf1".into(), ArrowDataType::Boolean, false),897Field::new("leaf2".into(), ArrowDataType::Int32, false),898];899let group1_struct =900Field::new("group1".into(), ArrowDataType::Struct(group1_fields), false);901arrow_fields.push(group1_struct);902903let leaf3_field = Field::new("leaf3".into(), ArrowDataType::Int64, false);904arrow_fields.push(leaf3_field);905}906907let message_type = "908message test_schema {909REQUIRED GROUP group1 {910REQUIRED BOOLEAN leaf1;911REQUIRED INT32 leaf2;912}913REQUIRED INT64 leaf3;914}915";916917let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;918let fields = parquet_to_arrow_schema(parquet_schema.fields());919let fields = fields.iter_values().cloned().collect::<Vec<_>>();920921assert_eq!(arrow_fields, fields);922Ok(())923}924925#[ignore]926#[test]927fn test_repeated_nested_schema() -> PolarsResult<()> {928let mut arrow_fields = Vec::new();929{930arrow_fields.push(Field::new("leaf1".into(), ArrowDataType::Int32, true));931932let inner_group_list = Field::new(933"innerGroup".into(),934ArrowDataType::LargeList(Box::new(Field::new(935"innerGroup".into(),936ArrowDataType::Struct(vec![Field::new(937"leaf3".into(),938ArrowDataType::Int32,939true,940)]),941false,942))),943false,944);945946let outer_group_list = Field::new(947"outerGroup".into(),948ArrowDataType::LargeList(Box::new(Field::new(949"outerGroup".into(),950ArrowDataType::Struct(vec![951Field::new("leaf2".into(), ArrowDataType::Int32, true),952inner_group_list,953]),954false,955))),956false,957);958arrow_fields.push(outer_group_list);959}960961let message_type = "962message test_schema {963OPTIONAL INT32 leaf1;964REPEATED GROUP outerGroup {965OPTIONAL INT32 leaf2;966REPEATED GROUP innerGroup {967OPTIONAL INT32 leaf3;968}969}970}971";972973let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;974let fields = parquet_to_arrow_schema(parquet_schema.fields());975let fields = fields.iter_values().cloned().collect::<Vec<_>>();976977assert_eq!(arrow_fields, fields);978Ok(())979}980981#[ignore]982#[test]983fn test_column_desc_to_field() -> PolarsResult<()> {984let message_type = "985message test_schema {986REQUIRED BOOLEAN boolean;987REQUIRED INT32 int8 (INT_8);988REQUIRED INT32 uint8 (INTEGER(8,false));989REQUIRED INT32 int16 (INT_16);990REQUIRED INT32 uint16 (INTEGER(16,false));991REQUIRED INT32 int32;992REQUIRED INT64 int64;993OPTIONAL DOUBLE double;994OPTIONAL FLOAT float;995OPTIONAL BINARY string (UTF8);996REPEATED BOOLEAN bools;997OPTIONAL INT32 date (DATE);998OPTIONAL INT32 time_milli (TIME_MILLIS);999OPTIONAL INT64 time_micro (TIME_MICROS);1000OPTIONAL INT64 time_nano (TIME(NANOS,false));1001OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);1002REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);1003REQUIRED INT64 ts_nano (TIMESTAMP(NANOS,true));1004}1005";1006let arrow_fields = vec![1007Field::new("boolean".into(), ArrowDataType::Boolean, false),1008Field::new("int8".into(), ArrowDataType::Int8, false),1009Field::new("uint8".into(), ArrowDataType::UInt8, false),1010Field::new("int16".into(), ArrowDataType::Int16, false),1011Field::new("uint16".into(), ArrowDataType::UInt16, false),1012Field::new("int32".into(), ArrowDataType::Int32, false),1013Field::new("int64".into(), ArrowDataType::Int64, false),1014Field::new("double".into(), ArrowDataType::Float64, true),1015Field::new("float".into(), ArrowDataType::Float32, true),1016Field::new("string".into(), ArrowDataType::Utf8, true),1017Field::new(1018"bools".into(),1019ArrowDataType::LargeList(Box::new(Field::new(1020"bools".into(),1021ArrowDataType::Boolean,1022false,1023))),1024false,1025),1026Field::new("date".into(), ArrowDataType::Date32, true),1027Field::new(1028"time_milli".into(),1029ArrowDataType::Time32(TimeUnit::Millisecond),1030true,1031),1032Field::new(1033"time_micro".into(),1034ArrowDataType::Time64(TimeUnit::Microsecond),1035true,1036),1037Field::new(1038"time_nano".into(),1039ArrowDataType::Time64(TimeUnit::Nanosecond),1040true,1041),1042Field::new(1043"ts_milli".into(),1044ArrowDataType::Timestamp(TimeUnit::Millisecond, None),1045true,1046),1047Field::new(1048"ts_micro".into(),1049ArrowDataType::Timestamp(TimeUnit::Microsecond, None),1050false,1051),1052Field::new(1053"ts_nano".into(),1054ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),1055false,1056),1057];10581059let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;1060let fields = parquet_to_arrow_schema(parquet_schema.fields());1061let fields = fields.iter_values().cloned().collect::<Vec<_>>();10621063assert_eq!(arrow_fields, fields);1064Ok(())1065}10661067#[test]1068fn test_field_to_column_desc() -> PolarsResult<()> {1069let message_type = "1070message arrow_schema {1071REQUIRED BOOLEAN boolean;1072REQUIRED INT32 int8 (INT_8);1073REQUIRED INT32 int16 (INTEGER(16,true));1074REQUIRED INT32 int32;1075REQUIRED INT64 int64;1076OPTIONAL DOUBLE double;1077OPTIONAL FLOAT float;1078OPTIONAL BINARY string (STRING);1079OPTIONAL GROUP bools (LIST) {1080REPEATED GROUP list {1081OPTIONAL BOOLEAN element;1082}1083}1084REQUIRED GROUP bools_non_null (LIST) {1085REPEATED GROUP list {1086REQUIRED BOOLEAN element;1087}1088}1089OPTIONAL INT32 date (DATE);1090OPTIONAL INT32 time_milli (TIME(MILLIS,false));1091OPTIONAL INT64 time_micro (TIME_MICROS);1092OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);1093REQUIRED INT64 ts_micro (TIMESTAMP(MICROS,false));1094REQUIRED GROUP struct {1095REQUIRED BOOLEAN bools;1096REQUIRED INT32 uint32 (INTEGER(32,false));1097REQUIRED GROUP int32 (LIST) {1098REPEATED GROUP list {1099OPTIONAL INT32 element;1100}1101}1102}1103REQUIRED BINARY dictionary_strings (STRING);1104}1105";11061107let arrow_fields = vec![1108Field::new("boolean".into(), ArrowDataType::Boolean, false),1109Field::new("int8".into(), ArrowDataType::Int8, false),1110Field::new("int16".into(), ArrowDataType::Int16, false),1111Field::new("int32".into(), ArrowDataType::Int32, false),1112Field::new("int64".into(), ArrowDataType::Int64, false),1113Field::new("double".into(), ArrowDataType::Float64, true),1114Field::new("float".into(), ArrowDataType::Float32, true),1115Field::new("string".into(), ArrowDataType::Utf8View, true),1116Field::new(1117"bools".into(),1118ArrowDataType::LargeList(Box::new(Field::new(1119"element".into(),1120ArrowDataType::Boolean,1121true,1122))),1123true,1124),1125Field::new(1126"bools_non_null".into(),1127ArrowDataType::LargeList(Box::new(Field::new(1128"element".into(),1129ArrowDataType::Boolean,1130false,1131))),1132false,1133),1134Field::new("date".into(), ArrowDataType::Date32, true),1135Field::new(1136"time_milli".into(),1137ArrowDataType::Time32(TimeUnit::Millisecond),1138true,1139),1140Field::new(1141"time_micro".into(),1142ArrowDataType::Time64(TimeUnit::Microsecond),1143true,1144),1145Field::new(1146"ts_milli".into(),1147ArrowDataType::Timestamp(TimeUnit::Millisecond, None),1148true,1149),1150Field::new(1151"ts_micro".into(),1152ArrowDataType::Timestamp(TimeUnit::Microsecond, None),1153false,1154),1155Field::new(1156"struct".into(),1157ArrowDataType::Struct(vec![1158Field::new("bools".into(), ArrowDataType::Boolean, false),1159Field::new("uint32".into(), ArrowDataType::UInt32, false),1160Field::new(1161"int32".into(),1162ArrowDataType::LargeList(Box::new(Field::new(1163"element".into(),1164ArrowDataType::Int32,1165true,1166))),1167false,1168),1169]),1170false,1171),1172Field::new("dictionary_strings".into(), ArrowDataType::Utf8View, false),1173];11741175let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;1176let fields = parquet_to_arrow_schema(parquet_schema.fields());1177let fields = fields.iter_values().cloned().collect::<Vec<_>>();11781179assert_eq!(arrow_fields, fields);1180Ok(())1181}11821183#[test]1184fn test_int96_options() -> PolarsResult<()> {1185for tu in [1186TimeUnit::Second,1187TimeUnit::Microsecond,1188TimeUnit::Millisecond,1189TimeUnit::Nanosecond,1190] {1191let message_type = "1192message arrow_schema {1193REQUIRED INT96 int96_field;1194OPTIONAL GROUP int96_list (LIST) {1195REPEATED GROUP list {1196OPTIONAL INT96 element;1197}1198}1199REQUIRED GROUP int96_struct {1200REQUIRED INT96 int96_field;1201}1202}1203";1204let coerced_to = ArrowDataType::Timestamp(tu, None);1205let arrow_fields = vec![1206Field::new("int96_field".into(), coerced_to.clone(), false),1207Field::new(1208"int96_list".into(),1209ArrowDataType::LargeList(Box::new(Field::new(1210"element".into(),1211coerced_to.clone(),1212true,1213))),1214true,1215),1216Field::new(1217"int96_struct".into(),1218ArrowDataType::Struct(vec![Field::new(1219"int96_field".into(),1220coerced_to.clone(),1221false,1222)]),1223false,1224),1225];12261227let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;1228let fields = parquet_to_arrow_schema_with_options(1229parquet_schema.fields(),1230&Some(SchemaInferenceOptions {1231int96_coerce_to_timeunit: tu,1232}),1233);1234let fields = fields.iter_values().cloned().collect::<Vec<_>>();1235assert_eq!(arrow_fields, fields);1236}1237Ok(())1238}1239}124012411242