Path: blob/main/crates/polars-parquet/src/arrow/read/deserialize/nested.rs
6940 views
use arrow::array::StructArray;1use arrow::datatypes::{2DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY,3DTYPE_ENUM_VALUES_NEW, IntegerType,4};5use polars_compute::cast::CastOptionsImpl;67use self::categorical::CategoricalDecoder;8use self::nested::deserialize::utils::freeze_validity;9use self::nested_utils::NestedContent;10use self::utils::PageDecoder;11use super::*;12use crate::parquet::error::ParquetResult;1314pub fn columns_to_iter_recursive(15mut columns: Vec<BasicDecompressor>,16mut types: Vec<&PrimitiveType>,17field: Field,18mut init: Vec<InitNested>,19filter: Option<Filter>,20) -> ParquetResult<(NestedState, Box<dyn Array>, Bitmap)> {21if !field.dtype().is_nested() {22let pages = columns.pop().unwrap();23init.push(InitNested::Primitive(field.is_nullable));24let type_ = types.pop().unwrap();25let (nested, arr, pdm) = page_iter_to_array(pages, type_, field, filter, Some(init))?;26Ok((nested.unwrap(), arr, pdm))27} else {28match field.dtype() {29ArrowDataType::List(inner) | ArrowDataType::LargeList(inner) => {30init.push(InitNested::List(field.is_nullable));31let (mut nested, array, ptm) = columns_to_iter_recursive(32columns,33types,34inner.as_ref().clone(),35init,36filter,37)?;38let array = create_list(field.dtype().clone(), &mut nested, array);39Ok((nested, array, ptm))40},41ArrowDataType::FixedSizeList(inner, width) => {42init.push(InitNested::FixedSizeList(field.is_nullable, *width));43let (mut nested, array, ptm) = columns_to_iter_recursive(44columns,45types,46inner.as_ref().clone(),47init,48filter,49)?;50let array = create_list(field.dtype().clone(), &mut nested, array);51Ok((nested, array, ptm))52},53ArrowDataType::Struct(fields) => {54// This definitely does not support Filter predicate yet.55assert!(!matches!(&filter, Some(Filter::Predicate(_))));5657// @NOTE:58// We go back to front here, because we constantly split off the end of the array59// to grab the relevant columns and types.60//61// Is this inefficient? Yes. Is this how we are going to do it for now? Yes.6263let Some(last_field) = fields.last() else {64return Err(ParquetError::not_supported("Struct has zero fields"));65};6667let field_to_nested_array =68|mut init: Vec<InitNested>,69columns: &mut Vec<BasicDecompressor>,70types: &mut Vec<&PrimitiveType>,71struct_field: &Field| {72init.push(InitNested::Struct(field.is_nullable));73let n = n_columns(&struct_field.dtype);74let columns = columns.split_off(columns.len() - n);75let types = types.split_off(types.len() - n);7677columns_to_iter_recursive(78columns,79types,80struct_field.clone(),81init,82filter.clone(),83)84};8586let (mut nested, last_array, _) =87field_to_nested_array(init.clone(), &mut columns, &mut types, last_field)?;88debug_assert!(matches!(nested.last().unwrap(), NestedContent::Struct));89let (length, _, struct_validity) = nested.pop().unwrap();9091let mut field_arrays = Vec::<Box<dyn Array>>::with_capacity(fields.len());92field_arrays.push(last_array);9394for field in fields.iter().rev().skip(1) {95let (mut _nested, array, _) =96field_to_nested_array(init.clone(), &mut columns, &mut types, field)?;9798#[cfg(debug_assertions)]99{100debug_assert!(matches!(_nested.last().unwrap(), NestedContent::Struct));101debug_assert_eq!(102_nested.pop().unwrap().2.and_then(freeze_validity),103struct_validity.clone().and_then(freeze_validity),104);105}106107field_arrays.push(array);108}109110field_arrays.reverse();111let struct_validity = struct_validity.and_then(freeze_validity);112113Ok((114nested,115StructArray::new(116ArrowDataType::Struct(fields.clone()),117length,118field_arrays,119struct_validity,120)121.to_boxed(),122Bitmap::new(),123))124},125ArrowDataType::Map(inner, _) => {126init.push(InitNested::List(field.is_nullable));127let (mut nested, array, ptm) = columns_to_iter_recursive(128columns,129types,130inner.as_ref().clone(),131init,132filter,133)?;134let array = create_map(field.dtype().clone(), &mut nested, array);135Ok((nested, array, ptm))136},137138ArrowDataType::Dictionary(key_type, value_type, _) => {139// @note: this should only hit in two cases:140// - polars enum's and categorical's141// - int -> string which can be turned into categoricals142assert!(matches!(value_type.as_ref(), ArrowDataType::Utf8View));143144init.push(InitNested::Primitive(field.is_nullable));145146if field.metadata.as_ref().is_none_or(|md| {147!md.contains_key(DTYPE_ENUM_VALUES_LEGACY)148&& !md.contains_key(DTYPE_ENUM_VALUES_NEW)149&& !md.contains_key(DTYPE_CATEGORICAL_NEW)150&& !md.contains_key(DTYPE_CATEGORICAL_LEGACY)151}) {152let (nested, arr, ptm) = PageDecoder::new(153&field.name,154columns.pop().unwrap(),155ArrowDataType::Utf8View,156binview::BinViewDecoder::new_string(),157Some(init),158)?159.collect_nested(filter)?;160161let arr = polars_compute::cast::cast(162arr.as_ref(),163field.dtype(),164CastOptionsImpl::default(),165)166.unwrap();167168Ok((nested, arr, ptm))169} else {170let (nested, arr, ptm) = match key_type {171IntegerType::UInt8 => PageDecoder::new(172&field.name,173columns.pop().unwrap(),174field.dtype().clone(),175CategoricalDecoder::<u8>::new(),176Some(init),177)?178.collect_boxed(filter)?,179IntegerType::UInt16 => PageDecoder::new(180&field.name,181columns.pop().unwrap(),182field.dtype().clone(),183CategoricalDecoder::<u16>::new(),184Some(init),185)?186.collect_boxed(filter)?,187IntegerType::UInt32 => PageDecoder::new(188&field.name,189columns.pop().unwrap(),190field.dtype().clone(),191CategoricalDecoder::<u32>::new(),192Some(init),193)?194.collect_boxed(filter)?,195_ => unimplemented!(),196};197198Ok((nested.unwrap(), arr, ptm))199}200},201other => Err(ParquetError::not_supported(format!(202"Deserializing type {other:?} from parquet"203))),204}205}206}207208209