Path: blob/main/crates/polars-parquet/src/arrow/read/schema/metadata.rs
6940 views
use arrow::datatypes::{1ArrowDataType, ArrowSchema, DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW,2DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, Field, IntegerType, Metadata,3};4use arrow::io::ipc::read::deserialize_schema;5use base64::Engine as _;6use base64::engine::general_purpose;7use polars_error::{PolarsResult, polars_bail};8use polars_utils::pl_str::PlSmallStr;910use super::super::super::ARROW_SCHEMA_META_KEY;11pub use crate::parquet::metadata::KeyValue;1213/// Reads custom key value metadata from a Parquet's key value file metadata.14pub fn read_custom_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {15parse_key_value_metadata(key_value_metadata)16}1718/// Reads an arrow schema from Parquet's file metadata. Returns `None` if no schema was found.19/// # Errors20/// Errors iff the schema cannot be correctly parsed.21pub fn read_schema_from_metadata(metadata: &mut Metadata) -> PolarsResult<Option<ArrowSchema>> {22metadata23.remove(ARROW_SCHEMA_META_KEY)24.map(|encoded| get_arrow_schema_from_metadata(&encoded))25.transpose()26}2728fn convert_field(field: &mut Field) {29// @NOTE: We cast non-Polars dictionaries to normal values because Polars does not have a30// generic dictionary type.31field.dtype = match std::mem::take(&mut field.dtype) {32ArrowDataType::Dictionary(key_type, value_type, sorted) => {33let is_pl_enum_or_categorical =34field.metadata.as_ref().is_some_and(|md| {35md.contains_key(DTYPE_ENUM_VALUES_LEGACY)36|| md.contains_key(DTYPE_ENUM_VALUES_NEW)37|| md.contains_key(DTYPE_CATEGORICAL_NEW)38|| md.contains_key(DTYPE_CATEGORICAL_LEGACY)39}) && matches!(40key_type,41IntegerType::UInt8 | IntegerType::UInt16 | IntegerType::UInt3242) && matches!(value_type.as_ref(), ArrowDataType::Utf8View);43let is_int_to_str = matches!(44value_type.as_ref(),45ArrowDataType::Utf8View | ArrowDataType::Utf8 | ArrowDataType::LargeUtf846);4748if is_pl_enum_or_categorical || is_int_to_str {49convert_dtype(ArrowDataType::Dictionary(key_type, value_type, sorted))50} else {51convert_dtype(*value_type)52}53},54dt => convert_dtype(dt),55};56}5758fn convert_dtype(mut dtype: ArrowDataType) -> ArrowDataType {59use ArrowDataType::*;60match dtype {61List(mut field) => {62convert_field(field.as_mut());63dtype = LargeList(field);64},65LargeList(ref mut field) | FixedSizeList(ref mut field, _) => convert_field(field.as_mut()),66Struct(ref mut fields) => {67for field in fields {68convert_field(field);69}70},71Float16 => dtype = Float32,72Binary | LargeBinary => dtype = BinaryView,73Utf8 | LargeUtf8 => dtype = Utf8View,74Dictionary(_, ref mut dtype, _) => {75let dtype = dtype.as_mut();76*dtype = convert_dtype(std::mem::take(dtype));77},78Extension(ref mut ext) => {79ext.inner = convert_dtype(std::mem::take(&mut ext.inner));80},81Map(mut field, _ordered) => {82// Polars doesn't support Map.83// A map is physically a `List<Struct<K, V>>`84// So we read as list.85convert_field(field.as_mut());86dtype = LargeList(field);87},88_ => {},89}9091dtype92}9394/// Try to convert Arrow schema metadata into a schema95fn get_arrow_schema_from_metadata(encoded_meta: &str) -> PolarsResult<ArrowSchema> {96let decoded = general_purpose::STANDARD.decode(encoded_meta);97match decoded {98Ok(bytes) => {99let slice = if bytes[0..4] == [255u8; 4] {100&bytes[8..]101} else {102bytes.as_slice()103};104let mut schema = deserialize_schema(slice).map(|x| x.0)?;105// Convert the data types to the data types we support.106for field in schema.iter_values_mut() {107convert_field(field);108}109Ok(schema)110},111Err(err) => {112// The C++ implementation returns an error if the schema can't be parsed.113polars_bail!(InvalidOperation:114"unable to decode the encoded schema stored in {ARROW_SCHEMA_META_KEY}, {err:?}"115)116},117}118}119120pub(super) fn parse_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {121key_value_metadata122.as_ref()123.map(|key_values| {124key_values125.iter()126.filter_map(|kv| {127kv.value.as_ref().map(|value| {128(129PlSmallStr::from_str(kv.key.as_str()),130PlSmallStr::from_str(value.as_str()),131)132})133})134.collect()135})136.unwrap_or_default()137}138139140