Path: blob/main/crates/polars-parquet/src/arrow/read/schema/metadata.rs
8512 views
use arrow::datatypes::{1ArrowDataType, ArrowSchema, DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW,2DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, Field, IntegerType, MAINTAIN_PL_TYPE,3Metadata, PL_KEY,4};5use arrow::io::ipc::read::deserialize_schema;6use base64::Engine as _;7use base64::engine::general_purpose;8use polars_error::{PolarsResult, polars_bail};9use polars_utils::pl_str::PlSmallStr;1011use super::super::super::ARROW_SCHEMA_META_KEY;12pub use crate::parquet::metadata::KeyValue;1314/// Reads custom key value metadata from a Parquet's key value file metadata.15pub fn read_custom_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {16parse_key_value_metadata(key_value_metadata)17}1819/// Reads an arrow schema from Parquet's file metadata. Returns `None` if no schema was found.20/// # Errors21/// Errors iff the schema cannot be correctly parsed.22pub fn read_schema_from_metadata(metadata: &mut Metadata) -> PolarsResult<Option<ArrowSchema>> {23metadata24.remove(ARROW_SCHEMA_META_KEY)25.map(|encoded| get_arrow_schema_from_metadata(&encoded))26.transpose()27}2829fn convert_field(field: &mut Field) {30// @NOTE: We cast non-Polars dictionaries to normal values because Polars does not have a31// generic dictionary type.32field.dtype = match std::mem::take(&mut field.dtype) {33ArrowDataType::Dictionary(key_type, value_type, sorted) => {34let is_pl_enum_or_categorical =35field.metadata.as_ref().is_some_and(|md| {36md.contains_key(DTYPE_ENUM_VALUES_LEGACY)37|| md.contains_key(DTYPE_ENUM_VALUES_NEW)38|| md.contains_key(DTYPE_CATEGORICAL_NEW)39|| md.contains_key(DTYPE_CATEGORICAL_LEGACY)40}) && matches!(41key_type,42IntegerType::UInt8 | IntegerType::UInt16 | IntegerType::UInt3243) && matches!(value_type.as_ref(), ArrowDataType::Utf8View);44let is_int_to_str = matches!(45value_type.as_ref(),46ArrowDataType::Utf8View | ArrowDataType::Utf8 | ArrowDataType::LargeUtf847);4849if is_pl_enum_or_categorical || is_int_to_str {50convert_dtype(ArrowDataType::Dictionary(key_type, value_type, sorted))51} else {52convert_dtype(*value_type)53}54},55ArrowDataType::LargeBinary56if field57.metadata58.as_ref()59.is_some_and(|md| md.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)) =>60{61ArrowDataType::LargeBinary62},63dt => convert_dtype(dt),64};65}6667fn convert_dtype(mut dtype: ArrowDataType) -> ArrowDataType {68use ArrowDataType::*;69match dtype {70List(mut field) => {71convert_field(field.as_mut());72dtype = LargeList(field);73},74LargeList(ref mut field) | FixedSizeList(ref mut field, _) => convert_field(field.as_mut()),75Struct(ref mut fields) => {76for field in fields {77convert_field(field);78}79},80Float16 => dtype = Float16,81Binary | LargeBinary => dtype = BinaryView,82Utf8 | LargeUtf8 => dtype = Utf8View,83Dictionary(_, ref mut dtype, _) => {84let dtype = dtype.as_mut();85*dtype = convert_dtype(std::mem::take(dtype));86},87Extension(ref mut ext) => {88ext.inner = convert_dtype(std::mem::take(&mut ext.inner));89},90Map(mut field, _ordered) => {91// Polars doesn't support Map.92// A map is physically a `List<Struct<K, V>>`93// So we read as list.94convert_field(field.as_mut());95dtype = LargeList(field);96},97_ => {},98}99100dtype101}102103/// Try to convert Arrow schema metadata into a schema104fn get_arrow_schema_from_metadata(encoded_meta: &str) -> PolarsResult<ArrowSchema> {105let decoded = general_purpose::STANDARD.decode(encoded_meta);106match decoded {107Ok(bytes) => {108let slice = if bytes[0..4] == [255u8; 4] {109&bytes[8..]110} else {111bytes.as_slice()112};113let mut schema = deserialize_schema(slice).map(|x| x.0)?;114// Convert the data types to the data types we support.115for field in schema.iter_values_mut() {116convert_field(field);117}118Ok(schema)119},120Err(err) => {121// The C++ implementation returns an error if the schema can't be parsed.122polars_bail!(InvalidOperation:123"unable to decode the encoded schema stored in {ARROW_SCHEMA_META_KEY}, {err:?}"124)125},126}127}128129pub(super) fn parse_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {130key_value_metadata131.as_ref()132.map(|key_values| {133key_values134.iter()135.filter_map(|kv| {136kv.value.as_ref().map(|value| {137(138PlSmallStr::from_str(kv.key.as_str()),139PlSmallStr::from_str(value.as_str()),140)141})142})143.collect()144})145.unwrap_or_default()146}147148149