Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/read/schema/metadata.rs
8512 views
1
use arrow::datatypes::{
2
ArrowDataType, ArrowSchema, DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW,
3
DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, Field, IntegerType, MAINTAIN_PL_TYPE,
4
Metadata, PL_KEY,
5
};
6
use arrow::io::ipc::read::deserialize_schema;
7
use base64::Engine as _;
8
use base64::engine::general_purpose;
9
use polars_error::{PolarsResult, polars_bail};
10
use polars_utils::pl_str::PlSmallStr;
11
12
use super::super::super::ARROW_SCHEMA_META_KEY;
13
pub use crate::parquet::metadata::KeyValue;
14
15
/// Reads custom key value metadata from a Parquet's key value file metadata.
16
pub fn read_custom_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {
17
parse_key_value_metadata(key_value_metadata)
18
}
19
20
/// Reads an arrow schema from Parquet's file metadata. Returns `None` if no schema was found.
21
/// # Errors
22
/// Errors iff the schema cannot be correctly parsed.
23
pub fn read_schema_from_metadata(metadata: &mut Metadata) -> PolarsResult<Option<ArrowSchema>> {
24
metadata
25
.remove(ARROW_SCHEMA_META_KEY)
26
.map(|encoded| get_arrow_schema_from_metadata(&encoded))
27
.transpose()
28
}
29
30
fn convert_field(field: &mut Field) {
31
// @NOTE: We cast non-Polars dictionaries to normal values because Polars does not have a
32
// generic dictionary type.
33
field.dtype = match std::mem::take(&mut field.dtype) {
34
ArrowDataType::Dictionary(key_type, value_type, sorted) => {
35
let is_pl_enum_or_categorical =
36
field.metadata.as_ref().is_some_and(|md| {
37
md.contains_key(DTYPE_ENUM_VALUES_LEGACY)
38
|| md.contains_key(DTYPE_ENUM_VALUES_NEW)
39
|| md.contains_key(DTYPE_CATEGORICAL_NEW)
40
|| md.contains_key(DTYPE_CATEGORICAL_LEGACY)
41
}) && matches!(
42
key_type,
43
IntegerType::UInt8 | IntegerType::UInt16 | IntegerType::UInt32
44
) && matches!(value_type.as_ref(), ArrowDataType::Utf8View);
45
let is_int_to_str = matches!(
46
value_type.as_ref(),
47
ArrowDataType::Utf8View | ArrowDataType::Utf8 | ArrowDataType::LargeUtf8
48
);
49
50
if is_pl_enum_or_categorical || is_int_to_str {
51
convert_dtype(ArrowDataType::Dictionary(key_type, value_type, sorted))
52
} else {
53
convert_dtype(*value_type)
54
}
55
},
56
ArrowDataType::LargeBinary
57
if field
58
.metadata
59
.as_ref()
60
.is_some_and(|md| md.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)) =>
61
{
62
ArrowDataType::LargeBinary
63
},
64
dt => convert_dtype(dt),
65
};
66
}
67
68
fn convert_dtype(mut dtype: ArrowDataType) -> ArrowDataType {
69
use ArrowDataType::*;
70
match dtype {
71
List(mut field) => {
72
convert_field(field.as_mut());
73
dtype = LargeList(field);
74
},
75
LargeList(ref mut field) | FixedSizeList(ref mut field, _) => convert_field(field.as_mut()),
76
Struct(ref mut fields) => {
77
for field in fields {
78
convert_field(field);
79
}
80
},
81
Float16 => dtype = Float16,
82
Binary | LargeBinary => dtype = BinaryView,
83
Utf8 | LargeUtf8 => dtype = Utf8View,
84
Dictionary(_, ref mut dtype, _) => {
85
let dtype = dtype.as_mut();
86
*dtype = convert_dtype(std::mem::take(dtype));
87
},
88
Extension(ref mut ext) => {
89
ext.inner = convert_dtype(std::mem::take(&mut ext.inner));
90
},
91
Map(mut field, _ordered) => {
92
// Polars doesn't support Map.
93
// A map is physically a `List<Struct<K, V>>`
94
// So we read as list.
95
convert_field(field.as_mut());
96
dtype = LargeList(field);
97
},
98
_ => {},
99
}
100
101
dtype
102
}
103
104
/// Try to convert Arrow schema metadata into a schema
105
fn get_arrow_schema_from_metadata(encoded_meta: &str) -> PolarsResult<ArrowSchema> {
106
let decoded = general_purpose::STANDARD.decode(encoded_meta);
107
match decoded {
108
Ok(bytes) => {
109
let slice = if bytes[0..4] == [255u8; 4] {
110
&bytes[8..]
111
} else {
112
bytes.as_slice()
113
};
114
let mut schema = deserialize_schema(slice).map(|x| x.0)?;
115
// Convert the data types to the data types we support.
116
for field in schema.iter_values_mut() {
117
convert_field(field);
118
}
119
Ok(schema)
120
},
121
Err(err) => {
122
// The C++ implementation returns an error if the schema can't be parsed.
123
polars_bail!(InvalidOperation:
124
"unable to decode the encoded schema stored in {ARROW_SCHEMA_META_KEY}, {err:?}"
125
)
126
},
127
}
128
}
129
130
pub(super) fn parse_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {
131
key_value_metadata
132
.as_ref()
133
.map(|key_values| {
134
key_values
135
.iter()
136
.filter_map(|kv| {
137
kv.value.as_ref().map(|value| {
138
(
139
PlSmallStr::from_str(kv.key.as_str()),
140
PlSmallStr::from_str(value.as_str()),
141
)
142
})
143
})
144
.collect()
145
})
146
.unwrap_or_default()
147
}
148
149