Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/read/schema/metadata.rs
6940 views
1
use arrow::datatypes::{
2
ArrowDataType, ArrowSchema, DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW,
3
DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, Field, IntegerType, Metadata,
4
};
5
use arrow::io::ipc::read::deserialize_schema;
6
use base64::Engine as _;
7
use base64::engine::general_purpose;
8
use polars_error::{PolarsResult, polars_bail};
9
use polars_utils::pl_str::PlSmallStr;
10
11
use super::super::super::ARROW_SCHEMA_META_KEY;
12
pub use crate::parquet::metadata::KeyValue;
13
14
/// Reads custom key value metadata from a Parquet's key value file metadata.
15
pub fn read_custom_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {
16
parse_key_value_metadata(key_value_metadata)
17
}
18
19
/// Reads an arrow schema from Parquet's file metadata. Returns `None` if no schema was found.
20
/// # Errors
21
/// Errors iff the schema cannot be correctly parsed.
22
pub fn read_schema_from_metadata(metadata: &mut Metadata) -> PolarsResult<Option<ArrowSchema>> {
23
metadata
24
.remove(ARROW_SCHEMA_META_KEY)
25
.map(|encoded| get_arrow_schema_from_metadata(&encoded))
26
.transpose()
27
}
28
29
fn convert_field(field: &mut Field) {
30
// @NOTE: We cast non-Polars dictionaries to normal values because Polars does not have a
31
// generic dictionary type.
32
field.dtype = match std::mem::take(&mut field.dtype) {
33
ArrowDataType::Dictionary(key_type, value_type, sorted) => {
34
let is_pl_enum_or_categorical =
35
field.metadata.as_ref().is_some_and(|md| {
36
md.contains_key(DTYPE_ENUM_VALUES_LEGACY)
37
|| md.contains_key(DTYPE_ENUM_VALUES_NEW)
38
|| md.contains_key(DTYPE_CATEGORICAL_NEW)
39
|| md.contains_key(DTYPE_CATEGORICAL_LEGACY)
40
}) && matches!(
41
key_type,
42
IntegerType::UInt8 | IntegerType::UInt16 | IntegerType::UInt32
43
) && matches!(value_type.as_ref(), ArrowDataType::Utf8View);
44
let is_int_to_str = matches!(
45
value_type.as_ref(),
46
ArrowDataType::Utf8View | ArrowDataType::Utf8 | ArrowDataType::LargeUtf8
47
);
48
49
if is_pl_enum_or_categorical || is_int_to_str {
50
convert_dtype(ArrowDataType::Dictionary(key_type, value_type, sorted))
51
} else {
52
convert_dtype(*value_type)
53
}
54
},
55
dt => convert_dtype(dt),
56
};
57
}
58
59
fn convert_dtype(mut dtype: ArrowDataType) -> ArrowDataType {
60
use ArrowDataType::*;
61
match dtype {
62
List(mut field) => {
63
convert_field(field.as_mut());
64
dtype = LargeList(field);
65
},
66
LargeList(ref mut field) | FixedSizeList(ref mut field, _) => convert_field(field.as_mut()),
67
Struct(ref mut fields) => {
68
for field in fields {
69
convert_field(field);
70
}
71
},
72
Float16 => dtype = Float32,
73
Binary | LargeBinary => dtype = BinaryView,
74
Utf8 | LargeUtf8 => dtype = Utf8View,
75
Dictionary(_, ref mut dtype, _) => {
76
let dtype = dtype.as_mut();
77
*dtype = convert_dtype(std::mem::take(dtype));
78
},
79
Extension(ref mut ext) => {
80
ext.inner = convert_dtype(std::mem::take(&mut ext.inner));
81
},
82
Map(mut field, _ordered) => {
83
// Polars doesn't support Map.
84
// A map is physically a `List<Struct<K, V>>`
85
// So we read as list.
86
convert_field(field.as_mut());
87
dtype = LargeList(field);
88
},
89
_ => {},
90
}
91
92
dtype
93
}
94
95
/// Try to convert Arrow schema metadata into a schema
96
fn get_arrow_schema_from_metadata(encoded_meta: &str) -> PolarsResult<ArrowSchema> {
97
let decoded = general_purpose::STANDARD.decode(encoded_meta);
98
match decoded {
99
Ok(bytes) => {
100
let slice = if bytes[0..4] == [255u8; 4] {
101
&bytes[8..]
102
} else {
103
bytes.as_slice()
104
};
105
let mut schema = deserialize_schema(slice).map(|x| x.0)?;
106
// Convert the data types to the data types we support.
107
for field in schema.iter_values_mut() {
108
convert_field(field);
109
}
110
Ok(schema)
111
},
112
Err(err) => {
113
// The C++ implementation returns an error if the schema can't be parsed.
114
polars_bail!(InvalidOperation:
115
"unable to decode the encoded schema stored in {ARROW_SCHEMA_META_KEY}, {err:?}"
116
)
117
},
118
}
119
}
120
121
pub(super) fn parse_key_value_metadata(key_value_metadata: &Option<Vec<KeyValue>>) -> Metadata {
122
key_value_metadata
123
.as_ref()
124
.map(|key_values| {
125
key_values
126
.iter()
127
.filter_map(|kv| {
128
kv.value.as_ref().map(|value| {
129
(
130
PlSmallStr::from_str(kv.key.as_str()),
131
PlSmallStr::from_str(value.as_str()),
132
)
133
})
134
})
135
.collect()
136
})
137
.unwrap_or_default()
138
}
139
140