Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/read/schema/mod.rs
6940 views
1
//! APIs to handle Parquet <-> Arrow schemas.
2
use arrow::datatypes::{ArrowSchema, TimeUnit};
3
4
mod convert;
5
mod metadata;
6
7
pub(crate) use convert::*;
8
pub use convert::{parquet_to_arrow_schema, parquet_to_arrow_schema_with_options};
9
pub use metadata::{read_custom_key_value_metadata, read_schema_from_metadata};
10
use polars_error::PolarsResult;
11
12
use self::metadata::parse_key_value_metadata;
13
pub use crate::parquet::metadata::{FileMetadata, KeyValue, SchemaDescriptor};
14
pub use crate::parquet::schema::types::ParquetType;
15
16
/// Options when inferring schemas from Parquet
17
pub struct SchemaInferenceOptions {
18
/// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit
19
/// in the inferred Arrow Timestamp type.
20
///
21
/// This defaults to `TimeUnit::Nanosecond`, but INT96 timestamps outside of the range of years 1678-2262,
22
/// will overflow when parsed as `Timestamp(TimeUnit::Nanosecond)`. Setting this to a lower resolution
23
/// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates
24
/// without overflowing when parsing the data.
25
pub int96_coerce_to_timeunit: TimeUnit,
26
}
27
28
impl Default for SchemaInferenceOptions {
29
fn default() -> Self {
30
SchemaInferenceOptions {
31
int96_coerce_to_timeunit: TimeUnit::Nanosecond,
32
}
33
}
34
}
35
36
/// Infers a [`ArrowSchema`] from parquet's [`FileMetadata`].
37
///
38
/// This first looks for the metadata key `"ARROW:schema"`; if it does not exist, it converts the
39
/// Parquet types declared in the file's Parquet schema to Arrow's equivalent.
40
///
41
/// # Error
42
/// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
43
/// indicating that the file's arrow metadata was incorrectly written.
44
pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult<ArrowSchema> {
45
infer_schema_with_options(file_metadata, &None)
46
}
47
48
/// Like [`infer_schema`] but with configurable options which affects the behavior of inference
49
pub fn infer_schema_with_options(
50
file_metadata: &FileMetadata,
51
options: &Option<SchemaInferenceOptions>,
52
) -> PolarsResult<ArrowSchema> {
53
let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata());
54
55
let schema = read_schema_from_metadata(&mut metadata)?;
56
Ok(schema.unwrap_or_else(|| {
57
parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options)
58
}))
59
}
60
61