Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/metadata/sort.rs
6940 views
1
#[cfg(feature = "serde")]
2
use serde::{Deserialize, Serialize};
3
4
use crate::parquet::schema::types::{
5
IntegerType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType,
6
};
7
8
/// Sort order for page and column statistics.
9
///
10
/// Types are associated with sort orders and column stats are aggregated using a sort
11
/// order, and a sort order should be considered when comparing values with statistics
12
/// min/max.
13
///
14
/// See reference in
15
/// <https://github.com/apache/parquet-cpp/blob/master/src/parquet/types.h>
16
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
18
pub enum SortOrder {
19
/// Signed (either value or legacy byte-wise) comparison.
20
Signed,
21
/// Unsigned (depending on physical type either value or byte-wise) comparison.
22
Unsigned,
23
/// Comparison is undefined.
24
Undefined,
25
}
26
27
/// Returns sort order for a physical/logical type.
28
pub fn get_sort_order(
29
logical_type: &Option<PrimitiveLogicalType>,
30
converted_type: &Option<PrimitiveConvertedType>,
31
physical_type: &PhysicalType,
32
) -> SortOrder {
33
if let Some(logical_type) = logical_type {
34
return get_logical_sort_order(logical_type);
35
};
36
if let Some(converted_type) = converted_type {
37
return get_converted_sort_order(converted_type);
38
};
39
get_physical_sort_order(physical_type)
40
}
41
42
fn get_logical_sort_order(logical_type: &PrimitiveLogicalType) -> SortOrder {
43
// TODO: Should this take converted and logical type, for compatibility?
44
use PrimitiveLogicalType::*;
45
match logical_type {
46
String | Enum | Json | Bson => SortOrder::Unsigned,
47
Integer(t) => match t {
48
IntegerType::Int8 | IntegerType::Int16 | IntegerType::Int32 | IntegerType::Int64 => {
49
SortOrder::Signed
50
},
51
_ => SortOrder::Unsigned,
52
},
53
Decimal(_, _) => SortOrder::Signed,
54
Date => SortOrder::Signed,
55
Time { .. } => SortOrder::Signed,
56
Timestamp { .. } => SortOrder::Signed,
57
Unknown => SortOrder::Undefined,
58
Uuid => SortOrder::Unsigned,
59
Float16 => SortOrder::Unsigned,
60
}
61
}
62
63
fn get_converted_sort_order(converted_type: &PrimitiveConvertedType) -> SortOrder {
64
use PrimitiveConvertedType::*;
65
match converted_type {
66
// Unsigned byte-wise comparison.
67
Utf8 | Json | Bson | Enum => SortOrder::Unsigned,
68
Int8 | Int16 | Int32 | Int64 => SortOrder::Signed,
69
Uint8 | Uint16 | Uint32 | Uint64 => SortOrder::Unsigned,
70
// Signed comparison of the represented value.
71
Decimal(_, _) => SortOrder::Signed,
72
Date => SortOrder::Signed,
73
TimeMillis | TimeMicros | TimestampMillis | TimestampMicros => SortOrder::Signed,
74
Interval => SortOrder::Undefined,
75
}
76
}
77
78
fn get_physical_sort_order(physical_type: &PhysicalType) -> SortOrder {
79
use PhysicalType::*;
80
match physical_type {
81
// Order: false, true
82
Boolean => SortOrder::Unsigned,
83
Int32 | Int64 => SortOrder::Signed,
84
Int96 => SortOrder::Undefined,
85
// Notes to remember when comparing float/double values:
86
// If the min is a NaN, it should be ignored.
87
// If the max is a NaN, it should be ignored.
88
// If the min is +0, the row group may contain -0 values as well.
89
// If the max is -0, the row group may contain +0 values as well.
90
// When looking for NaN values, min and max should be ignored.
91
Float | Double => SortOrder::Signed,
92
// Unsigned byte-wise comparison
93
ByteArray | FixedLenByteArray(_) => SortOrder::Unsigned,
94
}
95
}
96
97