Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/statistics/mod.rs
6940 views
1
mod binary;
2
mod boolean;
3
mod fixed_len_binary;
4
mod primitive;
5
6
pub use binary::BinaryStatistics;
7
pub use boolean::BooleanStatistics;
8
pub use fixed_len_binary::FixedLenStatistics;
9
pub use primitive::PrimitiveStatistics;
10
11
use crate::parquet::error::ParquetResult;
12
use crate::parquet::schema::types::{PhysicalType, PrimitiveType};
13
pub use crate::parquet::thrift_format::Statistics as ParquetStatistics;
14
15
#[derive(Debug, PartialEq)]
16
pub enum Statistics {
17
Binary(BinaryStatistics),
18
Boolean(BooleanStatistics),
19
FixedLen(FixedLenStatistics),
20
Int32(PrimitiveStatistics<i32>),
21
Int64(PrimitiveStatistics<i64>),
22
Int96(PrimitiveStatistics<[u32; 3]>),
23
Float(PrimitiveStatistics<f32>),
24
Double(PrimitiveStatistics<f64>),
25
}
26
27
impl Statistics {
28
#[inline]
29
pub const fn physical_type(&self) -> &PhysicalType {
30
use Statistics as S;
31
32
match self {
33
S::Binary(_) => &PhysicalType::ByteArray,
34
S::Boolean(_) => &PhysicalType::Boolean,
35
S::FixedLen(s) => &s.primitive_type.physical_type,
36
S::Int32(_) => &PhysicalType::Int32,
37
S::Int64(_) => &PhysicalType::Int64,
38
S::Int96(_) => &PhysicalType::Int96,
39
S::Float(_) => &PhysicalType::Float,
40
S::Double(_) => &PhysicalType::Double,
41
}
42
}
43
44
pub fn clear_min(&mut self) {
45
use Statistics as S;
46
match self {
47
S::Binary(s) => _ = s.min_value.take(),
48
S::Boolean(s) => _ = s.min_value.take(),
49
S::FixedLen(s) => _ = s.min_value.take(),
50
S::Int32(s) => _ = s.min_value.take(),
51
S::Int64(s) => _ = s.min_value.take(),
52
S::Int96(s) => _ = s.min_value.take(),
53
S::Float(s) => _ = s.min_value.take(),
54
S::Double(s) => _ = s.min_value.take(),
55
};
56
}
57
58
pub fn clear_max(&mut self) {
59
use Statistics as S;
60
match self {
61
S::Binary(s) => _ = s.max_value.take(),
62
S::Boolean(s) => _ = s.max_value.take(),
63
S::FixedLen(s) => _ = s.max_value.take(),
64
S::Int32(s) => _ = s.max_value.take(),
65
S::Int64(s) => _ = s.max_value.take(),
66
S::Int96(s) => _ = s.max_value.take(),
67
S::Float(s) => _ = s.max_value.take(),
68
S::Double(s) => _ = s.max_value.take(),
69
};
70
}
71
72
/// Deserializes a raw parquet statistics into [`Statistics`].
73
/// # Error
74
/// This function errors if it is not possible to read the statistics to the
75
/// corresponding `physical_type`.
76
#[inline]
77
pub fn deserialize(
78
statistics: &ParquetStatistics,
79
primitive_type: PrimitiveType,
80
) -> ParquetResult<Self> {
81
use {PhysicalType as T, PrimitiveStatistics as PrimStat};
82
let mut stats: Self = match primitive_type.physical_type {
83
T::ByteArray => BinaryStatistics::deserialize(statistics, primitive_type)?.into(),
84
T::Boolean => BooleanStatistics::deserialize(statistics)?.into(),
85
T::Int32 => PrimStat::<i32>::deserialize(statistics, primitive_type)?.into(),
86
T::Int64 => PrimStat::<i64>::deserialize(statistics, primitive_type)?.into(),
87
T::Int96 => PrimStat::<[u32; 3]>::deserialize(statistics, primitive_type)?.into(),
88
T::Float => PrimStat::<f32>::deserialize(statistics, primitive_type)?.into(),
89
T::Double => PrimStat::<f64>::deserialize(statistics, primitive_type)?.into(),
90
T::FixedLenByteArray(size) => {
91
FixedLenStatistics::deserialize(statistics, size, primitive_type)?.into()
92
},
93
};
94
95
if statistics.is_min_value_exact.is_some_and(|v| !v) {
96
stats.clear_min();
97
}
98
if statistics.is_max_value_exact.is_some_and(|v| !v) {
99
stats.clear_max();
100
}
101
102
// Parquet Format:
103
// > - If the min is a NaN, it should be ignored.
104
// > - If the max is a NaN, it should be ignored.
105
match &mut stats {
106
Statistics::Float(stats) => {
107
stats.min_value.take_if(|v| v.is_nan());
108
stats.max_value.take_if(|v| v.is_nan());
109
},
110
Statistics::Double(stats) => {
111
stats.min_value.take_if(|v| v.is_nan());
112
stats.max_value.take_if(|v| v.is_nan());
113
},
114
_ => {},
115
}
116
117
Ok(stats)
118
}
119
}
120
121
macro_rules! statistics_from_as {
122
($($variant:ident($struct:ty) => ($as_ident:ident, $into_ident:ident, $expect_ident:ident, $owned_expect_ident:ident),)+) => {
123
$(
124
impl From<$struct> for Statistics {
125
#[inline]
126
fn from(stats: $struct) -> Self {
127
Self::$variant(stats)
128
}
129
}
130
)+
131
132
impl Statistics {
133
#[inline]
134
pub const fn null_count(&self) -> Option<i64> {
135
match self {
136
$(Self::$variant(s) => s.null_count,)+
137
}
138
}
139
140
/// Serializes [`Statistics`] into a raw parquet statistics.
141
#[inline]
142
pub fn serialize(&self) -> ParquetStatistics {
143
match self {
144
$(Self::$variant(s) => s.serialize(),)+
145
}
146
}
147
148
const fn variant_str(&self) -> &'static str {
149
match self {
150
$(Self::$variant(_) => stringify!($struct),)+
151
}
152
}
153
154
$(
155
#[doc = concat!("Try to take [`Statistics`] as [`", stringify!($struct), "`]")]
156
#[inline]
157
pub fn $as_ident(&self) -> Option<&$struct> {
158
match self {
159
Self::$variant(s) => Some(s),
160
_ => None,
161
}
162
}
163
164
#[doc = concat!("Try to take [`Statistics`] as [`", stringify!($struct), "`]")]
165
#[inline]
166
pub fn $into_ident(self) -> Option<$struct> {
167
match self {
168
Self::$variant(s) => Some(s),
169
_ => None,
170
}
171
}
172
173
#[doc = concat!("Interpret [`Statistics`] to be [`", stringify!($struct), "`]")]
174
///
175
/// Panics if it is not the correct variant.
176
#[track_caller]
177
#[inline]
178
pub fn $expect_ident(&self) -> &$struct {
179
let Self::$variant(s) = self else {
180
panic!("Expected Statistics to be {}, found {} instead", stringify!($struct), self.variant_str());
181
};
182
183
s
184
}
185
186
#[doc = concat!("Interpret [`Statistics`] to be [`", stringify!($struct), "`]")]
187
///
188
/// Panics if it is not the correct variant.
189
#[track_caller]
190
#[inline]
191
pub fn $owned_expect_ident(self) -> $struct {
192
let Self::$variant(s) = self else {
193
panic!("Expected Statistics to be {}, found {} instead", stringify!($struct), self.variant_str());
194
};
195
196
s
197
}
198
)+
199
200
}
201
};
202
}
203
204
statistics_from_as! {
205
Binary (BinaryStatistics ) => (as_binary, into_binary, expect_as_binary, expect_binary ),
206
Boolean (BooleanStatistics ) => (as_boolean, into_boolean, expect_as_boolean, expect_boolean ),
207
FixedLen (FixedLenStatistics ) => (as_fixedlen, into_fixedlen, expect_as_fixedlen, expect_fixedlen),
208
Int32 (PrimitiveStatistics<i32> ) => (as_int32, into_int32, expect_as_int32, expect_int32 ),
209
Int64 (PrimitiveStatistics<i64> ) => (as_int64, into_int64, expect_as_int64, expect_int64 ),
210
Int96 (PrimitiveStatistics<[u32; 3]>) => (as_int96, into_int96, expect_as_int96, expect_int96 ),
211
Float (PrimitiveStatistics<f32> ) => (as_float, into_float, expect_as_float, expect_float ),
212
Double (PrimitiveStatistics<f64> ) => (as_double, into_double, expect_as_double, expect_double ),
213
}
214
215