Path: blob/main/crates/polars-parquet/src/parquet/statistics/mod.rs
6940 views
mod binary;1mod boolean;2mod fixed_len_binary;3mod primitive;45pub use binary::BinaryStatistics;6pub use boolean::BooleanStatistics;7pub use fixed_len_binary::FixedLenStatistics;8pub use primitive::PrimitiveStatistics;910use crate::parquet::error::ParquetResult;11use crate::parquet::schema::types::{PhysicalType, PrimitiveType};12pub use crate::parquet::thrift_format::Statistics as ParquetStatistics;1314#[derive(Debug, PartialEq)]15pub enum Statistics {16Binary(BinaryStatistics),17Boolean(BooleanStatistics),18FixedLen(FixedLenStatistics),19Int32(PrimitiveStatistics<i32>),20Int64(PrimitiveStatistics<i64>),21Int96(PrimitiveStatistics<[u32; 3]>),22Float(PrimitiveStatistics<f32>),23Double(PrimitiveStatistics<f64>),24}2526impl Statistics {27#[inline]28pub const fn physical_type(&self) -> &PhysicalType {29use Statistics as S;3031match self {32S::Binary(_) => &PhysicalType::ByteArray,33S::Boolean(_) => &PhysicalType::Boolean,34S::FixedLen(s) => &s.primitive_type.physical_type,35S::Int32(_) => &PhysicalType::Int32,36S::Int64(_) => &PhysicalType::Int64,37S::Int96(_) => &PhysicalType::Int96,38S::Float(_) => &PhysicalType::Float,39S::Double(_) => &PhysicalType::Double,40}41}4243pub fn clear_min(&mut self) {44use Statistics as S;45match self {46S::Binary(s) => _ = s.min_value.take(),47S::Boolean(s) => _ = s.min_value.take(),48S::FixedLen(s) => _ = s.min_value.take(),49S::Int32(s) => _ = s.min_value.take(),50S::Int64(s) => _ = s.min_value.take(),51S::Int96(s) => _ = s.min_value.take(),52S::Float(s) => _ = s.min_value.take(),53S::Double(s) => _ = s.min_value.take(),54};55}5657pub fn clear_max(&mut self) {58use Statistics as S;59match self {60S::Binary(s) => _ = s.max_value.take(),61S::Boolean(s) => _ = s.max_value.take(),62S::FixedLen(s) => _ = s.max_value.take(),63S::Int32(s) => _ = s.max_value.take(),64S::Int64(s) => _ = s.max_value.take(),65S::Int96(s) => _ = s.max_value.take(),66S::Float(s) => _ = s.max_value.take(),67S::Double(s) => _ = s.max_value.take(),68};69}7071/// Deserializes a raw parquet statistics into [`Statistics`].72/// # Error73/// This function errors if it is not possible to read the statistics to the74/// corresponding `physical_type`.75#[inline]76pub fn deserialize(77statistics: &ParquetStatistics,78primitive_type: PrimitiveType,79) -> ParquetResult<Self> {80use {PhysicalType as T, PrimitiveStatistics as PrimStat};81let mut stats: Self = match primitive_type.physical_type {82T::ByteArray => BinaryStatistics::deserialize(statistics, primitive_type)?.into(),83T::Boolean => BooleanStatistics::deserialize(statistics)?.into(),84T::Int32 => PrimStat::<i32>::deserialize(statistics, primitive_type)?.into(),85T::Int64 => PrimStat::<i64>::deserialize(statistics, primitive_type)?.into(),86T::Int96 => PrimStat::<[u32; 3]>::deserialize(statistics, primitive_type)?.into(),87T::Float => PrimStat::<f32>::deserialize(statistics, primitive_type)?.into(),88T::Double => PrimStat::<f64>::deserialize(statistics, primitive_type)?.into(),89T::FixedLenByteArray(size) => {90FixedLenStatistics::deserialize(statistics, size, primitive_type)?.into()91},92};9394if statistics.is_min_value_exact.is_some_and(|v| !v) {95stats.clear_min();96}97if statistics.is_max_value_exact.is_some_and(|v| !v) {98stats.clear_max();99}100101// Parquet Format:102// > - If the min is a NaN, it should be ignored.103// > - If the max is a NaN, it should be ignored.104match &mut stats {105Statistics::Float(stats) => {106stats.min_value.take_if(|v| v.is_nan());107stats.max_value.take_if(|v| v.is_nan());108},109Statistics::Double(stats) => {110stats.min_value.take_if(|v| v.is_nan());111stats.max_value.take_if(|v| v.is_nan());112},113_ => {},114}115116Ok(stats)117}118}119120macro_rules! statistics_from_as {121($($variant:ident($struct:ty) => ($as_ident:ident, $into_ident:ident, $expect_ident:ident, $owned_expect_ident:ident),)+) => {122$(123impl From<$struct> for Statistics {124#[inline]125fn from(stats: $struct) -> Self {126Self::$variant(stats)127}128}129)+130131impl Statistics {132#[inline]133pub const fn null_count(&self) -> Option<i64> {134match self {135$(Self::$variant(s) => s.null_count,)+136}137}138139/// Serializes [`Statistics`] into a raw parquet statistics.140#[inline]141pub fn serialize(&self) -> ParquetStatistics {142match self {143$(Self::$variant(s) => s.serialize(),)+144}145}146147const fn variant_str(&self) -> &'static str {148match self {149$(Self::$variant(_) => stringify!($struct),)+150}151}152153$(154#[doc = concat!("Try to take [`Statistics`] as [`", stringify!($struct), "`]")]155#[inline]156pub fn $as_ident(&self) -> Option<&$struct> {157match self {158Self::$variant(s) => Some(s),159_ => None,160}161}162163#[doc = concat!("Try to take [`Statistics`] as [`", stringify!($struct), "`]")]164#[inline]165pub fn $into_ident(self) -> Option<$struct> {166match self {167Self::$variant(s) => Some(s),168_ => None,169}170}171172#[doc = concat!("Interpret [`Statistics`] to be [`", stringify!($struct), "`]")]173///174/// Panics if it is not the correct variant.175#[track_caller]176#[inline]177pub fn $expect_ident(&self) -> &$struct {178let Self::$variant(s) = self else {179panic!("Expected Statistics to be {}, found {} instead", stringify!($struct), self.variant_str());180};181182s183}184185#[doc = concat!("Interpret [`Statistics`] to be [`", stringify!($struct), "`]")]186///187/// Panics if it is not the correct variant.188#[track_caller]189#[inline]190pub fn $owned_expect_ident(self) -> $struct {191let Self::$variant(s) = self else {192panic!("Expected Statistics to be {}, found {} instead", stringify!($struct), self.variant_str());193};194195s196}197)+198199}200};201}202203statistics_from_as! {204Binary (BinaryStatistics ) => (as_binary, into_binary, expect_as_binary, expect_binary ),205Boolean (BooleanStatistics ) => (as_boolean, into_boolean, expect_as_boolean, expect_boolean ),206FixedLen (FixedLenStatistics ) => (as_fixedlen, into_fixedlen, expect_as_fixedlen, expect_fixedlen),207Int32 (PrimitiveStatistics<i32> ) => (as_int32, into_int32, expect_as_int32, expect_int32 ),208Int64 (PrimitiveStatistics<i64> ) => (as_int64, into_int64, expect_as_int64, expect_int64 ),209Int96 (PrimitiveStatistics<[u32; 3]>) => (as_int96, into_int96, expect_as_int96, expect_int96 ),210Float (PrimitiveStatistics<f32> ) => (as_float, into_float, expect_as_float, expect_float ),211Double (PrimitiveStatistics<f64> ) => (as_double, into_double, expect_as_double, expect_double ),212}213214215