Path: blob/main/crates/polars-parquet/src/parquet/metadata/sort.rs
6940 views
#[cfg(feature = "serde")]1use serde::{Deserialize, Serialize};23use crate::parquet::schema::types::{4IntegerType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType,5};67/// Sort order for page and column statistics.8///9/// Types are associated with sort orders and column stats are aggregated using a sort10/// order, and a sort order should be considered when comparing values with statistics11/// min/max.12///13/// See reference in14/// <https://github.com/apache/parquet-cpp/blob/master/src/parquet/types.h>15#[derive(Debug, Clone, Copy, PartialEq, Eq)]16#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]17pub enum SortOrder {18/// Signed (either value or legacy byte-wise) comparison.19Signed,20/// Unsigned (depending on physical type either value or byte-wise) comparison.21Unsigned,22/// Comparison is undefined.23Undefined,24}2526/// Returns sort order for a physical/logical type.27pub fn get_sort_order(28logical_type: &Option<PrimitiveLogicalType>,29converted_type: &Option<PrimitiveConvertedType>,30physical_type: &PhysicalType,31) -> SortOrder {32if let Some(logical_type) = logical_type {33return get_logical_sort_order(logical_type);34};35if let Some(converted_type) = converted_type {36return get_converted_sort_order(converted_type);37};38get_physical_sort_order(physical_type)39}4041fn get_logical_sort_order(logical_type: &PrimitiveLogicalType) -> SortOrder {42// TODO: Should this take converted and logical type, for compatibility?43use PrimitiveLogicalType::*;44match logical_type {45String | Enum | Json | Bson => SortOrder::Unsigned,46Integer(t) => match t {47IntegerType::Int8 | IntegerType::Int16 | IntegerType::Int32 | IntegerType::Int64 => {48SortOrder::Signed49},50_ => SortOrder::Unsigned,51},52Decimal(_, _) => SortOrder::Signed,53Date => SortOrder::Signed,54Time { .. } => SortOrder::Signed,55Timestamp { .. } => SortOrder::Signed,56Unknown => SortOrder::Undefined,57Uuid => SortOrder::Unsigned,58Float16 => SortOrder::Unsigned,59}60}6162fn get_converted_sort_order(converted_type: &PrimitiveConvertedType) -> SortOrder {63use PrimitiveConvertedType::*;64match converted_type {65// Unsigned byte-wise comparison.66Utf8 | Json | Bson | Enum => SortOrder::Unsigned,67Int8 | Int16 | Int32 | Int64 => SortOrder::Signed,68Uint8 | Uint16 | Uint32 | Uint64 => SortOrder::Unsigned,69// Signed comparison of the represented value.70Decimal(_, _) => SortOrder::Signed,71Date => SortOrder::Signed,72TimeMillis | TimeMicros | TimestampMillis | TimestampMicros => SortOrder::Signed,73Interval => SortOrder::Undefined,74}75}7677fn get_physical_sort_order(physical_type: &PhysicalType) -> SortOrder {78use PhysicalType::*;79match physical_type {80// Order: false, true81Boolean => SortOrder::Unsigned,82Int32 | Int64 => SortOrder::Signed,83Int96 => SortOrder::Undefined,84// Notes to remember when comparing float/double values:85// If the min is a NaN, it should be ignored.86// If the max is a NaN, it should be ignored.87// If the min is +0, the row group may contain -0 values as well.88// If the max is -0, the row group may contain +0 values as well.89// When looking for NaN values, min and max should be ignored.90Float | Double => SortOrder::Signed,91// Unsigned byte-wise comparison92ByteArray | FixedLenByteArray(_) => SortOrder::Unsigned,93}94}959697