Path: blob/main/crates/polars-arrow/src/array/dictionary/mod.rs
8396 views
use std::hash::Hash;1use std::hint::unreachable_unchecked;23use crate::bitmap::Bitmap;4use crate::bitmap::utils::{BitmapIter, ZipValidity};5use crate::datatypes::{ArrowDataType, IntegerType};6use crate::scalar::{Scalar, new_scalar};7use crate::trusted_len::TrustedLen;8use crate::types::NativeType;910mod ffi;11pub(super) mod fmt;12mod iterator;13mod mutable;14use crate::array::specification::check_indexes_unchecked;15mod typed_iterator;16mod value_map;1718pub use iterator::*;19pub use mutable::*;20use polars_error::{PolarsResult, polars_bail};2122use super::primitive::PrimitiveArray;23use super::specification::check_indexes;24use super::{Array, Splitable, new_empty_array, new_null_array};25use crate::array::dictionary::typed_iterator::{26DictValue, DictionaryIterTyped, DictionaryValuesIterTyped,27};2829/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.30/// # Safety31///32/// Any implementation of this trait must ensure that `always_fits_usize` only33/// returns `true` if all values succeeds on `value::try_into::<usize>().unwrap()`.34pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + Hash {35/// The corresponding [`IntegerType`] of this key36const KEY_TYPE: IntegerType;37const MAX_USIZE_VALUE: usize;3839/// Represents this key as a `usize`.40///41/// # Safety42/// The caller _must_ have checked that the value can be cast to `usize`.43#[inline]44unsafe fn as_usize(self) -> usize {45match self.try_into() {46Ok(v) => v,47Err(_) => unreachable_unchecked(),48}49}5051/// Create a key from a `usize` without checking bounds.52///53/// # Safety54/// The caller _must_ have checked that the value can be created from a `usize`.55#[inline]56unsafe fn from_usize_unchecked(x: usize) -> Self {57debug_assert!(Self::try_from(x).is_ok());58unsafe { Self::try_from(x).unwrap_unchecked() }59}6061/// If the key type always can be converted to `usize`.62fn always_fits_usize() -> bool {63false64}65}6667unsafe impl DictionaryKey for i8 {68const KEY_TYPE: IntegerType = IntegerType::Int8;69const MAX_USIZE_VALUE: usize = i8::MAX as usize;70}71unsafe impl DictionaryKey for i16 {72const KEY_TYPE: IntegerType = IntegerType::Int16;73const MAX_USIZE_VALUE: usize = i16::MAX as usize;74}75unsafe impl DictionaryKey for i32 {76const KEY_TYPE: IntegerType = IntegerType::Int32;77const MAX_USIZE_VALUE: usize = i32::MAX as usize;78}79unsafe impl DictionaryKey for i64 {80const KEY_TYPE: IntegerType = IntegerType::Int64;81const MAX_USIZE_VALUE: usize = i64::MAX as usize;82}83unsafe impl DictionaryKey for i128 {84const KEY_TYPE: IntegerType = IntegerType::Int128;85const MAX_USIZE_VALUE: usize = i128::MAX as usize;86}87unsafe impl DictionaryKey for u8 {88const KEY_TYPE: IntegerType = IntegerType::UInt8;89const MAX_USIZE_VALUE: usize = u8::MAX as usize;9091fn always_fits_usize() -> bool {92true93}94}95unsafe impl DictionaryKey for u16 {96const KEY_TYPE: IntegerType = IntegerType::UInt16;97const MAX_USIZE_VALUE: usize = u16::MAX as usize;9899fn always_fits_usize() -> bool {100true101}102}103unsafe impl DictionaryKey for u32 {104const KEY_TYPE: IntegerType = IntegerType::UInt32;105const MAX_USIZE_VALUE: usize = u32::MAX as usize;106107fn always_fits_usize() -> bool {108true109}110}111unsafe impl DictionaryKey for u64 {112const KEY_TYPE: IntegerType = IntegerType::UInt64;113const MAX_USIZE_VALUE: usize = u64::MAX as usize;114115#[cfg(target_pointer_width = "64")]116fn always_fits_usize() -> bool {117true118}119}120unsafe impl DictionaryKey for u128 {121const KEY_TYPE: IntegerType = IntegerType::UInt128;122const MAX_USIZE_VALUE: usize = u128::MAX as usize;123}124125/// An [`Array`] whose values are stored as indices. This [`Array`] is useful when the cardinality of126/// values is low compared to the length of the [`Array`].127///128/// # Safety129/// This struct guarantees that each item of [`DictionaryArray::keys`] is castable to `usize` and130/// its value is smaller than [`DictionaryArray::values`]`.len()`. In other words, you can safely131/// use `unchecked` calls to retrieve the values132#[derive(Clone)]133pub struct DictionaryArray<K: DictionaryKey> {134dtype: ArrowDataType,135keys: PrimitiveArray<K>,136values: Box<dyn Array>,137}138139fn check_dtype(140key_type: IntegerType,141dtype: &ArrowDataType,142values_dtype: &ArrowDataType,143) -> PolarsResult<()> {144if let ArrowDataType::Dictionary(key, value, _) = dtype.to_storage() {145if *key != key_type {146polars_bail!(ComputeError: "DictionaryArray must be initialized with a DataType::Dictionary whose integer is compatible to its keys")147}148if value.as_ref().to_storage() != values_dtype.to_storage() {149polars_bail!(ComputeError: "DictionaryArray must be initialized with a DataType::Dictionary whose value is equal to its values")150}151} else {152polars_bail!(ComputeError: "DictionaryArray must be initialized with logical DataType::Dictionary")153}154Ok(())155}156157impl<K: DictionaryKey> DictionaryArray<K> {158/// Returns a new [`DictionaryArray`].159/// # Implementation160/// This function is `O(N)` where `N` is the length of keys161/// # Errors162/// This function errors iff163/// * the `dtype`'s logical type is not a `DictionaryArray`164/// * the `dtype`'s keys is not compatible with `keys`165/// * the `dtype`'s values's dtype is not equal with `values.dtype()`166/// * any of the keys's values is not represented in `usize` or is `>= values.len()`167pub fn try_new(168dtype: ArrowDataType,169keys: PrimitiveArray<K>,170values: Box<dyn Array>,171) -> PolarsResult<Self> {172check_dtype(K::KEY_TYPE, &dtype, values.dtype())?;173174if keys.null_count() != keys.len() {175if K::always_fits_usize() {176// SAFETY: we just checked that conversion to `usize` always177// succeeds178unsafe { check_indexes_unchecked(keys.values(), values.len()) }?;179} else {180check_indexes(keys.values(), values.len())?;181}182}183184Ok(Self {185dtype,186keys,187values,188})189}190191/// Returns a new [`DictionaryArray`].192/// # Implementation193/// This function is `O(N)` where `N` is the length of keys194/// # Errors195/// This function errors iff196/// * any of the keys's values is not represented in `usize` or is `>= values.len()`197pub fn try_from_keys(keys: PrimitiveArray<K>, values: Box<dyn Array>) -> PolarsResult<Self> {198let dtype = Self::default_dtype(values.dtype().clone());199Self::try_new(dtype, keys, values)200}201202/// Returns a new [`DictionaryArray`].203/// # Errors204/// This function errors iff205/// * the `dtype`'s logical type is not a `DictionaryArray`206/// * the `dtype`'s keys is not compatible with `keys`207/// * the `dtype`'s values's dtype is not equal with `values.dtype()`208///209/// # Safety210/// The caller must ensure that every keys's values is represented in `usize` and is `< values.len()`211pub unsafe fn try_new_unchecked(212dtype: ArrowDataType,213keys: PrimitiveArray<K>,214values: Box<dyn Array>,215) -> PolarsResult<Self> {216check_dtype(K::KEY_TYPE, &dtype, values.dtype())?;217218Ok(Self {219dtype,220keys,221values,222})223}224225/// Returns a new empty [`DictionaryArray`].226pub fn new_empty(dtype: ArrowDataType) -> Self {227let values = Self::try_get_child(&dtype).unwrap();228let values = new_empty_array(values.clone());229Self::try_new(230dtype,231PrimitiveArray::<K>::new_empty(K::PRIMITIVE.into()),232values,233)234.unwrap()235}236237/// Returns an [`DictionaryArray`] whose all elements are null238#[inline]239pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {240let values = Self::try_get_child(&dtype).unwrap();241let values = new_null_array(values.clone(), 1);242Self::try_new(243dtype,244PrimitiveArray::<K>::new_null(K::PRIMITIVE.into(), length),245values,246)247.unwrap()248}249250/// Returns an iterator of [`Option<Box<dyn Scalar>>`].251/// # Implementation252/// This function will allocate a new [`Scalar`] per item and is usually not performant.253/// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.254pub fn iter(255&self,256) -> ZipValidity<Box<dyn Scalar>, DictionaryValuesIter<'_, K>, BitmapIter<'_>> {257ZipValidity::new_with_validity(DictionaryValuesIter::new(self), self.keys.validity())258}259260/// Returns an iterator of [`Box<dyn Scalar>`]261/// # Implementation262/// This function will allocate a new [`Scalar`] per item and is usually not performant.263/// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.264pub fn values_iter(&self) -> DictionaryValuesIter<'_, K> {265DictionaryValuesIter::new(self)266}267268/// Returns an iterator over the values [`V::IterValue`].269///270/// # Panics271///272/// Panics if the keys of this [`DictionaryArray`] has any nulls.273/// If they do [`DictionaryArray::iter_typed`] should be used.274pub fn values_iter_typed<V: DictValue>(275&self,276) -> PolarsResult<DictionaryValuesIterTyped<'_, K, V>> {277let keys = &self.keys;278assert_eq!(keys.null_count(), 0);279let values = self.values.as_ref();280let values = V::downcast_values(values)?;281Ok(DictionaryValuesIterTyped::new(keys, values))282}283284/// Returns an iterator over the optional values of [`Option<V::IterValue>`].285pub fn iter_typed<V: DictValue>(&self) -> PolarsResult<DictionaryIterTyped<'_, K, V>> {286let keys = &self.keys;287let values = self.values.as_ref();288let values = V::downcast_values(values)?;289Ok(DictionaryIterTyped::new(keys, values))290}291292/// Returns the [`ArrowDataType`] of this [`DictionaryArray`]293#[inline]294pub fn dtype(&self) -> &ArrowDataType {295&self.dtype296}297298/// Returns whether the values of this [`DictionaryArray`] are ordered299#[inline]300pub fn is_ordered(&self) -> bool {301match self.dtype.to_storage() {302ArrowDataType::Dictionary(_, _, is_ordered) => *is_ordered,303_ => unreachable!(),304}305}306307pub(crate) fn default_dtype(values_datatype: ArrowDataType) -> ArrowDataType {308ArrowDataType::Dictionary(K::KEY_TYPE, Box::new(values_datatype), false)309}310311/// Slices this [`DictionaryArray`].312/// # Panics313/// iff `offset + length > self.len()`.314pub fn slice(&mut self, offset: usize, length: usize) {315self.keys.slice(offset, length);316}317318/// Slices this [`DictionaryArray`].319///320/// # Safety321/// Safe iff `offset + length <= self.len()`.322pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {323self.keys.slice_unchecked(offset, length);324}325326impl_sliced!();327328/// Returns this [`DictionaryArray`] with a new validity.329/// # Panic330/// This function panics iff `validity.len() != self.len()`.331#[must_use]332pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {333self.set_validity(validity);334self335}336337/// Sets the validity of the keys of this [`DictionaryArray`].338/// # Panics339/// This function panics iff `validity.len() != self.len()`.340pub fn set_validity(&mut self, validity: Option<Bitmap>) {341self.keys.set_validity(validity);342}343344impl_into_array!();345346/// Returns the length of this array347#[inline]348pub fn len(&self) -> usize {349self.keys.len()350}351352/// The optional validity. Equivalent to `self.keys().validity()`.353#[inline]354pub fn validity(&self) -> Option<&Bitmap> {355self.keys.validity()356}357358/// Returns the keys of the [`DictionaryArray`]. These keys can be used to fetch values359/// from `values`.360#[inline]361pub fn keys(&self) -> &PrimitiveArray<K> {362&self.keys363}364365/// Returns an iterator of the keys' values of the [`DictionaryArray`] as `usize`366#[inline]367pub fn keys_values_iter(&self) -> impl TrustedLen<Item = usize> + Clone + '_ {368// SAFETY: invariant of the struct369self.keys.values_iter().map(|x| unsafe { x.as_usize() })370}371372/// Returns an iterator of the keys' of the [`DictionaryArray`] as `usize`373#[inline]374pub fn keys_iter(&self) -> impl TrustedLen<Item = Option<usize>> + Clone + '_ {375// SAFETY: invariant of the struct376self.keys.iter().map(|x| x.map(|x| unsafe { x.as_usize() }))377}378379/// Returns the keys' value of the [`DictionaryArray`] as `usize`380/// # Panics381/// This function panics iff `index >= self.len()`382#[inline]383pub fn key_value(&self, index: usize) -> usize {384// SAFETY: invariant of the struct385unsafe { self.keys.values()[index].as_usize() }386}387388/// Returns the values of the [`DictionaryArray`].389#[inline]390pub fn values(&self) -> &Box<dyn Array> {391&self.values392}393394/// Returns the value of the [`DictionaryArray`] at position `i`.395/// # Implementation396/// This function will allocate a new [`Scalar`] and is usually not performant.397/// Consider calling `keys` and `values`, downcasting `values`, and iterating over that.398/// # Panic399/// This function panics iff `index >= self.len()`400#[inline]401pub fn value(&self, index: usize) -> Box<dyn Scalar> {402// SAFETY: invariant of this struct403let index = unsafe { self.keys.value(index).as_usize() };404new_scalar(self.values.as_ref(), index)405}406407pub(crate) fn try_get_child(dtype: &ArrowDataType) -> PolarsResult<&ArrowDataType> {408Ok(match dtype.to_storage() {409ArrowDataType::Dictionary(_, values, _) => values.as_ref(),410_ => {411polars_bail!(ComputeError: "Dictionaries must be initialized with DataType::Dictionary")412},413})414}415416pub fn take(self) -> (ArrowDataType, PrimitiveArray<K>, Box<dyn Array>) {417(self.dtype, self.keys, self.values)418}419}420421impl<K: DictionaryKey> Array for DictionaryArray<K> {422impl_common_array!();423424fn validity(&self) -> Option<&Bitmap> {425self.keys.validity()426}427428#[inline]429fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {430Box::new(self.clone().with_validity(validity))431}432}433434impl<K: DictionaryKey> Splitable for DictionaryArray<K> {435fn check_bound(&self, offset: usize) -> bool {436offset < self.len()437}438439unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {440let (lhs_keys, rhs_keys) = unsafe { Splitable::split_at_unchecked(&self.keys, offset) };441442(443Self {444dtype: self.dtype.clone(),445keys: lhs_keys,446values: self.values.clone(),447},448Self {449dtype: self.dtype.clone(),450keys: rhs_keys,451values: self.values.clone(),452},453)454}455}456457458