Path: blob/main/crates/polars-arrow/src/array/dictionary/value_map.rs
6939 views
use std::borrow::Borrow;1use std::fmt::{self, Debug};2use std::hash::{BuildHasher, Hash};34use hashbrown::HashTable;5use hashbrown::hash_table::Entry;6use polars_error::{PolarsResult, polars_bail, polars_err};7use polars_utils::aliases::PlRandomState;89use super::DictionaryKey;10use crate::array::indexable::{AsIndexed, Indexable};11use crate::array::{Array, MutableArray};12use crate::datatypes::ArrowDataType;1314#[derive(Clone)]15pub struct ValueMap<K: DictionaryKey, M: MutableArray> {16pub values: M,17pub map: HashTable<(u64, K)>,18random_state: PlRandomState,19}2021impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {22pub fn try_empty(values: M) -> PolarsResult<Self> {23if !values.is_empty() {24polars_bail!(ComputeError: "initializing value map with non-empty values array")25}26Ok(Self {27values,28map: HashTable::default(),29random_state: PlRandomState::default(),30})31}3233pub fn from_values(values: M) -> PolarsResult<Self>34where35M: Indexable,36M::Type: Eq + Hash,37{38let mut map: HashTable<(u64, K)> = HashTable::with_capacity(values.len());39let random_state = PlRandomState::default();40for index in 0..values.len() {41let key = K::try_from(index).map_err(|_| polars_err!(ComputeError: "overflow"))?;42// SAFETY: we only iterate within bounds43let value = unsafe { values.value_unchecked_at(index) };44let hash = random_state.hash_one(value.borrow());4546let entry = map.entry(47hash,48|(_h, key)| {49// SAFETY: invariant of the struct, it's always in bounds.50let stored_value = unsafe { values.value_unchecked_at(key.as_usize()) };51stored_value.borrow() == value.borrow()52},53|(h, _key)| *h,54);55match entry {56Entry::Occupied(_) => {57polars_bail!(InvalidOperation: "duplicate value in dictionary values array")58},59Entry::Vacant(entry) => {60entry.insert((hash, key));61},62}63}64Ok(Self {65values,66map,67random_state,68})69}7071pub fn dtype(&self) -> &ArrowDataType {72self.values.dtype()73}7475pub fn into_values(self) -> M {76self.values77}7879pub fn take_into(&mut self) -> Box<dyn Array> {80let arr = self.values.as_box();81self.map.clear();82arr83}8485#[inline]86pub fn values(&self) -> &M {87&self.values88}8990/// Try to insert a value and return its index (it may or may not get inserted).91pub fn try_push_valid<V>(92&mut self,93value: V,94mut push: impl FnMut(&mut M, V) -> PolarsResult<()>,95) -> PolarsResult<K>96where97M: Indexable,98V: AsIndexed<M>,99M::Type: Eq + Hash,100{101let hash = self.random_state.hash_one(value.as_indexed());102let entry = self.map.entry(103hash,104|(_h, key)| {105// SAFETY: invariant of the struct, it's always in bounds.106let stored_value = unsafe { self.values.value_unchecked_at(key.as_usize()) };107stored_value.borrow() == value.as_indexed()108},109|(h, _key)| *h,110);111let out = match entry {112Entry::Occupied(entry) => entry.get().1,113Entry::Vacant(entry) => {114let index = self.values.len();115let key = K::try_from(index).map_err(|_| polars_err!(ComputeError: "overflow"))?;116entry.insert((hash, key));117push(&mut self.values, value)?;118debug_assert_eq!(self.values.len(), index + 1);119key120},121};122Ok(out)123}124125pub fn shrink_to_fit(&mut self) {126self.values.shrink_to_fit();127}128}129130impl<K: DictionaryKey, M: MutableArray> Debug for ValueMap<K, M> {131fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {132self.values.fmt(f)133}134}135136137