Path: blob/main/crates/polars-compute/src/binview_index_map.rs
6939 views
use arrow::array::View;1use hashbrown::hash_table::{2Entry as TEntry, HashTable, OccupiedEntry as TOccupiedEntry, VacantEntry as TVacantEntry,3};4use polars_utils::IdxSize;56const BASE_KEY_BUFFER_CAPACITY: usize = 1024;78struct Key {9hash: u64,10view: View,11}1213/// An IndexMap where the keys are [u8] slices or `View`s which are pre-hashed.14/// Does not support deletion.15pub struct BinaryViewIndexMap<V> {16table: HashTable<IdxSize>,17tuples: Vec<(Key, V)>,18buffers: Vec<Vec<u8>>,1920// Internal random seed used to keep hash iteration order decorrelated.21// We simply store a random odd number and multiply the canonical hash by it.22seed: u64,23}2425impl<V> Default for BinaryViewIndexMap<V> {26fn default() -> Self {27Self {28table: HashTable::new(),29tuples: Vec::new(),30buffers: vec![],31seed: rand::random::<u64>() | 1,32}33}34}3536impl<V> BinaryViewIndexMap<V> {37pub fn new() -> Self {38Self::default()39}4041pub fn reserve(&mut self, additional: usize) {42self.table.reserve(additional, |i| unsafe {43let tuple = self.tuples.get_unchecked(*i as usize);44tuple.0.hash.wrapping_mul(self.seed)45});46self.tuples.reserve(additional);47}4849pub fn len(&self) -> IdxSize {50self.tuples.len() as IdxSize51}5253pub fn is_empty(&self) -> bool {54self.tuples.is_empty()55}5657pub fn buffers(&self) -> &[Vec<u8>] {58&self.buffers59}6061#[inline]62pub fn get(&self, hash: u64, key: &[u8]) -> Option<&V> {63unsafe {64if key.len() <= View::MAX_INLINE_SIZE as usize {65self.get_inline_view(hash, &View::new_inline_unchecked(key))66} else {67self.get_long_key(hash, key)68}69}70}7172/// # Safety73/// The view must be valid in combination with the given buffers.74#[inline]75pub unsafe fn get_view<B: AsRef<[u8]>>(76&self,77hash: u64,78key: &View,79buffers: &[B],80) -> Option<&V> {81unsafe {82if key.length <= View::MAX_INLINE_SIZE {83self.get_inline_view(hash, key)84} else {85self.get_long_key(hash, key.get_external_slice_unchecked(buffers))86}87}88}8990/// # Safety91/// The view must be inlined.92pub unsafe fn get_inline_view(&self, hash: u64, key: &View) -> Option<&V> {93unsafe {94debug_assert!(key.length <= View::MAX_INLINE_SIZE);95let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {96let t = self.tuples.get_unchecked(*i as usize);97*key == t.0.view98})?;99Some(&self.tuples.get_unchecked(*idx as usize).1)100}101}102103/// # Safety104/// key.len() > View::MAX_INLINE_SIZE105pub unsafe fn get_long_key(&self, hash: u64, key: &[u8]) -> Option<&V> {106unsafe {107debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);108let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {109let t = self.tuples.get_unchecked(*i as usize);110hash == t.0.hash111&& key.len() == t.0.view.length as usize112&& key == t.0.view.get_external_slice_unchecked(&self.buffers)113})?;114Some(&self.tuples.get_unchecked(*idx as usize).1)115}116}117118#[inline]119pub fn entry<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {120unsafe {121if key.len() <= View::MAX_INLINE_SIZE as usize {122self.entry_inline_view(hash, View::new_inline_unchecked(key))123} else {124self.entry_long_key(hash, key)125}126}127}128129/// # Safety130/// The view must be valid in combination with the given buffers.131#[inline]132pub unsafe fn entry_view<'k, B: AsRef<[u8]>>(133&mut self,134hash: u64,135key: View,136buffers: &'k [B],137) -> Entry<'_, 'k, V> {138unsafe {139if key.length <= View::MAX_INLINE_SIZE {140self.entry_inline_view(hash, key)141} else {142self.entry_long_key(hash, key.get_external_slice_unchecked(buffers))143}144}145}146147/// # Safety148/// The view must be inlined.149pub unsafe fn entry_inline_view<'k>(&mut self, hash: u64, key: View) -> Entry<'_, 'k, V> {150debug_assert!(key.length <= View::MAX_INLINE_SIZE);151let entry = self.table.entry(152hash.wrapping_mul(self.seed),153|i| unsafe {154let t = self.tuples.get_unchecked(*i as usize);155key == t.0.view156},157|i| unsafe {158let t = self.tuples.get_unchecked(*i as usize);159t.0.hash.wrapping_mul(self.seed)160},161);162163match entry {164TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {165entry: o,166tuples: &mut self.tuples,167}),168TEntry::Vacant(v) => Entry::Vacant(VacantEntry {169view: key,170external: None,171hash,172entry: v,173tuples: &mut self.tuples,174buffers: &mut self.buffers,175}),176}177}178179/// # Safety180/// key.len() > View::MAX_INLINE_SIZE181pub unsafe fn entry_long_key<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {182debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);183let entry = self.table.entry(184hash.wrapping_mul(self.seed),185|i| unsafe {186let t = self.tuples.get_unchecked(*i as usize);187hash == t.0.hash188&& key.len() == t.0.view.length as usize189&& key == t.0.view.get_external_slice_unchecked(&self.buffers)190},191|i| unsafe {192let t = self.tuples.get_unchecked(*i as usize);193t.0.hash.wrapping_mul(self.seed)194},195);196197match entry {198TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {199entry: o,200tuples: &mut self.tuples,201}),202TEntry::Vacant(v) => Entry::Vacant(VacantEntry {203view: View::default(),204external: Some(key),205hash,206entry: v,207tuples: &mut self.tuples,208buffers: &mut self.buffers,209}),210}211}212213/// Insert an empty entry which will never be mapped to. Returns the index of the entry.214///215/// This is useful for entries which are handled externally.216pub fn push_unmapped_empty_entry(&mut self, value: V) -> IdxSize {217let ret = self.tuples.len() as IdxSize;218let key = Key {219hash: 0,220view: View::default(),221};222self.tuples.push((key, value));223ret224}225226/// Gets the hash, key and value at the given index by insertion order.227#[inline(always)]228pub fn get_index(&self, idx: IdxSize) -> Option<(u64, &[u8], &V)> {229let t = self.tuples.get(idx as usize)?;230Some((231t.0.hash,232unsafe { t.0.view.get_slice_unchecked(&self.buffers) },233&t.1,234))235}236237/// Gets the hash, key and value at the given index by insertion order.238///239/// # Safety240/// The index must be less than len().241#[inline(always)]242pub unsafe fn get_index_unchecked(&self, idx: IdxSize) -> (u64, &[u8], &V) {243let t = unsafe { self.tuples.get_unchecked(idx as usize) };244unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers), &t.1) }245}246247/// Gets the hash, view and value at the given index by insertion order.248///249/// # Safety250/// The index must be less than len().251#[inline(always)]252pub unsafe fn get_index_view_unchecked(&self, idx: IdxSize) -> (u64, View, &V) {253let t = unsafe { self.tuples.get_unchecked(idx as usize) };254(t.0.hash, t.0.view, &t.1)255}256257/// Iterates over the (hash, key) pairs in insertion order.258pub fn iter_hash_keys(&self) -> impl Iterator<Item = (u64, &[u8])> {259self.tuples260.iter()261.map(|t| unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers)) })262}263264/// Iterates over the (hash, key_view) pairs in insertion order.265pub fn iter_hash_views(&self) -> impl Iterator<Item = (u64, View)> {266self.tuples.iter().map(|t| (t.0.hash, t.0.view))267}268269/// Iterates over the values in insertion order.270pub fn iter_values(&self) -> impl Iterator<Item = &V> {271self.tuples.iter().map(|t| &t.1)272}273}274275pub enum Entry<'a, 'k, V> {276Occupied(OccupiedEntry<'a, V>),277Vacant(VacantEntry<'a, 'k, V>),278}279280pub struct OccupiedEntry<'a, V> {281entry: TOccupiedEntry<'a, IdxSize>,282tuples: &'a mut Vec<(Key, V)>,283}284285impl<'a, V> OccupiedEntry<'a, V> {286#[inline]287pub fn index(&self) -> IdxSize {288*self.entry.get()289}290291#[inline]292pub fn into_mut(self) -> &'a mut V {293let idx = self.index();294unsafe { &mut self.tuples.get_unchecked_mut(idx as usize).1 }295}296}297298pub struct VacantEntry<'a, 'k, V> {299hash: u64,300view: View, // Empty when key is not inlined.301external: Option<&'k [u8]>, // Only set when not inlined.302entry: TVacantEntry<'a, IdxSize>,303tuples: &'a mut Vec<(Key, V)>,304buffers: &'a mut Vec<Vec<u8>>,305}306307#[allow(clippy::needless_lifetimes)]308impl<'a, 'k, V> VacantEntry<'a, 'k, V> {309#[inline]310pub fn index(&self) -> IdxSize {311self.tuples.len() as IdxSize312}313314#[inline]315pub fn insert(self, value: V) -> &'a mut V {316unsafe {317let tuple_idx: IdxSize = self.tuples.len().try_into().unwrap();318let view = if let Some(key) = self.external {319if self320.buffers321.last()322.is_none_or(|buf| buf.len() + key.len() > buf.capacity())323{324let ideal_next_cap = BASE_KEY_BUFFER_CAPACITY325.checked_shl(self.buffers.len() as u32)326.unwrap();327let next_capacity = std::cmp::max(ideal_next_cap, key.len());328self.buffers.push(Vec::with_capacity(next_capacity));329}330let buffer_idx = (self.buffers.len() - 1) as u32;331let active_buf = self.buffers.last_mut().unwrap_unchecked();332let offset = active_buf.len() as u32;333active_buf.extend_from_slice(key);334View::new_from_bytes(key, buffer_idx, offset)335} else {336self.view337};338let tuple_key = Key {339hash: self.hash,340view,341};342self.tuples.push((tuple_key, value));343self.entry.insert(tuple_idx);344&mut self.tuples.last_mut().unwrap_unchecked().1345}346}347}348349350