Path: blob/main/crates/polars-utils/src/idx_map/bytes_idx_map.rs
6940 views
use hashbrown::hash_table::{1Entry as TEntry, HashTable, OccupiedEntry as TOccupiedEntry, VacantEntry as TVacantEntry,2};34use crate::IdxSize;56const BASE_KEY_DATA_CAPACITY: usize = 1024;78struct Key {9key_hash: u64,10key_buffer: u32,11key_offset: usize,12key_length: u32,13}1415impl Key {16unsafe fn get<'k>(&self, key_data: &'k [Vec<u8>]) -> &'k [u8] {17let buf = unsafe { key_data.get_unchecked(self.key_buffer as usize) };18unsafe { buf.get_unchecked(self.key_offset..self.key_offset + self.key_length as usize) }19}20}2122/// An IndexMap where the keys are always [u8] slices which are pre-hashed.23pub struct BytesIndexMap<V> {24table: HashTable<IdxSize>,25tuples: Vec<(Key, V)>,26key_data: Vec<Vec<u8>>,2728// Internal random seed used to keep hash iteration order decorrelated.29// We simply store a random odd number and multiply the canonical hash by it.30seed: u64,31}3233impl<V> Default for BytesIndexMap<V> {34fn default() -> Self {35Self {36table: HashTable::new(),37tuples: Vec::new(),38key_data: vec![Vec::with_capacity(BASE_KEY_DATA_CAPACITY)],39seed: rand::random::<u64>() | 1,40}41}42}4344impl<V> BytesIndexMap<V> {45pub fn new() -> Self {46Self::default()47}4849pub fn reserve(&mut self, additional: usize) {50self.table.reserve(additional, |i| unsafe {51let tuple = self.tuples.get_unchecked(*i as usize);52tuple.0.key_hash.wrapping_mul(self.seed)53});54self.tuples.reserve(additional);55}5657pub fn len(&self) -> IdxSize {58self.tuples.len() as IdxSize59}6061pub fn is_empty(&self) -> bool {62self.tuples.is_empty()63}6465pub fn get(&self, hash: u64, key: &[u8]) -> Option<&V> {66let idx = self.table.find(hash.wrapping_mul(self.seed), |i| unsafe {67let t = self.tuples.get_unchecked(*i as usize);68hash == t.0.key_hash && key == t.0.get(&self.key_data)69})?;70unsafe { Some(&self.tuples.get_unchecked(*idx as usize).1) }71}7273pub fn contains_key(&self, hash: u64, key: &[u8]) -> bool {74self.table75.find(hash.wrapping_mul(self.seed), |i| unsafe {76let t = self.tuples.get_unchecked(*i as usize);77hash == t.0.key_hash && key == t.0.get(&self.key_data)78})79.is_some()80}8182pub fn entry<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {83let entry = self.table.entry(84hash.wrapping_mul(self.seed),85|i| unsafe {86let t = self.tuples.get_unchecked(*i as usize);87hash == t.0.key_hash && key == t.0.get(&self.key_data)88},89|i| unsafe {90let t = self.tuples.get_unchecked(*i as usize);91t.0.key_hash.wrapping_mul(self.seed)92},93);9495match entry {96TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {97entry: o,98tuples: &mut self.tuples,99}),100TEntry::Vacant(v) => Entry::Vacant(VacantEntry {101key,102hash,103entry: v,104tuples: &mut self.tuples,105key_data: &mut self.key_data,106}),107}108}109110/// Gets the hash, key and value at the given index by insertion order.111#[inline(always)]112pub fn get_index(&self, idx: IdxSize) -> Option<(u64, &[u8], &V)> {113let t = self.tuples.get(idx as usize)?;114Some((t.0.key_hash, unsafe { t.0.get(&self.key_data) }, &t.1))115}116117/// Gets the hash, key and value at the given index by insertion order.118///119/// # Safety120/// The index must be less than len().121#[inline(always)]122pub unsafe fn get_index_unchecked(&self, idx: IdxSize) -> (u64, &[u8], &V) {123let t = unsafe { self.tuples.get_unchecked(idx as usize) };124unsafe { (t.0.key_hash, t.0.get(&self.key_data), &t.1) }125}126127/// Iterates over the (hash, key) pairs in insertion order.128pub fn iter_hash_keys(&self) -> impl Iterator<Item = (u64, &[u8])> {129self.tuples130.iter()131.map(|t| unsafe { (t.0.key_hash, t.0.get(&self.key_data)) })132}133134/// Iterates over the values in insertion order.135pub fn iter_values(&self) -> impl Iterator<Item = &V> {136self.tuples.iter().map(|t| &t.1)137}138}139140pub enum Entry<'a, 'k, V> {141Occupied(OccupiedEntry<'a, V>),142Vacant(VacantEntry<'a, 'k, V>),143}144145pub struct OccupiedEntry<'a, V> {146entry: TOccupiedEntry<'a, IdxSize>,147tuples: &'a mut Vec<(Key, V)>,148}149150impl<'a, V> OccupiedEntry<'a, V> {151pub fn index(&self) -> IdxSize {152*self.entry.get()153}154155pub fn into_mut(self) -> &'a mut V {156let idx = self.index();157unsafe { &mut self.tuples.get_unchecked_mut(idx as usize).1 }158}159}160161pub struct VacantEntry<'a, 'k, V> {162hash: u64,163key: &'k [u8],164entry: TVacantEntry<'a, IdxSize>,165tuples: &'a mut Vec<(Key, V)>,166key_data: &'a mut Vec<Vec<u8>>,167}168169#[allow(clippy::needless_lifetimes)]170impl<'a, 'k, V> VacantEntry<'a, 'k, V> {171pub fn index(&self) -> IdxSize {172self.tuples.len() as IdxSize173}174175pub fn insert(self, value: V) -> &'a mut V {176unsafe {177let tuple_idx: IdxSize = self.tuples.len().try_into().unwrap();178179let mut num_buffers = self.key_data.len() as u32;180let mut active_buf = self.key_data.last_mut().unwrap_unchecked();181let key_len = self.key.len();182if active_buf.len() + key_len > active_buf.capacity() {183let ideal_next_cap = BASE_KEY_DATA_CAPACITY.checked_shl(num_buffers).unwrap();184let next_capacity = std::cmp::max(ideal_next_cap, key_len);185self.key_data.push(Vec::with_capacity(next_capacity));186active_buf = self.key_data.last_mut().unwrap_unchecked();187num_buffers += 1;188}189190let tuple_key = Key {191key_hash: self.hash,192key_buffer: num_buffers - 1,193key_offset: active_buf.len(),194key_length: self.key.len().try_into().unwrap(),195};196self.tuples.push((tuple_key, value));197active_buf.extend_from_slice(self.key);198self.entry.insert(tuple_idx);199&mut self.tuples.last_mut().unwrap_unchecked().1200}201}202}203204205