Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-utils/src/idx_map/bytes_idx_map.rs
6940 views
1
use hashbrown::hash_table::{
2
Entry as TEntry, HashTable, OccupiedEntry as TOccupiedEntry, VacantEntry as TVacantEntry,
3
};
4
5
use crate::IdxSize;
6
7
const BASE_KEY_DATA_CAPACITY: usize = 1024;
8
9
struct Key {
10
key_hash: u64,
11
key_buffer: u32,
12
key_offset: usize,
13
key_length: u32,
14
}
15
16
impl Key {
17
unsafe fn get<'k>(&self, key_data: &'k [Vec<u8>]) -> &'k [u8] {
18
let buf = unsafe { key_data.get_unchecked(self.key_buffer as usize) };
19
unsafe { buf.get_unchecked(self.key_offset..self.key_offset + self.key_length as usize) }
20
}
21
}
22
23
/// An IndexMap where the keys are always [u8] slices which are pre-hashed.
24
pub struct BytesIndexMap<V> {
25
table: HashTable<IdxSize>,
26
tuples: Vec<(Key, V)>,
27
key_data: Vec<Vec<u8>>,
28
29
// Internal random seed used to keep hash iteration order decorrelated.
30
// We simply store a random odd number and multiply the canonical hash by it.
31
seed: u64,
32
}
33
34
impl<V> Default for BytesIndexMap<V> {
35
fn default() -> Self {
36
Self {
37
table: HashTable::new(),
38
tuples: Vec::new(),
39
key_data: vec![Vec::with_capacity(BASE_KEY_DATA_CAPACITY)],
40
seed: rand::random::<u64>() | 1,
41
}
42
}
43
}
44
45
impl<V> BytesIndexMap<V> {
46
pub fn new() -> Self {
47
Self::default()
48
}
49
50
pub fn reserve(&mut self, additional: usize) {
51
self.table.reserve(additional, |i| unsafe {
52
let tuple = self.tuples.get_unchecked(*i as usize);
53
tuple.0.key_hash.wrapping_mul(self.seed)
54
});
55
self.tuples.reserve(additional);
56
}
57
58
pub fn len(&self) -> IdxSize {
59
self.tuples.len() as IdxSize
60
}
61
62
pub fn is_empty(&self) -> bool {
63
self.tuples.is_empty()
64
}
65
66
pub fn get(&self, hash: u64, key: &[u8]) -> Option<&V> {
67
let idx = self.table.find(hash.wrapping_mul(self.seed), |i| unsafe {
68
let t = self.tuples.get_unchecked(*i as usize);
69
hash == t.0.key_hash && key == t.0.get(&self.key_data)
70
})?;
71
unsafe { Some(&self.tuples.get_unchecked(*idx as usize).1) }
72
}
73
74
pub fn contains_key(&self, hash: u64, key: &[u8]) -> bool {
75
self.table
76
.find(hash.wrapping_mul(self.seed), |i| unsafe {
77
let t = self.tuples.get_unchecked(*i as usize);
78
hash == t.0.key_hash && key == t.0.get(&self.key_data)
79
})
80
.is_some()
81
}
82
83
pub fn entry<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
84
let entry = self.table.entry(
85
hash.wrapping_mul(self.seed),
86
|i| unsafe {
87
let t = self.tuples.get_unchecked(*i as usize);
88
hash == t.0.key_hash && key == t.0.get(&self.key_data)
89
},
90
|i| unsafe {
91
let t = self.tuples.get_unchecked(*i as usize);
92
t.0.key_hash.wrapping_mul(self.seed)
93
},
94
);
95
96
match entry {
97
TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
98
entry: o,
99
tuples: &mut self.tuples,
100
}),
101
TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
102
key,
103
hash,
104
entry: v,
105
tuples: &mut self.tuples,
106
key_data: &mut self.key_data,
107
}),
108
}
109
}
110
111
/// Gets the hash, key and value at the given index by insertion order.
112
#[inline(always)]
113
pub fn get_index(&self, idx: IdxSize) -> Option<(u64, &[u8], &V)> {
114
let t = self.tuples.get(idx as usize)?;
115
Some((t.0.key_hash, unsafe { t.0.get(&self.key_data) }, &t.1))
116
}
117
118
/// Gets the hash, key and value at the given index by insertion order.
119
///
120
/// # Safety
121
/// The index must be less than len().
122
#[inline(always)]
123
pub unsafe fn get_index_unchecked(&self, idx: IdxSize) -> (u64, &[u8], &V) {
124
let t = unsafe { self.tuples.get_unchecked(idx as usize) };
125
unsafe { (t.0.key_hash, t.0.get(&self.key_data), &t.1) }
126
}
127
128
/// Iterates over the (hash, key) pairs in insertion order.
129
pub fn iter_hash_keys(&self) -> impl Iterator<Item = (u64, &[u8])> {
130
self.tuples
131
.iter()
132
.map(|t| unsafe { (t.0.key_hash, t.0.get(&self.key_data)) })
133
}
134
135
/// Iterates over the values in insertion order.
136
pub fn iter_values(&self) -> impl Iterator<Item = &V> {
137
self.tuples.iter().map(|t| &t.1)
138
}
139
}
140
141
pub enum Entry<'a, 'k, V> {
142
Occupied(OccupiedEntry<'a, V>),
143
Vacant(VacantEntry<'a, 'k, V>),
144
}
145
146
pub struct OccupiedEntry<'a, V> {
147
entry: TOccupiedEntry<'a, IdxSize>,
148
tuples: &'a mut Vec<(Key, V)>,
149
}
150
151
impl<'a, V> OccupiedEntry<'a, V> {
152
pub fn index(&self) -> IdxSize {
153
*self.entry.get()
154
}
155
156
pub fn into_mut(self) -> &'a mut V {
157
let idx = self.index();
158
unsafe { &mut self.tuples.get_unchecked_mut(idx as usize).1 }
159
}
160
}
161
162
pub struct VacantEntry<'a, 'k, V> {
163
hash: u64,
164
key: &'k [u8],
165
entry: TVacantEntry<'a, IdxSize>,
166
tuples: &'a mut Vec<(Key, V)>,
167
key_data: &'a mut Vec<Vec<u8>>,
168
}
169
170
#[allow(clippy::needless_lifetimes)]
171
impl<'a, 'k, V> VacantEntry<'a, 'k, V> {
172
pub fn index(&self) -> IdxSize {
173
self.tuples.len() as IdxSize
174
}
175
176
pub fn insert(self, value: V) -> &'a mut V {
177
unsafe {
178
let tuple_idx: IdxSize = self.tuples.len().try_into().unwrap();
179
180
let mut num_buffers = self.key_data.len() as u32;
181
let mut active_buf = self.key_data.last_mut().unwrap_unchecked();
182
let key_len = self.key.len();
183
if active_buf.len() + key_len > active_buf.capacity() {
184
let ideal_next_cap = BASE_KEY_DATA_CAPACITY.checked_shl(num_buffers).unwrap();
185
let next_capacity = std::cmp::max(ideal_next_cap, key_len);
186
self.key_data.push(Vec::with_capacity(next_capacity));
187
active_buf = self.key_data.last_mut().unwrap_unchecked();
188
num_buffers += 1;
189
}
190
191
let tuple_key = Key {
192
key_hash: self.hash,
193
key_buffer: num_buffers - 1,
194
key_offset: active_buf.len(),
195
key_length: self.key.len().try_into().unwrap(),
196
};
197
self.tuples.push((tuple_key, value));
198
active_buf.extend_from_slice(self.key);
199
self.entry.insert(tuple_idx);
200
&mut self.tuples.last_mut().unwrap_unchecked().1
201
}
202
}
203
}
204
205