Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/binview_index_map.rs
6939 views
1
use arrow::array::View;
2
use hashbrown::hash_table::{
3
Entry as TEntry, HashTable, OccupiedEntry as TOccupiedEntry, VacantEntry as TVacantEntry,
4
};
5
use polars_utils::IdxSize;
6
7
const BASE_KEY_BUFFER_CAPACITY: usize = 1024;
8
9
struct Key {
10
hash: u64,
11
view: View,
12
}
13
14
/// An IndexMap where the keys are [u8] slices or `View`s which are pre-hashed.
15
/// Does not support deletion.
16
pub struct BinaryViewIndexMap<V> {
17
table: HashTable<IdxSize>,
18
tuples: Vec<(Key, V)>,
19
buffers: Vec<Vec<u8>>,
20
21
// Internal random seed used to keep hash iteration order decorrelated.
22
// We simply store a random odd number and multiply the canonical hash by it.
23
seed: u64,
24
}
25
26
impl<V> Default for BinaryViewIndexMap<V> {
27
fn default() -> Self {
28
Self {
29
table: HashTable::new(),
30
tuples: Vec::new(),
31
buffers: vec![],
32
seed: rand::random::<u64>() | 1,
33
}
34
}
35
}
36
37
impl<V> BinaryViewIndexMap<V> {
38
pub fn new() -> Self {
39
Self::default()
40
}
41
42
pub fn reserve(&mut self, additional: usize) {
43
self.table.reserve(additional, |i| unsafe {
44
let tuple = self.tuples.get_unchecked(*i as usize);
45
tuple.0.hash.wrapping_mul(self.seed)
46
});
47
self.tuples.reserve(additional);
48
}
49
50
pub fn len(&self) -> IdxSize {
51
self.tuples.len() as IdxSize
52
}
53
54
pub fn is_empty(&self) -> bool {
55
self.tuples.is_empty()
56
}
57
58
pub fn buffers(&self) -> &[Vec<u8>] {
59
&self.buffers
60
}
61
62
#[inline]
63
pub fn get(&self, hash: u64, key: &[u8]) -> Option<&V> {
64
unsafe {
65
if key.len() <= View::MAX_INLINE_SIZE as usize {
66
self.get_inline_view(hash, &View::new_inline_unchecked(key))
67
} else {
68
self.get_long_key(hash, key)
69
}
70
}
71
}
72
73
/// # Safety
74
/// The view must be valid in combination with the given buffers.
75
#[inline]
76
pub unsafe fn get_view<B: AsRef<[u8]>>(
77
&self,
78
hash: u64,
79
key: &View,
80
buffers: &[B],
81
) -> Option<&V> {
82
unsafe {
83
if key.length <= View::MAX_INLINE_SIZE {
84
self.get_inline_view(hash, key)
85
} else {
86
self.get_long_key(hash, key.get_external_slice_unchecked(buffers))
87
}
88
}
89
}
90
91
/// # Safety
92
/// The view must be inlined.
93
pub unsafe fn get_inline_view(&self, hash: u64, key: &View) -> Option<&V> {
94
unsafe {
95
debug_assert!(key.length <= View::MAX_INLINE_SIZE);
96
let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {
97
let t = self.tuples.get_unchecked(*i as usize);
98
*key == t.0.view
99
})?;
100
Some(&self.tuples.get_unchecked(*idx as usize).1)
101
}
102
}
103
104
/// # Safety
105
/// key.len() > View::MAX_INLINE_SIZE
106
pub unsafe fn get_long_key(&self, hash: u64, key: &[u8]) -> Option<&V> {
107
unsafe {
108
debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);
109
let idx = self.table.find(hash.wrapping_mul(self.seed), |i| {
110
let t = self.tuples.get_unchecked(*i as usize);
111
hash == t.0.hash
112
&& key.len() == t.0.view.length as usize
113
&& key == t.0.view.get_external_slice_unchecked(&self.buffers)
114
})?;
115
Some(&self.tuples.get_unchecked(*idx as usize).1)
116
}
117
}
118
119
#[inline]
120
pub fn entry<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
121
unsafe {
122
if key.len() <= View::MAX_INLINE_SIZE as usize {
123
self.entry_inline_view(hash, View::new_inline_unchecked(key))
124
} else {
125
self.entry_long_key(hash, key)
126
}
127
}
128
}
129
130
/// # Safety
131
/// The view must be valid in combination with the given buffers.
132
#[inline]
133
pub unsafe fn entry_view<'k, B: AsRef<[u8]>>(
134
&mut self,
135
hash: u64,
136
key: View,
137
buffers: &'k [B],
138
) -> Entry<'_, 'k, V> {
139
unsafe {
140
if key.length <= View::MAX_INLINE_SIZE {
141
self.entry_inline_view(hash, key)
142
} else {
143
self.entry_long_key(hash, key.get_external_slice_unchecked(buffers))
144
}
145
}
146
}
147
148
/// # Safety
149
/// The view must be inlined.
150
pub unsafe fn entry_inline_view<'k>(&mut self, hash: u64, key: View) -> Entry<'_, 'k, V> {
151
debug_assert!(key.length <= View::MAX_INLINE_SIZE);
152
let entry = self.table.entry(
153
hash.wrapping_mul(self.seed),
154
|i| unsafe {
155
let t = self.tuples.get_unchecked(*i as usize);
156
key == t.0.view
157
},
158
|i| unsafe {
159
let t = self.tuples.get_unchecked(*i as usize);
160
t.0.hash.wrapping_mul(self.seed)
161
},
162
);
163
164
match entry {
165
TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
166
entry: o,
167
tuples: &mut self.tuples,
168
}),
169
TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
170
view: key,
171
external: None,
172
hash,
173
entry: v,
174
tuples: &mut self.tuples,
175
buffers: &mut self.buffers,
176
}),
177
}
178
}
179
180
/// # Safety
181
/// key.len() > View::MAX_INLINE_SIZE
182
pub unsafe fn entry_long_key<'k>(&mut self, hash: u64, key: &'k [u8]) -> Entry<'_, 'k, V> {
183
debug_assert!(key.len() > View::MAX_INLINE_SIZE as usize);
184
let entry = self.table.entry(
185
hash.wrapping_mul(self.seed),
186
|i| unsafe {
187
let t = self.tuples.get_unchecked(*i as usize);
188
hash == t.0.hash
189
&& key.len() == t.0.view.length as usize
190
&& key == t.0.view.get_external_slice_unchecked(&self.buffers)
191
},
192
|i| unsafe {
193
let t = self.tuples.get_unchecked(*i as usize);
194
t.0.hash.wrapping_mul(self.seed)
195
},
196
);
197
198
match entry {
199
TEntry::Occupied(o) => Entry::Occupied(OccupiedEntry {
200
entry: o,
201
tuples: &mut self.tuples,
202
}),
203
TEntry::Vacant(v) => Entry::Vacant(VacantEntry {
204
view: View::default(),
205
external: Some(key),
206
hash,
207
entry: v,
208
tuples: &mut self.tuples,
209
buffers: &mut self.buffers,
210
}),
211
}
212
}
213
214
/// Insert an empty entry which will never be mapped to. Returns the index of the entry.
215
///
216
/// This is useful for entries which are handled externally.
217
pub fn push_unmapped_empty_entry(&mut self, value: V) -> IdxSize {
218
let ret = self.tuples.len() as IdxSize;
219
let key = Key {
220
hash: 0,
221
view: View::default(),
222
};
223
self.tuples.push((key, value));
224
ret
225
}
226
227
/// Gets the hash, key and value at the given index by insertion order.
228
#[inline(always)]
229
pub fn get_index(&self, idx: IdxSize) -> Option<(u64, &[u8], &V)> {
230
let t = self.tuples.get(idx as usize)?;
231
Some((
232
t.0.hash,
233
unsafe { t.0.view.get_slice_unchecked(&self.buffers) },
234
&t.1,
235
))
236
}
237
238
/// Gets the hash, key and value at the given index by insertion order.
239
///
240
/// # Safety
241
/// The index must be less than len().
242
#[inline(always)]
243
pub unsafe fn get_index_unchecked(&self, idx: IdxSize) -> (u64, &[u8], &V) {
244
let t = unsafe { self.tuples.get_unchecked(idx as usize) };
245
unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers), &t.1) }
246
}
247
248
/// Gets the hash, view and value at the given index by insertion order.
249
///
250
/// # Safety
251
/// The index must be less than len().
252
#[inline(always)]
253
pub unsafe fn get_index_view_unchecked(&self, idx: IdxSize) -> (u64, View, &V) {
254
let t = unsafe { self.tuples.get_unchecked(idx as usize) };
255
(t.0.hash, t.0.view, &t.1)
256
}
257
258
/// Iterates over the (hash, key) pairs in insertion order.
259
pub fn iter_hash_keys(&self) -> impl Iterator<Item = (u64, &[u8])> {
260
self.tuples
261
.iter()
262
.map(|t| unsafe { (t.0.hash, t.0.view.get_slice_unchecked(&self.buffers)) })
263
}
264
265
/// Iterates over the (hash, key_view) pairs in insertion order.
266
pub fn iter_hash_views(&self) -> impl Iterator<Item = (u64, View)> {
267
self.tuples.iter().map(|t| (t.0.hash, t.0.view))
268
}
269
270
/// Iterates over the values in insertion order.
271
pub fn iter_values(&self) -> impl Iterator<Item = &V> {
272
self.tuples.iter().map(|t| &t.1)
273
}
274
}
275
276
pub enum Entry<'a, 'k, V> {
277
Occupied(OccupiedEntry<'a, V>),
278
Vacant(VacantEntry<'a, 'k, V>),
279
}
280
281
pub struct OccupiedEntry<'a, V> {
282
entry: TOccupiedEntry<'a, IdxSize>,
283
tuples: &'a mut Vec<(Key, V)>,
284
}
285
286
impl<'a, V> OccupiedEntry<'a, V> {
287
#[inline]
288
pub fn index(&self) -> IdxSize {
289
*self.entry.get()
290
}
291
292
#[inline]
293
pub fn into_mut(self) -> &'a mut V {
294
let idx = self.index();
295
unsafe { &mut self.tuples.get_unchecked_mut(idx as usize).1 }
296
}
297
}
298
299
pub struct VacantEntry<'a, 'k, V> {
300
hash: u64,
301
view: View, // Empty when key is not inlined.
302
external: Option<&'k [u8]>, // Only set when not inlined.
303
entry: TVacantEntry<'a, IdxSize>,
304
tuples: &'a mut Vec<(Key, V)>,
305
buffers: &'a mut Vec<Vec<u8>>,
306
}
307
308
#[allow(clippy::needless_lifetimes)]
309
impl<'a, 'k, V> VacantEntry<'a, 'k, V> {
310
#[inline]
311
pub fn index(&self) -> IdxSize {
312
self.tuples.len() as IdxSize
313
}
314
315
#[inline]
316
pub fn insert(self, value: V) -> &'a mut V {
317
unsafe {
318
let tuple_idx: IdxSize = self.tuples.len().try_into().unwrap();
319
let view = if let Some(key) = self.external {
320
if self
321
.buffers
322
.last()
323
.is_none_or(|buf| buf.len() + key.len() > buf.capacity())
324
{
325
let ideal_next_cap = BASE_KEY_BUFFER_CAPACITY
326
.checked_shl(self.buffers.len() as u32)
327
.unwrap();
328
let next_capacity = std::cmp::max(ideal_next_cap, key.len());
329
self.buffers.push(Vec::with_capacity(next_capacity));
330
}
331
let buffer_idx = (self.buffers.len() - 1) as u32;
332
let active_buf = self.buffers.last_mut().unwrap_unchecked();
333
let offset = active_buf.len() as u32;
334
active_buf.extend_from_slice(key);
335
View::new_from_bytes(key, buffer_idx, offset)
336
} else {
337
self.view
338
};
339
let tuple_key = Key {
340
hash: self.hash,
341
view,
342
};
343
self.tuples.push((tuple_key, value));
344
self.entry.insert(tuple_idx);
345
&mut self.tuples.last_mut().unwrap_unchecked().1
346
}
347
}
348
}
349
350