Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-expr/src/groups/binview.rs
8395 views
1
use arrow::array::{Array, BinaryViewArrayGeneric, View, ViewType};
2
use arrow::bitmap::{Bitmap, MutableBitmap};
3
use polars_buffer::Buffer;
4
use polars_compute::binview_index_map::{BinaryViewIndexMap, Entry};
5
6
use super::*;
7
use crate::hash_keys::HashKeys;
8
9
#[derive(Default)]
10
pub struct BinviewHashGrouper {
11
idx_map: BinaryViewIndexMap<()>,
12
null_idx: IdxSize,
13
}
14
15
impl BinviewHashGrouper {
16
pub fn new() -> Self {
17
Self {
18
idx_map: BinaryViewIndexMap::default(),
19
null_idx: IdxSize::MAX,
20
}
21
}
22
23
/// # Safety
24
/// The view must be valid for the given buffer set.
25
#[inline(always)]
26
unsafe fn insert_key(
27
&mut self,
28
hash: u64,
29
view: View,
30
buffers: &Buffer<Buffer<u8>>,
31
) -> IdxSize {
32
unsafe {
33
match self.idx_map.entry_view(hash, view, buffers) {
34
Entry::Occupied(o) => o.index(),
35
Entry::Vacant(v) => {
36
let index = v.index();
37
v.insert(());
38
index
39
},
40
}
41
}
42
}
43
44
#[inline(always)]
45
fn insert_null(&mut self) -> IdxSize {
46
if self.null_idx == IdxSize::MAX {
47
self.null_idx = self.idx_map.push_unmapped_empty_entry(());
48
}
49
self.null_idx
50
}
51
52
/// # Safety
53
/// The view must be valid for the given buffer set.
54
#[inline(always)]
55
unsafe fn contains_key(&self, hash: u64, view: &View, buffers: &Buffer<Buffer<u8>>) -> bool {
56
unsafe { self.idx_map.get_view(hash, view, buffers).is_some() }
57
}
58
59
#[inline(always)]
60
fn contains_null(&self) -> bool {
61
self.null_idx < IdxSize::MAX
62
}
63
64
/// # Safety
65
/// The views must be valid for the given buffers.
66
unsafe fn finalize_keys<V: ViewType + ?Sized>(
67
&self,
68
schema: &Schema,
69
views: Buffer<View>,
70
buffers: Buffer<Buffer<u8>>,
71
validity: Option<Bitmap>,
72
) -> DataFrame {
73
let (name, dtype) = schema.get_at_index(0).unwrap();
74
unsafe {
75
let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
76
let keys = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(
77
arrow_dtype,
78
views,
79
buffers,
80
validity,
81
None,
82
);
83
let s =
84
Series::from_chunks_and_dtype_unchecked(name.clone(), vec![Box::new(keys)], dtype);
85
DataFrame::new_unchecked(s.len(), vec![Column::from(s)])
86
}
87
}
88
}
89
90
impl Grouper for BinviewHashGrouper {
91
fn new_empty(&self) -> Box<dyn Grouper> {
92
Box::new(Self::new())
93
}
94
95
fn reserve(&mut self, additional: usize) {
96
self.idx_map.reserve(additional);
97
}
98
99
fn num_groups(&self) -> IdxSize {
100
self.idx_map.len()
101
}
102
103
unsafe fn insert_keys_subset(
104
&mut self,
105
hash_keys: &HashKeys,
106
subset: &[IdxSize],
107
group_idxs: Option<&mut Vec<IdxSize>>,
108
) {
109
let HashKeys::Binview(hash_keys) = hash_keys else {
110
unreachable!()
111
};
112
113
unsafe {
114
let views = hash_keys.keys.views().as_slice();
115
let buffers = hash_keys.keys.data_buffers();
116
if let Some(validity) = hash_keys.keys.validity() {
117
if hash_keys.null_is_valid {
118
let groups = subset.iter().map(|idx| {
119
if validity.get_bit_unchecked(*idx as usize) {
120
let hash = hash_keys.hashes.value_unchecked(*idx as usize);
121
let view = views.get_unchecked(*idx as usize);
122
self.insert_key(hash, *view, buffers)
123
} else {
124
self.insert_null()
125
}
126
});
127
if let Some(group_idxs) = group_idxs {
128
group_idxs.reserve(subset.len());
129
group_idxs.extend(groups);
130
} else {
131
groups.for_each(drop);
132
}
133
} else {
134
let groups = subset.iter().filter_map(|idx| {
135
if validity.get_bit_unchecked(*idx as usize) {
136
let hash = hash_keys.hashes.value_unchecked(*idx as usize);
137
let view = views.get_unchecked(*idx as usize);
138
Some(self.insert_key(hash, *view, buffers))
139
} else {
140
None
141
}
142
});
143
if let Some(group_idxs) = group_idxs {
144
group_idxs.reserve(subset.len());
145
group_idxs.extend(groups);
146
} else {
147
groups.for_each(drop);
148
}
149
}
150
} else {
151
let groups = subset.iter().map(|idx| {
152
let hash = hash_keys.hashes.value_unchecked(*idx as usize);
153
let view = views.get_unchecked(*idx as usize);
154
self.insert_key(hash, *view, buffers)
155
});
156
if let Some(group_idxs) = group_idxs {
157
group_idxs.reserve(subset.len());
158
group_idxs.extend(groups);
159
} else {
160
groups.for_each(drop);
161
}
162
}
163
}
164
}
165
166
fn get_keys_in_group_order(&self, schema: &Schema) -> DataFrame {
167
let buffers = self
168
.idx_map
169
.buffers()
170
.iter()
171
.map(|b| Buffer::from(b.to_vec()))
172
.collect();
173
let views = self.idx_map.iter_hash_views().map(|(_h, v)| v).collect();
174
let validity = if self.null_idx < IdxSize::MAX {
175
let mut validity = MutableBitmap::new();
176
validity.extend_constant(self.idx_map.len() as usize, true);
177
validity.set(self.null_idx as usize, false);
178
Some(validity.freeze())
179
} else {
180
None
181
};
182
183
unsafe {
184
let (_name, dt) = schema.get_at_index(0).unwrap();
185
match dt {
186
DataType::Binary => self.finalize_keys::<[u8]>(schema, views, buffers, validity),
187
DataType::String => self.finalize_keys::<str>(schema, views, buffers, validity),
188
_ => unreachable!(),
189
}
190
}
191
}
192
193
/// # Safety
194
/// All groupers must be a BinviewHashGrouper.
195
unsafe fn probe_partitioned_groupers(
196
&self,
197
groupers: &[Box<dyn Grouper>],
198
hash_keys: &HashKeys,
199
partitioner: &HashPartitioner,
200
invert: bool,
201
probe_matches: &mut Vec<IdxSize>,
202
) {
203
let HashKeys::Binview(hash_keys) = hash_keys else {
204
unreachable!()
205
};
206
assert!(partitioner.num_partitions() == groupers.len());
207
208
unsafe {
209
let null_p = partitioner.null_partition();
210
let buffers = hash_keys.keys.data_buffers();
211
let views = hash_keys.keys.views().as_slice();
212
hash_keys.for_each_hash(|idx, opt_h| {
213
let has_group = if let Some(h) = opt_h {
214
let p = partitioner.hash_to_partition(h);
215
let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(p);
216
let grouper =
217
&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);
218
let view = views.get_unchecked(idx as usize);
219
grouper.contains_key(h, view, buffers)
220
} else {
221
let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(null_p);
222
let grouper =
223
&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);
224
grouper.contains_null()
225
};
226
227
if has_group != invert {
228
probe_matches.push(idx);
229
}
230
});
231
}
232
}
233
234
/// # Safety
235
/// All groupers must be a BinviewHashGrouper.
236
unsafe fn contains_key_partitioned_groupers(
237
&self,
238
groupers: &[Box<dyn Grouper>],
239
hash_keys: &HashKeys,
240
partitioner: &HashPartitioner,
241
invert: bool,
242
contains_key: &mut BitmapBuilder,
243
) {
244
let HashKeys::Binview(hash_keys) = hash_keys else {
245
unreachable!()
246
};
247
assert!(partitioner.num_partitions() == groupers.len());
248
249
unsafe {
250
let null_p = partitioner.null_partition();
251
let buffers = hash_keys.keys.data_buffers();
252
let views = hash_keys.keys.views().as_slice();
253
hash_keys.for_each_hash(|idx, opt_h| {
254
let has_group = if let Some(h) = opt_h {
255
let p = partitioner.hash_to_partition(h);
256
let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(p);
257
let grouper =
258
&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);
259
let view = views.get_unchecked(idx as usize);
260
grouper.contains_key(h, view, buffers)
261
} else {
262
let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(null_p);
263
let grouper =
264
&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);
265
grouper.contains_null()
266
};
267
268
contains_key.push(has_group != invert);
269
});
270
}
271
}
272
273
fn as_any(&self) -> &dyn Any {
274
self
275
}
276
}
277
278