Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-expr/src/groups/mod.rs
6940 views
1
use std::any::Any;
2
3
use arrow::bitmap::BitmapBuilder;
4
use polars_core::prelude::*;
5
#[cfg(feature = "dtype-categorical")]
6
use polars_core::with_match_categorical_physical_type;
7
use polars_core::with_match_physical_numeric_polars_type;
8
use polars_utils::IdxSize;
9
use polars_utils::hashing::HashPartitioner;
10
11
use crate::hash_keys::HashKeys;
12
13
mod binview;
14
mod row_encoded;
15
mod single_key;
16
17
/// A Grouper maps keys to groups, such that duplicate keys map to the same group.
18
pub trait Grouper: Any + Send + Sync {
19
/// Creates a new empty Grouper similar to this one.
20
fn new_empty(&self) -> Box<dyn Grouper>;
21
22
/// Reserves space for the given number additional groups.
23
fn reserve(&mut self, additional: usize);
24
25
/// Returns the number of groups in this Grouper.
26
fn num_groups(&self) -> IdxSize;
27
28
/// Inserts the given subset of keys into this Grouper. If groups_idxs is
29
/// passed it is extended such with the group index of keys[subset[i]].
30
///
31
/// # Safety
32
/// The subset indexes must be in-bounds.
33
unsafe fn insert_keys_subset(
34
&mut self,
35
keys: &HashKeys,
36
subset: &[IdxSize],
37
group_idxs: Option<&mut Vec<IdxSize>>,
38
);
39
40
/// Returns the keys in this Grouper in group order, that is the key for
41
/// group i is returned in row i.
42
fn get_keys_in_group_order(&self, schema: &Schema) -> DataFrame;
43
44
/// Returns the (indices of the) keys found in the groupers. If
45
/// invert is true it instead returns the keys not found in the groupers.
46
/// # Safety
47
/// All groupers must have the same schema.
48
unsafe fn probe_partitioned_groupers(
49
&self,
50
groupers: &[Box<dyn Grouper>],
51
keys: &HashKeys,
52
partitioner: &HashPartitioner,
53
invert: bool,
54
probe_matches: &mut Vec<IdxSize>,
55
);
56
57
/// Returns for each key if it is found in the groupers. If invert is true
58
/// it returns true if it isn't found.
59
/// # Safety
60
/// All groupers must have the same schema.
61
unsafe fn contains_key_partitioned_groupers(
62
&self,
63
groupers: &[Box<dyn Grouper>],
64
keys: &HashKeys,
65
partitioner: &HashPartitioner,
66
invert: bool,
67
contains_key: &mut BitmapBuilder,
68
);
69
70
fn as_any(&self) -> &dyn Any;
71
}
72
73
pub fn new_hash_grouper(key_schema: Arc<Schema>) -> Box<dyn Grouper> {
74
if key_schema.len() > 1 {
75
Box::new(row_encoded::RowEncodedHashGrouper::new())
76
} else {
77
let (_name, dt) = key_schema.get_at_index(0).unwrap();
78
match dt {
79
dt if dt.is_primitive_numeric() | dt.is_temporal() => {
80
with_match_physical_numeric_polars_type!(dt.to_physical(), |$T| {
81
Box::new(single_key::SingleKeyHashGrouper::<$T>::new())
82
})
83
},
84
85
#[cfg(feature = "dtype-decimal")]
86
DataType::Decimal(_, _) => {
87
Box::new(single_key::SingleKeyHashGrouper::<Int128Type>::new())
88
},
89
#[cfg(feature = "dtype-categorical")]
90
dt @ (DataType::Enum(_, _) | DataType::Categorical(_, _)) => {
91
with_match_categorical_physical_type!(dt.cat_physical().unwrap(), |$C| {
92
Box::new(single_key::SingleKeyHashGrouper::<<$C as PolarsCategoricalType>::PolarsPhysical>::new())
93
})
94
},
95
96
DataType::String | DataType::Binary => Box::new(binview::BinviewHashGrouper::new()),
97
98
_ => Box::new(row_encoded::RowEncodedHashGrouper::new()),
99
}
100
}
101
}
102
103