Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-expr/src/dispatch/cat.rs
7884 views
1
use std::sync::Arc;
2
3
use polars_core::error::PolarsResult;
4
use polars_core::prelude::{
5
ChunkTakeUnchecked, ChunkedArray, Column, FalseT, IDX_DTYPE, IntoColumn, PolarsPhysicalType,
6
StringChunked,
7
};
8
use polars_core::series::Series;
9
use polars_ops::prelude::BinaryNameSpaceImpl;
10
#[cfg(feature = "strings")]
11
use polars_ops::prelude::StringNameSpaceImpl;
12
use polars_plan::dsl::{ColumnsUdf, SpecialEq};
13
use polars_plan::plans::IRCategoricalFunction;
14
15
pub fn function_expr_to_udf(func: IRCategoricalFunction) -> SpecialEq<Arc<dyn ColumnsUdf>> {
16
use IRCategoricalFunction::*;
17
match func {
18
GetCategories => map!(get_categories),
19
#[cfg(feature = "strings")]
20
LenBytes => map!(len_bytes),
21
#[cfg(feature = "strings")]
22
LenChars => map!(len_chars),
23
#[cfg(feature = "strings")]
24
StartsWith(prefix) => map!(starts_with, prefix.as_str()),
25
#[cfg(feature = "strings")]
26
EndsWith(suffix) => map!(ends_with, suffix.as_str()),
27
#[cfg(feature = "strings")]
28
Slice(offset, length) => map!(slice, offset, length),
29
}
30
}
31
32
fn get_categories(s: &Column) -> PolarsResult<Column> {
33
let mapping = s.dtype().cat_mapping()?;
34
let ca = unsafe { StringChunked::from_chunks(s.name().clone(), vec![mapping.to_arrow(true)]) };
35
Ok(ca.into_column())
36
}
37
38
// Determine mapping between categories and underlying physical. For local, this is just 0..n.
39
// For global, this is the global indexes.
40
fn _get_cat_phys_map(col: &Column) -> (StringChunked, Series) {
41
let mapping = col.dtype().cat_mapping().unwrap();
42
let cats =
43
unsafe { StringChunked::from_chunks(col.name().clone(), vec![mapping.to_arrow(true)]) };
44
let mut phys = col.to_physical_repr();
45
if phys.dtype() != &IDX_DTYPE {
46
phys = phys.cast(&IDX_DTYPE).unwrap();
47
}
48
let phys = phys.as_materialized_series().clone();
49
(cats, phys)
50
}
51
52
/// Fast path: apply a string function to the categories of a categorical column and broadcast the
53
/// result back to the array.
54
fn apply_to_cats<F, T>(c: &Column, mut op: F) -> PolarsResult<Column>
55
where
56
F: FnMut(StringChunked) -> ChunkedArray<T>,
57
T: PolarsPhysicalType<HasViews = FalseT, IsStruct = FalseT, IsNested = FalseT>,
58
{
59
let (categories, phys) = _get_cat_phys_map(c);
60
let result = op(categories);
61
// SAFETY: physical idx array is valid.
62
let out = unsafe { result.take_unchecked(phys.idx().unwrap()) };
63
Ok(out.into_column())
64
}
65
66
#[cfg(feature = "strings")]
67
fn len_bytes(c: &Column) -> PolarsResult<Column> {
68
apply_to_cats(c, |s| s.str_len_bytes())
69
}
70
71
#[cfg(feature = "strings")]
72
fn len_chars(c: &Column) -> PolarsResult<Column> {
73
apply_to_cats(c, |s| s.str_len_chars())
74
}
75
76
#[cfg(feature = "strings")]
77
fn starts_with(c: &Column, prefix: &str) -> PolarsResult<Column> {
78
apply_to_cats(c, |s| s.as_binary().starts_with(prefix.as_bytes()))
79
}
80
81
#[cfg(feature = "strings")]
82
fn ends_with(c: &Column, suffix: &str) -> PolarsResult<Column> {
83
apply_to_cats(c, |s| s.as_binary().ends_with(suffix.as_bytes()))
84
}
85
86
#[cfg(feature = "strings")]
87
fn slice(c: &Column, offset: i64, length: Option<usize>) -> PolarsResult<Column> {
88
let length = length.unwrap_or(usize::MAX) as u64;
89
let (categories, phys) = _get_cat_phys_map(c);
90
91
let result = unsafe {
92
categories.apply_views(|view, val| {
93
let (start, end) =
94
polars_ops::prelude::substring_ternary_offsets_value(val, offset, length);
95
polars_ops::prelude::update_view(view, start, end, val)
96
})
97
};
98
// SAFETY: physical idx array is valid.
99
let out = unsafe { result.take_unchecked(phys.idx().unwrap()) };
100
Ok(out.into_column())
101
}
102
103