Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/series/ops/strings.rs
7884 views
1
use std::borrow::Cow;
2
3
use arrow::array::builder::StaticArrayBuilder;
4
use arrow::array::{Array, Utf8ViewArrayBuilder};
5
use arrow::datatypes::ArrowDataType;
6
use polars_core::prelude::{Column, DataType, IntoColumn, StringChunked};
7
use polars_core::scalar::Scalar;
8
use polars_error::{PolarsResult, polars_ensure};
9
use polars_utils::pl_str::PlSmallStr;
10
11
#[inline(always)]
12
fn opt_str_to_string(s: Option<&str>) -> &str {
13
s.unwrap_or("null")
14
}
15
16
pub fn str_format(cs: &mut [Column], format: &str, insertions: &[usize]) -> PolarsResult<Column> {
17
assert_eq!(cs.len(), insertions.len());
18
assert!(!cs.is_empty()); // Checked at IR construction
19
20
let output_name = cs[0].name().clone();
21
let mut output_length = 1;
22
for c in cs.iter() {
23
if c.len() != 1 {
24
polars_ensure!(
25
output_length == 1 || output_length == c.len(),
26
length_mismatch = "format",
27
output_length,
28
c.len()
29
);
30
output_length = c.len();
31
}
32
}
33
34
let mut validity = None;
35
let mut num_scalar_inputs = 0;
36
for c in cs.iter_mut() {
37
if let Some(c_validity) = c.rechunk_validity() {
38
// Column with only nulls means output is only nulls.
39
if c.null_count() == c.len() {
40
return Ok(Column::full_null(
41
output_name,
42
output_length,
43
&DataType::String,
44
));
45
}
46
47
match &mut validity {
48
v @ None => *v = Some(c_validity),
49
Some(v) => *v = arrow::bitmap::and(v, &c_validity),
50
}
51
}
52
53
*c = c.cast(&DataType::String)?;
54
num_scalar_inputs += usize::from(c.len() == 1);
55
}
56
57
let mut format = Cow::Borrowed(format);
58
let mut insertions = Cow::Borrowed(insertions);
59
60
// Fill in any constants into the format string.
61
if num_scalar_inputs > 0 {
62
let mut filled_format = String::new();
63
filled_format.push_str(&format[..*insertions.first().unwrap()]);
64
insertions = Cow::Owned(
65
cs.iter()
66
.enumerate()
67
.filter_map(|(i, c)| {
68
let v = if c.len() == 1 {
69
filled_format.push_str(opt_str_to_string(c.str().unwrap().get(0)));
70
None
71
} else {
72
Some(filled_format.len())
73
};
74
75
let s = if i == cs.len() - 1 {
76
&format[insertions[i]..]
77
} else {
78
&format[insertions[i]..insertions[i + 1]]
79
};
80
filled_format.push_str(s);
81
82
v
83
})
84
.collect(),
85
);
86
format = filled_format.into();
87
}
88
89
let format = format.as_ref();
90
let insertions = insertions.as_ref();
91
92
// If the format string is constant.
93
if num_scalar_inputs == cs.len() {
94
let sc = Scalar::from(PlSmallStr::from_str(format));
95
return Ok(Column::new_scalar(output_name, sc, output_length));
96
}
97
98
let mut builder = Utf8ViewArrayBuilder::new(ArrowDataType::Utf8View);
99
builder.reserve(output_length);
100
101
let mut arrays = cs
102
.iter()
103
.filter(|c| c.len() != 1)
104
.map(|c| {
105
let ca = c.str().unwrap();
106
let mut iter = ca.downcast_iter();
107
let arr = iter.next().unwrap();
108
(iter, arr, 0)
109
})
110
.collect::<Vec<_>>();
111
112
// @Performance. There is some smarter stuff that can be done with views and stuff. Don't think
113
// it is worth the complexity.
114
115
// Amortize the format string allocation.
116
let mut s = String::new();
117
for i in 0..output_length {
118
if validity
119
.as_ref()
120
.is_some_and(|v| !unsafe { v.get_bit_unchecked(i) })
121
{
122
unsafe { builder.push_inline_view_ignore_validity(Default::default()) };
123
124
for (iter, arr, elem_idx) in arrays.iter_mut() {
125
*elem_idx += 1;
126
if i + 1 != output_length && *elem_idx == arr.len() {
127
*arr = iter.next().unwrap();
128
*elem_idx = 0;
129
}
130
}
131
132
continue;
133
}
134
135
s.clear();
136
s.push_str(&format[..insertions[0]]);
137
138
for (j, (iter, arr, elem_idx)) in arrays.iter_mut().enumerate() {
139
s.push_str(opt_str_to_string(arr.get(*elem_idx)));
140
let start = insertions[j];
141
let end = insertions.get(j + 1).copied().unwrap_or(format.len());
142
s.push_str(&format[start..end]);
143
144
*elem_idx += 1;
145
if i + 1 != output_length && *elem_idx == arr.len() {
146
*arr = iter.next().unwrap();
147
*elem_idx = 0;
148
}
149
}
150
151
builder.push_value_ignore_validity(&s);
152
}
153
154
let array = builder.freeze().with_validity(validity).to_boxed();
155
Ok(unsafe { StringChunked::from_chunks(output_name, vec![array]) }.into_column())
156
}
157
158