Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/concat.rs
6939 views
1
use arrow::array::{Utf8Array, ValueSize};
2
use polars_compute::cast::utf8_to_utf8view;
3
use polars_core::prelude::arity::unary_elementwise;
4
use polars_core::prelude::*;
5
6
// Vertically concatenate all strings in a StringChunked.
7
pub fn str_join(ca: &StringChunked, delimiter: &str, ignore_nulls: bool) -> StringChunked {
8
if ca.is_empty() {
9
return StringChunked::new(ca.name().clone(), &[""]);
10
}
11
12
// Propagate null value.
13
if !ignore_nulls && ca.null_count() != 0 {
14
return StringChunked::full_null(ca.name().clone(), 1);
15
}
16
17
// Fast path for all nulls.
18
if ignore_nulls && ca.null_count() == ca.len() {
19
return StringChunked::new(ca.name().clone(), &[""]);
20
}
21
22
if ca.len() == 1 {
23
return ca.clone();
24
}
25
26
// Calculate capacity.
27
let capacity = ca.get_values_size() + delimiter.len() * (ca.len() - 1);
28
29
let mut buf = String::with_capacity(capacity);
30
let mut first = true;
31
ca.for_each(|val| {
32
if let Some(val) = val {
33
if !first {
34
buf.push_str(delimiter);
35
}
36
buf.push_str(val);
37
first = false;
38
}
39
});
40
41
let buf = buf.into_bytes();
42
assert!(capacity >= buf.len());
43
let offsets = vec![0, buf.len() as i64];
44
let arr = unsafe { Utf8Array::from_data_unchecked_default(offsets.into(), buf.into(), None) };
45
// conversion is cheap with one value.
46
let arr = utf8_to_utf8view(&arr);
47
StringChunked::with_chunk(ca.name().clone(), arr)
48
}
49
50
enum ColumnIter<I, T> {
51
Iter(I),
52
Broadcast(T),
53
}
54
55
/// Horizontally concatenate all strings.
56
///
57
/// Each array should have length 1 or a length equal to the maximum length.
58
pub fn hor_str_concat(
59
cas: &[&StringChunked],
60
delimiter: &str,
61
ignore_nulls: bool,
62
) -> PolarsResult<StringChunked> {
63
if cas.is_empty() {
64
return Ok(StringChunked::full_null(PlSmallStr::EMPTY, 0));
65
}
66
if cas.len() == 1 {
67
let ca = cas[0];
68
return if !ignore_nulls || ca.null_count() == 0 {
69
Ok(ca.clone())
70
} else {
71
Ok(unary_elementwise(ca, |val| Some(val.unwrap_or(""))))
72
};
73
}
74
75
// Calculate the post-broadcast length and ensure everything is consistent.
76
let len = cas
77
.iter()
78
.map(|ca| ca.len())
79
.filter(|l| *l != 1)
80
.max()
81
.unwrap_or(1);
82
polars_ensure!(
83
cas.iter().all(|ca| ca.len() == 1 || ca.len() == len),
84
ShapeMismatch: "all series in `hor_str_concat` should have equal or unit length"
85
);
86
87
let mut builder = StringChunkedBuilder::new(cas[0].name().clone(), len);
88
89
// Broadcast if appropriate.
90
let mut cols: Vec<_> = cas
91
.iter()
92
.map(|ca| match ca.len() {
93
0 => ColumnIter::Broadcast(None),
94
1 => ColumnIter::Broadcast(ca.get(0)),
95
_ => ColumnIter::Iter(ca.iter()),
96
})
97
.collect();
98
99
// Build concatenated string.
100
let mut buf = String::with_capacity(1024);
101
for _row in 0..len {
102
let mut has_null = false;
103
let mut found_not_null_value = false;
104
for col in cols.iter_mut() {
105
let val = match col {
106
ColumnIter::Iter(i) => i.next().unwrap(),
107
ColumnIter::Broadcast(s) => *s,
108
};
109
110
if has_null && !ignore_nulls {
111
// We know that the result must be null, but we can't just break out of the loop,
112
// because all cols iterator has to be moved correctly.
113
continue;
114
}
115
116
if let Some(s) = val {
117
if found_not_null_value {
118
buf.push_str(delimiter);
119
}
120
buf.push_str(s);
121
found_not_null_value = true;
122
} else {
123
has_null = true;
124
}
125
}
126
127
if !ignore_nulls && has_null {
128
builder.append_null();
129
} else {
130
builder.append_value(&buf)
131
}
132
buf.clear();
133
}
134
135
Ok(builder.finish())
136
}
137
138
#[cfg(test)]
139
mod test {
140
use super::*;
141
142
#[test]
143
fn test_str_concat() {
144
let ca = Int32Chunked::new("foo".into(), &[Some(1), None, Some(3)]);
145
let ca_str = ca.cast(&DataType::String).unwrap();
146
let out = str_join(ca_str.str().unwrap(), "-", true);
147
148
let out = out.get(0);
149
assert_eq!(out, Some("1-3"));
150
}
151
152
#[test]
153
fn test_hor_str_concat() {
154
let a = StringChunked::new("a".into(), &["foo", "bar"]);
155
let b = StringChunked::new("b".into(), &["spam", "ham"]);
156
157
let out = hor_str_concat(&[&a, &b], "_", true).unwrap();
158
assert_eq!(Vec::from(&out), &[Some("foo_spam"), Some("bar_ham")]);
159
160
let c = StringChunked::new("b".into(), &["literal"]);
161
let out = hor_str_concat(&[&a, &b, &c], "_", true).unwrap();
162
assert_eq!(
163
Vec::from(&out),
164
&[Some("foo_spam_literal"), Some("bar_ham_literal")]
165
);
166
}
167
}
168
169