Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/split.rs
6939 views
1
use arrow::array::ValueSize;
2
#[cfg(feature = "dtype-struct")]
3
use arrow::array::{MutableArray, MutableUtf8Array};
4
use polars_core::chunked_array::ops::arity::binary_elementwise_for_each;
5
6
use super::*;
7
8
pub struct SplitNChars<'a> {
9
s: &'a str,
10
n: usize,
11
keep_remainder: bool,
12
}
13
14
impl<'a> Iterator for SplitNChars<'a> {
15
type Item = &'a str;
16
17
fn next(&mut self) -> Option<Self::Item> {
18
let single_char_limit = if self.keep_remainder { 2 } else { 1 };
19
if self.n >= single_char_limit {
20
self.n -= 1;
21
let ch = self.s.chars().next()?;
22
let first;
23
(first, self.s) = self.s.split_at(ch.len_utf8());
24
Some(first)
25
} else if self.n == 1 && !self.s.is_empty() {
26
self.n -= 1;
27
Some(self.s)
28
} else {
29
None
30
}
31
}
32
}
33
34
/// Splits a string into substrings consisting of single characters.
35
///
36
/// Returns at most n strings, where the last string is the entire remainder
37
/// of the string if keep_remainder is True, and just the nth character otherwise.
38
#[cfg(feature = "dtype-struct")]
39
fn splitn_chars(s: &str, n: usize, keep_remainder: bool) -> SplitNChars<'_> {
40
SplitNChars {
41
s,
42
n,
43
keep_remainder,
44
}
45
}
46
47
/// Splits a string into substrings consisting of single characters.
48
fn split_chars(s: &str) -> SplitNChars<'_> {
49
SplitNChars {
50
s,
51
n: usize::MAX,
52
keep_remainder: false,
53
}
54
}
55
56
#[cfg(feature = "dtype-struct")]
57
pub fn split_to_struct<'a, F, I>(
58
ca: &'a StringChunked,
59
by: &'a StringChunked,
60
n: usize,
61
op: F,
62
keep_remainder: bool,
63
) -> PolarsResult<StructChunked>
64
where
65
F: Fn(&'a str, &'a str) -> I,
66
I: Iterator<Item = &'a str>,
67
{
68
use polars_utils::format_pl_smallstr;
69
70
let mut arrs = (0..n)
71
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
72
.collect::<Vec<_>>();
73
74
if by.len() == 1 {
75
if let Some(by) = by.get(0) {
76
if by.is_empty() {
77
ca.for_each(|opt_s| match opt_s {
78
None => {
79
for arr in &mut arrs {
80
arr.push_null()
81
}
82
},
83
Some(s) => {
84
let mut arr_iter = arrs.iter_mut();
85
splitn_chars(s, n, keep_remainder)
86
.zip(&mut arr_iter)
87
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
88
// fill the remaining with null
89
for arr in arr_iter {
90
arr.push_null()
91
}
92
},
93
});
94
} else {
95
ca.for_each(|opt_s| match opt_s {
96
None => {
97
for arr in &mut arrs {
98
arr.push_null()
99
}
100
},
101
Some(s) => {
102
let mut arr_iter = arrs.iter_mut();
103
op(s, by)
104
.zip(&mut arr_iter)
105
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
106
// fill the remaining with null
107
for arr in arr_iter {
108
arr.push_null()
109
}
110
},
111
});
112
}
113
} else {
114
for arr in &mut arrs {
115
arr.push_null()
116
}
117
}
118
} else {
119
binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {
120
(Some(s), Some(by)) => {
121
let mut arr_iter = arrs.iter_mut();
122
if by.is_empty() {
123
splitn_chars(s, n, keep_remainder)
124
.zip(&mut arr_iter)
125
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
126
} else {
127
op(s, by)
128
.zip(&mut arr_iter)
129
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
130
};
131
// fill the remaining with null
132
for arr in arr_iter {
133
arr.push_null()
134
}
135
},
136
_ => {
137
for arr in &mut arrs {
138
arr.push_null()
139
}
140
},
141
})
142
}
143
144
let fields = arrs
145
.into_iter()
146
.enumerate()
147
.map(|(i, mut arr)| {
148
Series::try_from((format_pl_smallstr!("field_{i}"), arr.as_box())).unwrap()
149
})
150
.collect::<Vec<_>>();
151
152
StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())
153
}
154
155
pub fn split_helper<'a, F, I>(
156
ca: &'a StringChunked,
157
by: &'a StringChunked,
158
op: F,
159
) -> PolarsResult<ListChunked>
160
where
161
F: Fn(&'a str, &'a str) -> I,
162
I: Iterator<Item = &'a str>,
163
{
164
Ok(match (ca.len(), by.len()) {
165
(a, b) if a == b => {
166
let mut builder =
167
ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());
168
169
binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {
170
(Some(s), Some(by)) => {
171
if by.is_empty() {
172
builder.append_values_iter(split_chars(s))
173
} else {
174
builder.append_values_iter(op(s, by))
175
}
176
},
177
_ => builder.append_null(),
178
});
179
180
builder.finish()
181
},
182
(1, _) => {
183
if let Some(s) = ca.get(0) {
184
let mut builder = ListStringChunkedBuilder::new(
185
by.name().clone(),
186
by.len(),
187
by.get_values_size(),
188
);
189
190
by.for_each(|opt_by| match opt_by {
191
Some(by) => builder.append_values_iter(op(s, by)),
192
_ => builder.append_null(),
193
});
194
builder.finish()
195
} else {
196
ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)
197
}
198
},
199
(_, 1) => {
200
if let Some(by) = by.get(0) {
201
let mut builder = ListStringChunkedBuilder::new(
202
ca.name().clone(),
203
ca.len(),
204
ca.get_values_size(),
205
);
206
207
if by.is_empty() {
208
ca.for_each(|opt_s| match opt_s {
209
Some(s) => builder.append_values_iter(split_chars(s)),
210
_ => builder.append_null(),
211
});
212
} else {
213
ca.for_each(|opt_s| match opt_s {
214
Some(s) => builder.append_values_iter(op(s, by)),
215
_ => builder.append_null(),
216
});
217
}
218
builder.finish()
219
} else {
220
ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)
221
}
222
},
223
_ => polars_bail!(length_mismatch = "str.split", ca.len(), by.len()),
224
})
225
}
226
227