Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/split.rs
8362 views
1
use arrow::array::ValueSize;
2
#[cfg(feature = "dtype-struct")]
3
use arrow::array::{MutableArray, MutableUtf8Array};
4
use polars_core::chunked_array::ops::arity::binary_elementwise_for_each;
5
use polars_core::prelude::*;
6
use polars_utils::regex_cache::compile_regex;
7
use regex::Regex;
8
9
pub struct SplitNChars<'a> {
10
s: &'a str,
11
n: usize,
12
keep_remainder: bool,
13
}
14
15
impl<'a> Iterator for SplitNChars<'a> {
16
type Item = &'a str;
17
18
fn next(&mut self) -> Option<Self::Item> {
19
let single_char_limit = if self.keep_remainder { 2 } else { 1 };
20
if self.n >= single_char_limit {
21
self.n -= 1;
22
let ch = self.s.chars().next()?;
23
let first;
24
(first, self.s) = self.s.split_at(ch.len_utf8());
25
Some(first)
26
} else if self.n == 1 && !self.s.is_empty() {
27
self.n -= 1;
28
Some(self.s)
29
} else {
30
None
31
}
32
}
33
}
34
35
/// Splits a string into substrings consisting of single characters.
36
///
37
/// Returns at most n strings, where the last string is the entire remainder
38
/// of the string if keep_remainder is True, and just the nth character otherwise.
39
#[cfg(feature = "dtype-struct")]
40
fn splitn_chars(s: &str, n: usize, keep_remainder: bool) -> SplitNChars<'_> {
41
SplitNChars {
42
s,
43
n,
44
keep_remainder,
45
}
46
}
47
48
/// Splits a string into substrings consisting of single characters.
49
fn split_chars(s: &str) -> SplitNChars<'_> {
50
SplitNChars {
51
s,
52
n: usize::MAX,
53
keep_remainder: false,
54
}
55
}
56
57
#[cfg(feature = "dtype-struct")]
58
pub fn split_to_struct<'a, F, I>(
59
ca: &'a StringChunked,
60
by: &'a StringChunked,
61
n: usize,
62
op: F,
63
keep_remainder: bool,
64
) -> PolarsResult<StructChunked>
65
where
66
F: Fn(&'a str, &'a str) -> I,
67
I: Iterator<Item = &'a str>,
68
{
69
use polars_utils::format_pl_smallstr;
70
71
let mut arrs = (0..n)
72
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
73
.collect::<Vec<_>>();
74
75
if by.len() == 1 {
76
if let Some(by) = by.get(0) {
77
if by.is_empty() {
78
ca.for_each(|opt_s| match opt_s {
79
None => {
80
for arr in &mut arrs {
81
arr.push_null()
82
}
83
},
84
Some(s) => {
85
let mut arr_iter = arrs.iter_mut();
86
splitn_chars(s, n, keep_remainder)
87
.zip(&mut arr_iter)
88
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
89
// fill the remaining with null
90
for arr in arr_iter {
91
arr.push_null()
92
}
93
},
94
});
95
} else {
96
ca.for_each(|opt_s| match opt_s {
97
None => {
98
for arr in &mut arrs {
99
arr.push_null()
100
}
101
},
102
Some(s) => {
103
let mut arr_iter = arrs.iter_mut();
104
op(s, by)
105
.zip(&mut arr_iter)
106
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
107
// fill the remaining with null
108
for arr in arr_iter {
109
arr.push_null()
110
}
111
},
112
});
113
}
114
} else {
115
for arr in &mut arrs {
116
arr.push_null()
117
}
118
}
119
} else {
120
binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {
121
(Some(s), Some(by)) => {
122
let mut arr_iter = arrs.iter_mut();
123
if by.is_empty() {
124
splitn_chars(s, n, keep_remainder)
125
.zip(&mut arr_iter)
126
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
127
} else {
128
op(s, by)
129
.zip(&mut arr_iter)
130
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
131
};
132
// fill the remaining with null
133
for arr in arr_iter {
134
arr.push_null()
135
}
136
},
137
_ => {
138
for arr in &mut arrs {
139
arr.push_null()
140
}
141
},
142
})
143
}
144
145
let fields = arrs
146
.into_iter()
147
.enumerate()
148
.map(|(i, mut arr)| {
149
Series::try_from((format_pl_smallstr!("field_{i}"), arr.as_box())).unwrap()
150
})
151
.collect::<Vec<_>>();
152
153
StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())
154
}
155
156
pub fn split_helper<'a, F, I>(
157
ca: &'a StringChunked,
158
by: &'a StringChunked,
159
op: F,
160
) -> PolarsResult<ListChunked>
161
where
162
F: Fn(&'a str, &'a str) -> I,
163
I: Iterator<Item = &'a str>,
164
{
165
Ok(match (ca.len(), by.len()) {
166
(a, b) if a == b => {
167
let mut builder =
168
ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());
169
170
binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {
171
(Some(s), Some(by)) => {
172
if by.is_empty() {
173
builder.append_values_iter(split_chars(s))
174
} else {
175
builder.append_values_iter(op(s, by))
176
}
177
},
178
_ => builder.append_null(),
179
});
180
181
builder.finish()
182
},
183
(1, _) => {
184
if let Some(s) = ca.get(0) {
185
let mut builder = ListStringChunkedBuilder::new(
186
by.name().clone(),
187
by.len(),
188
by.get_values_size(),
189
);
190
191
by.for_each(|opt_by| match opt_by {
192
Some(by) => builder.append_values_iter(op(s, by)),
193
_ => builder.append_null(),
194
});
195
builder.finish()
196
} else {
197
ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)
198
}
199
},
200
(_, 1) => {
201
if let Some(by) = by.get(0) {
202
let mut builder = ListStringChunkedBuilder::new(
203
ca.name().clone(),
204
ca.len(),
205
ca.get_values_size(),
206
);
207
208
if by.is_empty() {
209
ca.for_each(|opt_s| match opt_s {
210
Some(s) => builder.append_values_iter(split_chars(s)),
211
_ => builder.append_null(),
212
});
213
} else {
214
ca.for_each(|opt_s| match opt_s {
215
Some(s) => builder.append_values_iter(op(s, by)),
216
_ => builder.append_null(),
217
});
218
}
219
builder.finish()
220
} else {
221
ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)
222
}
223
},
224
_ => polars_bail!(length_mismatch = "str.split", ca.len(), by.len()),
225
})
226
}
227
228
#[inline]
229
fn split_inclusive<'a>(re: &'a Regex, s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
230
let mut it = re.find_iter(s);
231
let mut last_end: usize = 0;
232
let mut yielded_any = false;
233
let mut done_tail = false;
234
235
std::iter::from_fn(move || {
236
if let Some(m) = it.next() {
237
let end = m.end();
238
let out = &s[last_end..end];
239
last_end = end;
240
yielded_any = true;
241
return Some(out);
242
}
243
244
if done_tail {
245
return None;
246
}
247
done_tail = true;
248
249
if last_end < s.len() {
250
Some(&s[last_end..])
251
} else if !yielded_any {
252
Some(s)
253
} else {
254
None
255
}
256
})
257
}
258
259
#[inline]
260
fn invalid_regex_err(pat: &str) -> PolarsError {
261
polars_err!(ComputeError: "invalid regex pattern in str.split_regex: {}", pat)
262
}
263
264
#[inline]
265
fn append_split_compiled(
266
builder: &mut ListStringChunkedBuilder,
267
s: &str,
268
re: &Regex,
269
inclusive: bool,
270
) {
271
if inclusive {
272
builder.append_values_iter(split_inclusive(re, s));
273
} else {
274
builder.append_values_iter(re.split(s));
275
}
276
}
277
278
#[inline]
279
fn append_split(
280
builder: &mut ListStringChunkedBuilder,
281
s: &str,
282
pat: &str,
283
inclusive: bool,
284
strict: bool,
285
) -> PolarsResult<()> {
286
if pat.is_empty() {
287
builder.append_values_iter(split_chars(s));
288
return Ok(());
289
}
290
291
match compile_regex(pat) {
292
Ok(re) => {
293
append_split_compiled(builder, s, &re, inclusive);
294
Ok(())
295
},
296
Err(_) if strict => Err(invalid_regex_err(pat)),
297
Err(_) => {
298
builder.append_null();
299
Ok(())
300
},
301
}
302
}
303
304
pub fn split_regex_helper(
305
ca: &StringChunked,
306
by: &StringChunked,
307
inclusive: bool,
308
strict: bool,
309
) -> PolarsResult<ListChunked> {
310
use polars_utils::regex_cache::compile_regex;
311
312
Ok(match (ca.len(), by.len()) {
313
// elementwise: string[i] with pattern[i]
314
(a, b) if a == b => {
315
let mut builder =
316
ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());
317
318
for (opt_s, opt_pat) in ca.into_iter().zip(by.into_iter()) {
319
match (opt_s, opt_pat) {
320
(Some(s), Some(pat)) => append_split(&mut builder, s, pat, inclusive, strict)?,
321
_ => builder.append_null(),
322
}
323
}
324
325
builder.finish()
326
},
327
328
// scalar string with per-row patterns
329
(1, _) => {
330
if let Some(s0) = ca.get(0) {
331
let mut builder = ListStringChunkedBuilder::new(
332
by.name().clone(),
333
by.len(),
334
by.get_values_size(),
335
);
336
337
for opt_pat in by.into_iter() {
338
match opt_pat {
339
Some(pat) => append_split(&mut builder, s0, pat, inclusive, strict)?,
340
None => builder.append_null(),
341
}
342
}
343
344
builder.finish()
345
} else {
346
ListChunked::full_null_with_dtype(ca.name().clone(), by.len(), &DataType::String)
347
}
348
},
349
350
// per-row strings with scalar pattern
351
(_, 1) => {
352
if let Some(pat0) = by.get(0) {
353
let mut builder = ListStringChunkedBuilder::new(
354
ca.name().clone(),
355
ca.len(),
356
ca.get_values_size(),
357
);
358
359
if pat0.is_empty() {
360
ca.for_each(|opt_s| match opt_s {
361
Some(s) => builder.append_values_iter(split_chars(s)),
362
None => builder.append_null(),
363
});
364
builder.finish()
365
} else {
366
let re = match compile_regex(pat0) {
367
Ok(re) => re,
368
Err(_) if strict => return Err(invalid_regex_err(pat0)),
369
Err(_) => {
370
return Ok(ListChunked::full_null_with_dtype(
371
ca.name().clone(),
372
ca.len(),
373
&DataType::String,
374
));
375
},
376
};
377
378
ca.for_each(|opt_s| match opt_s {
379
Some(s) => append_split_compiled(&mut builder, s, &re, inclusive),
380
None => builder.append_null(),
381
});
382
383
builder.finish()
384
}
385
} else {
386
ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)
387
}
388
},
389
390
_ => polars_bail!(length_mismatch = "str.split_regex", ca.len(), by.len()),
391
})
392
}
393
394