Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-expr/src/dispatch/strings.rs
8362 views
1
use std::borrow::Cow;
2
use std::sync::Arc;
3
4
use polars_core::prelude::*;
5
use polars_core::utils::{CustomIterTools, handle_casting_failures};
6
#[cfg(feature = "regex")]
7
use polars_ops::chunked_array::strings::split_regex_helper;
8
use polars_ops::prelude::{BinaryNameSpaceImpl, StringNameSpaceImpl};
9
#[cfg(feature = "temporal")]
10
use polars_plan::dsl::StrptimeOptions;
11
use polars_plan::dsl::{ColumnsUdf, SpecialEq};
12
use polars_plan::plans::IRStringFunction;
13
use polars_time::prelude::StringMethods;
14
#[cfg(feature = "regex")]
15
use regex::{NoExpand, escape};
16
17
use super::*;
18
19
pub fn function_expr_to_udf(func: IRStringFunction) -> SpecialEq<Arc<dyn ColumnsUdf>> {
20
use IRStringFunction::*;
21
match func {
22
Format { format, insertions } => {
23
map_as_slice!(strings::format, format.as_str(), insertions.as_ref())
24
},
25
#[cfg(feature = "regex")]
26
Contains { literal, strict } => map_as_slice!(strings::contains, literal, strict),
27
CountMatches(literal) => {
28
map_as_slice!(strings::count_matches, literal)
29
},
30
EndsWith => map_as_slice!(strings::ends_with),
31
StartsWith => map_as_slice!(strings::starts_with),
32
Extract(group_index) => map_as_slice!(strings::extract, group_index),
33
ExtractAll => {
34
map_as_slice!(strings::extract_all)
35
},
36
#[cfg(feature = "extract_groups")]
37
ExtractGroups { pat, dtype } => {
38
map!(strings::extract_groups, &pat, &dtype)
39
},
40
#[cfg(feature = "regex")]
41
Find { literal, strict } => map_as_slice!(strings::find, literal, strict),
42
LenBytes => map!(strings::len_bytes),
43
LenChars => map!(strings::len_chars),
44
#[cfg(feature = "string_pad")]
45
PadEnd { fill_char } => {
46
map_as_slice!(strings::pad_end, fill_char)
47
},
48
#[cfg(feature = "string_pad")]
49
PadStart { fill_char } => {
50
map_as_slice!(strings::pad_start, fill_char)
51
},
52
#[cfg(feature = "string_pad")]
53
ZFill => {
54
map_as_slice!(strings::zfill)
55
},
56
#[cfg(feature = "temporal")]
57
Strptime(dtype, options) => {
58
map_as_slice!(strings::strptime, dtype.clone(), &options)
59
},
60
Split(inclusive) => {
61
map_as_slice!(strings::split, inclusive)
62
},
63
#[cfg(feature = "regex")]
64
SplitRegex { inclusive, strict } => {
65
map_as_slice!(strings::split_regex, inclusive, strict)
66
},
67
#[cfg(feature = "dtype-struct")]
68
SplitExact { n, inclusive } => map_as_slice!(strings::split_exact, n, inclusive),
69
#[cfg(feature = "dtype-struct")]
70
SplitN(n) => map_as_slice!(strings::splitn, n),
71
#[cfg(feature = "concat_str")]
72
ConcatVertical {
73
delimiter,
74
ignore_nulls,
75
} => map!(strings::join, &delimiter, ignore_nulls),
76
#[cfg(feature = "concat_str")]
77
ConcatHorizontal {
78
delimiter,
79
ignore_nulls,
80
} => map_as_slice!(strings::concat_hor, &delimiter, ignore_nulls),
81
#[cfg(feature = "regex")]
82
Replace { n, literal } => map_as_slice!(strings::replace, literal, n),
83
#[cfg(feature = "string_normalize")]
84
Normalize { form } => map!(strings::normalize, form.clone()),
85
#[cfg(feature = "string_reverse")]
86
Reverse => map!(strings::reverse),
87
Uppercase => map!(uppercase),
88
Lowercase => map!(lowercase),
89
#[cfg(feature = "nightly")]
90
Titlecase => map!(strings::titlecase),
91
StripChars => map_as_slice!(strings::strip_chars),
92
StripCharsStart => map_as_slice!(strings::strip_chars_start),
93
StripCharsEnd => map_as_slice!(strings::strip_chars_end),
94
StripPrefix => map_as_slice!(strings::strip_prefix),
95
StripSuffix => map_as_slice!(strings::strip_suffix),
96
#[cfg(feature = "string_to_integer")]
97
ToInteger { dtype, strict } => {
98
map_as_slice!(strings::to_integer, dtype.clone(), strict)
99
},
100
Slice => map_as_slice!(strings::str_slice),
101
Head => map_as_slice!(strings::str_head),
102
Tail => map_as_slice!(strings::str_tail),
103
#[cfg(feature = "string_encoding")]
104
HexEncode => map!(strings::hex_encode),
105
#[cfg(feature = "binary_encoding")]
106
HexDecode(strict) => map!(strings::hex_decode, strict),
107
#[cfg(feature = "string_encoding")]
108
Base64Encode => map!(strings::base64_encode),
109
#[cfg(feature = "binary_encoding")]
110
Base64Decode(strict) => map!(strings::base64_decode, strict),
111
#[cfg(feature = "dtype-decimal")]
112
ToDecimal { scale } => map!(strings::to_decimal, scale),
113
#[cfg(feature = "extract_jsonpath")]
114
JsonDecode(dtype) => map!(strings::json_decode, dtype.clone()),
115
#[cfg(feature = "extract_jsonpath")]
116
JsonPathMatch => map_as_slice!(strings::json_path_match),
117
#[cfg(feature = "find_many")]
118
ContainsAny {
119
ascii_case_insensitive,
120
} => {
121
map_as_slice!(contains_any, ascii_case_insensitive)
122
},
123
#[cfg(feature = "find_many")]
124
ReplaceMany {
125
ascii_case_insensitive,
126
leftmost,
127
} => {
128
map_as_slice!(replace_many, ascii_case_insensitive, leftmost)
129
},
130
#[cfg(feature = "find_many")]
131
ExtractMany {
132
ascii_case_insensitive,
133
overlapping,
134
leftmost,
135
} => {
136
map_as_slice!(extract_many, ascii_case_insensitive, overlapping, leftmost)
137
},
138
#[cfg(feature = "find_many")]
139
FindMany {
140
ascii_case_insensitive,
141
overlapping,
142
leftmost,
143
} => {
144
map_as_slice!(find_many, ascii_case_insensitive, overlapping, leftmost)
145
},
146
#[cfg(feature = "regex")]
147
EscapeRegex => map!(escape_regex),
148
}
149
}
150
151
#[cfg(feature = "find_many")]
152
fn contains_any(s: &[Column], ascii_case_insensitive: bool) -> PolarsResult<Column> {
153
let ca = s[0].str()?;
154
let patterns = s[1].list()?;
155
polars_ops::chunked_array::strings::contains_any(ca, patterns, ascii_case_insensitive)
156
.map(|out| out.into_column())
157
}
158
159
#[cfg(feature = "find_many")]
160
fn replace_many(
161
s: &[Column],
162
ascii_case_insensitive: bool,
163
leftmost: bool,
164
) -> PolarsResult<Column> {
165
let ca = s[0].str()?;
166
let patterns = s[1].list()?;
167
let replace_with = s[2].list()?;
168
polars_ops::chunked_array::strings::replace_all(
169
ca,
170
patterns,
171
replace_with,
172
ascii_case_insensitive,
173
leftmost,
174
)
175
.map(|out| out.into_column())
176
}
177
178
#[cfg(feature = "find_many")]
179
fn extract_many(
180
s: &[Column],
181
ascii_case_insensitive: bool,
182
overlapping: bool,
183
leftmost: bool,
184
) -> PolarsResult<Column> {
185
let ca = s[0].str()?;
186
let patterns = s[1].list()?;
187
188
polars_ops::chunked_array::strings::extract_many(
189
ca,
190
patterns,
191
ascii_case_insensitive,
192
overlapping,
193
leftmost,
194
)
195
.map(|out| out.into_column())
196
}
197
198
#[cfg(feature = "find_many")]
199
fn find_many(
200
s: &[Column],
201
ascii_case_insensitive: bool,
202
overlapping: bool,
203
leftmost: bool,
204
) -> PolarsResult<Column> {
205
let ca = s[0].str()?;
206
let patterns = s[1].list()?;
207
208
polars_ops::chunked_array::strings::find_many(
209
ca,
210
patterns,
211
ascii_case_insensitive,
212
overlapping,
213
leftmost,
214
)
215
.map(|out| out.into_column())
216
}
217
218
fn uppercase(s: &Column) -> PolarsResult<Column> {
219
let ca = s.str()?;
220
Ok(ca.to_uppercase().into_column())
221
}
222
223
fn lowercase(s: &Column) -> PolarsResult<Column> {
224
let ca = s.str()?;
225
Ok(ca.to_lowercase().into_column())
226
}
227
228
#[cfg(feature = "nightly")]
229
pub(super) fn titlecase(s: &Column) -> PolarsResult<Column> {
230
let ca = s.str()?;
231
Ok(ca.to_titlecase().into_column())
232
}
233
234
pub(super) fn len_chars(s: &Column) -> PolarsResult<Column> {
235
let ca = s.str()?;
236
Ok(ca.str_len_chars().into_column())
237
}
238
239
pub(super) fn len_bytes(s: &Column) -> PolarsResult<Column> {
240
let ca = s.str()?;
241
Ok(ca.str_len_bytes().into_column())
242
}
243
244
#[cfg(feature = "regex")]
245
pub(super) fn contains(s: &[Column], literal: bool, strict: bool) -> PolarsResult<Column> {
246
_check_same_length(s, "contains")?;
247
let ca = s[0].str()?;
248
let pat = s[1].str()?;
249
ca.contains_chunked(pat, literal, strict)
250
.map(|ok| ok.into_column())
251
}
252
253
#[cfg(feature = "regex")]
254
pub(super) fn find(s: &[Column], literal: bool, strict: bool) -> PolarsResult<Column> {
255
_check_same_length(s, "find")?;
256
let ca = s[0].str()?;
257
let pat = s[1].str()?;
258
ca.find_chunked(pat, literal, strict)
259
.map(|ok| ok.into_column())
260
}
261
262
pub(super) fn ends_with(s: &[Column]) -> PolarsResult<Column> {
263
_check_same_length(s, "ends_with")?;
264
let ca = s[0].str()?.as_binary();
265
let suffix = s[1].str()?.as_binary();
266
267
Ok(ca.ends_with_chunked(&suffix)?.into_column())
268
}
269
270
pub(super) fn starts_with(s: &[Column]) -> PolarsResult<Column> {
271
_check_same_length(s, "starts_with")?;
272
let ca = s[0].str()?.as_binary();
273
let prefix = s[1].str()?.as_binary();
274
Ok(ca.starts_with_chunked(&prefix)?.into_column())
275
}
276
277
/// Extract a regex pattern from the a string value.
278
pub(super) fn extract(s: &[Column], group_index: usize) -> PolarsResult<Column> {
279
let ca = s[0].str()?;
280
let pat = s[1].str()?;
281
ca.extract(pat, group_index).map(|ca| ca.into_column())
282
}
283
284
#[cfg(feature = "extract_groups")]
285
/// Extract all capture groups from a regex pattern as a struct
286
pub(super) fn extract_groups(s: &Column, pat: &str, dtype: &DataType) -> PolarsResult<Column> {
287
let ca = s.str()?;
288
ca.extract_groups(pat, dtype).map(Column::from)
289
}
290
291
#[cfg(feature = "string_pad")]
292
pub(super) fn pad_start(s: &[Column], fill_char: char) -> PolarsResult<Column> {
293
let s1 = s[0].as_materialized_series();
294
let length = &s[1];
295
polars_ensure!(
296
s1.len() == 1 || length.len() == 1 || s1.len() == length.len(),
297
ShapeMismatch: "cannot pad_start with 'length' array of length {}", length.len()
298
);
299
let length = length.as_materialized_series().u64()?;
300
let ca = s1.str()?;
301
Ok(ca.pad_start(length, fill_char).into_column())
302
}
303
304
#[cfg(feature = "string_pad")]
305
pub(super) fn pad_end(s: &[Column], fill_char: char) -> PolarsResult<Column> {
306
let s1 = s[0].as_materialized_series();
307
let length = &s[1];
308
polars_ensure!(
309
s1.len() == 1 || length.len() == 1 || s1.len() == length.len(),
310
ShapeMismatch: "cannot pad_end with 'length' array of length {}", length.len()
311
);
312
let length = length.as_materialized_series().u64()?;
313
let ca = s1.str()?;
314
Ok(ca.pad_end(length, fill_char).into_column())
315
}
316
317
#[cfg(feature = "string_pad")]
318
pub(super) fn zfill(s: &[Column]) -> PolarsResult<Column> {
319
let s1 = s[0].as_materialized_series();
320
let length = &s[1];
321
polars_ensure!(
322
s1.len() == 1 || length.len() == 1 || s1.len() == length.len(),
323
ShapeMismatch: "cannot zfill with 'length' array of length {}", length.len()
324
);
325
let length = length.as_materialized_series().u64()?;
326
let ca = s1.str()?;
327
Ok(ca.zfill(length).into_column())
328
}
329
330
pub(super) fn strip_chars(s: &[Column]) -> PolarsResult<Column> {
331
_check_same_length(s, "strip_chars")?;
332
let ca = s[0].str()?;
333
let pat_s = &s[1];
334
ca.strip_chars(pat_s).map(|ok| ok.into_column())
335
}
336
337
pub(super) fn strip_chars_start(s: &[Column]) -> PolarsResult<Column> {
338
_check_same_length(s, "strip_chars_start")?;
339
let ca = s[0].str()?;
340
let pat_s = &s[1];
341
ca.strip_chars_start(pat_s).map(|ok| ok.into_column())
342
}
343
344
pub(super) fn strip_chars_end(s: &[Column]) -> PolarsResult<Column> {
345
_check_same_length(s, "strip_chars_end")?;
346
let ca = s[0].str()?;
347
let pat_s = &s[1];
348
ca.strip_chars_end(pat_s).map(|ok| ok.into_column())
349
}
350
351
pub(super) fn strip_prefix(s: &[Column]) -> PolarsResult<Column> {
352
_check_same_length(s, "strip_prefix")?;
353
let ca = s[0].str()?;
354
let prefix = s[1].str()?;
355
Ok(ca.strip_prefix(prefix).into_column())
356
}
357
358
pub(super) fn strip_suffix(s: &[Column]) -> PolarsResult<Column> {
359
_check_same_length(s, "strip_suffix")?;
360
let ca = s[0].str()?;
361
let suffix = s[1].str()?;
362
Ok(ca.strip_suffix(suffix).into_column())
363
}
364
365
pub(super) fn extract_all(args: &[Column]) -> PolarsResult<Column> {
366
let s = &args[0];
367
let pat = &args[1];
368
369
let ca = s.str()?;
370
let pat = pat.str()?;
371
372
if pat.len() == 1 {
373
if let Some(pat) = pat.get(0) {
374
ca.extract_all(pat).map(|ca| ca.into_column())
375
} else {
376
Ok(Column::full_null(
377
ca.name().clone(),
378
ca.len(),
379
&DataType::List(Box::new(DataType::String)),
380
))
381
}
382
} else {
383
ca.extract_all_many(pat).map(|ca| ca.into_column())
384
}
385
}
386
387
pub(super) fn count_matches(args: &[Column], literal: bool) -> PolarsResult<Column> {
388
let s = &args[0];
389
let pat = &args[1];
390
391
let ca = s.str()?;
392
let pat = pat.str()?;
393
if pat.len() == 1 {
394
if let Some(pat) = pat.get(0) {
395
ca.count_matches(pat, literal).map(|ca| ca.into_column())
396
} else {
397
Ok(Column::full_null(
398
ca.name().clone(),
399
ca.len(),
400
&DataType::UInt32,
401
))
402
}
403
} else {
404
ca.count_matches_many(pat, literal)
405
.map(|ca| ca.into_column())
406
}
407
}
408
409
#[cfg(feature = "temporal")]
410
pub(super) fn strptime(
411
s: &[Column],
412
dtype: DataType,
413
options: &StrptimeOptions,
414
) -> PolarsResult<Column> {
415
match dtype {
416
#[cfg(feature = "dtype-date")]
417
DataType::Date => to_date(&s[0], options),
418
#[cfg(feature = "dtype-datetime")]
419
DataType::Datetime(time_unit, time_zone) => {
420
to_datetime(s, &time_unit, time_zone.as_ref(), options)
421
},
422
#[cfg(feature = "dtype-time")]
423
DataType::Time => to_time(&s[0], options),
424
dt => polars_bail!(ComputeError: "not implemented for dtype {}", dt),
425
}
426
}
427
428
#[cfg(feature = "dtype-struct")]
429
pub(super) fn split_exact(s: &[Column], n: usize, inclusive: bool) -> PolarsResult<Column> {
430
let ca = s[0].str()?;
431
let by = s[1].str()?;
432
433
if inclusive {
434
ca.split_exact_inclusive(by, n).map(|ca| ca.into_column())
435
} else {
436
ca.split_exact(by, n).map(|ca| ca.into_column())
437
}
438
}
439
440
#[cfg(feature = "dtype-struct")]
441
pub(super) fn splitn(s: &[Column], n: usize) -> PolarsResult<Column> {
442
let ca = s[0].str()?;
443
let by = s[1].str()?;
444
445
ca.splitn(by, n).map(|ca| ca.into_column())
446
}
447
448
pub(super) fn split(s: &[Column], inclusive: bool) -> PolarsResult<Column> {
449
let ca = s[0].str()?;
450
let by = s[1].str()?;
451
452
if inclusive {
453
Ok(ca.split_inclusive(by)?.into_column())
454
} else {
455
Ok(ca.split(by)?.into_column())
456
}
457
}
458
459
#[cfg(feature = "regex")]
460
pub(super) fn split_regex(s: &[Column], inclusive: bool, strict: bool) -> PolarsResult<Column> {
461
let ca = s[0].str()?;
462
let by = s[1].str()?;
463
464
let out = split_regex_helper(ca, by, inclusive, strict)?;
465
Ok(out.into_column())
466
}
467
468
#[cfg(feature = "dtype-date")]
469
fn to_date(s: &Column, options: &StrptimeOptions) -> PolarsResult<Column> {
470
let ca = s.str()?;
471
let out = {
472
if options.exact {
473
ca.as_date(options.format.as_deref(), options.cache)?
474
.into_column()
475
} else {
476
ca.as_date_not_exact(options.format.as_deref())?
477
.into_column()
478
}
479
};
480
481
if options.strict && ca.null_count() != out.null_count() {
482
handle_casting_failures(s.as_materialized_series(), out.as_materialized_series())?;
483
}
484
Ok(out.into_column())
485
}
486
487
#[cfg(feature = "dtype-datetime")]
488
fn to_datetime(
489
s: &[Column],
490
time_unit: &TimeUnit,
491
time_zone: Option<&TimeZone>,
492
options: &StrptimeOptions,
493
) -> PolarsResult<Column> {
494
let datetime_strings = &s[0].str()?;
495
let ambiguous = &s[1].str()?;
496
497
polars_ensure!(
498
datetime_strings.len() == ambiguous.len()
499
|| datetime_strings.len() == 1
500
|| ambiguous.len() == 1,
501
length_mismatch = "str.strptime",
502
datetime_strings.len(),
503
ambiguous.len()
504
);
505
506
let tz_aware = match &options.format {
507
#[cfg(all(feature = "regex", feature = "timezones"))]
508
Some(format) => polars_plan::plans::TZ_AWARE_RE.is_match(format),
509
_ => false,
510
};
511
512
let out = if options.exact {
513
datetime_strings
514
.as_datetime(
515
options.format.as_deref(),
516
*time_unit,
517
options.cache,
518
tz_aware,
519
time_zone,
520
ambiguous,
521
)?
522
.into_column()
523
} else {
524
datetime_strings
525
.as_datetime_not_exact(
526
options.format.as_deref(),
527
*time_unit,
528
tz_aware,
529
time_zone,
530
ambiguous,
531
true,
532
)?
533
.into_column()
534
};
535
536
if options.strict && datetime_strings.null_count() != out.null_count() {
537
handle_casting_failures(s[0].as_materialized_series(), out.as_materialized_series())?;
538
}
539
Ok(out.into_column())
540
}
541
542
#[cfg(feature = "dtype-time")]
543
fn to_time(s: &Column, options: &StrptimeOptions) -> PolarsResult<Column> {
544
polars_ensure!(
545
options.exact, ComputeError: "non-exact not implemented for Time data type"
546
);
547
548
let ca = s.str()?;
549
let out = ca
550
.as_time(options.format.as_deref(), options.cache)?
551
.into_column();
552
553
if options.strict && ca.null_count() != out.null_count() {
554
handle_casting_failures(s.as_materialized_series(), out.as_materialized_series())?;
555
}
556
Ok(out.into_column())
557
}
558
559
#[cfg(feature = "concat_str")]
560
pub(super) fn join(s: &Column, delimiter: &str, ignore_nulls: bool) -> PolarsResult<Column> {
561
let str_s = s.cast(&DataType::String)?;
562
let joined = polars_ops::chunked_array::str_join(str_s.str()?, delimiter, ignore_nulls);
563
Ok(joined.into_column())
564
}
565
566
#[cfg(feature = "concat_str")]
567
pub(super) fn concat_hor(
568
series: &[Column],
569
delimiter: &str,
570
ignore_nulls: bool,
571
) -> PolarsResult<Column> {
572
let str_series: Vec<_> = series
573
.iter()
574
.map(|s| s.cast(&DataType::String))
575
.collect::<PolarsResult<_>>()?;
576
let cas: Vec<_> = str_series.iter().map(|s| s.str().unwrap()).collect();
577
Ok(polars_ops::chunked_array::hor_str_concat(&cas, delimiter, ignore_nulls)?.into_column())
578
}
579
580
#[cfg(feature = "regex")]
581
fn get_pat(pat: &StringChunked) -> PolarsResult<&str> {
582
pat.get(0).ok_or_else(
583
|| polars_err!(ComputeError: "pattern cannot be 'null' in 'replace' expression"),
584
)
585
}
586
587
// used only if feature="regex"
588
#[allow(dead_code)]
589
fn iter_and_replace<'a, F>(ca: &'a StringChunked, val: &'a StringChunked, f: F) -> StringChunked
590
where
591
F: Fn(&'a str, &'a str) -> Cow<'a, str>,
592
{
593
let mut out: StringChunked = ca
594
.into_iter()
595
.zip(val)
596
.map(|(opt_src, opt_val)| match (opt_src, opt_val) {
597
(Some(src), Some(val)) => Some(f(src, val)),
598
(Some(src), None) => Some(Cow::from(src)),
599
_ => None,
600
})
601
.collect_trusted();
602
603
out.rename(ca.name().clone());
604
out
605
}
606
607
#[cfg(feature = "regex")]
608
fn is_literal_pat(pat: &str) -> bool {
609
pat.chars().all(|c| !c.is_ascii_punctuation())
610
}
611
612
#[cfg(feature = "regex")]
613
fn replace_n<'a>(
614
ca: &'a StringChunked,
615
pat: &'a StringChunked,
616
val: &'a StringChunked,
617
literal: bool,
618
n: usize,
619
) -> PolarsResult<StringChunked> {
620
match (pat.len(), val.len()) {
621
(1, 1) => {
622
let pat = get_pat(pat)?;
623
let Some(val) = val.get(0) else {
624
return Ok(ca.clone());
625
};
626
let literal = literal || is_literal_pat(pat);
627
628
match literal {
629
true => ca.replace_literal(pat, val, n),
630
false => {
631
if n > 1 {
632
polars_bail!(ComputeError: "regex replacement with 'n > 1' not yet supported")
633
}
634
ca.replace(pat, val)
635
},
636
}
637
},
638
(1, len_val) => {
639
if n > 1 {
640
polars_bail!(ComputeError: "multivalue replacement with 'n > 1' not yet supported")
641
}
642
643
if n == 0 {
644
return Ok(ca.clone());
645
};
646
647
// from here on, we know that n == 1
648
let mut pat = get_pat(pat)?.to_string();
649
polars_ensure!(
650
len_val == ca.len(),
651
ComputeError:
652
"replacement value length ({}) does not match string column length ({})",
653
len_val, ca.len(),
654
);
655
let lit = is_literal_pat(&pat);
656
let literal_pat = literal || lit;
657
658
if literal_pat {
659
pat = escape(&pat)
660
}
661
662
let reg = polars_utils::regex_cache::compile_regex(&pat)?;
663
664
let f = |s: &'a str, val: &'a str| {
665
if literal {
666
reg.replace(s, NoExpand(val))
667
} else {
668
reg.replace(s, val)
669
}
670
};
671
672
Ok(iter_and_replace(ca, val, f))
673
},
674
_ => polars_bail!(
675
ComputeError: "dynamic pattern length in 'str.replace' expressions is not supported yet"
676
),
677
}
678
}
679
680
#[cfg(feature = "regex")]
681
fn replace_all<'a>(
682
ca: &'a StringChunked,
683
pat: &'a StringChunked,
684
val: &'a StringChunked,
685
literal: bool,
686
) -> PolarsResult<StringChunked> {
687
match (pat.len(), val.len()) {
688
(1, 1) => {
689
let pat = get_pat(pat)?;
690
let val = val.get(0).ok_or_else(
691
|| polars_err!(ComputeError: "value cannot be 'null' in 'replace' expression"),
692
)?;
693
let literal = literal || is_literal_pat(pat);
694
695
match literal {
696
true => ca.replace_literal_all(pat, val),
697
false => ca.replace_all(pat, val),
698
}
699
},
700
(1, len_val) => {
701
let mut pat = get_pat(pat)?.to_string();
702
polars_ensure!(
703
len_val == ca.len(),
704
ComputeError:
705
"replacement value length ({}) does not match string column length ({})",
706
len_val, ca.len(),
707
);
708
709
let literal_pat = literal || is_literal_pat(&pat);
710
711
if literal_pat {
712
pat = escape(&pat)
713
}
714
715
let reg = polars_utils::regex_cache::compile_regex(&pat)?;
716
717
let f = |s: &'a str, val: &'a str| {
718
// According to the docs for replace_all
719
// when literal = True then capture groups are ignored.
720
if literal {
721
reg.replace_all(s, NoExpand(val))
722
} else {
723
reg.replace_all(s, val)
724
}
725
};
726
727
Ok(iter_and_replace(ca, val, f))
728
},
729
_ => polars_bail!(
730
ComputeError: "dynamic pattern length in 'str.replace' expressions is not supported yet"
731
),
732
}
733
}
734
735
pub(super) fn format(s: &mut [Column], format: &str, insertions: &[usize]) -> PolarsResult<Column> {
736
polars_ops::series::str_format(s, format, insertions)
737
}
738
739
#[cfg(feature = "regex")]
740
pub(super) fn replace(s: &[Column], literal: bool, n: i64) -> PolarsResult<Column> {
741
let column = &s[0];
742
let pat = &s[1];
743
let val = &s[2];
744
let all = n < 0;
745
746
let column = column.str()?;
747
let pat = pat.str()?;
748
let val = val.str()?;
749
750
if all {
751
replace_all(column, pat, val, literal)
752
} else {
753
replace_n(column, pat, val, literal, n as usize)
754
}
755
.map(|ca| ca.into_column())
756
}
757
758
#[cfg(feature = "string_normalize")]
759
pub(super) fn normalize(
760
s: &Column,
761
form: polars_ops::prelude::UnicodeForm,
762
) -> PolarsResult<Column> {
763
let ca = s.str()?;
764
Ok(ca.str_normalize(form).into_column())
765
}
766
767
#[cfg(feature = "string_reverse")]
768
pub(super) fn reverse(s: &Column) -> PolarsResult<Column> {
769
let ca = s.str()?;
770
Ok(ca.str_reverse().into_column())
771
}
772
773
#[cfg(feature = "string_to_integer")]
774
pub(super) fn to_integer(
775
s: &[Column],
776
dtype: Option<DataType>,
777
strict: bool,
778
) -> PolarsResult<Column> {
779
let ca = s[0].str()?;
780
let base = s[1].strict_cast(&DataType::UInt32)?;
781
ca.to_integer(base.u32()?, dtype, strict)
782
.map(|ok| ok.into_column())
783
}
784
785
fn _ensure_lengths(s: &[Column]) -> bool {
786
// Calculate the post-broadcast length and ensure everything is consistent.
787
let len = s
788
.iter()
789
.map(|series| series.len())
790
.filter(|l| *l != 1)
791
.max()
792
.unwrap_or(1);
793
s.iter()
794
.all(|series| series.len() == 1 || series.len() == len)
795
}
796
797
fn _check_same_length(s: &[Column], fn_name: &str) -> Result<(), PolarsError> {
798
polars_ensure!(
799
_ensure_lengths(s),
800
ShapeMismatch: "all series in `str.{}()` should have equal or unit length",
801
fn_name
802
);
803
Ok(())
804
}
805
806
pub(super) fn str_slice(s: &[Column]) -> PolarsResult<Column> {
807
_check_same_length(s, "slice")?;
808
let ca = s[0].str()?;
809
let offset = &s[1];
810
let length = &s[2];
811
Ok(ca.str_slice(offset, length)?.into_column())
812
}
813
814
pub(super) fn str_head(s: &[Column]) -> PolarsResult<Column> {
815
_check_same_length(s, "head")?;
816
let ca = s[0].str()?;
817
let n = &s[1];
818
Ok(ca.str_head(n)?.into_column())
819
}
820
821
pub(super) fn str_tail(s: &[Column]) -> PolarsResult<Column> {
822
_check_same_length(s, "tail")?;
823
let ca = s[0].str()?;
824
let n = &s[1];
825
Ok(ca.str_tail(n)?.into_column())
826
}
827
828
#[cfg(feature = "string_encoding")]
829
pub(super) fn hex_encode(s: &Column) -> PolarsResult<Column> {
830
Ok(s.str()?.hex_encode().into_column())
831
}
832
833
#[cfg(feature = "binary_encoding")]
834
pub(super) fn hex_decode(s: &Column, strict: bool) -> PolarsResult<Column> {
835
s.str()?.hex_decode(strict).map(|ca| ca.into_column())
836
}
837
838
#[cfg(feature = "string_encoding")]
839
pub(super) fn base64_encode(s: &Column) -> PolarsResult<Column> {
840
Ok(s.str()?.base64_encode().into_column())
841
}
842
843
#[cfg(feature = "binary_encoding")]
844
pub(super) fn base64_decode(s: &Column, strict: bool) -> PolarsResult<Column> {
845
s.str()?.base64_decode(strict).map(|ca| ca.into_column())
846
}
847
848
#[cfg(feature = "dtype-decimal")]
849
pub(super) fn to_decimal(s: &Column, scale: usize) -> PolarsResult<Column> {
850
let ca = s.str()?;
851
ca.to_decimal(polars_compute::decimal::DEC128_MAX_PREC, scale)
852
.map(Column::from)
853
}
854
855
#[cfg(feature = "extract_jsonpath")]
856
pub(super) fn json_decode(s: &Column, dtype: DataType) -> PolarsResult<Column> {
857
use polars_ops::prelude::Utf8JsonPathImpl;
858
859
let ca = s.str()?;
860
ca.json_decode(Some(dtype), None).map(Column::from)
861
}
862
863
#[cfg(feature = "extract_jsonpath")]
864
pub(super) fn json_path_match(s: &[Column]) -> PolarsResult<Column> {
865
use polars_ops::prelude::Utf8JsonPathImpl;
866
867
_check_same_length(s, "json_path_match")?;
868
let ca = s[0].str()?;
869
let pat = s[1].str()?;
870
Ok(ca.json_path_match(pat)?.into_column())
871
}
872
873
#[cfg(feature = "regex")]
874
pub(super) fn escape_regex(s: &Column) -> PolarsResult<Column> {
875
let ca = s.str()?;
876
Ok(ca.str_escape_regex().into_column())
877
}
878
879