Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-expr/src/dispatch/strings.rs
7884 views
1
use std::borrow::Cow;
2
use std::sync::Arc;
3
4
use polars_core::prelude::*;
5
use polars_core::utils::{CustomIterTools, handle_casting_failures};
6
use polars_ops::prelude::{BinaryNameSpaceImpl, StringNameSpaceImpl};
7
#[cfg(feature = "temporal")]
8
use polars_plan::dsl::StrptimeOptions;
9
use polars_plan::dsl::{ColumnsUdf, SpecialEq};
10
use polars_plan::plans::IRStringFunction;
11
use polars_time::prelude::StringMethods;
12
#[cfg(feature = "regex")]
13
use regex::{NoExpand, escape};
14
15
use super::*;
16
17
pub fn function_expr_to_udf(func: IRStringFunction) -> SpecialEq<Arc<dyn ColumnsUdf>> {
18
use IRStringFunction::*;
19
match func {
20
Format { format, insertions } => {
21
map_as_slice!(strings::format, format.as_str(), insertions.as_ref())
22
},
23
#[cfg(feature = "regex")]
24
Contains { literal, strict } => map_as_slice!(strings::contains, literal, strict),
25
CountMatches(literal) => {
26
map_as_slice!(strings::count_matches, literal)
27
},
28
EndsWith => map_as_slice!(strings::ends_with),
29
StartsWith => map_as_slice!(strings::starts_with),
30
Extract(group_index) => map_as_slice!(strings::extract, group_index),
31
ExtractAll => {
32
map_as_slice!(strings::extract_all)
33
},
34
#[cfg(feature = "extract_groups")]
35
ExtractGroups { pat, dtype } => {
36
map!(strings::extract_groups, &pat, &dtype)
37
},
38
#[cfg(feature = "regex")]
39
Find { literal, strict } => map_as_slice!(strings::find, literal, strict),
40
LenBytes => map!(strings::len_bytes),
41
LenChars => map!(strings::len_chars),
42
#[cfg(feature = "string_pad")]
43
PadEnd { fill_char } => {
44
map_as_slice!(strings::pad_end, fill_char)
45
},
46
#[cfg(feature = "string_pad")]
47
PadStart { fill_char } => {
48
map_as_slice!(strings::pad_start, fill_char)
49
},
50
#[cfg(feature = "string_pad")]
51
ZFill => {
52
map_as_slice!(strings::zfill)
53
},
54
#[cfg(feature = "temporal")]
55
Strptime(dtype, options) => {
56
map_as_slice!(strings::strptime, dtype.clone(), &options)
57
},
58
Split(inclusive) => {
59
map_as_slice!(strings::split, inclusive)
60
},
61
#[cfg(feature = "dtype-struct")]
62
SplitExact { n, inclusive } => map_as_slice!(strings::split_exact, n, inclusive),
63
#[cfg(feature = "dtype-struct")]
64
SplitN(n) => map_as_slice!(strings::splitn, n),
65
#[cfg(feature = "concat_str")]
66
ConcatVertical {
67
delimiter,
68
ignore_nulls,
69
} => map!(strings::join, &delimiter, ignore_nulls),
70
#[cfg(feature = "concat_str")]
71
ConcatHorizontal {
72
delimiter,
73
ignore_nulls,
74
} => map_as_slice!(strings::concat_hor, &delimiter, ignore_nulls),
75
#[cfg(feature = "regex")]
76
Replace { n, literal } => map_as_slice!(strings::replace, literal, n),
77
#[cfg(feature = "string_normalize")]
78
Normalize { form } => map!(strings::normalize, form.clone()),
79
#[cfg(feature = "string_reverse")]
80
Reverse => map!(strings::reverse),
81
Uppercase => map!(uppercase),
82
Lowercase => map!(lowercase),
83
#[cfg(feature = "nightly")]
84
Titlecase => map!(strings::titlecase),
85
StripChars => map_as_slice!(strings::strip_chars),
86
StripCharsStart => map_as_slice!(strings::strip_chars_start),
87
StripCharsEnd => map_as_slice!(strings::strip_chars_end),
88
StripPrefix => map_as_slice!(strings::strip_prefix),
89
StripSuffix => map_as_slice!(strings::strip_suffix),
90
#[cfg(feature = "string_to_integer")]
91
ToInteger { dtype, strict } => {
92
map_as_slice!(strings::to_integer, dtype.clone(), strict)
93
},
94
Slice => map_as_slice!(strings::str_slice),
95
Head => map_as_slice!(strings::str_head),
96
Tail => map_as_slice!(strings::str_tail),
97
#[cfg(feature = "string_encoding")]
98
HexEncode => map!(strings::hex_encode),
99
#[cfg(feature = "binary_encoding")]
100
HexDecode(strict) => map!(strings::hex_decode, strict),
101
#[cfg(feature = "string_encoding")]
102
Base64Encode => map!(strings::base64_encode),
103
#[cfg(feature = "binary_encoding")]
104
Base64Decode(strict) => map!(strings::base64_decode, strict),
105
#[cfg(feature = "dtype-decimal")]
106
ToDecimal { scale } => map!(strings::to_decimal, scale),
107
#[cfg(feature = "extract_jsonpath")]
108
JsonDecode(dtype) => map!(strings::json_decode, dtype.clone()),
109
#[cfg(feature = "extract_jsonpath")]
110
JsonPathMatch => map_as_slice!(strings::json_path_match),
111
#[cfg(feature = "find_many")]
112
ContainsAny {
113
ascii_case_insensitive,
114
} => {
115
map_as_slice!(contains_any, ascii_case_insensitive)
116
},
117
#[cfg(feature = "find_many")]
118
ReplaceMany {
119
ascii_case_insensitive,
120
leftmost,
121
} => {
122
map_as_slice!(replace_many, ascii_case_insensitive, leftmost)
123
},
124
#[cfg(feature = "find_many")]
125
ExtractMany {
126
ascii_case_insensitive,
127
overlapping,
128
leftmost,
129
} => {
130
map_as_slice!(extract_many, ascii_case_insensitive, overlapping, leftmost)
131
},
132
#[cfg(feature = "find_many")]
133
FindMany {
134
ascii_case_insensitive,
135
overlapping,
136
leftmost,
137
} => {
138
map_as_slice!(find_many, ascii_case_insensitive, overlapping, leftmost)
139
},
140
#[cfg(feature = "regex")]
141
EscapeRegex => map!(escape_regex),
142
}
143
}
144
145
#[cfg(feature = "find_many")]
146
fn contains_any(s: &[Column], ascii_case_insensitive: bool) -> PolarsResult<Column> {
147
let ca = s[0].str()?;
148
let patterns = s[1].list()?;
149
polars_ops::chunked_array::strings::contains_any(ca, patterns, ascii_case_insensitive)
150
.map(|out| out.into_column())
151
}
152
153
#[cfg(feature = "find_many")]
154
fn replace_many(
155
s: &[Column],
156
ascii_case_insensitive: bool,
157
leftmost: bool,
158
) -> PolarsResult<Column> {
159
let ca = s[0].str()?;
160
let patterns = s[1].list()?;
161
let replace_with = s[2].list()?;
162
polars_ops::chunked_array::strings::replace_all(
163
ca,
164
patterns,
165
replace_with,
166
ascii_case_insensitive,
167
leftmost,
168
)
169
.map(|out| out.into_column())
170
}
171
172
#[cfg(feature = "find_many")]
173
fn extract_many(
174
s: &[Column],
175
ascii_case_insensitive: bool,
176
overlapping: bool,
177
leftmost: bool,
178
) -> PolarsResult<Column> {
179
let ca = s[0].str()?;
180
let patterns = s[1].list()?;
181
182
polars_ops::chunked_array::strings::extract_many(
183
ca,
184
patterns,
185
ascii_case_insensitive,
186
overlapping,
187
leftmost,
188
)
189
.map(|out| out.into_column())
190
}
191
192
#[cfg(feature = "find_many")]
193
fn find_many(
194
s: &[Column],
195
ascii_case_insensitive: bool,
196
overlapping: bool,
197
leftmost: bool,
198
) -> PolarsResult<Column> {
199
let ca = s[0].str()?;
200
let patterns = s[1].list()?;
201
202
polars_ops::chunked_array::strings::find_many(
203
ca,
204
patterns,
205
ascii_case_insensitive,
206
overlapping,
207
leftmost,
208
)
209
.map(|out| out.into_column())
210
}
211
212
fn uppercase(s: &Column) -> PolarsResult<Column> {
213
let ca = s.str()?;
214
Ok(ca.to_uppercase().into_column())
215
}
216
217
fn lowercase(s: &Column) -> PolarsResult<Column> {
218
let ca = s.str()?;
219
Ok(ca.to_lowercase().into_column())
220
}
221
222
#[cfg(feature = "nightly")]
223
pub(super) fn titlecase(s: &Column) -> PolarsResult<Column> {
224
let ca = s.str()?;
225
Ok(ca.to_titlecase().into_column())
226
}
227
228
pub(super) fn len_chars(s: &Column) -> PolarsResult<Column> {
229
let ca = s.str()?;
230
Ok(ca.str_len_chars().into_column())
231
}
232
233
pub(super) fn len_bytes(s: &Column) -> PolarsResult<Column> {
234
let ca = s.str()?;
235
Ok(ca.str_len_bytes().into_column())
236
}
237
238
#[cfg(feature = "regex")]
239
pub(super) fn contains(s: &[Column], literal: bool, strict: bool) -> PolarsResult<Column> {
240
_check_same_length(s, "contains")?;
241
let ca = s[0].str()?;
242
let pat = s[1].str()?;
243
ca.contains_chunked(pat, literal, strict)
244
.map(|ok| ok.into_column())
245
}
246
247
#[cfg(feature = "regex")]
248
pub(super) fn find(s: &[Column], literal: bool, strict: bool) -> PolarsResult<Column> {
249
_check_same_length(s, "find")?;
250
let ca = s[0].str()?;
251
let pat = s[1].str()?;
252
ca.find_chunked(pat, literal, strict)
253
.map(|ok| ok.into_column())
254
}
255
256
pub(super) fn ends_with(s: &[Column]) -> PolarsResult<Column> {
257
_check_same_length(s, "ends_with")?;
258
let ca = s[0].str()?.as_binary();
259
let suffix = s[1].str()?.as_binary();
260
261
Ok(ca.ends_with_chunked(&suffix)?.into_column())
262
}
263
264
pub(super) fn starts_with(s: &[Column]) -> PolarsResult<Column> {
265
_check_same_length(s, "starts_with")?;
266
let ca = s[0].str()?.as_binary();
267
let prefix = s[1].str()?.as_binary();
268
Ok(ca.starts_with_chunked(&prefix)?.into_column())
269
}
270
271
/// Extract a regex pattern from the a string value.
272
pub(super) fn extract(s: &[Column], group_index: usize) -> PolarsResult<Column> {
273
let ca = s[0].str()?;
274
let pat = s[1].str()?;
275
ca.extract(pat, group_index).map(|ca| ca.into_column())
276
}
277
278
#[cfg(feature = "extract_groups")]
279
/// Extract all capture groups from a regex pattern as a struct
280
pub(super) fn extract_groups(s: &Column, pat: &str, dtype: &DataType) -> PolarsResult<Column> {
281
let ca = s.str()?;
282
ca.extract_groups(pat, dtype).map(Column::from)
283
}
284
285
#[cfg(feature = "string_pad")]
286
pub(super) fn pad_start(s: &[Column], fill_char: char) -> PolarsResult<Column> {
287
let s1 = s[0].as_materialized_series();
288
let length = &s[1];
289
polars_ensure!(
290
s1.len() == 1 || length.len() == 1 || s1.len() == length.len(),
291
ShapeMismatch: "cannot pad_start with 'length' array of length {}", length.len()
292
);
293
let length = length.as_materialized_series().u64()?;
294
let ca = s1.str()?;
295
Ok(ca.pad_start(length, fill_char).into_column())
296
}
297
298
#[cfg(feature = "string_pad")]
299
pub(super) fn pad_end(s: &[Column], fill_char: char) -> PolarsResult<Column> {
300
let s1 = s[0].as_materialized_series();
301
let length = &s[1];
302
polars_ensure!(
303
s1.len() == 1 || length.len() == 1 || s1.len() == length.len(),
304
ShapeMismatch: "cannot pad_end with 'length' array of length {}", length.len()
305
);
306
let length = length.as_materialized_series().u64()?;
307
let ca = s1.str()?;
308
Ok(ca.pad_end(length, fill_char).into_column())
309
}
310
311
#[cfg(feature = "string_pad")]
312
pub(super) fn zfill(s: &[Column]) -> PolarsResult<Column> {
313
let s1 = s[0].as_materialized_series();
314
let length = &s[1];
315
polars_ensure!(
316
s1.len() == 1 || length.len() == 1 || s1.len() == length.len(),
317
ShapeMismatch: "cannot zfill with 'length' array of length {}", length.len()
318
);
319
let length = length.as_materialized_series().u64()?;
320
let ca = s1.str()?;
321
Ok(ca.zfill(length).into_column())
322
}
323
324
pub(super) fn strip_chars(s: &[Column]) -> PolarsResult<Column> {
325
_check_same_length(s, "strip_chars")?;
326
let ca = s[0].str()?;
327
let pat_s = &s[1];
328
ca.strip_chars(pat_s).map(|ok| ok.into_column())
329
}
330
331
pub(super) fn strip_chars_start(s: &[Column]) -> PolarsResult<Column> {
332
_check_same_length(s, "strip_chars_start")?;
333
let ca = s[0].str()?;
334
let pat_s = &s[1];
335
ca.strip_chars_start(pat_s).map(|ok| ok.into_column())
336
}
337
338
pub(super) fn strip_chars_end(s: &[Column]) -> PolarsResult<Column> {
339
_check_same_length(s, "strip_chars_end")?;
340
let ca = s[0].str()?;
341
let pat_s = &s[1];
342
ca.strip_chars_end(pat_s).map(|ok| ok.into_column())
343
}
344
345
pub(super) fn strip_prefix(s: &[Column]) -> PolarsResult<Column> {
346
_check_same_length(s, "strip_prefix")?;
347
let ca = s[0].str()?;
348
let prefix = s[1].str()?;
349
Ok(ca.strip_prefix(prefix).into_column())
350
}
351
352
pub(super) fn strip_suffix(s: &[Column]) -> PolarsResult<Column> {
353
_check_same_length(s, "strip_suffix")?;
354
let ca = s[0].str()?;
355
let suffix = s[1].str()?;
356
Ok(ca.strip_suffix(suffix).into_column())
357
}
358
359
pub(super) fn extract_all(args: &[Column]) -> PolarsResult<Column> {
360
let s = &args[0];
361
let pat = &args[1];
362
363
let ca = s.str()?;
364
let pat = pat.str()?;
365
366
if pat.len() == 1 {
367
if let Some(pat) = pat.get(0) {
368
ca.extract_all(pat).map(|ca| ca.into_column())
369
} else {
370
Ok(Column::full_null(
371
ca.name().clone(),
372
ca.len(),
373
&DataType::List(Box::new(DataType::String)),
374
))
375
}
376
} else {
377
ca.extract_all_many(pat).map(|ca| ca.into_column())
378
}
379
}
380
381
pub(super) fn count_matches(args: &[Column], literal: bool) -> PolarsResult<Column> {
382
let s = &args[0];
383
let pat = &args[1];
384
385
let ca = s.str()?;
386
let pat = pat.str()?;
387
if pat.len() == 1 {
388
if let Some(pat) = pat.get(0) {
389
ca.count_matches(pat, literal).map(|ca| ca.into_column())
390
} else {
391
Ok(Column::full_null(
392
ca.name().clone(),
393
ca.len(),
394
&DataType::UInt32,
395
))
396
}
397
} else {
398
ca.count_matches_many(pat, literal)
399
.map(|ca| ca.into_column())
400
}
401
}
402
403
#[cfg(feature = "temporal")]
404
pub(super) fn strptime(
405
s: &[Column],
406
dtype: DataType,
407
options: &StrptimeOptions,
408
) -> PolarsResult<Column> {
409
match dtype {
410
#[cfg(feature = "dtype-date")]
411
DataType::Date => to_date(&s[0], options),
412
#[cfg(feature = "dtype-datetime")]
413
DataType::Datetime(time_unit, time_zone) => {
414
to_datetime(s, &time_unit, time_zone.as_ref(), options)
415
},
416
#[cfg(feature = "dtype-time")]
417
DataType::Time => to_time(&s[0], options),
418
dt => polars_bail!(ComputeError: "not implemented for dtype {}", dt),
419
}
420
}
421
422
#[cfg(feature = "dtype-struct")]
423
pub(super) fn split_exact(s: &[Column], n: usize, inclusive: bool) -> PolarsResult<Column> {
424
let ca = s[0].str()?;
425
let by = s[1].str()?;
426
427
if inclusive {
428
ca.split_exact_inclusive(by, n).map(|ca| ca.into_column())
429
} else {
430
ca.split_exact(by, n).map(|ca| ca.into_column())
431
}
432
}
433
434
#[cfg(feature = "dtype-struct")]
435
pub(super) fn splitn(s: &[Column], n: usize) -> PolarsResult<Column> {
436
let ca = s[0].str()?;
437
let by = s[1].str()?;
438
439
ca.splitn(by, n).map(|ca| ca.into_column())
440
}
441
442
pub(super) fn split(s: &[Column], inclusive: bool) -> PolarsResult<Column> {
443
let ca = s[0].str()?;
444
let by = s[1].str()?;
445
446
if inclusive {
447
Ok(ca.split_inclusive(by)?.into_column())
448
} else {
449
Ok(ca.split(by)?.into_column())
450
}
451
}
452
453
#[cfg(feature = "dtype-date")]
454
fn to_date(s: &Column, options: &StrptimeOptions) -> PolarsResult<Column> {
455
let ca = s.str()?;
456
let out = {
457
if options.exact {
458
ca.as_date(options.format.as_deref(), options.cache)?
459
.into_column()
460
} else {
461
ca.as_date_not_exact(options.format.as_deref())?
462
.into_column()
463
}
464
};
465
466
if options.strict && ca.null_count() != out.null_count() {
467
handle_casting_failures(s.as_materialized_series(), out.as_materialized_series())?;
468
}
469
Ok(out.into_column())
470
}
471
472
#[cfg(feature = "dtype-datetime")]
473
fn to_datetime(
474
s: &[Column],
475
time_unit: &TimeUnit,
476
time_zone: Option<&TimeZone>,
477
options: &StrptimeOptions,
478
) -> PolarsResult<Column> {
479
let datetime_strings = &s[0].str()?;
480
let ambiguous = &s[1].str()?;
481
482
polars_ensure!(
483
datetime_strings.len() == ambiguous.len()
484
|| datetime_strings.len() == 1
485
|| ambiguous.len() == 1,
486
length_mismatch = "str.strptime",
487
datetime_strings.len(),
488
ambiguous.len()
489
);
490
491
let tz_aware = match &options.format {
492
#[cfg(all(feature = "regex", feature = "timezones"))]
493
Some(format) => polars_plan::plans::TZ_AWARE_RE.is_match(format),
494
_ => false,
495
};
496
497
let out = if options.exact {
498
datetime_strings
499
.as_datetime(
500
options.format.as_deref(),
501
*time_unit,
502
options.cache,
503
tz_aware,
504
time_zone,
505
ambiguous,
506
)?
507
.into_column()
508
} else {
509
datetime_strings
510
.as_datetime_not_exact(
511
options.format.as_deref(),
512
*time_unit,
513
tz_aware,
514
time_zone,
515
ambiguous,
516
true,
517
)?
518
.into_column()
519
};
520
521
if options.strict && datetime_strings.null_count() != out.null_count() {
522
handle_casting_failures(s[0].as_materialized_series(), out.as_materialized_series())?;
523
}
524
Ok(out.into_column())
525
}
526
527
#[cfg(feature = "dtype-time")]
528
fn to_time(s: &Column, options: &StrptimeOptions) -> PolarsResult<Column> {
529
polars_ensure!(
530
options.exact, ComputeError: "non-exact not implemented for Time data type"
531
);
532
533
let ca = s.str()?;
534
let out = ca
535
.as_time(options.format.as_deref(), options.cache)?
536
.into_column();
537
538
if options.strict && ca.null_count() != out.null_count() {
539
handle_casting_failures(s.as_materialized_series(), out.as_materialized_series())?;
540
}
541
Ok(out.into_column())
542
}
543
544
#[cfg(feature = "concat_str")]
545
pub(super) fn join(s: &Column, delimiter: &str, ignore_nulls: bool) -> PolarsResult<Column> {
546
let str_s = s.cast(&DataType::String)?;
547
let joined = polars_ops::chunked_array::str_join(str_s.str()?, delimiter, ignore_nulls);
548
Ok(joined.into_column())
549
}
550
551
#[cfg(feature = "concat_str")]
552
pub(super) fn concat_hor(
553
series: &[Column],
554
delimiter: &str,
555
ignore_nulls: bool,
556
) -> PolarsResult<Column> {
557
let str_series: Vec<_> = series
558
.iter()
559
.map(|s| s.cast(&DataType::String))
560
.collect::<PolarsResult<_>>()?;
561
let cas: Vec<_> = str_series.iter().map(|s| s.str().unwrap()).collect();
562
Ok(polars_ops::chunked_array::hor_str_concat(&cas, delimiter, ignore_nulls)?.into_column())
563
}
564
565
#[cfg(feature = "regex")]
566
fn get_pat(pat: &StringChunked) -> PolarsResult<&str> {
567
pat.get(0).ok_or_else(
568
|| polars_err!(ComputeError: "pattern cannot be 'null' in 'replace' expression"),
569
)
570
}
571
572
// used only if feature="regex"
573
#[allow(dead_code)]
574
fn iter_and_replace<'a, F>(ca: &'a StringChunked, val: &'a StringChunked, f: F) -> StringChunked
575
where
576
F: Fn(&'a str, &'a str) -> Cow<'a, str>,
577
{
578
let mut out: StringChunked = ca
579
.into_iter()
580
.zip(val)
581
.map(|(opt_src, opt_val)| match (opt_src, opt_val) {
582
(Some(src), Some(val)) => Some(f(src, val)),
583
(Some(src), None) => Some(Cow::from(src)),
584
_ => None,
585
})
586
.collect_trusted();
587
588
out.rename(ca.name().clone());
589
out
590
}
591
592
#[cfg(feature = "regex")]
593
fn is_literal_pat(pat: &str) -> bool {
594
pat.chars().all(|c| !c.is_ascii_punctuation())
595
}
596
597
#[cfg(feature = "regex")]
598
fn replace_n<'a>(
599
ca: &'a StringChunked,
600
pat: &'a StringChunked,
601
val: &'a StringChunked,
602
literal: bool,
603
n: usize,
604
) -> PolarsResult<StringChunked> {
605
match (pat.len(), val.len()) {
606
(1, 1) => {
607
let pat = get_pat(pat)?;
608
let Some(val) = val.get(0) else {
609
return Ok(ca.clone());
610
};
611
let literal = literal || is_literal_pat(pat);
612
613
match literal {
614
true => ca.replace_literal(pat, val, n),
615
false => {
616
if n > 1 {
617
polars_bail!(ComputeError: "regex replacement with 'n > 1' not yet supported")
618
}
619
ca.replace(pat, val)
620
},
621
}
622
},
623
(1, len_val) => {
624
if n > 1 {
625
polars_bail!(ComputeError: "multivalue replacement with 'n > 1' not yet supported")
626
}
627
628
if n == 0 {
629
return Ok(ca.clone());
630
};
631
632
// from here on, we know that n == 1
633
let mut pat = get_pat(pat)?.to_string();
634
polars_ensure!(
635
len_val == ca.len(),
636
ComputeError:
637
"replacement value length ({}) does not match string column length ({})",
638
len_val, ca.len(),
639
);
640
let lit = is_literal_pat(&pat);
641
let literal_pat = literal || lit;
642
643
if literal_pat {
644
pat = escape(&pat)
645
}
646
647
let reg = polars_utils::regex_cache::compile_regex(&pat)?;
648
649
let f = |s: &'a str, val: &'a str| {
650
if literal {
651
reg.replace(s, NoExpand(val))
652
} else {
653
reg.replace(s, val)
654
}
655
};
656
657
Ok(iter_and_replace(ca, val, f))
658
},
659
_ => polars_bail!(
660
ComputeError: "dynamic pattern length in 'str.replace' expressions is not supported yet"
661
),
662
}
663
}
664
665
#[cfg(feature = "regex")]
666
fn replace_all<'a>(
667
ca: &'a StringChunked,
668
pat: &'a StringChunked,
669
val: &'a StringChunked,
670
literal: bool,
671
) -> PolarsResult<StringChunked> {
672
match (pat.len(), val.len()) {
673
(1, 1) => {
674
let pat = get_pat(pat)?;
675
let val = val.get(0).ok_or_else(
676
|| polars_err!(ComputeError: "value cannot be 'null' in 'replace' expression"),
677
)?;
678
let literal = literal || is_literal_pat(pat);
679
680
match literal {
681
true => ca.replace_literal_all(pat, val),
682
false => ca.replace_all(pat, val),
683
}
684
},
685
(1, len_val) => {
686
let mut pat = get_pat(pat)?.to_string();
687
polars_ensure!(
688
len_val == ca.len(),
689
ComputeError:
690
"replacement value length ({}) does not match string column length ({})",
691
len_val, ca.len(),
692
);
693
694
let literal_pat = literal || is_literal_pat(&pat);
695
696
if literal_pat {
697
pat = escape(&pat)
698
}
699
700
let reg = polars_utils::regex_cache::compile_regex(&pat)?;
701
702
let f = |s: &'a str, val: &'a str| {
703
// According to the docs for replace_all
704
// when literal = True then capture groups are ignored.
705
if literal {
706
reg.replace_all(s, NoExpand(val))
707
} else {
708
reg.replace_all(s, val)
709
}
710
};
711
712
Ok(iter_and_replace(ca, val, f))
713
},
714
_ => polars_bail!(
715
ComputeError: "dynamic pattern length in 'str.replace' expressions is not supported yet"
716
),
717
}
718
}
719
720
pub(super) fn format(s: &mut [Column], format: &str, insertions: &[usize]) -> PolarsResult<Column> {
721
polars_ops::series::str_format(s, format, insertions)
722
}
723
724
#[cfg(feature = "regex")]
725
pub(super) fn replace(s: &[Column], literal: bool, n: i64) -> PolarsResult<Column> {
726
let column = &s[0];
727
let pat = &s[1];
728
let val = &s[2];
729
let all = n < 0;
730
731
let column = column.str()?;
732
let pat = pat.str()?;
733
let val = val.str()?;
734
735
if all {
736
replace_all(column, pat, val, literal)
737
} else {
738
replace_n(column, pat, val, literal, n as usize)
739
}
740
.map(|ca| ca.into_column())
741
}
742
743
#[cfg(feature = "string_normalize")]
744
pub(super) fn normalize(
745
s: &Column,
746
form: polars_ops::prelude::UnicodeForm,
747
) -> PolarsResult<Column> {
748
let ca = s.str()?;
749
Ok(ca.str_normalize(form).into_column())
750
}
751
752
#[cfg(feature = "string_reverse")]
753
pub(super) fn reverse(s: &Column) -> PolarsResult<Column> {
754
let ca = s.str()?;
755
Ok(ca.str_reverse().into_column())
756
}
757
758
#[cfg(feature = "string_to_integer")]
759
pub(super) fn to_integer(
760
s: &[Column],
761
dtype: Option<DataType>,
762
strict: bool,
763
) -> PolarsResult<Column> {
764
let ca = s[0].str()?;
765
let base = s[1].strict_cast(&DataType::UInt32)?;
766
ca.to_integer(base.u32()?, dtype, strict)
767
.map(|ok| ok.into_column())
768
}
769
770
fn _ensure_lengths(s: &[Column]) -> bool {
771
// Calculate the post-broadcast length and ensure everything is consistent.
772
let len = s
773
.iter()
774
.map(|series| series.len())
775
.filter(|l| *l != 1)
776
.max()
777
.unwrap_or(1);
778
s.iter()
779
.all(|series| series.len() == 1 || series.len() == len)
780
}
781
782
fn _check_same_length(s: &[Column], fn_name: &str) -> Result<(), PolarsError> {
783
polars_ensure!(
784
_ensure_lengths(s),
785
ShapeMismatch: "all series in `str.{}()` should have equal or unit length",
786
fn_name
787
);
788
Ok(())
789
}
790
791
pub(super) fn str_slice(s: &[Column]) -> PolarsResult<Column> {
792
_check_same_length(s, "slice")?;
793
let ca = s[0].str()?;
794
let offset = &s[1];
795
let length = &s[2];
796
Ok(ca.str_slice(offset, length)?.into_column())
797
}
798
799
pub(super) fn str_head(s: &[Column]) -> PolarsResult<Column> {
800
_check_same_length(s, "head")?;
801
let ca = s[0].str()?;
802
let n = &s[1];
803
Ok(ca.str_head(n)?.into_column())
804
}
805
806
pub(super) fn str_tail(s: &[Column]) -> PolarsResult<Column> {
807
_check_same_length(s, "tail")?;
808
let ca = s[0].str()?;
809
let n = &s[1];
810
Ok(ca.str_tail(n)?.into_column())
811
}
812
813
#[cfg(feature = "string_encoding")]
814
pub(super) fn hex_encode(s: &Column) -> PolarsResult<Column> {
815
Ok(s.str()?.hex_encode().into_column())
816
}
817
818
#[cfg(feature = "binary_encoding")]
819
pub(super) fn hex_decode(s: &Column, strict: bool) -> PolarsResult<Column> {
820
s.str()?.hex_decode(strict).map(|ca| ca.into_column())
821
}
822
823
#[cfg(feature = "string_encoding")]
824
pub(super) fn base64_encode(s: &Column) -> PolarsResult<Column> {
825
Ok(s.str()?.base64_encode().into_column())
826
}
827
828
#[cfg(feature = "binary_encoding")]
829
pub(super) fn base64_decode(s: &Column, strict: bool) -> PolarsResult<Column> {
830
s.str()?.base64_decode(strict).map(|ca| ca.into_column())
831
}
832
833
#[cfg(feature = "dtype-decimal")]
834
pub(super) fn to_decimal(s: &Column, scale: usize) -> PolarsResult<Column> {
835
let ca = s.str()?;
836
ca.to_decimal(polars_compute::decimal::DEC128_MAX_PREC, scale)
837
.map(Column::from)
838
}
839
840
#[cfg(feature = "extract_jsonpath")]
841
pub(super) fn json_decode(s: &Column, dtype: DataType) -> PolarsResult<Column> {
842
use polars_ops::prelude::Utf8JsonPathImpl;
843
844
let ca = s.str()?;
845
ca.json_decode(Some(dtype), None).map(Column::from)
846
}
847
848
#[cfg(feature = "extract_jsonpath")]
849
pub(super) fn json_path_match(s: &[Column]) -> PolarsResult<Column> {
850
use polars_ops::prelude::Utf8JsonPathImpl;
851
852
_check_same_length(s, "json_path_match")?;
853
let ca = s[0].str()?;
854
let pat = s[1].str()?;
855
Ok(ca.json_path_match(pat)?.into_column())
856
}
857
858
#[cfg(feature = "regex")]
859
pub(super) fn escape_regex(s: &Column) -> PolarsResult<Column> {
860
let ca = s.str()?;
861
Ok(ca.str_escape_regex().into_column())
862
}
863
864