Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/dsl/string.rs
8421 views
1
use super::*;
2
/// Specialized expressions for [`Series`] of [`DataType::String`].
3
pub struct StringNameSpace(pub(crate) Expr);
4
5
impl StringNameSpace {
6
/// Check if a string value contains a literal substring.
7
#[cfg(feature = "regex")]
8
pub fn contains_literal(self, pat: Expr) -> Expr {
9
self.0.map_binary(
10
StringFunction::Contains {
11
literal: true,
12
strict: false,
13
},
14
pat,
15
)
16
}
17
18
/// Check if this column of strings contains a Regex. If `strict` is `true`, then it is an error if any `pat` is
19
/// an invalid regex, whereas if `strict` is `false`, an invalid regex will simply evaluate to `false`.
20
#[cfg(feature = "regex")]
21
pub fn contains(self, pat: Expr, strict: bool) -> Expr {
22
self.0.map_binary(
23
StringFunction::Contains {
24
literal: false,
25
strict,
26
},
27
pat,
28
)
29
}
30
31
/// Uses aho-corasick to find many patterns.
32
///
33
/// # Arguments
34
/// - `patterns`: an expression that evaluates to a String column
35
/// - `ascii_case_insensitive`: Enable ASCII-aware case insensitive matching.
36
/// When this option is enabled, searching will be performed without respect to case for
37
/// ASCII letters (a-z and A-Z) only.
38
#[cfg(feature = "find_many")]
39
pub fn contains_any(self, patterns: Expr, ascii_case_insensitive: bool) -> Expr {
40
self.0.map_binary(
41
StringFunction::ContainsAny {
42
ascii_case_insensitive,
43
},
44
patterns,
45
)
46
}
47
48
/// Uses aho-corasick to replace many patterns.
49
/// # Arguments
50
/// - `patterns`: an expression that evaluates to a String column
51
/// - `replace_with`: an expression that evaluates to a String column
52
/// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
53
/// When this option is enabled, searching will be performed without respect to case for
54
/// ASCII letters (a-z and A-Z) only.
55
#[cfg(feature = "find_many")]
56
pub fn replace_many(
57
self,
58
patterns: Expr,
59
replace_with: Expr,
60
ascii_case_insensitive: bool,
61
leftmost: bool,
62
) -> Expr {
63
self.0.map_ternary(
64
StringFunction::ReplaceMany {
65
ascii_case_insensitive,
66
leftmost,
67
},
68
patterns,
69
replace_with,
70
)
71
}
72
73
/// Uses aho-corasick to replace many patterns.
74
/// # Arguments
75
/// - `patterns`: an expression that evaluates to a String column
76
/// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
77
/// When this option is enabled, searching will be performed without respect to case for
78
/// ASCII letters (a-z and A-Z) only.
79
/// - `overlapping`: Whether matches may overlap.
80
#[cfg(feature = "find_many")]
81
pub fn extract_many(
82
self,
83
patterns: Expr,
84
ascii_case_insensitive: bool,
85
overlapping: bool,
86
leftmost: bool,
87
) -> Expr {
88
self.0.map_binary(
89
StringFunction::ExtractMany {
90
ascii_case_insensitive,
91
overlapping,
92
leftmost,
93
},
94
patterns,
95
)
96
}
97
98
/// Uses aho-corasick to find many patterns.
99
/// # Arguments
100
/// - `patterns`: an expression that evaluates to a String column
101
/// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
102
/// When this option is enabled, searching will be performed without respect to case for
103
/// ASCII letters (a-z and A-Z) only.
104
/// - `overlapping`: Whether matches may overlap.
105
#[cfg(feature = "find_many")]
106
pub fn find_many(
107
self,
108
patterns: Expr,
109
ascii_case_insensitive: bool,
110
overlapping: bool,
111
leftmost: bool,
112
) -> Expr {
113
self.0.map_binary(
114
StringFunction::FindMany {
115
ascii_case_insensitive,
116
overlapping,
117
leftmost,
118
},
119
patterns,
120
)
121
}
122
123
/// Check if a string value ends with the `sub` string.
124
pub fn ends_with(self, sub: Expr) -> Expr {
125
self.0.map_binary(StringFunction::EndsWith, sub)
126
}
127
128
/// Check if a string value starts with the `sub` string.
129
pub fn starts_with(self, sub: Expr) -> Expr {
130
self.0.map_binary(StringFunction::StartsWith, sub)
131
}
132
133
#[cfg(feature = "string_encoding")]
134
pub fn hex_encode(self) -> Expr {
135
self.0.map_unary(StringFunction::HexEncode)
136
}
137
138
#[cfg(feature = "binary_encoding")]
139
pub fn hex_decode(self, strict: bool) -> Expr {
140
self.0.map_unary(StringFunction::HexDecode(strict))
141
}
142
143
#[cfg(feature = "string_encoding")]
144
pub fn base64_encode(self) -> Expr {
145
self.0.map_unary(StringFunction::Base64Encode)
146
}
147
148
#[cfg(feature = "binary_encoding")]
149
pub fn base64_decode(self, strict: bool) -> Expr {
150
self.0.map_unary(StringFunction::Base64Decode(strict))
151
}
152
153
/// Extract a regex pattern from the a string value. If `group_index` is out of bounds, null is returned.
154
pub fn extract(self, pat: Expr, group_index: usize) -> Expr {
155
self.0.map_binary(StringFunction::Extract(group_index), pat)
156
}
157
158
#[cfg(feature = "extract_groups")]
159
// Extract all captures groups from a regex pattern as a struct
160
pub fn extract_groups(self, pat: &str) -> PolarsResult<Expr> {
161
// regex will be compiled twice, because it doesn't support serde
162
// and we need to compile it here to determine the output datatype
163
164
use polars_utils::format_pl_smallstr;
165
let reg = polars_utils::regex_cache::compile_regex(pat)?;
166
let names = reg
167
.capture_names()
168
.enumerate()
169
.skip(1)
170
.map(|(idx, opt_name)| {
171
opt_name
172
.map(PlSmallStr::from_str)
173
.unwrap_or_else(|| format_pl_smallstr!("{idx}"))
174
})
175
.collect::<Vec<_>>();
176
177
let dtype = DataType::Struct(
178
names
179
.iter()
180
.map(|name| Field::new(name.clone(), DataType::String))
181
.collect(),
182
);
183
184
Ok(self.0.map_unary(StringFunction::ExtractGroups {
185
dtype,
186
pat: pat.into(),
187
}))
188
}
189
190
/// Pad the start of the string until it reaches the given length.
191
///
192
/// Padding is done using the specified `fill_char`.
193
/// Strings with length equal to or greater than the given length are
194
/// returned as-is.
195
#[cfg(feature = "string_pad")]
196
pub fn pad_start(self, length: Expr, fill_char: char) -> Expr {
197
self.0
198
.map_binary(StringFunction::PadStart { fill_char }, length)
199
}
200
201
/// Pad the end of the string until it reaches the given length.
202
///
203
/// Padding is done using the specified `fill_char`.
204
/// Strings with length equal to or greater than the given length are
205
/// returned as-is.
206
#[cfg(feature = "string_pad")]
207
pub fn pad_end(self, length: Expr, fill_char: char) -> Expr {
208
self.0
209
.map_binary(StringFunction::PadEnd { fill_char }, length)
210
}
211
212
/// Pad the start of the string with zeros until it reaches the given length.
213
///
214
/// A sign prefix (`-`) is handled by inserting the padding after the sign
215
/// character rather than before.
216
/// Strings with length equal to or greater than the given length are
217
/// returned as-is.
218
#[cfg(feature = "string_pad")]
219
pub fn zfill(self, length: Expr) -> Expr {
220
self.0.map_binary(StringFunction::ZFill, length)
221
}
222
223
/// Find the index of a literal substring within another string value.
224
#[cfg(feature = "regex")]
225
pub fn find_literal(self, pat: Expr) -> Expr {
226
self.0.map_binary(
227
StringFunction::Find {
228
literal: true,
229
strict: false,
230
},
231
pat,
232
)
233
}
234
235
/// Find the index of a substring defined by a regular expressions within another string value.
236
#[cfg(feature = "regex")]
237
pub fn find(self, pat: Expr, strict: bool) -> Expr {
238
self.0.map_binary(
239
StringFunction::Find {
240
literal: false,
241
strict,
242
},
243
pat,
244
)
245
}
246
247
/// Extract each successive non-overlapping match in an individual string as an array
248
pub fn extract_all(self, pat: Expr) -> Expr {
249
self.0.map_binary(StringFunction::ExtractAll, pat)
250
}
251
252
/// Count all successive non-overlapping regex matches.
253
pub fn count_matches(self, pat: Expr, literal: bool) -> Expr {
254
self.0
255
.map_binary(StringFunction::CountMatches(literal), pat)
256
}
257
258
/// Convert a String column into a Date/Datetime/Time column.
259
#[cfg(feature = "temporal")]
260
pub fn strptime(
261
self,
262
dtype: impl Into<DataTypeExpr>,
263
options: StrptimeOptions,
264
ambiguous: Expr,
265
) -> Expr {
266
// Only elementwise if the format is explicitly set, or we're constant.
267
self.0
268
.map_binary(StringFunction::Strptime(dtype.into(), options), ambiguous)
269
}
270
271
/// Convert a String column into a Date column.
272
#[cfg(feature = "dtype-date")]
273
pub fn to_date(self, options: StrptimeOptions) -> Expr {
274
self.strptime(DataType::Date, options, lit("raise"))
275
}
276
277
/// Convert a String column into a Datetime column.
278
#[cfg(feature = "dtype-datetime")]
279
pub fn to_datetime(
280
self,
281
time_unit: Option<TimeUnit>,
282
time_zone: Option<TimeZone>,
283
options: StrptimeOptions,
284
ambiguous: Expr,
285
) -> Expr {
286
// If time_unit is None, try to infer it from the format or set a default
287
let time_unit = match (&options.format, time_unit) {
288
(_, Some(time_unit)) => time_unit,
289
(Some(format), None) => {
290
if format.contains("%.9f") || format.contains("%9f") {
291
TimeUnit::Nanoseconds
292
} else if format.contains("%.3f") || format.contains("%3f") {
293
TimeUnit::Milliseconds
294
} else {
295
TimeUnit::Microseconds
296
}
297
},
298
(None, None) => TimeUnit::Microseconds,
299
};
300
301
self.strptime(DataType::Datetime(time_unit, time_zone), options, ambiguous)
302
}
303
304
/// Convert a String column into a Time column.
305
#[cfg(feature = "dtype-time")]
306
pub fn to_time(self, options: StrptimeOptions) -> Expr {
307
self.strptime(DataType::Time, options, lit("raise"))
308
}
309
310
/// Convert a String column into a Decimal column.
311
#[cfg(feature = "dtype-decimal")]
312
pub fn to_decimal(self, scale: usize) -> Expr {
313
self.0.map_unary(StringFunction::ToDecimal { scale })
314
}
315
316
/// Concat the values into a string array.
317
/// # Arguments
318
///
319
/// * `delimiter` - A string that will act as delimiter between values.
320
#[cfg(feature = "concat_str")]
321
pub fn join(self, delimiter: &str, ignore_nulls: bool) -> Expr {
322
self.0.map_unary(StringFunction::ConcatVertical {
323
delimiter: delimiter.into(),
324
ignore_nulls,
325
})
326
}
327
328
/// Split the string by a substring. The resulting dtype is `List<String>`.
329
pub fn split(self, by: Expr) -> Expr {
330
self.0.map_binary(StringFunction::Split(false), by)
331
}
332
333
/// Split the string by a substring and keep the substring. The resulting dtype is `List<String>`.
334
pub fn split_inclusive(self, by: Expr) -> Expr {
335
self.0.map_binary(StringFunction::Split(true), by)
336
}
337
338
#[cfg(feature = "dtype-struct")]
339
/// Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
340
pub fn split_exact(self, by: Expr, n: usize) -> Expr {
341
self.0.map_binary(
342
StringFunction::SplitExact {
343
n,
344
inclusive: false,
345
},
346
by,
347
)
348
}
349
350
#[cfg(feature = "dtype-struct")]
351
/// Split exactly `n` times by a given substring and keep the substring.
352
/// The resulting dtype is [`DataType::Struct`].
353
pub fn split_exact_inclusive(self, by: Expr, n: usize) -> Expr {
354
self.0
355
.map_binary(StringFunction::SplitExact { n, inclusive: true }, by)
356
}
357
358
#[cfg(feature = "dtype-struct")]
359
/// Split by a given substring, returning exactly `n` items. If there are more possible splits,
360
/// keeps the remainder of the string intact. The resulting dtype is [`DataType::Struct`].
361
pub fn splitn(self, by: Expr, n: usize) -> Expr {
362
self.0.map_binary(StringFunction::SplitN(n), by)
363
}
364
365
#[cfg(feature = "regex")]
366
/// Split the string by a regex pattern. The resulting dtype is `List<String>`.
367
pub fn split_regex(self, pat: Expr, strict: bool) -> Expr {
368
self.0.map_binary(
369
StringFunction::SplitRegex {
370
inclusive: false,
371
strict,
372
},
373
pat,
374
)
375
}
376
377
#[cfg(feature = "regex")]
378
/// Split the string by a regex pattern and keep the matched substrings.
379
/// The resulting dtype is `List<String>`.
380
pub fn split_regex_inclusive(self, pat: Expr, strict: bool) -> Expr {
381
self.0.map_binary(
382
StringFunction::SplitRegex {
383
inclusive: true,
384
strict,
385
},
386
pat,
387
)
388
}
389
390
#[cfg(feature = "regex")]
391
/// Replace values that match a regex `pat` with a `value`.
392
pub fn replace(self, pat: Expr, value: Expr, literal: bool) -> Expr {
393
self.0
394
.map_ternary(StringFunction::Replace { n: 1, literal }, pat, value)
395
}
396
397
#[cfg(feature = "regex")]
398
/// Replace values that match a regex `pat` with a `value`.
399
pub fn replace_n(self, pat: Expr, value: Expr, literal: bool, n: i64) -> Expr {
400
self.0
401
.map_ternary(StringFunction::Replace { n, literal }, pat, value)
402
}
403
404
#[cfg(feature = "regex")]
405
/// Replace all values that match a regex `pat` with a `value`.
406
pub fn replace_all(self, pat: Expr, value: Expr, literal: bool) -> Expr {
407
self.0
408
.map_ternary(StringFunction::Replace { n: -1, literal }, pat, value)
409
}
410
411
#[cfg(feature = "string_normalize")]
412
/// Normalize each string
413
pub fn normalize(self, form: UnicodeForm) -> Expr {
414
self.0.map_unary(StringFunction::Normalize { form })
415
}
416
417
#[cfg(feature = "string_reverse")]
418
/// Reverse each string
419
pub fn reverse(self) -> Expr {
420
self.0.map_unary(StringFunction::Reverse)
421
}
422
423
/// Remove leading and trailing characters, or whitespace if matches is None.
424
pub fn strip_chars(self, matches: Expr) -> Expr {
425
self.0.map_binary(StringFunction::StripChars, matches)
426
}
427
428
/// Remove leading characters, or whitespace if matches is None.
429
pub fn strip_chars_start(self, matches: Expr) -> Expr {
430
self.0.map_binary(StringFunction::StripCharsStart, matches)
431
}
432
433
/// Remove trailing characters, or whitespace if matches is None.
434
pub fn strip_chars_end(self, matches: Expr) -> Expr {
435
self.0.map_binary(StringFunction::StripCharsEnd, matches)
436
}
437
438
/// Remove prefix.
439
pub fn strip_prefix(self, prefix: Expr) -> Expr {
440
self.0.map_binary(StringFunction::StripPrefix, prefix)
441
}
442
443
/// Remove suffix.
444
pub fn strip_suffix(self, suffix: Expr) -> Expr {
445
self.0.map_binary(StringFunction::StripSuffix, suffix)
446
}
447
448
/// Convert all characters to lowercase.
449
pub fn to_lowercase(self) -> Expr {
450
self.0.map_unary(StringFunction::Lowercase)
451
}
452
453
/// Convert all characters to uppercase.
454
pub fn to_uppercase(self) -> Expr {
455
self.0.map_unary(StringFunction::Uppercase)
456
}
457
458
/// Convert all characters to titlecase.
459
#[cfg(feature = "nightly")]
460
pub fn to_titlecase(self) -> Expr {
461
self.0.map_unary(StringFunction::Titlecase)
462
}
463
464
#[cfg(feature = "string_to_integer")]
465
/// Parse string in base radix into decimal.
466
/// The resulting dtype is `dtype`
467
pub fn to_integer(self, base: Expr, dtype: Option<DataType>, strict: bool) -> Expr {
468
self.0
469
.map_binary(StringFunction::ToInteger { dtype, strict }, base)
470
}
471
472
/// Return the length of each string as the number of bytes.
473
///
474
/// When working with non-ASCII text, the length in bytes is not the same
475
/// as the length in characters. You may want to use
476
/// [`len_chars`] instead. Note that `len_bytes` is much more
477
/// performant (_O(1)_) than [`len_chars`] (_O(n)_).
478
///
479
/// [`len_chars`]: StringNameSpace::len_chars
480
pub fn len_bytes(self) -> Expr {
481
self.0.map_unary(StringFunction::LenBytes)
482
}
483
484
/// Return the length of each string as the number of characters.
485
///
486
/// When working with ASCII text, use [`len_bytes`] instead to achieve
487
/// equivalent output with much better performance:
488
/// [`len_bytes`] runs in _O(1)_, while `len_chars` runs in _O(n)_.
489
///
490
/// [`len_bytes`]: StringNameSpace::len_bytes
491
pub fn len_chars(self) -> Expr {
492
self.0.map_unary(StringFunction::LenChars)
493
}
494
495
/// Slice the string values.
496
pub fn slice(self, offset: Expr, length: Expr) -> Expr {
497
self.0.map_ternary(StringFunction::Slice, offset, length)
498
}
499
500
/// Take the first `n` characters of the string values.
501
pub fn head(self, n: Expr) -> Expr {
502
self.0.map_binary(StringFunction::Head, n)
503
}
504
505
/// Take the last `n` characters of the string values.
506
pub fn tail(self, n: Expr) -> Expr {
507
self.0.map_binary(StringFunction::Tail, n)
508
}
509
510
#[cfg(feature = "extract_jsonpath")]
511
pub fn json_decode(self, dtype: impl Into<DataTypeExpr>) -> Expr {
512
self.0.map_unary(StringFunction::JsonDecode(dtype.into()))
513
}
514
515
#[cfg(feature = "extract_jsonpath")]
516
pub fn json_path_match(self, pat: Expr) -> Expr {
517
self.0.map_binary(StringFunction::JsonPathMatch, pat)
518
}
519
520
#[cfg(feature = "regex")]
521
pub fn escape_regex(self) -> Expr {
522
self.0.map_unary(StringFunction::EscapeRegex)
523
}
524
}
525
526