Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/dsl/string.rs
6939 views
1
use super::*;
2
/// Specialized expressions for [`Series`] of [`DataType::String`].
3
pub struct StringNameSpace(pub(crate) Expr);
4
5
impl StringNameSpace {
6
/// Check if a string value contains a literal substring.
7
#[cfg(feature = "regex")]
8
pub fn contains_literal(self, pat: Expr) -> Expr {
9
self.0.map_binary(
10
StringFunction::Contains {
11
literal: true,
12
strict: false,
13
},
14
pat,
15
)
16
}
17
18
/// Check if this column of strings contains a Regex. If `strict` is `true`, then it is an error if any `pat` is
19
/// an invalid regex, whereas if `strict` is `false`, an invalid regex will simply evaluate to `false`.
20
#[cfg(feature = "regex")]
21
pub fn contains(self, pat: Expr, strict: bool) -> Expr {
22
self.0.map_binary(
23
StringFunction::Contains {
24
literal: false,
25
strict,
26
},
27
pat,
28
)
29
}
30
31
/// Uses aho-corasick to find many patterns.
32
///
33
/// # Arguments
34
/// - `patterns`: an expression that evaluates to a String column
35
/// - `ascii_case_insensitive`: Enable ASCII-aware case insensitive matching.
36
/// When this option is enabled, searching will be performed without respect to case for
37
/// ASCII letters (a-z and A-Z) only.
38
#[cfg(feature = "find_many")]
39
pub fn contains_any(self, patterns: Expr, ascii_case_insensitive: bool) -> Expr {
40
self.0.map_binary(
41
StringFunction::ContainsAny {
42
ascii_case_insensitive,
43
},
44
patterns,
45
)
46
}
47
48
/// Uses aho-corasick to replace many patterns.
49
/// # Arguments
50
/// - `patterns`: an expression that evaluates to a String column
51
/// - `replace_with`: an expression that evaluates to a String column
52
/// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
53
/// When this option is enabled, searching will be performed without respect to case for
54
/// ASCII letters (a-z and A-Z) only.
55
#[cfg(feature = "find_many")]
56
pub fn replace_many(
57
self,
58
patterns: Expr,
59
replace_with: Expr,
60
ascii_case_insensitive: bool,
61
) -> Expr {
62
self.0.map_ternary(
63
StringFunction::ReplaceMany {
64
ascii_case_insensitive,
65
},
66
patterns,
67
replace_with,
68
)
69
}
70
71
/// Uses aho-corasick to replace many patterns.
72
/// # Arguments
73
/// - `patterns`: an expression that evaluates to a String column
74
/// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
75
/// When this option is enabled, searching will be performed without respect to case for
76
/// ASCII letters (a-z and A-Z) only.
77
/// - `overlapping`: Whether matches may overlap.
78
#[cfg(feature = "find_many")]
79
pub fn extract_many(
80
self,
81
patterns: Expr,
82
ascii_case_insensitive: bool,
83
overlapping: bool,
84
) -> Expr {
85
self.0.map_binary(
86
StringFunction::ExtractMany {
87
ascii_case_insensitive,
88
overlapping,
89
},
90
patterns,
91
)
92
}
93
94
/// Uses aho-corasick to find many patterns.
95
/// # Arguments
96
/// - `patterns`: an expression that evaluates to a String column
97
/// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
98
/// When this option is enabled, searching will be performed without respect to case for
99
/// ASCII letters (a-z and A-Z) only.
100
/// - `overlapping`: Whether matches may overlap.
101
#[cfg(feature = "find_many")]
102
pub fn find_many(
103
self,
104
patterns: Expr,
105
ascii_case_insensitive: bool,
106
overlapping: bool,
107
) -> Expr {
108
self.0.map_binary(
109
StringFunction::FindMany {
110
ascii_case_insensitive,
111
overlapping,
112
},
113
patterns,
114
)
115
}
116
117
/// Check if a string value ends with the `sub` string.
118
pub fn ends_with(self, sub: Expr) -> Expr {
119
self.0.map_binary(StringFunction::EndsWith, sub)
120
}
121
122
/// Check if a string value starts with the `sub` string.
123
pub fn starts_with(self, sub: Expr) -> Expr {
124
self.0.map_binary(StringFunction::StartsWith, sub)
125
}
126
127
#[cfg(feature = "string_encoding")]
128
pub fn hex_encode(self) -> Expr {
129
self.0.map_unary(StringFunction::HexEncode)
130
}
131
132
#[cfg(feature = "binary_encoding")]
133
pub fn hex_decode(self, strict: bool) -> Expr {
134
self.0.map_unary(StringFunction::HexDecode(strict))
135
}
136
137
#[cfg(feature = "string_encoding")]
138
pub fn base64_encode(self) -> Expr {
139
self.0.map_unary(StringFunction::Base64Encode)
140
}
141
142
#[cfg(feature = "binary_encoding")]
143
pub fn base64_decode(self, strict: bool) -> Expr {
144
self.0.map_unary(StringFunction::Base64Decode(strict))
145
}
146
147
/// Extract a regex pattern from the a string value. If `group_index` is out of bounds, null is returned.
148
pub fn extract(self, pat: Expr, group_index: usize) -> Expr {
149
self.0.map_binary(StringFunction::Extract(group_index), pat)
150
}
151
152
#[cfg(feature = "extract_groups")]
153
// Extract all captures groups from a regex pattern as a struct
154
pub fn extract_groups(self, pat: &str) -> PolarsResult<Expr> {
155
// regex will be compiled twice, because it doesn't support serde
156
// and we need to compile it here to determine the output datatype
157
158
use polars_utils::format_pl_smallstr;
159
let reg = polars_utils::regex_cache::compile_regex(pat)?;
160
let names = reg
161
.capture_names()
162
.enumerate()
163
.skip(1)
164
.map(|(idx, opt_name)| {
165
opt_name
166
.map(PlSmallStr::from_str)
167
.unwrap_or_else(|| format_pl_smallstr!("{idx}"))
168
})
169
.collect::<Vec<_>>();
170
171
let dtype = DataType::Struct(
172
names
173
.iter()
174
.map(|name| Field::new(name.clone(), DataType::String))
175
.collect(),
176
);
177
178
Ok(self.0.map_unary(StringFunction::ExtractGroups {
179
dtype,
180
pat: pat.into(),
181
}))
182
}
183
184
/// Pad the start of the string until it reaches the given length.
185
///
186
/// Padding is done using the specified `fill_char`.
187
/// Strings with length equal to or greater than the given length are
188
/// returned as-is.
189
#[cfg(feature = "string_pad")]
190
pub fn pad_start(self, length: Expr, fill_char: char) -> Expr {
191
self.0
192
.map_binary(StringFunction::PadStart { fill_char }, length)
193
}
194
195
/// Pad the end of the string until it reaches the given length.
196
///
197
/// Padding is done using the specified `fill_char`.
198
/// Strings with length equal to or greater than the given length are
199
/// returned as-is.
200
#[cfg(feature = "string_pad")]
201
pub fn pad_end(self, length: Expr, fill_char: char) -> Expr {
202
self.0
203
.map_binary(StringFunction::PadEnd { fill_char }, length)
204
}
205
206
/// Pad the start of the string with zeros until it reaches the given length.
207
///
208
/// A sign prefix (`-`) is handled by inserting the padding after the sign
209
/// character rather than before.
210
/// Strings with length equal to or greater than the given length are
211
/// returned as-is.
212
#[cfg(feature = "string_pad")]
213
pub fn zfill(self, length: Expr) -> Expr {
214
self.0.map_binary(StringFunction::ZFill, length)
215
}
216
217
/// Find the index of a literal substring within another string value.
218
#[cfg(feature = "regex")]
219
pub fn find_literal(self, pat: Expr) -> Expr {
220
self.0.map_binary(
221
StringFunction::Find {
222
literal: true,
223
strict: false,
224
},
225
pat,
226
)
227
}
228
229
/// Find the index of a substring defined by a regular expressions within another string value.
230
#[cfg(feature = "regex")]
231
pub fn find(self, pat: Expr, strict: bool) -> Expr {
232
self.0.map_binary(
233
StringFunction::Find {
234
literal: false,
235
strict,
236
},
237
pat,
238
)
239
}
240
241
/// Extract each successive non-overlapping match in an individual string as an array
242
pub fn extract_all(self, pat: Expr) -> Expr {
243
self.0.map_binary(StringFunction::ExtractAll, pat)
244
}
245
246
/// Count all successive non-overlapping regex matches.
247
pub fn count_matches(self, pat: Expr, literal: bool) -> Expr {
248
self.0
249
.map_binary(StringFunction::CountMatches(literal), pat)
250
}
251
252
/// Convert a String column into a Date/Datetime/Time column.
253
#[cfg(feature = "temporal")]
254
pub fn strptime(
255
self,
256
dtype: impl Into<DataTypeExpr>,
257
options: StrptimeOptions,
258
ambiguous: Expr,
259
) -> Expr {
260
// Only elementwise if the format is explicitly set, or we're constant.
261
self.0
262
.map_binary(StringFunction::Strptime(dtype.into(), options), ambiguous)
263
}
264
265
/// Convert a String column into a Date column.
266
#[cfg(feature = "dtype-date")]
267
pub fn to_date(self, options: StrptimeOptions) -> Expr {
268
self.strptime(DataType::Date, options, lit("raise"))
269
}
270
271
/// Convert a String column into a Datetime column.
272
#[cfg(feature = "dtype-datetime")]
273
pub fn to_datetime(
274
self,
275
time_unit: Option<TimeUnit>,
276
time_zone: Option<TimeZone>,
277
options: StrptimeOptions,
278
ambiguous: Expr,
279
) -> Expr {
280
// If time_unit is None, try to infer it from the format or set a default
281
let time_unit = match (&options.format, time_unit) {
282
(_, Some(time_unit)) => time_unit,
283
(Some(format), None) => {
284
if format.contains("%.9f") || format.contains("%9f") {
285
TimeUnit::Nanoseconds
286
} else if format.contains("%.3f") || format.contains("%3f") {
287
TimeUnit::Milliseconds
288
} else {
289
TimeUnit::Microseconds
290
}
291
},
292
(None, None) => TimeUnit::Microseconds,
293
};
294
295
self.strptime(DataType::Datetime(time_unit, time_zone), options, ambiguous)
296
}
297
298
/// Convert a String column into a Time column.
299
#[cfg(feature = "dtype-time")]
300
pub fn to_time(self, options: StrptimeOptions) -> Expr {
301
self.strptime(DataType::Time, options, lit("raise"))
302
}
303
304
/// Convert a String column into a Decimal column.
305
#[cfg(feature = "dtype-decimal")]
306
pub fn to_decimal(self, scale: usize) -> Expr {
307
self.0.map_unary(StringFunction::ToDecimal { scale })
308
}
309
310
/// Concat the values into a string array.
311
/// # Arguments
312
///
313
/// * `delimiter` - A string that will act as delimiter between values.
314
#[cfg(feature = "concat_str")]
315
pub fn join(self, delimiter: &str, ignore_nulls: bool) -> Expr {
316
self.0.map_unary(StringFunction::ConcatVertical {
317
delimiter: delimiter.into(),
318
ignore_nulls,
319
})
320
}
321
322
/// Split the string by a substring. The resulting dtype is `List<String>`.
323
pub fn split(self, by: Expr) -> Expr {
324
self.0.map_binary(StringFunction::Split(false), by)
325
}
326
327
/// Split the string by a substring and keep the substring. The resulting dtype is `List<String>`.
328
pub fn split_inclusive(self, by: Expr) -> Expr {
329
self.0.map_binary(StringFunction::Split(true), by)
330
}
331
332
#[cfg(feature = "dtype-struct")]
333
/// Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
334
pub fn split_exact(self, by: Expr, n: usize) -> Expr {
335
self.0.map_binary(
336
StringFunction::SplitExact {
337
n,
338
inclusive: false,
339
},
340
by,
341
)
342
}
343
344
#[cfg(feature = "dtype-struct")]
345
/// Split exactly `n` times by a given substring and keep the substring.
346
/// The resulting dtype is [`DataType::Struct`].
347
pub fn split_exact_inclusive(self, by: Expr, n: usize) -> Expr {
348
self.0
349
.map_binary(StringFunction::SplitExact { n, inclusive: true }, by)
350
}
351
352
#[cfg(feature = "dtype-struct")]
353
/// Split by a given substring, returning exactly `n` items. If there are more possible splits,
354
/// keeps the remainder of the string intact. The resulting dtype is [`DataType::Struct`].
355
pub fn splitn(self, by: Expr, n: usize) -> Expr {
356
self.0.map_binary(StringFunction::SplitN(n), by)
357
}
358
359
#[cfg(feature = "regex")]
360
/// Replace values that match a regex `pat` with a `value`.
361
pub fn replace(self, pat: Expr, value: Expr, literal: bool) -> Expr {
362
self.0
363
.map_ternary(StringFunction::Replace { n: 1, literal }, pat, value)
364
}
365
366
#[cfg(feature = "regex")]
367
/// Replace values that match a regex `pat` with a `value`.
368
pub fn replace_n(self, pat: Expr, value: Expr, literal: bool, n: i64) -> Expr {
369
self.0
370
.map_ternary(StringFunction::Replace { n, literal }, pat, value)
371
}
372
373
#[cfg(feature = "regex")]
374
/// Replace all values that match a regex `pat` with a `value`.
375
pub fn replace_all(self, pat: Expr, value: Expr, literal: bool) -> Expr {
376
self.0
377
.map_ternary(StringFunction::Replace { n: -1, literal }, pat, value)
378
}
379
380
#[cfg(feature = "string_normalize")]
381
/// Normalize each string
382
pub fn normalize(self, form: UnicodeForm) -> Expr {
383
self.0.map_unary(StringFunction::Normalize { form })
384
}
385
386
#[cfg(feature = "string_reverse")]
387
/// Reverse each string
388
pub fn reverse(self) -> Expr {
389
self.0.map_unary(StringFunction::Reverse)
390
}
391
392
/// Remove leading and trailing characters, or whitespace if matches is None.
393
pub fn strip_chars(self, matches: Expr) -> Expr {
394
self.0.map_binary(StringFunction::StripChars, matches)
395
}
396
397
/// Remove leading characters, or whitespace if matches is None.
398
pub fn strip_chars_start(self, matches: Expr) -> Expr {
399
self.0.map_binary(StringFunction::StripCharsStart, matches)
400
}
401
402
/// Remove trailing characters, or whitespace if matches is None.
403
pub fn strip_chars_end(self, matches: Expr) -> Expr {
404
self.0.map_binary(StringFunction::StripCharsEnd, matches)
405
}
406
407
/// Remove prefix.
408
pub fn strip_prefix(self, prefix: Expr) -> Expr {
409
self.0.map_binary(StringFunction::StripPrefix, prefix)
410
}
411
412
/// Remove suffix.
413
pub fn strip_suffix(self, suffix: Expr) -> Expr {
414
self.0.map_binary(StringFunction::StripSuffix, suffix)
415
}
416
417
/// Convert all characters to lowercase.
418
pub fn to_lowercase(self) -> Expr {
419
self.0.map_unary(StringFunction::Lowercase)
420
}
421
422
/// Convert all characters to uppercase.
423
pub fn to_uppercase(self) -> Expr {
424
self.0.map_unary(StringFunction::Uppercase)
425
}
426
427
/// Convert all characters to titlecase.
428
#[cfg(feature = "nightly")]
429
pub fn to_titlecase(self) -> Expr {
430
self.0.map_unary(StringFunction::Titlecase)
431
}
432
433
#[cfg(feature = "string_to_integer")]
434
/// Parse string in base radix into decimal.
435
/// The resulting dtype is `dtype`
436
pub fn to_integer(self, base: Expr, dtype: Option<DataType>, strict: bool) -> Expr {
437
self.0
438
.map_binary(StringFunction::ToInteger { dtype, strict }, base)
439
}
440
441
/// Return the length of each string as the number of bytes.
442
///
443
/// When working with non-ASCII text, the length in bytes is not the same
444
/// as the length in characters. You may want to use
445
/// [`len_chars`] instead. Note that `len_bytes` is much more
446
/// performant (_O(1)_) than [`len_chars`] (_O(n)_).
447
///
448
/// [`len_chars`]: StringNameSpace::len_chars
449
pub fn len_bytes(self) -> Expr {
450
self.0.map_unary(StringFunction::LenBytes)
451
}
452
453
/// Return the length of each string as the number of characters.
454
///
455
/// When working with ASCII text, use [`len_bytes`] instead to achieve
456
/// equivalent output with much better performance:
457
/// [`len_bytes`] runs in _O(1)_, while `len_chars` runs in _O(n)_.
458
///
459
/// [`len_bytes`]: StringNameSpace::len_bytes
460
pub fn len_chars(self) -> Expr {
461
self.0.map_unary(StringFunction::LenChars)
462
}
463
464
/// Slice the string values.
465
pub fn slice(self, offset: Expr, length: Expr) -> Expr {
466
self.0.map_ternary(StringFunction::Slice, offset, length)
467
}
468
469
/// Take the first `n` characters of the string values.
470
pub fn head(self, n: Expr) -> Expr {
471
self.0.map_binary(StringFunction::Head, n)
472
}
473
474
/// Take the last `n` characters of the string values.
475
pub fn tail(self, n: Expr) -> Expr {
476
self.0.map_binary(StringFunction::Tail, n)
477
}
478
479
#[cfg(feature = "extract_jsonpath")]
480
pub fn json_decode(self, dtype: impl Into<DataTypeExpr>) -> Expr {
481
self.0.map_unary(StringFunction::JsonDecode(dtype.into()))
482
}
483
484
#[cfg(feature = "extract_jsonpath")]
485
pub fn json_path_match(self, pat: Expr) -> Expr {
486
self.0.map_binary(StringFunction::JsonPathMatch, pat)
487
}
488
489
#[cfg(feature = "regex")]
490
pub fn escape_regex(self) -> Expr {
491
self.0.map_unary(StringFunction::EscapeRegex)
492
}
493
}
494
495