Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/plans/aexpr/function_expr/strings.rs
8393 views
1
#[cfg(feature = "dtype-decimal")]
2
use polars_compute::decimal::DEC128_MAX_PREC;
3
#[cfg(feature = "dtype-struct")]
4
use polars_utils::format_pl_smallstr;
5
6
use super::*;
7
8
#[cfg(all(feature = "regex", feature = "timezones"))]
9
polars_utils::regex_cache::cached_regex! {
10
pub static TZ_AWARE_RE = r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)";
11
}
12
13
#[cfg_attr(feature = "ir_serde", derive(serde::Serialize, serde::Deserialize))]
14
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
15
pub enum IRStringFunction {
16
Format {
17
format: PlSmallStr,
18
insertions: Arc<[usize]>,
19
},
20
#[cfg(feature = "concat_str")]
21
ConcatHorizontal {
22
delimiter: PlSmallStr,
23
ignore_nulls: bool,
24
},
25
#[cfg(feature = "concat_str")]
26
ConcatVertical {
27
delimiter: PlSmallStr,
28
ignore_nulls: bool,
29
},
30
#[cfg(feature = "regex")]
31
Contains {
32
literal: bool,
33
strict: bool,
34
},
35
CountMatches(bool),
36
EndsWith,
37
Extract(usize),
38
ExtractAll,
39
#[cfg(feature = "extract_groups")]
40
ExtractGroups {
41
dtype: DataType,
42
pat: PlSmallStr,
43
},
44
#[cfg(feature = "regex")]
45
Find {
46
literal: bool,
47
strict: bool,
48
},
49
#[cfg(feature = "string_to_integer")]
50
ToInteger {
51
dtype: Option<DataType>,
52
strict: bool,
53
},
54
LenBytes,
55
LenChars,
56
Lowercase,
57
#[cfg(feature = "extract_jsonpath")]
58
JsonDecode(DataType),
59
#[cfg(feature = "extract_jsonpath")]
60
JsonPathMatch,
61
#[cfg(feature = "regex")]
62
Replace {
63
// negative is replace all
64
// how many matches to replace
65
n: i64,
66
literal: bool,
67
},
68
#[cfg(feature = "string_normalize")]
69
Normalize {
70
form: UnicodeForm,
71
},
72
#[cfg(feature = "string_reverse")]
73
Reverse,
74
#[cfg(feature = "string_pad")]
75
PadStart {
76
fill_char: char,
77
},
78
#[cfg(feature = "string_pad")]
79
PadEnd {
80
fill_char: char,
81
},
82
Slice,
83
Head,
84
Tail,
85
#[cfg(feature = "string_encoding")]
86
HexEncode,
87
#[cfg(feature = "binary_encoding")]
88
HexDecode(bool),
89
#[cfg(feature = "string_encoding")]
90
Base64Encode,
91
#[cfg(feature = "binary_encoding")]
92
Base64Decode(bool),
93
StartsWith,
94
StripChars,
95
StripCharsStart,
96
StripCharsEnd,
97
StripPrefix,
98
StripSuffix,
99
#[cfg(feature = "dtype-struct")]
100
SplitExact {
101
n: usize,
102
inclusive: bool,
103
},
104
#[cfg(feature = "dtype-struct")]
105
SplitN(usize),
106
#[cfg(feature = "temporal")]
107
// DataType can only be Date/Datetime/Time
108
Strptime(DataType, StrptimeOptions),
109
Split(bool),
110
#[cfg(feature = "regex")]
111
SplitRegex {
112
inclusive: bool,
113
strict: bool,
114
},
115
#[cfg(feature = "dtype-decimal")]
116
ToDecimal {
117
scale: usize,
118
},
119
#[cfg(feature = "nightly")]
120
Titlecase,
121
Uppercase,
122
#[cfg(feature = "string_pad")]
123
ZFill,
124
#[cfg(feature = "find_many")]
125
ContainsAny {
126
ascii_case_insensitive: bool,
127
},
128
#[cfg(feature = "find_many")]
129
ReplaceMany {
130
ascii_case_insensitive: bool,
131
leftmost: bool,
132
},
133
#[cfg(feature = "find_many")]
134
ExtractMany {
135
ascii_case_insensitive: bool,
136
overlapping: bool,
137
leftmost: bool,
138
},
139
#[cfg(feature = "find_many")]
140
FindMany {
141
ascii_case_insensitive: bool,
142
overlapping: bool,
143
leftmost: bool,
144
},
145
#[cfg(feature = "regex")]
146
EscapeRegex,
147
}
148
149
impl IRStringFunction {
150
pub(super) fn get_field(&self, mapper: FieldsMapper) -> PolarsResult<Field> {
151
use IRStringFunction::*;
152
match self {
153
Format { .. } => mapper.with_dtype(DataType::String),
154
#[cfg(feature = "concat_str")]
155
ConcatVertical { .. } | ConcatHorizontal { .. } => mapper.with_dtype(DataType::String),
156
#[cfg(feature = "regex")]
157
Contains { .. } => mapper.with_dtype(DataType::Boolean),
158
CountMatches(_) => mapper.with_dtype(DataType::UInt32),
159
EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean),
160
Extract(_) => mapper.with_same_dtype(),
161
ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
162
#[cfg(feature = "extract_groups")]
163
ExtractGroups { dtype, .. } => mapper.with_dtype(dtype.clone()),
164
#[cfg(feature = "string_to_integer")]
165
ToInteger { dtype, .. } => mapper.with_dtype(dtype.clone().unwrap_or(DataType::Int64)),
166
#[cfg(feature = "regex")]
167
Find { .. } => mapper.with_dtype(DataType::UInt32),
168
#[cfg(feature = "extract_jsonpath")]
169
JsonDecode(dtype) => mapper.with_dtype(dtype.clone()),
170
#[cfg(feature = "extract_jsonpath")]
171
JsonPathMatch => mapper.with_dtype(DataType::String),
172
LenBytes => mapper.with_dtype(DataType::UInt32),
173
LenChars => mapper.with_dtype(DataType::UInt32),
174
#[cfg(feature = "regex")]
175
Replace { .. } => mapper.with_same_dtype(),
176
#[cfg(feature = "string_normalize")]
177
Normalize { .. } => mapper.with_same_dtype(),
178
#[cfg(feature = "string_reverse")]
179
Reverse => mapper.with_same_dtype(),
180
#[cfg(feature = "temporal")]
181
Strptime(dtype, options) => match dtype {
182
#[cfg(feature = "dtype-datetime")]
183
DataType::Datetime(time_unit, time_zone) => {
184
let mut time_zone = time_zone.clone();
185
#[cfg(all(feature = "regex", feature = "timezones"))]
186
if options
187
.format
188
.as_ref()
189
.is_some_and(|format| TZ_AWARE_RE.is_match(format.as_str()))
190
&& time_zone.is_none()
191
{
192
time_zone = Some(time_zone.unwrap_or(TimeZone::UTC));
193
}
194
mapper.with_dtype(DataType::Datetime(*time_unit, time_zone))
195
},
196
_ => mapper.with_dtype(dtype.clone()),
197
},
198
Split(_) => mapper.with_dtype(DataType::List(DataType::String.into())),
199
#[cfg(feature = "regex")]
200
SplitRegex { .. } => mapper.with_dtype(DataType::List(DataType::String.into())),
201
#[cfg(feature = "nightly")]
202
Titlecase => mapper.with_same_dtype(),
203
#[cfg(feature = "dtype-decimal")]
204
ToDecimal { scale } => mapper.with_dtype(DataType::Decimal(DEC128_MAX_PREC, *scale)),
205
#[cfg(feature = "string_encoding")]
206
HexEncode => mapper.with_same_dtype(),
207
#[cfg(feature = "binary_encoding")]
208
HexDecode(_) => mapper.with_dtype(DataType::Binary),
209
#[cfg(feature = "string_encoding")]
210
Base64Encode => mapper.with_same_dtype(),
211
#[cfg(feature = "binary_encoding")]
212
Base64Decode(_) => mapper.with_dtype(DataType::Binary),
213
Uppercase | Lowercase | StripChars | StripCharsStart | StripCharsEnd | StripPrefix
214
| StripSuffix | Slice | Head | Tail => mapper.with_same_dtype(),
215
#[cfg(feature = "string_pad")]
216
PadStart { .. } | PadEnd { .. } | ZFill => mapper.with_same_dtype(),
217
#[cfg(feature = "dtype-struct")]
218
SplitExact { n, .. } => mapper.with_dtype(DataType::Struct(
219
(0..n + 1)
220
.map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String))
221
.collect(),
222
)),
223
#[cfg(feature = "dtype-struct")]
224
SplitN(n) => mapper.with_dtype(DataType::Struct(
225
(0..*n)
226
.map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String))
227
.collect(),
228
)),
229
#[cfg(feature = "find_many")]
230
ContainsAny { .. } => mapper.with_dtype(DataType::Boolean),
231
#[cfg(feature = "find_many")]
232
ReplaceMany { .. } => mapper.with_same_dtype(),
233
#[cfg(feature = "find_many")]
234
ExtractMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
235
#[cfg(feature = "find_many")]
236
FindMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::UInt32))),
237
#[cfg(feature = "regex")]
238
EscapeRegex => mapper.with_same_dtype(),
239
}
240
}
241
242
pub fn function_options(&self) -> FunctionOptions {
243
use IRStringFunction as S;
244
match self {
245
S::Format { .. } => FunctionOptions::elementwise(),
246
#[cfg(feature = "concat_str")]
247
S::ConcatHorizontal { .. } => FunctionOptions::elementwise()
248
.with_flags(|f| f | FunctionFlags::INPUT_WILDCARD_EXPANSION),
249
#[cfg(feature = "concat_str")]
250
S::ConcatVertical { .. } => FunctionOptions::aggregation(),
251
#[cfg(feature = "regex")]
252
S::Contains { .. } => {
253
FunctionOptions::elementwise().with_supertyping(Default::default())
254
},
255
S::CountMatches(_) => FunctionOptions::elementwise(),
256
S::EndsWith | S::StartsWith | S::Extract(_) => {
257
FunctionOptions::elementwise().with_supertyping(Default::default())
258
},
259
S::ExtractAll => FunctionOptions::elementwise(),
260
#[cfg(feature = "extract_groups")]
261
S::ExtractGroups { .. } => FunctionOptions::elementwise(),
262
#[cfg(feature = "string_to_integer")]
263
S::ToInteger { .. } => FunctionOptions::elementwise(),
264
#[cfg(feature = "regex")]
265
S::Find { .. } => FunctionOptions::elementwise().with_supertyping(Default::default()),
266
#[cfg(feature = "extract_jsonpath")]
267
S::JsonDecode { .. } => FunctionOptions::elementwise(),
268
#[cfg(feature = "extract_jsonpath")]
269
S::JsonPathMatch => FunctionOptions::elementwise(),
270
S::LenBytes | S::LenChars => FunctionOptions::elementwise(),
271
#[cfg(feature = "regex")]
272
S::Replace { .. } => {
273
FunctionOptions::elementwise().with_supertyping(Default::default())
274
},
275
#[cfg(feature = "string_normalize")]
276
S::Normalize { .. } => FunctionOptions::elementwise(),
277
#[cfg(feature = "string_reverse")]
278
S::Reverse => FunctionOptions::elementwise(),
279
#[cfg(feature = "temporal")]
280
S::Strptime(_, options) if options.format.is_some() => FunctionOptions::elementwise(),
281
#[cfg(feature = "temporal")]
282
S::Strptime(_, _) => FunctionOptions::elementwise_with_infer(),
283
S::Split(_) => FunctionOptions::elementwise(),
284
#[cfg(feature = "nightly")]
285
S::Titlecase => FunctionOptions::elementwise(),
286
#[cfg(feature = "dtype-decimal")]
287
S::ToDecimal { .. } => FunctionOptions::elementwise(),
288
#[cfg(feature = "string_encoding")]
289
S::HexEncode | S::Base64Encode => FunctionOptions::elementwise(),
290
#[cfg(feature = "binary_encoding")]
291
S::HexDecode(_) | S::Base64Decode(_) => FunctionOptions::elementwise(),
292
S::Uppercase | S::Lowercase => FunctionOptions::elementwise(),
293
S::StripChars
294
| S::StripCharsStart
295
| S::StripCharsEnd
296
| S::StripPrefix
297
| S::StripSuffix
298
| S::Head
299
| S::Tail => FunctionOptions::elementwise(),
300
S::Slice => FunctionOptions::elementwise(),
301
#[cfg(feature = "string_pad")]
302
S::PadStart { .. } | S::PadEnd { .. } | S::ZFill => FunctionOptions::elementwise(),
303
#[cfg(feature = "dtype-struct")]
304
S::SplitExact { .. } => FunctionOptions::elementwise(),
305
#[cfg(feature = "dtype-struct")]
306
S::SplitN(_) => FunctionOptions::elementwise(),
307
#[cfg(feature = "regex")]
308
S::SplitRegex { .. } => FunctionOptions::elementwise(),
309
#[cfg(feature = "find_many")]
310
S::ContainsAny { .. } => FunctionOptions::elementwise(),
311
#[cfg(feature = "find_many")]
312
S::ReplaceMany { .. } => FunctionOptions::elementwise(),
313
#[cfg(feature = "find_many")]
314
S::ExtractMany { .. } => FunctionOptions::elementwise(),
315
#[cfg(feature = "find_many")]
316
S::FindMany { .. } => FunctionOptions::elementwise(),
317
#[cfg(feature = "regex")]
318
S::EscapeRegex => FunctionOptions::elementwise(),
319
}
320
}
321
}
322
323
impl Display for IRStringFunction {
324
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
325
use IRStringFunction::*;
326
let s = match self {
327
Format { .. } => "format",
328
#[cfg(feature = "regex")]
329
Contains { .. } => "contains",
330
CountMatches(_) => "count_matches",
331
EndsWith => "ends_with",
332
Extract(_) => "extract",
333
#[cfg(feature = "concat_str")]
334
ConcatHorizontal { .. } => "concat_horizontal",
335
#[cfg(feature = "concat_str")]
336
ConcatVertical { .. } => "concat_vertical",
337
ExtractAll => "extract_all",
338
#[cfg(feature = "extract_groups")]
339
ExtractGroups { .. } => "extract_groups",
340
#[cfg(feature = "string_to_integer")]
341
ToInteger { .. } => "to_integer",
342
#[cfg(feature = "regex")]
343
Find { .. } => "find",
344
Head => "head",
345
Tail => "tail",
346
#[cfg(feature = "extract_jsonpath")]
347
JsonDecode(..) => "json_decode",
348
#[cfg(feature = "extract_jsonpath")]
349
JsonPathMatch => "json_path_match",
350
LenBytes => "len_bytes",
351
Lowercase => "to_lowercase",
352
LenChars => "len_chars",
353
#[cfg(feature = "string_pad")]
354
PadEnd { .. } => "pad_end",
355
#[cfg(feature = "string_pad")]
356
PadStart { .. } => "pad_start",
357
#[cfg(feature = "regex")]
358
Replace { .. } => "replace",
359
#[cfg(feature = "string_normalize")]
360
Normalize { .. } => "normalize",
361
#[cfg(feature = "string_reverse")]
362
Reverse => "reverse",
363
#[cfg(feature = "string_encoding")]
364
HexEncode => "hex_encode",
365
#[cfg(feature = "binary_encoding")]
366
HexDecode(_) => "hex_decode",
367
#[cfg(feature = "string_encoding")]
368
Base64Encode => "base64_encode",
369
#[cfg(feature = "binary_encoding")]
370
Base64Decode(_) => "base64_decode",
371
Slice => "slice",
372
StartsWith => "starts_with",
373
StripChars => "strip_chars",
374
StripCharsStart => "strip_chars_start",
375
StripCharsEnd => "strip_chars_end",
376
StripPrefix => "strip_prefix",
377
StripSuffix => "strip_suffix",
378
#[cfg(feature = "dtype-struct")]
379
SplitExact { inclusive, .. } => {
380
if *inclusive {
381
"split_exact_inclusive"
382
} else {
383
"split_exact"
384
}
385
},
386
#[cfg(feature = "dtype-struct")]
387
SplitN(_) => "splitn",
388
#[cfg(feature = "temporal")]
389
Strptime(_, _) => "strptime",
390
Split(inclusive) => {
391
if *inclusive {
392
"split_inclusive"
393
} else {
394
"split"
395
}
396
},
397
#[cfg(feature = "regex")]
398
SplitRegex { inclusive, .. } => {
399
if *inclusive {
400
"split_regex_inclusive"
401
} else {
402
"split_regex"
403
}
404
},
405
#[cfg(feature = "nightly")]
406
Titlecase => "to_titlecase",
407
#[cfg(feature = "dtype-decimal")]
408
ToDecimal { .. } => "to_decimal",
409
Uppercase => "to_uppercase",
410
#[cfg(feature = "string_pad")]
411
ZFill => "zfill",
412
#[cfg(feature = "find_many")]
413
ContainsAny { .. } => "contains_any",
414
#[cfg(feature = "find_many")]
415
ReplaceMany { .. } => "replace_many",
416
#[cfg(feature = "find_many")]
417
ExtractMany { .. } => "extract_many",
418
#[cfg(feature = "find_many")]
419
FindMany { .. } => "extract_many",
420
#[cfg(feature = "regex")]
421
EscapeRegex => "escape_regex",
422
};
423
write!(f, "str.{s}")
424
}
425
}
426
427
impl From<IRStringFunction> for IRFunctionExpr {
428
fn from(str: IRStringFunction) -> Self {
429
IRFunctionExpr::StringExpr(str)
430
}
431
}
432
433