Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/string/test_string.py
8396 views
1
from __future__ import annotations
2
3
from typing import Any
4
5
import pytest
6
7
import polars as pl
8
import polars.selectors as cs
9
from polars.exceptions import (
10
ColumnNotFoundError,
11
ComputeError,
12
InvalidOperationError,
13
PolarsInefficientMapWarning,
14
ShapeError,
15
)
16
from polars.testing import assert_frame_equal, assert_series_equal
17
18
19
def test_str_slice() -> None:
20
df = pl.DataFrame({"a": ["foobar", "barfoo"]})
21
assert df["a"].str.slice(-3).to_list() == ["bar", "foo"]
22
assert df.select([pl.col("a").str.slice(2, 4)])["a"].to_list() == ["obar", "rfoo"]
23
24
25
def test_str_slice_expr() -> None:
26
df = pl.DataFrame(
27
{
28
"a": ["foobar", None, "barfoo", "abcd", ""],
29
"offset": [1, 3, None, -3, 2],
30
"length": [3, 4, 2, None, 2],
31
}
32
)
33
out = df.select(
34
all_expr=pl.col("a").str.slice("offset", "length"),
35
offset_expr=pl.col("a").str.slice("offset", 2),
36
length_expr=pl.col("a").str.slice(0, "length"),
37
length_none=pl.col("a").str.slice("offset", None),
38
offset_length_lit=pl.col("a").str.slice(-3, 3),
39
str_lit=pl.lit("qwert").str.slice("offset", "length"),
40
)
41
expected = pl.DataFrame(
42
{
43
"all_expr": ["oob", None, None, "bcd", ""],
44
"offset_expr": ["oo", None, None, "bc", ""],
45
"length_expr": ["foo", None, "ba", "abcd", ""],
46
"length_none": ["oobar", None, None, "bcd", ""],
47
"offset_length_lit": ["bar", None, "foo", "bcd", ""],
48
"str_lit": ["wer", "rt", None, "ert", "er"],
49
}
50
)
51
assert_frame_equal(out, expected)
52
53
# negative length is not allowed
54
with pytest.raises(InvalidOperationError):
55
df.select(pl.col("a").str.slice(0, -1))
56
57
58
def test_str_slice_wrong_length() -> None:
59
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
60
with pytest.raises(ShapeError):
61
df.select(pl.col("num").str.slice(pl.Series([1, 2])))
62
63
64
@pytest.mark.parametrize(
65
("input", "n", "output"),
66
[
67
(["012345", "", None], 0, ["", "", None]),
68
(["012345", "", None], 2, ["01", "", None]),
69
(["012345", "", None], -2, ["0123", "", None]),
70
(["012345", "", None], 100, ["012345", "", None]),
71
(["012345", "", None], -100, ["", "", None]),
72
],
73
)
74
def test_str_head(input: list[str], n: int, output: list[str]) -> None:
75
assert pl.Series(input).str.head(n).to_list() == output
76
77
78
@pytest.mark.parametrize(
79
("input", "n", "output"),
80
[
81
("你好世界", 0, ""),
82
("你好世界", 2, "你好"),
83
("你好世界", 999, "你好世界"),
84
("你好世界", -1, "你好世"),
85
("你好世界", -2, "你好"),
86
("你好世界", -999, ""),
87
],
88
)
89
def test_str_head_codepoints(input: str, n: int, output: str) -> None:
90
assert pl.Series([input]).str.head(n).to_list() == [output]
91
92
93
def test_str_head_expr() -> None:
94
s = "012345"
95
df = pl.DataFrame(
96
{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}
97
)
98
out = df.select(
99
n_expr=pl.col("a").str.head("n"),
100
n_pos2=pl.col("a").str.head(2),
101
n_neg2=pl.col("a").str.head(-2),
102
n_pos100=pl.col("a").str.head(100),
103
n_pos_neg100=pl.col("a").str.head(-100),
104
n_pos_0=pl.col("a").str.head(0),
105
str_lit=pl.col("a").str.head(pl.lit(2)),
106
lit_expr=pl.lit(s).str.head("n"),
107
lit_n=pl.lit(s).str.head(2),
108
)
109
expected = pl.DataFrame(
110
{
111
"n_expr": ["", "01", "0123", "012345", "", None, "", None],
112
"n_pos2": ["01", "01", "01", "01", "01", "01", "", None],
113
"n_neg2": ["0123", "0123", "0123", "0123", "0123", "0123", "", None],
114
"n_pos100": [s, s, s, s, s, s, "", None],
115
"n_pos_neg100": ["", "", "", "", "", "", "", None],
116
"n_pos_0": ["", "", "", "", "", "", "", None],
117
"str_lit": ["01", "01", "01", "01", "01", "01", "", None],
118
"lit_expr": ["", "01", "0123", "012345", "", None, "012", "0123"],
119
"lit_n": ["01", "01", "01", "01", "01", "01", "01", "01"],
120
}
121
)
122
assert_frame_equal(out, expected)
123
124
125
def test_str_head_wrong_length() -> None:
126
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
127
with pytest.raises(ShapeError):
128
df.select(pl.col("num").str.head(pl.Series([1, 2])))
129
130
131
@pytest.mark.parametrize(
132
("input", "n", "output"),
133
[
134
(["012345", "", None], 0, ["", "", None]),
135
(["012345", "", None], 2, ["45", "", None]),
136
(["012345", "", None], -2, ["2345", "", None]),
137
(["012345", "", None], 100, ["012345", "", None]),
138
(["012345", "", None], -100, ["", "", None]),
139
],
140
)
141
def test_str_tail(input: list[str], n: int, output: list[str]) -> None:
142
assert pl.Series(input).str.tail(n).to_list() == output
143
144
145
@pytest.mark.parametrize(
146
("input", "n", "output"),
147
[
148
("你好世界", 0, ""),
149
("你好世界", 2, "世界"),
150
("你好世界", 999, "你好世界"),
151
("你好世界", -1, "好世界"),
152
("你好世界", -2, "世界"),
153
("你好世界", -999, ""),
154
],
155
)
156
def test_str_tail_codepoints(input: str, n: int, output: str) -> None:
157
assert pl.Series([input]).str.tail(n).to_list() == [output]
158
159
160
def test_str_tail_expr() -> None:
161
s = "012345"
162
df = pl.DataFrame(
163
{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}
164
)
165
out = df.select(
166
n_expr=pl.col("a").str.tail("n"),
167
n_pos2=pl.col("a").str.tail(2),
168
n_neg2=pl.col("a").str.tail(-2),
169
n_pos100=pl.col("a").str.tail(100),
170
n_pos_neg100=pl.col("a").str.tail(-100),
171
n_pos_0=pl.col("a").str.tail(0),
172
str_lit=pl.col("a").str.tail(pl.lit(2)),
173
lit_expr=pl.lit(s).str.tail("n"),
174
lit_n=pl.lit(s).str.tail(2),
175
)
176
expected = pl.DataFrame(
177
{
178
"n_expr": ["", "45", "2345", "012345", "", None, "", None],
179
"n_pos2": ["45", "45", "45", "45", "45", "45", "", None],
180
"n_neg2": ["2345", "2345", "2345", "2345", "2345", "2345", "", None],
181
"n_pos100": [s, s, s, s, s, s, "", None],
182
"n_pos_neg100": ["", "", "", "", "", "", "", None],
183
"n_pos_0": ["", "", "", "", "", "", "", None],
184
"str_lit": ["45", "45", "45", "45", "45", "45", "", None],
185
"lit_expr": ["", "45", "2345", "012345", "", None, "345", "2345"],
186
"lit_n": ["45", "45", "45", "45", "45", "45", "45", "45"],
187
}
188
)
189
assert_frame_equal(out, expected)
190
191
192
def test_str_tail_wrong_length() -> None:
193
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
194
with pytest.raises(ShapeError):
195
df.select(pl.col("num").str.tail(pl.Series([1, 2])))
196
197
198
def test_str_slice_multibyte() -> None:
199
ref = "你好世界"
200
s = pl.Series([ref])
201
202
# Pad the string to simplify (negative) offsets starting before/after the string.
203
npad = 20
204
padref = "_" * npad + ref + "_" * npad
205
for start in range(-5, 6):
206
for length in range(6):
207
offset = npad + start if start >= 0 else npad + start + len(ref)
208
correct = padref[offset : offset + length].strip("_")
209
result = s.str.slice(start, length)
210
expected = pl.Series([correct])
211
assert_series_equal(result, expected)
212
213
214
def test_str_len_bytes() -> None:
215
s = pl.Series(["Café", None, "345", "東京"])
216
result = s.str.len_bytes()
217
expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32)
218
assert_series_equal(result, expected)
219
220
221
def test_str_len_chars() -> None:
222
s = pl.Series(["Café", None, "345", "東京"])
223
result = s.str.len_chars()
224
expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32)
225
assert_series_equal(result, expected)
226
227
228
def test_str_contains() -> None:
229
s = pl.Series(["messi", "ronaldo", "ibrahimovic"])
230
expected = pl.Series([True, False, False])
231
assert_series_equal(s.str.contains("mes"), expected)
232
233
234
def test_str_contains_wrong_length() -> None:
235
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
236
with pytest.raises(ShapeError):
237
df.select(pl.col("num").str.contains(pl.Series(["a", "b"]))) # type: ignore [arg-type]
238
239
240
def test_count_match_literal() -> None:
241
s = pl.Series(["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None])
242
out = s.str.count_matches(r"\d", literal=True)
243
expected = pl.Series([0, 0, 2, None], dtype=pl.UInt32)
244
assert_series_equal(out, expected)
245
246
out = s.str.count_matches(pl.Series([r"\w", r"\w", r"\d", r"\d"]), literal=True)
247
expected = pl.Series([0, 1, 2, None], dtype=pl.UInt32)
248
assert_series_equal(out, expected)
249
250
251
def test_str_encode() -> None:
252
s = pl.Series(["foo", "bar", None])
253
hex_encoded = pl.Series(["666f6f", "626172", None])
254
base64_encoded = pl.Series(["Zm9v", "YmFy", None])
255
256
assert_series_equal(s.str.encode("hex"), hex_encoded)
257
assert_series_equal(s.str.encode("base64"), base64_encoded)
258
with pytest.raises(ValueError):
259
s.str.encode("utf8") # type: ignore[arg-type]
260
261
262
def test_str_decode() -> None:
263
hex_encoded = pl.Series(["666f6f", "626172", None])
264
base64_encoded = pl.Series(["Zm9v", "YmFy", None])
265
expected = pl.Series([b"foo", b"bar", None])
266
267
assert_series_equal(hex_encoded.str.decode("hex"), expected)
268
assert_series_equal(base64_encoded.str.decode("base64"), expected)
269
270
271
def test_str_decode_exception() -> None:
272
s = pl.Series(["not a valid", "626172", None])
273
with pytest.raises(ComputeError):
274
s.str.decode(encoding="hex")
275
with pytest.raises(ComputeError):
276
s.str.decode(encoding="base64")
277
with pytest.raises(ValueError):
278
s.str.decode("utf8") # type: ignore[arg-type]
279
280
281
@pytest.mark.parametrize("strict", [True, False])
282
def test_str_find(strict: bool) -> None:
283
df = pl.DataFrame(
284
data=[
285
("Dubai", 3564931, "b[ai]", "ai"),
286
("Abu Dhabi", 1807000, "b[ai]", " "),
287
("Sharjah", 1405000, "[ai]n", "s"),
288
("Al Ain", 846747, "[ai]n", ""),
289
("Ajman", 490035, "[ai]n", "ma"),
290
("Ras Al Khaimah", 191753, "a.+a", "Kha"),
291
("Fujairah", 118933, "a.+a", None),
292
("Umm Al Quwain", 59098, "a.+a", "wa"),
293
(None, None, None, "n/a"),
294
],
295
schema={
296
"city": pl.String,
297
"population": pl.Int32,
298
"pat": pl.String,
299
"lit": pl.String,
300
},
301
orient="row",
302
)
303
city, pop, pat, lit = (pl.col(c) for c in ("city", "population", "pat", "lit"))
304
305
for match_lit in (True, False):
306
res = df.select(
307
find_a_regex=city.str.find("(?i)a", strict=strict),
308
find_a_lit=city.str.find("a", literal=match_lit),
309
find_00_lit=pop.cast(pl.String).str.find("00", literal=match_lit),
310
find_col_lit=city.str.find(lit, strict=strict, literal=match_lit),
311
find_col_pat=city.str.find(pat, strict=strict),
312
)
313
assert res.to_dict(as_series=False) == {
314
"find_a_regex": [3, 0, 2, 0, 0, 1, 3, 4, None],
315
"find_a_lit": [3, 6, 2, None, 3, 1, 3, 10, None],
316
"find_00_lit": [None, 4, 4, None, 2, None, None, None, None],
317
"find_col_lit": [3, 3, None, 0, 2, 7, None, 9, None],
318
"find_col_pat": [2, 7, None, 4, 3, 1, 3, None, None],
319
}
320
321
322
def test_str_find_invalid_regex() -> None:
323
# test behaviour of 'strict' with invalid regular expressions
324
df = pl.DataFrame({"txt": ["AbCdEfG"]})
325
rx_invalid = "(?i)AB.))"
326
327
with pytest.raises(ComputeError):
328
df.with_columns(pl.col("txt").str.find(rx_invalid, strict=True))
329
330
res = df.with_columns(pl.col("txt").str.find(rx_invalid, strict=False))
331
assert res.item() is None
332
333
334
def test_str_find_escaped_chars() -> None:
335
# test behaviour of 'literal=True' with special chars
336
df = pl.DataFrame({"txt": ["123.*465", "x(x?)x"]})
337
338
res = df.with_columns(
339
x1=pl.col("txt").str.find("(x?)", literal=True),
340
x2=pl.col("txt").str.find(".*4", literal=True),
341
x3=pl.col("txt").str.find("(x?)"),
342
x4=pl.col("txt").str.find(".*4"),
343
)
344
# ┌──────────┬──────┬──────┬─────┬──────┐
345
# │ txt ┆ x1 ┆ x2 ┆ x3 ┆ x4 │
346
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
347
# │ str ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
348
# ╞══════════╪══════╪══════╪═════╪══════╡
349
# │ 123.*465 ┆ null ┆ 3 ┆ 0 ┆ 0 │
350
# │ x(x?)x ┆ 1 ┆ null ┆ 0 ┆ null │
351
# └──────────┴──────┴──────┴─────┴──────┘
352
assert_frame_equal(
353
pl.DataFrame(
354
{
355
"txt": ["123.*465", "x(x?)x"],
356
"x1": [None, 1],
357
"x2": [3, None],
358
"x3": [0, 0],
359
"x4": [0, None],
360
}
361
).cast({cs.signed_integer(): pl.UInt32}),
362
res,
363
)
364
365
366
def test_str_find_wrong_length() -> None:
367
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
368
with pytest.raises(ShapeError):
369
df.select(pl.col("num").str.find(pl.Series(["a", "b"]))) # type: ignore [arg-type]
370
371
372
def test_hex_decode_return_dtype() -> None:
373
data = {"a": ["68656c6c6f", "776f726c64"]}
374
expr = pl.col("a").str.decode("hex")
375
376
df = pl.DataFrame(data).select(expr)
377
assert df.schema == {"a": pl.Binary}
378
379
ldf = pl.LazyFrame(data).select(expr)
380
assert ldf.collect_schema() == {"a": pl.Binary}
381
382
383
def test_base64_decode_return_dtype() -> None:
384
data = {"a": ["Zm9v", "YmFy"]}
385
expr = pl.col("a").str.decode("base64")
386
387
df = pl.DataFrame(data).select(expr)
388
assert df.schema == {"a": pl.Binary}
389
390
ldf = pl.LazyFrame(data).select(expr)
391
assert ldf.collect_schema() == {"a": pl.Binary}
392
393
394
def test_str_replace_str_replace_all() -> None:
395
s = pl.Series(["hello", "world", "test", "rooted"])
396
expected = pl.Series(["hell0", "w0rld", "test", "r0oted"])
397
assert_series_equal(s.str.replace("o", "0"), expected)
398
399
expected = pl.Series(["hell0", "w0rld", "test", "r00ted"])
400
assert_series_equal(s.str.replace_all("o", "0"), expected)
401
402
403
def test_str_replace_n_single() -> None:
404
s = pl.Series(["aba", "abaa"])
405
406
assert s.str.replace("a", "b", n=1).to_list() == ["bba", "bbaa"]
407
assert s.str.replace("a", "b", n=2).to_list() == ["bbb", "bbba"]
408
assert s.str.replace("a", "b", n=3).to_list() == ["bbb", "bbbb"]
409
410
411
def test_str_replace_n_same_length() -> None:
412
# pat and val have the same length
413
# this triggers a fast path
414
s = pl.Series(["abfeab", "foobarabfooabab"])
415
assert s.str.replace("ab", "AB", n=1).to_list() == ["ABfeab", "foobarABfooabab"]
416
assert s.str.replace("ab", "AB", n=2).to_list() == ["ABfeAB", "foobarABfooABab"]
417
assert s.str.replace("ab", "AB", n=3).to_list() == ["ABfeAB", "foobarABfooABAB"]
418
419
420
def test_str_to_lowercase() -> None:
421
s = pl.Series(["Hello", "WORLD"])
422
expected = pl.Series(["hello", "world"])
423
assert_series_equal(s.str.to_lowercase(), expected)
424
425
426
def test_str_to_uppercase() -> None:
427
s = pl.Series(["Hello", "WORLD"])
428
expected = pl.Series(["HELLO", "WORLD"])
429
assert_series_equal(s.str.to_uppercase(), expected)
430
431
432
def test_str_case_cyrillic() -> None:
433
vals = ["Biтpyк", "Iвaн"]
434
s = pl.Series(vals)
435
assert s.str.to_lowercase().to_list() == [a.lower() for a in vals]
436
assert s.str.to_uppercase().to_list() == [a.upper() for a in vals]
437
438
439
def test_str_to_integer() -> None:
440
bin = pl.Series(["110", "101", "010"])
441
assert_series_equal(bin.str.to_integer(base=2), pl.Series([6, 5, 2]).cast(pl.Int64))
442
443
hex = pl.Series(["fa1e", "ff00", "cafe", "invalid", None])
444
assert_series_equal(
445
hex.str.to_integer(base=16, strict=False),
446
pl.Series([64030, 65280, 51966, None, None]).cast(pl.Int64),
447
check_exact=True,
448
)
449
450
with pytest.raises(ComputeError):
451
hex.str.to_integer(base=16)
452
453
454
@pytest.mark.parametrize("strict", [False, True])
455
def test_str_to_integer_invalid_base(strict: bool) -> None:
456
numbers = pl.Series(["1", "ZZZ", "-ABCZZZ", None])
457
with pytest.raises(ComputeError):
458
numbers.str.to_integer(base=100, strict=strict)
459
460
df = pl.DataFrame({"str": numbers, "base": [0, 1, 100, None]})
461
with pytest.raises(ComputeError):
462
df.select(pl.col("str").str.to_integer(base=pl.col("base"), strict=strict))
463
464
465
def test_str_to_integer_base_expr() -> None:
466
df = pl.DataFrame(
467
{"str": ["110", "ff00", "234", None, "130"], "base": [2, 16, 10, 8, None]}
468
)
469
out = df.select(base_expr=pl.col("str").str.to_integer(base="base"))
470
expected = pl.DataFrame({"base_expr": [6, 65280, 234, None, None]})
471
assert_frame_equal(out, expected)
472
473
# test strict raise
474
df = pl.DataFrame({"str": ["110", "ff00", "cafe", None], "base": [2, 10, 10, 8]})
475
476
with pytest.raises(ComputeError):
477
df.select(pl.col("str").str.to_integer(base="base"))
478
479
480
def test_str_to_integer_base_literal() -> None:
481
df = pl.DataFrame(
482
{
483
"bin": ["110", "101", "-010", "invalid", None],
484
"hex": ["fa1e", "ff00", "cafe", "invalid", None],
485
}
486
)
487
result = df.with_columns(
488
pl.col("bin").str.to_integer(base=2, strict=False),
489
pl.col("hex").str.to_integer(base=16, strict=False),
490
)
491
492
expected = pl.DataFrame(
493
{
494
"bin": [6, 5, -2, None, None],
495
"hex": [64030, 65280, 51966, None, None],
496
}
497
)
498
assert_frame_equal(result, expected)
499
500
with pytest.raises(ComputeError):
501
df.with_columns(
502
pl.col("bin").str.to_integer(base=2),
503
pl.col("hex").str.to_integer(base=16),
504
)
505
506
507
def test_str_to_integer_dtype() -> None:
508
lf = pl.LazyFrame(
509
{
510
"str": ["1111111", "7f", "127", None, "42"],
511
"base": [2, 16, 10, 8, None],
512
}
513
)
514
out = lf.select(
515
i8=pl.col("str").str.to_integer(base="base", dtype=pl.Int8),
516
i16=pl.col("str").str.to_integer(base="base", dtype=pl.Int16),
517
i32=pl.col("str").str.to_integer(base="base", dtype=pl.Int32),
518
i64=pl.col("str").str.to_integer(base="base", dtype=pl.Int64),
519
u8=pl.col("str").str.to_integer(base="base", dtype=pl.UInt8),
520
u16=pl.col("str").str.to_integer(base="base", dtype=pl.UInt16),
521
u32=pl.col("str").str.to_integer(base="base", dtype=pl.UInt32),
522
u64=pl.col("str").str.to_integer(base="base", dtype=pl.UInt64),
523
default=pl.col("str").str.to_integer(base="base"),
524
).collect()
525
526
expected = pl.DataFrame(
527
{
528
"i8": [127, 127, 127, None, None],
529
"i16": [127, 127, 127, None, None],
530
"i32": [127, 127, 127, None, None],
531
"i64": [127, 127, 127, None, None],
532
"u8": [127, 127, 127, None, None],
533
"u16": [127, 127, 127, None, None],
534
"u32": [127, 127, 127, None, None],
535
"u64": [127, 127, 127, None, None],
536
"default": [127, 127, 127, None, None],
537
},
538
schema={
539
"i8": pl.Int8,
540
"i16": pl.Int16,
541
"i32": pl.Int32,
542
"i64": pl.Int64,
543
"u8": pl.UInt8,
544
"u16": pl.UInt16,
545
"u32": pl.UInt32,
546
"u64": pl.UInt64,
547
"default": pl.Int64,
548
},
549
)
550
assert lf.collect_schema() == lf.collect().schema
551
assert_frame_equal(out, expected)
552
553
554
def test_str_to_integer_large() -> None:
555
df = pl.DataFrame(
556
{
557
"str": [
558
"-6129899454972456276923959272",
559
"1A44E53BFEBA967E6682FBB0",
560
"10100110111110110101110100000100110010101111000100011000000100010101010101101011111111101000",
561
None,
562
"7798994549724957734429272",
563
],
564
"base": [10, 16, 2, 8, None],
565
}
566
)
567
out = df.select(i128=pl.col("str").str.to_integer(base="base", dtype=pl.Int128))
568
expected = pl.DataFrame(
569
{
570
"i128": [
571
-6129899454972456276923959272,
572
8129899739726392769273592752,
573
3229899454972495776923959272,
574
None,
575
None,
576
]
577
},
578
schema={"i128": pl.Int128},
579
)
580
assert_frame_equal(out, expected)
581
582
# test strict raise
583
df = pl.DataFrame(
584
{
585
"i128": [
586
"612989945497245627692395927261298994549724562769239592726129899454972456276923959272",
587
"1A44E53BFEBA967E6682FBB0",
588
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
589
"7798994549724957734429272",
590
None,
591
"7798994549724957734429272",
592
],
593
"base": [10, 2, 16, 10, 8, None],
594
}
595
)
596
597
with pytest.raises(ComputeError):
598
df.select(pl.col("i128").str.to_integer(base="base", dtype=pl.Int128))
599
600
601
def test_str_strip_chars_expr() -> None:
602
df = pl.DataFrame(
603
{
604
"s": [" hello ", "^^world^^", "&&hi&&", " polars ", None],
605
"pat": [" ", "^", "&", None, "anything"],
606
}
607
)
608
609
all_expr = df.select(
610
pl.col("s").str.strip_chars(pl.col("pat")).alias("strip_chars"),
611
pl.col("s").str.strip_chars_start(pl.col("pat")).alias("strip_chars_start"),
612
pl.col("s").str.strip_chars_end(pl.col("pat")).alias("strip_chars_end"),
613
)
614
615
expected = pl.DataFrame(
616
{
617
"strip_chars": ["hello", "world", "hi", "polars", None],
618
"strip_chars_start": ["hello ", "world^^", "hi&&", "polars ", None],
619
"strip_chars_end": [" hello", "^^world", "&&hi", " polars", None],
620
}
621
)
622
623
assert_frame_equal(all_expr, expected)
624
625
strip_by_null = df.select(
626
pl.col("s").str.strip_chars(None).alias("strip_chars"),
627
pl.col("s").str.strip_chars_start(None).alias("strip_chars_start"),
628
pl.col("s").str.strip_chars_end(None).alias("strip_chars_end"),
629
)
630
631
# only whitespace are striped.
632
expected = pl.DataFrame(
633
{
634
"strip_chars": ["hello", "^^world^^", "&&hi&&", "polars", None],
635
"strip_chars_start": ["hello ", "^^world^^", "&&hi&&", "polars ", None],
636
"strip_chars_end": [" hello", "^^world^^", "&&hi&&", " polars", None],
637
}
638
)
639
assert_frame_equal(strip_by_null, expected)
640
641
642
def test_str_strip_chars() -> None:
643
s = pl.Series([" hello ", "world\t "])
644
expected = pl.Series(["hello", "world"])
645
assert_series_equal(s.str.strip_chars(), expected)
646
647
expected = pl.Series(["hell", "world"])
648
assert_series_equal(s.str.strip_chars().str.strip_chars("o"), expected)
649
650
expected = pl.Series(["ell", "rld\t"])
651
assert_series_equal(s.str.strip_chars(" hwo"), expected)
652
653
654
def test_str_strip_chars_wrong_length() -> None:
655
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
656
with pytest.raises(ShapeError):
657
df.select(pl.col("num").str.strip_chars(pl.Series(["a", "b"])))
658
659
660
def test_str_strip_chars_start() -> None:
661
s = pl.Series([" hello ", "\t world"])
662
expected = pl.Series(["hello ", "world"])
663
assert_series_equal(s.str.strip_chars_start(), expected)
664
665
expected = pl.Series(["ello ", "world"])
666
assert_series_equal(s.str.strip_chars_start().str.strip_chars_start("h"), expected)
667
668
expected = pl.Series(["ello ", "\t world"])
669
assert_series_equal(s.str.strip_chars_start("hw "), expected)
670
671
672
def test_str_strip_chars_start_wrong_length() -> None:
673
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
674
with pytest.raises(ShapeError):
675
df.select(pl.col("num").str.strip_chars_start(pl.Series(["a", "b"])))
676
677
678
def test_str_strip_chars_end() -> None:
679
s = pl.Series([" hello ", "world\t "])
680
expected = pl.Series([" hello", "world"])
681
assert_series_equal(s.str.strip_chars_end(), expected)
682
683
expected = pl.Series([" hell", "world"])
684
assert_series_equal(s.str.strip_chars_end().str.strip_chars_end("o"), expected)
685
686
expected = pl.Series([" he", "wor"])
687
assert_series_equal(s.str.strip_chars_end("odl \t"), expected)
688
689
690
def test_str_strip_chars_end_wrong_length() -> None:
691
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
692
with pytest.raises(ShapeError):
693
df.select(pl.col("num").str.strip_chars_end(pl.Series(["a", "b"])))
694
695
696
def test_str_strip_whitespace() -> None:
697
s = pl.Series("a", ["trailing ", " leading", " both "])
698
699
expected = pl.Series("a", ["trailing", " leading", " both"])
700
assert_series_equal(s.str.strip_chars_end(), expected)
701
702
expected = pl.Series("a", ["trailing ", "leading", "both "])
703
assert_series_equal(s.str.strip_chars_start(), expected)
704
705
expected = pl.Series("a", ["trailing", "leading", "both"])
706
assert_series_equal(s.str.strip_chars(), expected)
707
708
709
def test_str_strip_prefix_literal() -> None:
710
s = pl.Series(["foo:bar", "foofoo:bar", "bar:bar", "foo", "", None])
711
expected = pl.Series([":bar", "foo:bar", "bar:bar", "", "", None])
712
assert_series_equal(s.str.strip_prefix("foo"), expected)
713
# test null literal
714
expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)
715
assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.String)), expected)
716
717
718
def test_str_strip_prefix_suffix_expr() -> None:
719
df = pl.DataFrame(
720
{
721
"s": ["foo-bar", "foobarbar", "barfoo", "", "anything", None],
722
"prefix": ["foo", "foobar", "foo", "", None, "bar"],
723
"suffix": ["bar", "barbar", "bar", "", None, "foo"],
724
}
725
)
726
out = df.select(
727
pl.col("s").str.strip_prefix(pl.col("prefix")).alias("strip_prefix"),
728
pl.col("s").str.strip_suffix(pl.col("suffix")).alias("strip_suffix"),
729
)
730
assert out.to_dict(as_series=False) == {
731
"strip_prefix": ["-bar", "bar", "barfoo", "", None, None],
732
"strip_suffix": ["foo-", "foo", "barfoo", "", None, None],
733
}
734
735
736
def test_str_strip_prefix_wrong_length() -> None:
737
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
738
with pytest.raises(ShapeError):
739
df.select(pl.col("num").str.strip_prefix(pl.Series(["a", "b"])))
740
741
742
def test_str_strip_suffix() -> None:
743
s = pl.Series(["foo:bar", "foo:barbar", "foo:foo", "bar", "", None])
744
expected = pl.Series(["foo:", "foo:bar", "foo:foo", "", "", None])
745
assert_series_equal(s.str.strip_suffix("bar"), expected)
746
# test null literal
747
expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)
748
assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.String)), expected)
749
750
751
def test_str_strip_suffix_wrong_length() -> None:
752
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
753
with pytest.raises(ShapeError):
754
df.select(pl.col("num").str.strip_suffix(pl.Series(["a", "b"])))
755
756
757
def test_str_split() -> None:
758
a = pl.Series("a", ["a, b", "a", "ab,c,de"])
759
for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:
760
assert out[0].to_list() == ["a", " b"]
761
assert out[1].to_list() == ["a"]
762
assert out[2].to_list() == ["ab", "c", "de"]
763
764
for out in [
765
a.str.split(",", inclusive=True),
766
pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),
767
]:
768
assert out[0].to_list() == ["a,", " b"]
769
assert out[1].to_list() == ["a"]
770
assert out[2].to_list() == ["ab,", "c,", "de"]
771
772
773
def test_json_decode_series() -> None:
774
s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"])
775
expected = pl.Series([[1, 2, 3], None, [4, 5, 6]])
776
dtype = pl.List(pl.Int64)
777
assert_series_equal(s.str.json_decode(None), expected)
778
assert_series_equal(s.str.json_decode(dtype), expected)
779
780
s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}'])
781
expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}])
782
dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
783
assert_series_equal(s.str.json_decode(None), expected)
784
assert_series_equal(s.str.json_decode(dtype2), expected)
785
786
expected = pl.Series([{"a": 1}, None, {"a": 2}])
787
dtype2 = pl.Struct([pl.Field("a", pl.Int64)])
788
assert_series_equal(s.str.json_decode(dtype2), expected)
789
790
s = pl.Series([], dtype=pl.String)
791
expected = pl.Series([], dtype=pl.List(pl.Int64))
792
dtype = pl.List(pl.Int64)
793
assert_series_equal(s.str.json_decode(dtype), expected)
794
795
796
def test_json_decode_lazy_expr() -> None:
797
dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
798
ldf = (
799
pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']})
800
.lazy()
801
.select(pl.col("json").str.json_decode(dtype))
802
)
803
expected = pl.DataFrame(
804
{"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]}
805
).lazy()
806
assert ldf.collect_schema() == {"json": dtype}
807
assert_frame_equal(ldf, expected)
808
809
810
def test_json_decode_nested_struct() -> None:
811
json = [
812
'[{"key_1": "a"}]',
813
'[{"key_1": "a2", "key_2": 2}]',
814
'[{"key_1": "a3", "key_2": 3, "key_3": "c"}]',
815
]
816
s = pl.Series("json_str", json)
817
s_parsed = s.str.json_decode().rename("parsed_list_json")
818
819
expected_dtype = pl.List(
820
pl.Struct(
821
[
822
pl.Field("key_1", pl.String),
823
pl.Field("key_2", pl.Int64),
824
pl.Field("key_3", pl.String),
825
]
826
)
827
)
828
assert s_parsed.dtype == expected_dtype
829
830
key_1_values = s_parsed.to_frame().select(
831
pl.col("parsed_list_json")
832
.list.get(0)
833
.struct.field("key_1")
834
.alias("key_1_values")
835
)
836
expected_values = pl.Series("key_1_values", ["a", "a2", "a3"])
837
assert_series_equal(key_1_values.get_column("key_1_values"), expected_values)
838
839
840
def test_json_decode_primitive_to_list_11053() -> None:
841
df = pl.DataFrame(
842
{
843
"json": [
844
'{"col1": ["123"], "col2": "123"}',
845
'{"col1": ["xyz"], "col2": null}',
846
]
847
}
848
)
849
schema = pl.Struct(
850
{
851
"col1": pl.List(pl.String),
852
"col2": pl.List(pl.String),
853
}
854
)
855
856
output = df.select(
857
pl.col("json").str.json_decode(schema).alias("decoded_json")
858
).unnest("decoded_json")
859
expected = pl.DataFrame({"col1": [["123"], ["xyz"]], "col2": [["123"], None]})
860
assert_frame_equal(output, expected)
861
862
863
def test_jsonpath_single() -> None:
864
s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}'])
865
expected = pl.Series(["1", None, "2", "2.1", "true"])
866
assert_series_equal(s.str.json_path_match("$.a"), expected)
867
868
869
def test_json_path_match() -> None:
870
df = pl.DataFrame(
871
{
872
"str": [
873
'{"a":"1"}',
874
None,
875
'{"b":2}',
876
'{"a":2.1, "b": "hello"}',
877
'{"a":true}',
878
],
879
"pat": ["$.a", "$.a", "$.b", "$.b", None],
880
}
881
)
882
out = df.select(
883
all_expr=pl.col("str").str.json_path_match(pl.col("pat")),
884
str_expr=pl.col("str").str.json_path_match("$.a"),
885
pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),
886
)
887
expected = pl.DataFrame(
888
{
889
"all_expr": ["1", None, "2", "hello", None],
890
"str_expr": ["1", None, None, "2.1", "true"],
891
"pat_expr": ["1.1", "1.1", "10", "10", None],
892
}
893
)
894
assert_frame_equal(out, expected)
895
896
897
def test_str_json_path_match_wrong_length() -> None:
898
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
899
with pytest.raises((ShapeError, ComputeError)):
900
df.select(pl.col("num").str.json_path_match(pl.Series(["a", "b"])))
901
902
903
def test_extract_regex() -> None:
904
s = pl.Series(
905
[
906
"http://vote.com/ballon_dor?candidate=messi&ref=polars",
907
"http://vote.com/ballon_dor?candidat=jorginho&ref=polars",
908
"http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
909
]
910
)
911
expected = pl.Series(["messi", None, "ronaldo"])
912
assert_series_equal(s.str.extract(r"candidate=(\w+)", 1), expected)
913
914
915
def test_extract() -> None:
916
df = pl.DataFrame(
917
{
918
"s": ["aron123", "12butler", "charly*", "~david", None],
919
"pat": [r"^([a-zA-Z]+)", r"^(\d+)", None, "^(da)", r"(.*)"],
920
}
921
)
922
923
out = df.select(
924
all_expr=pl.col("s").str.extract(pl.col("pat"), 1),
925
str_expr=pl.col("s").str.extract("^([a-zA-Z]+)", 1),
926
pat_expr=pl.lit("aron123").str.extract(pl.col("pat")),
927
)
928
expected = pl.DataFrame(
929
{
930
"all_expr": ["aron", "12", None, None, None],
931
"str_expr": ["aron", None, "charly", None, None],
932
"pat_expr": ["aron", None, None, None, "aron123"],
933
}
934
)
935
assert_frame_equal(out, expected)
936
937
938
def test_extract_binary() -> None:
939
df = pl.DataFrame({"foo": ["aron", "butler", "charly", "david"]})
940
out = df.filter(pl.col("foo").str.extract("^(a)", 1) == "a").to_series()
941
assert out[0] == "aron"
942
943
944
def test_str_join_returns_scalar() -> None:
945
df = pl.DataFrame(
946
[pl.Series("val", ["A", "B", "C", "D"]), pl.Series("id", [1, 1, 2, 2])]
947
)
948
grouped = (
949
df.group_by("id")
950
.agg(pl.col("val").str.join(delimiter=",").alias("grouped"))
951
.get_column("grouped")
952
)
953
assert grouped.dtype == pl.String
954
955
956
def test_contains() -> None:
957
# test strict/non strict
958
s_txt = pl.Series(["123", "456", "789"])
959
assert (
960
pl.Series([None, None, None]).cast(pl.Boolean).to_list()
961
== s_txt.str.contains("(not_valid_regex", literal=False, strict=False).to_list()
962
)
963
with pytest.raises(ComputeError):
964
s_txt.str.contains("(not_valid_regex", literal=False, strict=True)
965
assert (
966
pl.Series([True, False, False]).cast(pl.Boolean).to_list()
967
== s_txt.str.contains("1", literal=False, strict=False).to_list()
968
)
969
970
df = pl.DataFrame(
971
data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],
972
schema=["idx", "text"],
973
orient="row",
974
)
975
for pattern, as_literal, expected in (
976
(r"\* \*", False, [True, False, False]),
977
(r"* *", True, [True, False, False]),
978
(r"^\(", False, [False, True, False]),
979
(r"^\(", True, [False, False, False]),
980
(r"(", True, [False, True, False]),
981
(r"e", False, [True, True, True]),
982
(r"e", True, [True, True, True]),
983
(r"^\S+$", False, [False, False, True]),
984
(r"\?\$", False, [False, False, True]),
985
(r"?$", True, [False, False, True]),
986
):
987
# series
988
assert (
989
expected == df["text"].str.contains(pattern, literal=as_literal).to_list()
990
)
991
# frame select
992
assert (
993
expected
994
== df.select(pl.col("text").str.contains(pattern, literal=as_literal))[
995
"text"
996
].to_list()
997
)
998
# frame filter
999
assert sum(expected) == len(
1000
df.filter(pl.col("text").str.contains(pattern, literal=as_literal))
1001
)
1002
1003
1004
def test_contains_expr() -> None:
1005
df = pl.DataFrame(
1006
{
1007
"text": [
1008
"some text",
1009
"(with) special\n .* chars",
1010
"**etc...?$",
1011
None,
1012
"b",
1013
"invalid_regex",
1014
],
1015
"pattern": [r"[me]", r".*", r"^\(", "a", None, "*"],
1016
}
1017
)
1018
1019
assert df.select(
1020
pl.col("text")
1021
.str.contains(pl.col("pattern"), literal=False, strict=False)
1022
.alias("contains"),
1023
pl.col("text")
1024
.str.contains(pl.col("pattern"), literal=True)
1025
.alias("contains_lit"),
1026
).to_dict(as_series=False) == {
1027
"contains": [True, True, False, None, None, None],
1028
"contains_lit": [False, True, False, None, None, False],
1029
}
1030
1031
with pytest.raises(ComputeError):
1032
df.select(
1033
pl.col("text").str.contains(pl.col("pattern"), literal=False, strict=True)
1034
)
1035
1036
1037
@pytest.mark.parametrize(
1038
("pattern", "case_insensitive", "expected"),
1039
[
1040
(["me"], False, True),
1041
(["Me"], False, False),
1042
(["Me"], True, True),
1043
(pl.Series(["me", "they"]), False, True),
1044
(pl.Series(["Me", "they"]), False, False),
1045
(pl.Series(["Me", "they"]), True, True),
1046
(["me", "they"], False, True),
1047
(["Me", "they"], False, False),
1048
(["Me", "they"], True, True),
1049
],
1050
)
1051
def test_contains_any(
1052
pattern: pl.Series | list[str],
1053
case_insensitive: bool,
1054
expected: bool,
1055
) -> None:
1056
df = pl.DataFrame({"text": ["Tell me what you want"]})
1057
# series
1058
assert (
1059
expected
1060
== df["text"]
1061
.str.contains_any(pattern, ascii_case_insensitive=case_insensitive)
1062
.item()
1063
)
1064
# expr
1065
assert (
1066
expected
1067
== df.select(
1068
pl.col("text").str.contains_any(
1069
pattern, ascii_case_insensitive=case_insensitive
1070
)
1071
)["text"].item()
1072
)
1073
# frame filter
1074
assert int(expected) == len(
1075
df.filter(
1076
pl.col("text").str.contains_any(
1077
pattern, ascii_case_insensitive=case_insensitive
1078
)
1079
)
1080
)
1081
1082
1083
def test_replace() -> None:
1084
df = pl.DataFrame(
1085
data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
1086
schema=["idx", "text"],
1087
orient="row",
1088
)
1089
for pattern, replacement, as_literal, expected in (
1090
(r"\*", "-", False, ["- * text", "(with) special\n - chars **etc...?$"]),
1091
(r"*", "-", True, ["- * text", "(with) special\n - chars **etc...?$"]),
1092
(r"^\(", "[", False, ["* * text", "[with) special\n * chars **etc...?$"]),
1093
(r"^\(", "[", True, ["* * text", "(with) special\n * chars **etc...?$"]),
1094
(r"t$", "an", False, ["* * texan", "(with) special\n * chars **etc...?$"]),
1095
(r"t$", "an", True, ["* * text", "(with) special\n * chars **etc...?$"]),
1096
(r"(with) special", "$1", True, ["* * text", "$1\n * chars **etc...?$"]),
1097
(
1098
r"\((with)\) special",
1099
":$1:",
1100
False,
1101
["* * text", ":with:\n * chars **etc...?$"],
1102
),
1103
):
1104
# series
1105
assert (
1106
expected
1107
== df["text"]
1108
.str.replace(pattern, replacement, literal=as_literal)
1109
.to_list()
1110
)
1111
# expr
1112
assert (
1113
expected
1114
== df.select(
1115
pl.col("text").str.replace(pattern, replacement, literal=as_literal)
1116
)["text"].to_list()
1117
)
1118
1119
assert pl.Series(["."]).str.replace(".", "$0", literal=True)[0] == "$0"
1120
assert pl.Series(["(.)(?)"]).str.replace(".", "$1", literal=True)[0] == "($1)(?)"
1121
1122
1123
def test_replace_all() -> None:
1124
df = pl.DataFrame(
1125
data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
1126
schema=["idx", "text"],
1127
orient="row",
1128
)
1129
for pattern, replacement, as_literal, expected in (
1130
(r"\*", "-", False, ["- - text", "(with) special\n - chars --etc...?$"]),
1131
(r"*", "-", True, ["- - text", "(with) special\n - chars --etc...?$"]),
1132
(r"\W", "", False, ["text", "withspecialcharsetc"]),
1133
(r".?$", "", True, ["* * text", "(with) special\n * chars **etc.."]),
1134
(
1135
r"(with) special",
1136
"$1",
1137
True,
1138
["* * text", "$1\n * chars **etc...?$"],
1139
),
1140
(
1141
r"\((with)\) special",
1142
":$1:",
1143
False,
1144
["* * text", ":with:\n * chars **etc...?$"],
1145
),
1146
(
1147
r"(\b)[\w\s]{2,}(\b)",
1148
"$1(blah)$3",
1149
False,
1150
["* * (blah)", "((blah)) (blah)\n * (blah) **(blah)...?$"],
1151
),
1152
):
1153
# series
1154
assert (
1155
expected
1156
== df["text"]
1157
.str.replace_all(pattern, replacement, literal=as_literal)
1158
.to_list()
1159
)
1160
# expr
1161
assert (
1162
expected
1163
== df.select(
1164
pl.col("text").str.replace_all(pattern, replacement, literal=as_literal)
1165
)["text"].to_list()
1166
)
1167
# invalid regex (but valid literal - requires "literal=True")
1168
with pytest.raises(ComputeError):
1169
df["text"].str.replace_all("*", "")
1170
1171
assert (
1172
pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=True)[0]
1173
== "(.)($0)($0)"
1174
)
1175
assert (
1176
pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=False)[0]
1177
== "(.)(\\?)(\\?)"
1178
)
1179
1180
1181
def test_replace_all_literal_no_captures() -> None:
1182
# When using literal = True, capture groups should be disabled
1183
1184
# Single row code path in Rust
1185
df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
1186
df = df.with_columns(
1187
pl.col("text")
1188
.str.replace_all("<amt>", pl.col("amt"), literal=True)
1189
.alias("text2")
1190
)
1191
assert df.get_column("text2")[0] == "I found $1 yesterday."
1192
1193
# Multi-row code path in Rust
1194
df2 = pl.DataFrame(
1195
{
1196
"text": ["I found <amt> yesterday.", "I lost <amt> yesterday."],
1197
"amt": ["$1", "$2"],
1198
}
1199
)
1200
df2 = df2.with_columns(
1201
pl.col("text")
1202
.str.replace_all("<amt>", pl.col("amt"), literal=True)
1203
.alias("text2")
1204
)
1205
assert df2.get_column("text2")[0] == "I found $1 yesterday."
1206
assert df2.get_column("text2")[1] == "I lost $2 yesterday."
1207
1208
1209
def test_replace_literal_no_captures() -> None:
1210
# When using literal = True, capture groups should be disabled
1211
1212
# Single row code path in Rust
1213
df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
1214
df = df.with_columns(
1215
pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")
1216
)
1217
assert df.get_column("text2")[0] == "I found $1 yesterday."
1218
1219
# Multi-row code path in Rust
1220
# A string shorter than 32 chars,
1221
# and one longer than 32 chars to test both sub-paths
1222
df2 = pl.DataFrame(
1223
{
1224
"text": [
1225
"I found <amt> yesterday.",
1226
"I lost <amt> yesterday and this string is longer than 32 characters.",
1227
],
1228
"amt": ["$1", "$2"],
1229
}
1230
)
1231
df2 = df2.with_columns(
1232
pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")
1233
)
1234
assert df2.get_column("text2")[0] == "I found $1 yesterday."
1235
assert (
1236
df2.get_column("text2")[1]
1237
== "I lost $2 yesterday and this string is longer than 32 characters."
1238
)
1239
1240
1241
def test_replace_expressions() -> None:
1242
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})
1243
out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])
1244
assert out.to_dict(as_series=False) == {"foo": ["A", "xyz 678 910t"]}
1245
out = df.select([pl.col("foo").str.replace(pl.col("foo").last(), "value")])
1246
assert out.to_dict(as_series=False) == {"foo": ["123 bla 45 asd", "value"]}
1247
1248
df = pl.DataFrame(
1249
{"foo": ["1 bla 45 asd", "xyz 6t"], "pat": [r"\d", r"\W"], "value": ["A", "B"]}
1250
)
1251
out = df.select([pl.col("foo").str.replace_all(pl.col("pat").first(), "value")])
1252
assert out.to_dict(as_series=False) == {
1253
"foo": ["value bla valuevalue asd", "xyz valuet"]
1254
}
1255
1256
1257
@pytest.mark.parametrize(
1258
("pattern", "replacement", "case_insensitive", "leftmost", "expected"),
1259
[
1260
(["say"], "", False, False, "Tell me what you want"),
1261
(["me"], ["them"], False, False, "Tell them what you want"),
1262
(["who"], ["them"], False, False, "Tell me what you want"),
1263
(["me", "you"], "it", False, False, "Tell it what it want"),
1264
(["Me", "you"], "it", False, False, "Tell me what it want"),
1265
(["me", "you"], ["it"], False, False, "Tell it what it want"),
1266
(["me", "you"], ["you", "me"], False, False, "Tell you what me want"),
1267
(["me", "You", "them"], "it", False, False, "Tell it what you want"),
1268
(["Me", "you"], "it", True, False, "Tell it what it want"),
1269
(["me", "YOU"], ["you", "me"], True, False, "Tell you what me want"),
1270
(
1271
pl.Series(["me", "YOU"]),
1272
["you", "me"],
1273
False,
1274
False,
1275
"Tell you what you want",
1276
),
1277
(pl.Series(["me", "YOU"]), ["you", "me"], True, False, "Tell you what me want"),
1278
(
1279
["Tell me", "Tell"],
1280
["Don't tell", "Text"],
1281
False,
1282
False,
1283
"Text me what you want",
1284
),
1285
(
1286
["Tell me", "Tell"],
1287
["Don't tell me", "Text"],
1288
False,
1289
True,
1290
"Don't tell me what you want",
1291
),
1292
],
1293
)
1294
def test_replace_many(
1295
pattern: pl.Series | list[str],
1296
replacement: pl.Series | list[str] | str,
1297
case_insensitive: bool,
1298
leftmost: bool,
1299
expected: str,
1300
) -> None:
1301
df = pl.DataFrame({"text": ["Tell me what you want"]})
1302
# series
1303
val = (
1304
df["text"]
1305
.str.replace_many(
1306
pattern,
1307
replacement,
1308
ascii_case_insensitive=case_insensitive,
1309
leftmost=leftmost,
1310
)
1311
.item()
1312
)
1313
assert expected == val, val
1314
# expr
1315
val = df.select(
1316
pl.col("text").str.replace_many(
1317
pattern,
1318
replacement,
1319
ascii_case_insensitive=case_insensitive,
1320
leftmost=leftmost,
1321
)
1322
).item()
1323
assert expected == val, val
1324
1325
1326
def test_replace_many_groupby() -> None:
1327
df = pl.DataFrame(
1328
{
1329
"x": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
1330
"g": [0, 0, 0, 1, 1, 1, 2, 2, 2],
1331
}
1332
)
1333
out = df.group_by("g").agg(pl.col.x.str.replace_many(pl.col.x.head(2), ""))
1334
expected = pl.DataFrame(
1335
{
1336
"g": [0, 1, 2],
1337
"x": [["", "", "c"], ["", "", "f"], ["", "", "i"]],
1338
}
1339
)
1340
assert_frame_equal(out, expected, check_row_order=False)
1341
1342
1343
@pytest.mark.parametrize(
1344
("mapping", "case_insensitive", "expected"),
1345
[
1346
({}, False, "Tell me what you want"),
1347
({"me": "them"}, False, "Tell them what you want"),
1348
({"who": "them"}, False, "Tell me what you want"),
1349
({"me": "it", "you": "it"}, False, "Tell it what it want"),
1350
({"Me": "it", "you": "it"}, False, "Tell me what it want"),
1351
({"me": "you", "you": "me"}, False, "Tell you what me want"),
1352
({}, True, "Tell me what you want"),
1353
({"Me": "it", "you": "it"}, True, "Tell it what it want"),
1354
({"me": "you", "YOU": "me"}, True, "Tell you what me want"),
1355
],
1356
)
1357
def test_replace_many_mapping(
1358
mapping: dict[str, str],
1359
case_insensitive: bool,
1360
expected: str,
1361
) -> None:
1362
df = pl.DataFrame({"text": ["Tell me what you want"]})
1363
# series
1364
assert (
1365
expected
1366
== df["text"]
1367
.str.replace_many(mapping, ascii_case_insensitive=case_insensitive)
1368
.item()
1369
)
1370
# expr
1371
assert (
1372
expected
1373
== df.select(
1374
pl.col("text").str.replace_many(
1375
mapping,
1376
ascii_case_insensitive=case_insensitive,
1377
)
1378
).item()
1379
)
1380
1381
1382
def test_replace_many_invalid_inputs() -> None:
1383
df = pl.DataFrame({"text": ["Tell me what you want"]})
1384
1385
# Ensure a string as the first argument is parsed as a column name.
1386
with pytest.raises(ColumnNotFoundError, match="me"):
1387
df.select(pl.col("text").str.replace_many("me", "you"))
1388
1389
with pytest.raises(InvalidOperationError):
1390
df.select(pl.col("text").str.replace_many(1, 2))
1391
1392
with pytest.raises(InvalidOperationError):
1393
df.select(pl.col("text").str.replace_many([1], [2]))
1394
1395
with pytest.raises(InvalidOperationError):
1396
df.select(pl.col("text").str.replace_many(["me"], None))
1397
1398
with pytest.raises(TypeError):
1399
df.select(pl.col("text").str.replace_many(["me"]))
1400
1401
with pytest.raises(
1402
InvalidOperationError,
1403
match="expected the same amount of patterns as replacement strings",
1404
):
1405
df.select(pl.col("text").str.replace_many(["a"], ["b", "c"]))
1406
1407
s = df.to_series()
1408
1409
with pytest.raises(ColumnNotFoundError, match="me"):
1410
s.str.replace_many("me", "you") # type: ignore[arg-type]
1411
1412
with pytest.raises(TypeError):
1413
df.select(pl.col("text").str.replace_many(["me"]))
1414
1415
with pytest.raises(
1416
InvalidOperationError,
1417
match="expected the same amount of patterns as replacement strings",
1418
):
1419
s.str.replace_many(["a"], ["b", "c"])
1420
1421
1422
def test_extract_all_count() -> None:
1423
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xaz 678 910t", "boo", None]})
1424
assert (
1425
df.select(
1426
pl.col("foo").str.extract_all(r"a").alias("extract"),
1427
pl.col("foo").str.count_matches(r"a").alias("count"),
1428
).to_dict(as_series=False)
1429
) == {"extract": [["a", "a"], ["a"], [], None], "count": [2, 1, 0, None]}
1430
1431
assert df["foo"].str.extract_all(r"a").dtype == pl.List
1432
assert df["foo"].str.count_matches(r"a").dtype == pl.UInt32
1433
1434
1435
def test_count_matches_many() -> None:
1436
df = pl.DataFrame(
1437
{
1438
"foo": ["123 bla 45 asd", "xyz 678 910t", None, "boo"],
1439
"bar": [r"\d", r"[a-z]", r"\d", None],
1440
}
1441
)
1442
assert (
1443
df.select(
1444
pl.col("foo").str.count_matches(pl.col("bar")).alias("count")
1445
).to_dict(as_series=False)
1446
) == {"count": [5, 4, None, None]}
1447
1448
assert df["foo"].str.count_matches(df["bar"]).dtype == pl.UInt32
1449
1450
# Test broadcast.
1451
broad = df.select(
1452
pl.col("foo").str.count_matches(pl.col("bar").first()).alias("count"),
1453
pl.col("foo").str.count_matches(pl.col("bar").last()).alias("count_null"),
1454
)
1455
assert broad.to_dict(as_series=False) == {
1456
"count": [5, 6, None, 0],
1457
"count_null": [None, None, None, None],
1458
}
1459
assert broad.schema == {"count": pl.UInt32, "count_null": pl.UInt32}
1460
1461
1462
def test_extract_all_many() -> None:
1463
df = pl.DataFrame(
1464
{
1465
"foo": ["ab", "abc", "abcd", "foo", None, "boo"],
1466
"re": ["a", "bc", "a.c", "a", "a", None],
1467
}
1468
)
1469
assert df["foo"].str.extract_all(df["re"]).to_list() == [
1470
["a"],
1471
["bc"],
1472
["abc"],
1473
[],
1474
None,
1475
None,
1476
]
1477
1478
# Test broadcast.
1479
broad = df.select(
1480
pl.col("foo").str.extract_all(pl.col("re").first()).alias("a"),
1481
pl.col("foo").str.extract_all(pl.col("re").last()).alias("null"),
1482
)
1483
assert broad.to_dict(as_series=False) == {
1484
"a": [["a"], ["a"], ["a"], [], None, []],
1485
"null": [None] * 6,
1486
}
1487
assert broad.schema == {"a": pl.List(pl.String), "null": pl.List(pl.String)}
1488
1489
1490
@pytest.mark.may_fail_cloud # reason: zero-field struct
1491
def test_extract_groups_empty() -> None:
1492
df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})
1493
1494
assert df.select(pl.col("iso_code").str.extract_groups("")).to_dict(
1495
as_series=False
1496
) == {"iso_code": [{}, {}]}
1497
1498
q = df.lazy().select(pl.col("iso_code").str.extract_groups(""))
1499
assert q.collect_schema() == q.collect().schema
1500
1501
1502
def test_extract_groups() -> None:
1503
def _named_groups_builder(pattern: str, groups: dict[str, str]) -> str:
1504
return pattern.format(
1505
**{name: f"(?<{name}>{value})" for name, value in groups.items()}
1506
)
1507
1508
expected = {
1509
"authority": ["ISO", "ISO/IEC/IEEE"],
1510
"spec_num": ["80000", "29148"],
1511
"part_num": ["1", None],
1512
"revision_year": ["2009", "2018"],
1513
}
1514
1515
pattern = _named_groups_builder(
1516
r"{authority}\s{spec_num}(?:-{part_num})?(?::{revision_year})",
1517
{
1518
"authority": r"^ISO(?:/[A-Z]+)*",
1519
"spec_num": r"\d+",
1520
"part_num": r"\d+",
1521
"revision_year": r"\d{4}",
1522
},
1523
)
1524
1525
df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})
1526
1527
assert (
1528
df.select(pl.col("iso_code").str.extract_groups(pattern))
1529
.unnest("iso_code")
1530
.to_dict(as_series=False)
1531
== expected
1532
)
1533
1534
assert df.select(
1535
pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(\d+)")
1536
).to_dict(as_series=False) == {
1537
"iso_code": [{"1": "ISO", "2": "80000"}, {"1": "ISO/IEC/IEEE", "2": "29148"}]
1538
}
1539
1540
assert df.select(
1541
pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(?<year>\d+)\z")
1542
).to_dict(as_series=False) == {
1543
"iso_code": [
1544
{"1": "ISO", "year": "2009"},
1545
{"1": "ISO/IEC/IEEE", "year": "2018"},
1546
]
1547
}
1548
1549
assert pl.select(
1550
pl.lit(r"foobar").str.extract_groups(r"(?<foo>.{3})|(?<bar>...)")
1551
).to_dict(as_series=False) == {"literal": [{"foo": "foo", "bar": None}]}
1552
1553
1554
def test_starts_ends_with() -> None:
1555
df = pl.DataFrame(
1556
{
1557
"a": ["hamburger_with_tomatoes", "nuts", "lollypop", None],
1558
"sub": ["ham", "ts", None, "anything"],
1559
}
1560
)
1561
1562
assert df.select(
1563
pl.col("a").str.ends_with("pop").alias("ends_pop"),
1564
pl.col("a").str.ends_with(pl.lit(None)).alias("ends_None"),
1565
pl.col("a").str.ends_with(pl.col("sub")).alias("ends_sub"),
1566
pl.col("a").str.starts_with("ham").alias("starts_ham"),
1567
pl.col("a").str.starts_with(pl.lit(None)).alias("starts_None"),
1568
pl.col("a").str.starts_with(pl.col("sub")).alias("starts_sub"),
1569
).to_dict(as_series=False) == {
1570
"ends_pop": [False, False, True, None],
1571
"ends_None": [None, None, None, None],
1572
"ends_sub": [False, True, None, None],
1573
"starts_ham": [True, False, False, None],
1574
"starts_None": [None, None, None, None],
1575
"starts_sub": [True, False, None, None],
1576
}
1577
1578
1579
def test_json_path_match_type_4905() -> None:
1580
df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})
1581
assert df.filter(
1582
pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])
1583
).to_dict(as_series=False) == {"json_val": ['{"a":"hello"}']}
1584
1585
1586
def test_decode_strict() -> None:
1587
df = pl.DataFrame(
1588
{"strings": ["0IbQvTc3", "0J%2FQldCf0JA%3D", "0J%2FRgNC%2B0YHRgtC%2B"]}
1589
)
1590
result = df.select(pl.col("strings").str.decode("base64", strict=False))
1591
expected = {"strings": [b"\xd0\x86\xd0\xbd77", None, None]}
1592
assert result.to_dict(as_series=False) == expected
1593
1594
with pytest.raises(ComputeError):
1595
df.select(pl.col("strings").str.decode("base64", strict=True))
1596
1597
1598
def test_split() -> None:
1599
df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})
1600
out = df.select([pl.col("x").str.split("_")])
1601
1602
expected = pl.DataFrame(
1603
[
1604
{"x": ["a", "a"]},
1605
{"x": None},
1606
{"x": ["b"]},
1607
{"x": ["c", "c", "c"]},
1608
{"x": [""]},
1609
]
1610
)
1611
1612
assert_frame_equal(out, expected)
1613
assert_frame_equal(df["x"].str.split("_").to_frame(), expected)
1614
1615
out = df.select([pl.col("x").str.split("_", inclusive=True)])
1616
1617
expected = pl.DataFrame(
1618
[
1619
{"x": ["a_", "a"]},
1620
{"x": None},
1621
{"x": ["b"]},
1622
{"x": ["c_", "c_", "c"]},
1623
{"x": []},
1624
]
1625
)
1626
1627
assert_frame_equal(out, expected)
1628
assert_frame_equal(df["x"].str.split("_", inclusive=True).to_frame(), expected)
1629
1630
out = df.select([pl.col("x").str.split("")])
1631
1632
expected = pl.DataFrame(
1633
[
1634
{"x": ["a", "_", "a"]},
1635
{"x": None},
1636
{"x": ["b"]},
1637
{"x": ["c", "_", "c", "_", "c"]},
1638
{"x": []},
1639
]
1640
)
1641
1642
assert_frame_equal(out, expected)
1643
assert_frame_equal(df["x"].str.split("").to_frame(), expected)
1644
1645
out = df.select([pl.col("x").str.split("", inclusive=True)])
1646
1647
expected = pl.DataFrame(
1648
[
1649
{"x": ["a", "_", "a"]},
1650
{"x": None},
1651
{"x": ["b"]},
1652
{"x": ["c", "_", "c", "_", "c"]},
1653
{"x": []},
1654
]
1655
)
1656
1657
assert_frame_equal(out, expected)
1658
assert_frame_equal(df["x"].str.split("", inclusive=True).to_frame(), expected)
1659
1660
plan = (
1661
df.lazy()
1662
.select(
1663
a=pl.col("x").str.split(" ", inclusive=False),
1664
b=pl.col("x").str.split_exact(" ", 1, inclusive=False),
1665
)
1666
.explain()
1667
)
1668
1669
assert "str.split(" in plan
1670
assert "str.split_exact(" in plan
1671
1672
plan = (
1673
df.lazy()
1674
.select(
1675
a=pl.col("x").str.split(" ", inclusive=True),
1676
b=pl.col("x").str.split_exact(" ", 1, inclusive=True),
1677
)
1678
.explain()
1679
)
1680
1681
assert "str.split_inclusive(" in plan
1682
assert "str.split_exact_inclusive(" in plan
1683
1684
1685
def test_split_expr() -> None:
1686
df = pl.DataFrame(
1687
{
1688
"x": ["a_a", None, "b", "c*c*c", "dddd", ""],
1689
"by": ["_", "#", "^", "*", "", ""],
1690
}
1691
)
1692
out = df.select([pl.col("x").str.split(pl.col("by"))])
1693
expected = pl.DataFrame(
1694
[
1695
{"x": ["a", "a"]},
1696
{"x": None},
1697
{"x": ["b"]},
1698
{"x": ["c", "c", "c"]},
1699
{"x": ["d", "d", "d", "d"]},
1700
{"x": []},
1701
]
1702
)
1703
assert_frame_equal(out, expected)
1704
1705
out = df.select([pl.col("x").str.split(pl.col("by"), inclusive=True)])
1706
expected = pl.DataFrame(
1707
[
1708
{"x": ["a_", "a"]},
1709
{"x": None},
1710
{"x": ["b"]},
1711
{"x": ["c*", "c*", "c"]},
1712
{"x": ["d", "d", "d", "d"]},
1713
{"x": []},
1714
]
1715
)
1716
assert_frame_equal(out, expected)
1717
1718
1719
def test_split_exact() -> None:
1720
df = pl.DataFrame({"x": ["a_a", None, "b", "c_c", ""]})
1721
out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x")
1722
1723
expected = pl.DataFrame(
1724
{
1725
"field_0": ["a", None, "b", "c", ""],
1726
"field_1": ["a", None, None, "c", None],
1727
"field_2": pl.Series([None, None, None, None, None], dtype=pl.String),
1728
}
1729
)
1730
1731
assert_frame_equal(out, expected)
1732
out2 = df["x"].str.split_exact("_", 2, inclusive=False).to_frame().unnest("x")
1733
assert_frame_equal(out2, expected)
1734
1735
out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x")
1736
1737
expected = pl.DataFrame(
1738
{
1739
"field_0": ["a_", None, "b", "c_", None],
1740
"field_1": ["a", None, None, "c", None],
1741
}
1742
)
1743
assert_frame_equal(out, expected)
1744
assert df["x"].str.split_exact("_", 1).dtype == pl.Struct
1745
assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct
1746
1747
out = df.select([pl.col("x").str.split_exact("", 1)]).unnest("x")
1748
1749
expected = pl.DataFrame(
1750
{
1751
"field_0": ["a", None, "b", "c", None],
1752
"field_1": ["_", None, None, "_", None],
1753
}
1754
)
1755
assert_frame_equal(out, expected)
1756
1757
out = df.select([pl.col("x").str.split_exact("", 1, inclusive=True)]).unnest("x")
1758
1759
expected = pl.DataFrame(
1760
{
1761
"field_0": ["a", None, "b", "c", None],
1762
"field_1": ["_", None, None, "_", None],
1763
}
1764
)
1765
assert_frame_equal(out, expected)
1766
1767
1768
def test_split_exact_expr() -> None:
1769
df = pl.DataFrame(
1770
{
1771
"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],
1772
"by": ["_", "&", "$", "^", None, "", ""],
1773
}
1774
)
1775
1776
out = df.select(
1777
pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=False)
1778
).unnest("x")
1779
1780
expected = pl.DataFrame(
1781
{
1782
"field_0": ["a", None, "b", "c", None, "e", None],
1783
"field_1": ["a", None, None, "c", None, "e", None],
1784
"field_2": pl.Series(
1785
[None, None, None, "c", None, "e", None], dtype=pl.String
1786
),
1787
}
1788
)
1789
1790
assert_frame_equal(out, expected)
1791
1792
out2 = df.select(
1793
pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=True)
1794
).unnest("x")
1795
1796
expected2 = pl.DataFrame(
1797
{
1798
"field_0": ["a_", None, "b", "c^", None, "e", None],
1799
"field_1": ["a", None, None, "c^", None, "e", None],
1800
"field_2": pl.Series(
1801
[None, None, None, "c", None, "e", None], dtype=pl.String
1802
),
1803
}
1804
)
1805
assert_frame_equal(out2, expected2)
1806
1807
1808
def test_splitn() -> None:
1809
df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})
1810
out = df.select([pl.col("x").str.splitn("_", 2)]).unnest("x")
1811
1812
expected = pl.DataFrame(
1813
{
1814
"field_0": ["a", None, "b", "c", ""],
1815
"field_1": ["a", None, None, "c_c", None],
1816
}
1817
)
1818
1819
assert_frame_equal(out, expected)
1820
assert_frame_equal(df["x"].str.splitn("_", 2).to_frame().unnest("x"), expected)
1821
1822
out = df.select([pl.col("x").str.splitn("", 2)]).unnest("x")
1823
1824
expected = pl.DataFrame(
1825
{
1826
"field_0": ["a", None, "b", "c", None],
1827
"field_1": ["_a", None, None, "_c_c", None],
1828
}
1829
)
1830
1831
assert_frame_equal(out, expected)
1832
assert_frame_equal(df["x"].str.splitn("", 2).to_frame().unnest("x"), expected)
1833
1834
1835
def test_splitn_expr() -> None:
1836
df = pl.DataFrame(
1837
{
1838
"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],
1839
"by": ["_", "&", "$", "^", None, "", ""],
1840
}
1841
)
1842
1843
out = df.select(pl.col("x").str.splitn(pl.col("by"), 2)).unnest("x")
1844
1845
expected = pl.DataFrame(
1846
{
1847
"field_0": ["a", None, "b", "c", None, "e", None],
1848
"field_1": ["a", None, None, "c^c", None, "eee", None],
1849
}
1850
)
1851
1852
assert_frame_equal(out, expected)
1853
1854
1855
def test_titlecase() -> None:
1856
df = pl.DataFrame(
1857
{
1858
"quotes": [
1859
"'e.t. phone home'",
1860
"you talkin' to me?",
1861
"i feel the need--the need for speed",
1862
"to infinity,and BEYOND!",
1863
"say 'what' again!i dare you - I\u00a0double-dare you!",
1864
"What.we.got.here... is#failure#to#communicate",
1865
"welcome to my world",
1866
"double space",
1867
"and\ta\t tab",
1868
"by jean-paul sartre, 'esq'",
1869
"SOMETIMES/life/gives/you/a/2nd/chance",
1870
]
1871
}
1872
)
1873
1874
with pytest.warns(PolarsInefficientMapWarning):
1875
assert_frame_equal(
1876
df.select(pl.col("quotes").str.to_titlecase()),
1877
df.select(pl.col("quotes").map_elements(lambda s: s.title())),
1878
)
1879
1880
1881
def test_string_replace_with_nulls_10124() -> None:
1882
df = pl.DataFrame({"col1": ["S", "S", "S", None, "S", "S", "S", "S"]})
1883
1884
assert df.select(
1885
pl.col("col1"),
1886
pl.col("col1").str.replace("S", "O", n=1).alias("n_1"),
1887
pl.col("col1").str.replace("S", "O", n=3).alias("n_3"),
1888
).to_dict(as_series=False) == {
1889
"col1": ["S", "S", "S", None, "S", "S", "S", "S"],
1890
"n_1": ["O", "O", "O", None, "O", "O", "O", "O"],
1891
"n_3": ["O", "O", "O", None, "O", "O", "O", "O"],
1892
}
1893
1894
1895
def test_string_extract_groups_lazy_schema_10305() -> None:
1896
df = pl.LazyFrame(
1897
data={
1898
"url": [
1899
"http://vote.com/ballon_dor?candidate=messi&ref=python",
1900
"http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
1901
"http://vote.com/ballon_dor?error=404&ref=rust",
1902
]
1903
}
1904
)
1905
pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"
1906
df = df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(
1907
"captures"
1908
)
1909
1910
assert df.collect_schema() == {"candidate": pl.String, "ref": pl.String}
1911
1912
1913
def test_string_reverse() -> None:
1914
df = pl.DataFrame(
1915
{
1916
"text": [None, "foo", "bar", "i like pizza&#", None, "man\u0303ana"],
1917
}
1918
)
1919
expected = pl.DataFrame(
1920
[
1921
pl.Series(
1922
"text",
1923
[None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"],
1924
dtype=pl.String,
1925
),
1926
]
1927
)
1928
1929
result = df.select(pl.col("text").str.reverse())
1930
assert_frame_equal(result, expected)
1931
1932
1933
@pytest.mark.parametrize(
1934
("data", "expected_data"),
1935
[
1936
(["", None, "a"], ["", None, "b"]),
1937
([None, None, "a"], [None, None, "b"]),
1938
(["", "", ""], ["", "", ""]),
1939
([None, None, None], [None, None, None]),
1940
(["a", "", None], ["b", "", None]),
1941
],
1942
)
1943
def test_replace_lit_n_char_13385(
1944
data: list[str | None], expected_data: list[str | None]
1945
) -> None:
1946
s = pl.Series(data, dtype=pl.String)
1947
res = s.str.replace("a", "b", literal=True)
1948
expected_s = pl.Series(expected_data, dtype=pl.String)
1949
assert_series_equal(res, expected_s)
1950
1951
1952
def test_find_many_raises() -> None:
1953
df = pl.DataFrame({"values": ["discontent", "foobar"]})
1954
patterns = ["winter", "disco", "onte", "discontent"]
1955
with pytest.raises(
1956
ValueError, match="can not match overlapping patterns when leftmost == True"
1957
):
1958
df.select(
1959
pl.col("values").str.find_many(patterns, leftmost=True, overlapping=True)
1960
)
1961
1962
1963
def test_extract_many_raises() -> None:
1964
df = pl.DataFrame({"values": ["discontent", "foobar"]})
1965
patterns = ["winter", "disco", "onte", "discontent"]
1966
with pytest.raises(
1967
ValueError, match="can not match overlapping patterns when leftmost == True"
1968
):
1969
df.select(
1970
pl.col("values").str.extract_many(patterns, leftmost=True, overlapping=True)
1971
)
1972
1973
1974
def test_extract_many() -> None:
1975
df = pl.DataFrame({"values": ["discontent", "foobar"]})
1976
patterns = ["winter", "disco", "onte", "discontent"]
1977
assert df.with_columns(
1978
pl.col("values").str.extract_many(patterns, overlapping=False).alias("matches"),
1979
pl.col("values")
1980
.str.extract_many(patterns, overlapping=True)
1981
.alias("matches_overlapping"),
1982
).to_dict(as_series=False) == {
1983
"values": ["discontent", "foobar"],
1984
"matches": [["disco"], []],
1985
"matches_overlapping": [["disco", "onte", "discontent"], []],
1986
}
1987
1988
# many patterns
1989
df = pl.DataFrame(
1990
{
1991
"values": ["discontent", "rhapsody"],
1992
"patterns": [
1993
["winter", "disco", "onte", "discontent"],
1994
["rhap", "ody", "coalesce"],
1995
],
1996
}
1997
)
1998
1999
# extract_many
2000
assert df.select(pl.col("values").str.extract_many("patterns")).to_dict(
2001
as_series=False
2002
) == {"values": [["disco"], ["rhap", "ody"]]}
2003
2004
# find_many
2005
f1 = df.select(pl.col("values").str.find_many("patterns"))
2006
f2 = df["values"].str.find_many(df["patterns"])
2007
2008
assert_series_equal(f1["values"], f2)
2009
assert f2.to_list() == [[0], [0, 5]]
2010
2011
2012
def test_json_decode_raise_on_data_type_mismatch_13061() -> None:
2013
assert_series_equal(
2014
pl.Series(["null", "null"]).str.json_decode(infer_schema_length=1),
2015
pl.Series([None, None]),
2016
)
2017
2018
with pytest.raises(ComputeError):
2019
pl.Series(["null", "1"]).str.json_decode(infer_schema_length=1)
2020
2021
assert_series_equal(
2022
pl.Series(["null", "1"]).str.json_decode(infer_schema_length=2),
2023
pl.Series([None, 1]),
2024
)
2025
2026
2027
def test_json_decode_struct_schema() -> None:
2028
with pytest.raises(ComputeError, match="extra field in struct data: b"):
2029
pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2030
infer_schema_length=1
2031
)
2032
2033
assert_series_equal(
2034
pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2035
infer_schema_length=2
2036
),
2037
pl.Series([{"a": 1, "b": None}, {"a": 2, "b": 2}]),
2038
)
2039
2040
# If the schema was explicitly given, then we ignore extra fields.
2041
# TODO: There should be a `columns=` parameter to this.
2042
assert_series_equal(
2043
pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2044
dtype=pl.Struct({"a": pl.Int64})
2045
),
2046
pl.Series([{"a": 1}, {"a": 2}]),
2047
)
2048
2049
2050
def test_escape_regex() -> None:
2051
df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
2052
result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
2053
expected_df = pl.DataFrame(
2054
{
2055
"text": ["abc", "def", None, "abc(\\w+)"],
2056
"escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],
2057
}
2058
)
2059
2060
assert_frame_equal(result_df, expected_df)
2061
assert_series_equal(result_df["escaped"], expected_df["escaped"])
2062
2063
2064
@pytest.mark.parametrize(
2065
("form", "expected_data"),
2066
[
2067
("NFC", ["01²", "KADOKAWA"]), # noqa: RUF001
2068
("NFD", ["01²", "KADOKAWA"]), # noqa: RUF001
2069
("NFKC", ["012", "KADOKAWA"]),
2070
("NFKD", ["012", "KADOKAWA"]),
2071
],
2072
)
2073
def test_string_normalize(form: Any, expected_data: list[str | None]) -> None:
2074
s = pl.Series(["01²", "KADOKAWA"], dtype=pl.String) # noqa: RUF001
2075
res = s.str.normalize(form)
2076
expected_s = pl.Series(expected_data, dtype=pl.String)
2077
assert_series_equal(res, expected_s)
2078
2079
2080
def test_string_normalize_wrong_input() -> None:
2081
with pytest.raises(ValueError, match="`form` must be one of"):
2082
pl.Series(["01²"], dtype=pl.String).str.normalize("foobar") # type: ignore[arg-type]
2083
2084
2085
def test_to_integer_unequal_lengths_22034() -> None:
2086
s = pl.Series("a", ["1", "2", "3"], pl.String)
2087
with pytest.raises(pl.exceptions.ShapeError):
2088
s.str.to_integer(base=pl.Series([4, 5, 5, 4]))
2089
2090
2091
def test_broadcast_self() -> None:
2092
s = pl.Series("a", ["3"], pl.String)
2093
with pytest.raises(
2094
pl.exceptions.ComputeError, match="strict integer parsing failed"
2095
):
2096
s.str.to_integer(base=pl.Series([2, 2, 3, 4]))
2097
2098
2099
def test_strptime_unequal_length_22018() -> None:
2100
s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
2101
with pytest.raises(pl.exceptions.ShapeError):
2102
s.str.strptime(
2103
pl.Datetime, "%Y-%m-%d %H:%M%#z", ambiguous=pl.Series(["a", "b", "d"])
2104
)
2105
2106
2107
@pytest.mark.parametrize("inclusive", [False, True])
2108
def test_str_split_unequal_length_22018(inclusive: bool) -> None:
2109
with pytest.raises(pl.exceptions.ShapeError):
2110
pl.Series(["a-c", "x-y"]).str.split(
2111
pl.Series(["-", "/", "+"]), inclusive=inclusive
2112
)
2113
2114
2115
def test_str_split_self_broadcast() -> None:
2116
assert_series_equal(
2117
pl.Series(["a-/c"]).str.split(pl.Series(["-", "/", "+"])),
2118
pl.Series([["a", "/c"], ["a-", "c"], ["a-/c"]]),
2119
)
2120
2121
2122
def test_replace_many_mapping_in_list() -> None:
2123
assert_series_equal(
2124
pl.Series([["a", "b"]]).list.eval(
2125
pl.element().replace_strict({"a": 1, "b": 2})
2126
),
2127
pl.Series([[1, 2]]),
2128
)
2129
2130
2131
def test_str_replace_n_zero_23570() -> None:
2132
# more than 32 bytes
2133
abc_long = "abc " * 20 + "abc"
2134
df = pl.DataFrame(
2135
{"a": [abc_long, "abc abc abc", "abc ghi"], "b": ["jkl", "pqr", "xyz"]}
2136
)
2137
expected = df
2138
2139
out = df.with_columns(pl.col("a").str.replace("abc", "XYZ", n=0))
2140
assert_frame_equal(out, expected)
2141
2142
out = df.with_columns(pl.col("a").str.replace("abc", pl.col("b"), n=0))
2143
assert_frame_equal(out, expected)
2144
2145
2146
def test_str_replace_null_19601() -> None:
2147
df = pl.DataFrame({"key": ["1", "2"], "1": ["---", None]})
2148
2149
assert_frame_equal(
2150
df.select(result=pl.col("key").str.replace("1", pl.col("1"))),
2151
pl.DataFrame({"result": ["---", "2"]}),
2152
)
2153
2154
2155
def test_str_json_decode_25237() -> None:
2156
s = pl.Series(['[{"a": 0, "b": 1}, {"b": 2}]'])
2157
2158
dtypes = {s.str.json_decode().dtype for _ in range(20)}
2159
2160
assert len(dtypes) == 1
2161
2162
2163
def test_json_decode_decimal_25789() -> None:
2164
s = pl.Series(
2165
['{"a": 1.23}', '{"a": 4.56}', '{"a": null}', '{"a": "30.1271239481230948"}']
2166
)
2167
result = s.str.json_decode(dtype=pl.Struct({"a": pl.Decimal(4, 2)}))
2168
expected = pl.Series(
2169
[{"a": 1.23}, {"a": 4.56}, {"a": None}, {"a": 30.13}],
2170
dtype=pl.Struct({"a": pl.Decimal(4, 2)}),
2171
)
2172
assert_series_equal(result, expected)
2173
2174
with pytest.raises(
2175
ComputeError, match=r"error deserializing value.*30.127.* as Decimal\(3, 2\)"
2176
):
2177
s.str.json_decode(dtype=pl.Struct({"a": pl.Decimal(3, 2)}))
2178
2179
2180
def test_json_decode_i128() -> None:
2181
s = pl.Series(
2182
[
2183
'{"a":170141183460469231731687303715884105723}',
2184
'{"a":null}',
2185
'{"a":-170141183460469231731687303715759193239}',
2186
]
2187
)
2188
result = s.str.json_decode(dtype=pl.Struct({"a": pl.Int128}))
2189
expected = pl.Series(
2190
[{"a": 2**127 - 5}, {"a": None}, {"a": -(2**127) + 124912489}],
2191
dtype=pl.Struct({"a": pl.Int128}),
2192
)
2193
assert_series_equal(result, expected)
2194
2195
2196
def test_json_decode_u128() -> None:
2197
s = pl.Series(['{"a":340282366920938463463374607431768211451}', '{"a":null}'])
2198
result = s.str.json_decode(dtype=pl.Struct({"a": pl.UInt128}))
2199
expected = pl.Series(
2200
[{"a": 2**128 - 5}, {"a": None}],
2201
dtype=pl.Struct({"a": pl.UInt128}),
2202
)
2203
assert_series_equal(result, expected)
2204
2205
2206
@pytest.mark.parametrize("dtype", [pl.Enum(["bar", "foo"]), pl.Categorical])
2207
def test_json_decode_categorical_enum(dtype: pl.DataType) -> None:
2208
s = pl.Series(['{"a":"foo"}', '{"a":"bar"}', '{"a":null}', '{"a":"foo"}'])
2209
result = s.str.json_decode(dtype=pl.Struct({"a": dtype}))
2210
expected = pl.Series(
2211
[{"a": "foo"}, {"a": "bar"}, {"a": None}, {"a": "foo"}],
2212
dtype=pl.Struct({"a": dtype}),
2213
)
2214
assert_series_equal(result, expected)
2215
2216
2217
def test_str_split_regex() -> None:
2218
df = pl.DataFrame({"s": ["foo1bar", "foo99bar", "foo1bar2baz"]})
2219
2220
out = df.select(split=pl.col("s").str.split(by=r"\d+", literal=False))
2221
expected = pl.DataFrame(
2222
{"split": [["foo", "bar"], ["foo", "bar"], ["foo", "bar", "baz"]]}
2223
)
2224
2225
assert_frame_equal(out, expected)
2226
2227
2228
def test_str_split_regex_inclusive() -> None:
2229
df = pl.DataFrame({"s": ["foo1bar", "foo99bar", "foo1bar2baz"]})
2230
2231
out = df.select(
2232
split=pl.col("s").str.split(by=r"\d+", literal=False, inclusive=True)
2233
)
2234
expected = pl.DataFrame(
2235
{"split": [["foo1", "bar"], ["foo99", "bar"], ["foo1", "bar2", "baz"]]}
2236
)
2237
2238
assert_frame_equal(out, expected)
2239
2240
2241
def test_str_split_regex_expr() -> None:
2242
df = pl.DataFrame(
2243
{
2244
"s": ["foo1bar", "foo bar", "foo-bar baz"],
2245
"by": [r"\d", r"\s", r"-"],
2246
}
2247
)
2248
2249
out = df.select(split=pl.col("s").str.split(by=pl.col("by"), literal=False))
2250
expected = pl.DataFrame(
2251
{"split": [["foo", "bar"], ["foo", "bar"], ["foo", "bar baz"]]}
2252
)
2253
2254
assert_frame_equal(out, expected)
2255
2256
2257
def test_str_split_regex_expr_inclusive() -> None:
2258
df = pl.DataFrame(
2259
{
2260
"s": ["foo1bar", "foo bar", "foo-bar baz"],
2261
"by": [r"\d", r"\s", r"-"],
2262
}
2263
)
2264
2265
out = df.select(
2266
split=pl.col("s").str.split(by=pl.col("by"), literal=False, inclusive=True)
2267
)
2268
expected = pl.DataFrame(
2269
{"split": [["foo1", "bar"], ["foo ", "bar"], ["foo-", "bar baz"]]}
2270
)
2271
2272
assert_frame_equal(out, expected)
2273
2274
2275
def test_str_split_regex_invalid_pattern_strict_true() -> None:
2276
df = pl.DataFrame({"s": ["foo1bar", "abc", "123xyz"]})
2277
2278
with pytest.raises(ComputeError):
2279
df.select(split=pl.col("s").str.split(by="(", literal=False, strict=True))
2280
2281
2282
def test_str_split_regex_invalid_pattern_strict_false() -> None:
2283
df = pl.DataFrame({"s": ["foo1bar", "abc", "123xyz"]})
2284
2285
out = df.select(split=pl.col("s").str.split(by="(", literal=False, strict=False))
2286
2287
expected = pl.DataFrame(
2288
{
2289
"split": pl.Series(
2290
"split",
2291
[None, None, None],
2292
dtype=pl.List(pl.String),
2293
)
2294
}
2295
)
2296
2297
assert_frame_equal(out, expected)
2298
2299
2300
def test_str_split_regex_scalar_string_expr() -> None:
2301
df = pl.DataFrame({"by": [r"\d", r"\d+", r"bar"]})
2302
2303
out = df.select(
2304
split=pl.lit("foo1bar2baz").str.split(by=pl.col("by"), literal=False)
2305
)
2306
2307
expected = pl.DataFrame(
2308
{
2309
"split": [
2310
["foo", "bar", "baz"], # split by \d
2311
["foo", "bar", "baz"], # split by \d+
2312
["foo1", "2baz"], # split by "bar"
2313
]
2314
}
2315
)
2316
2317
assert_frame_equal(out, expected)
2318
2319