Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/string/test_string.py
6940 views
1
from __future__ import annotations
2
3
from typing import Any
4
5
import pytest
6
7
import polars as pl
8
import polars.selectors as cs
9
from polars.exceptions import (
10
ColumnNotFoundError,
11
ComputeError,
12
InvalidOperationError,
13
ShapeError,
14
)
15
from polars.testing import assert_frame_equal, assert_series_equal
16
17
18
def test_str_slice() -> None:
19
df = pl.DataFrame({"a": ["foobar", "barfoo"]})
20
assert df["a"].str.slice(-3).to_list() == ["bar", "foo"]
21
assert df.select([pl.col("a").str.slice(2, 4)])["a"].to_list() == ["obar", "rfoo"]
22
23
24
def test_str_slice_expr() -> None:
25
df = pl.DataFrame(
26
{
27
"a": ["foobar", None, "barfoo", "abcd", ""],
28
"offset": [1, 3, None, -3, 2],
29
"length": [3, 4, 2, None, 2],
30
}
31
)
32
out = df.select(
33
all_expr=pl.col("a").str.slice("offset", "length"),
34
offset_expr=pl.col("a").str.slice("offset", 2),
35
length_expr=pl.col("a").str.slice(0, "length"),
36
length_none=pl.col("a").str.slice("offset", None),
37
offset_length_lit=pl.col("a").str.slice(-3, 3),
38
str_lit=pl.lit("qwert").str.slice("offset", "length"),
39
)
40
expected = pl.DataFrame(
41
{
42
"all_expr": ["oob", None, None, "bcd", ""],
43
"offset_expr": ["oo", None, None, "bc", ""],
44
"length_expr": ["foo", None, "ba", "abcd", ""],
45
"length_none": ["oobar", None, None, "bcd", ""],
46
"offset_length_lit": ["bar", None, "foo", "bcd", ""],
47
"str_lit": ["wer", "rt", None, "ert", "er"],
48
}
49
)
50
assert_frame_equal(out, expected)
51
52
# negative length is not allowed
53
with pytest.raises(InvalidOperationError):
54
df.select(pl.col("a").str.slice(0, -1))
55
56
57
def test_str_slice_wrong_length() -> None:
58
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
59
with pytest.raises(ShapeError):
60
df.select(pl.col("num").str.slice(pl.Series([1, 2])))
61
62
63
@pytest.mark.parametrize(
64
("input", "n", "output"),
65
[
66
(["012345", "", None], 0, ["", "", None]),
67
(["012345", "", None], 2, ["01", "", None]),
68
(["012345", "", None], -2, ["0123", "", None]),
69
(["012345", "", None], 100, ["012345", "", None]),
70
(["012345", "", None], -100, ["", "", None]),
71
],
72
)
73
def test_str_head(input: list[str], n: int, output: list[str]) -> None:
74
assert pl.Series(input).str.head(n).to_list() == output
75
76
77
@pytest.mark.parametrize(
78
("input", "n", "output"),
79
[
80
("你好世界", 0, ""),
81
("你好世界", 2, "你好"),
82
("你好世界", 999, "你好世界"),
83
("你好世界", -1, "你好世"),
84
("你好世界", -2, "你好"),
85
("你好世界", -999, ""),
86
],
87
)
88
def test_str_head_codepoints(input: str, n: int, output: str) -> None:
89
assert pl.Series([input]).str.head(n).to_list() == [output]
90
91
92
def test_str_head_expr() -> None:
93
s = "012345"
94
df = pl.DataFrame(
95
{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}
96
)
97
out = df.select(
98
n_expr=pl.col("a").str.head("n"),
99
n_pos2=pl.col("a").str.head(2),
100
n_neg2=pl.col("a").str.head(-2),
101
n_pos100=pl.col("a").str.head(100),
102
n_pos_neg100=pl.col("a").str.head(-100),
103
n_pos_0=pl.col("a").str.head(0),
104
str_lit=pl.col("a").str.head(pl.lit(2)),
105
lit_expr=pl.lit(s).str.head("n"),
106
lit_n=pl.lit(s).str.head(2),
107
)
108
expected = pl.DataFrame(
109
{
110
"n_expr": ["", "01", "0123", "012345", "", None, "", None],
111
"n_pos2": ["01", "01", "01", "01", "01", "01", "", None],
112
"n_neg2": ["0123", "0123", "0123", "0123", "0123", "0123", "", None],
113
"n_pos100": [s, s, s, s, s, s, "", None],
114
"n_pos_neg100": ["", "", "", "", "", "", "", None],
115
"n_pos_0": ["", "", "", "", "", "", "", None],
116
"str_lit": ["01", "01", "01", "01", "01", "01", "", None],
117
"lit_expr": ["", "01", "0123", "012345", "", None, "012", "0123"],
118
"lit_n": ["01", "01", "01", "01", "01", "01", "01", "01"],
119
}
120
)
121
assert_frame_equal(out, expected)
122
123
124
def test_str_head_wrong_length() -> None:
125
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
126
with pytest.raises(ShapeError):
127
df.select(pl.col("num").str.head(pl.Series([1, 2])))
128
129
130
@pytest.mark.parametrize(
131
("input", "n", "output"),
132
[
133
(["012345", "", None], 0, ["", "", None]),
134
(["012345", "", None], 2, ["45", "", None]),
135
(["012345", "", None], -2, ["2345", "", None]),
136
(["012345", "", None], 100, ["012345", "", None]),
137
(["012345", "", None], -100, ["", "", None]),
138
],
139
)
140
def test_str_tail(input: list[str], n: int, output: list[str]) -> None:
141
assert pl.Series(input).str.tail(n).to_list() == output
142
143
144
@pytest.mark.parametrize(
145
("input", "n", "output"),
146
[
147
("你好世界", 0, ""),
148
("你好世界", 2, "世界"),
149
("你好世界", 999, "你好世界"),
150
("你好世界", -1, "好世界"),
151
("你好世界", -2, "世界"),
152
("你好世界", -999, ""),
153
],
154
)
155
def test_str_tail_codepoints(input: str, n: int, output: str) -> None:
156
assert pl.Series([input]).str.tail(n).to_list() == [output]
157
158
159
def test_str_tail_expr() -> None:
160
s = "012345"
161
df = pl.DataFrame(
162
{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}
163
)
164
out = df.select(
165
n_expr=pl.col("a").str.tail("n"),
166
n_pos2=pl.col("a").str.tail(2),
167
n_neg2=pl.col("a").str.tail(-2),
168
n_pos100=pl.col("a").str.tail(100),
169
n_pos_neg100=pl.col("a").str.tail(-100),
170
n_pos_0=pl.col("a").str.tail(0),
171
str_lit=pl.col("a").str.tail(pl.lit(2)),
172
lit_expr=pl.lit(s).str.tail("n"),
173
lit_n=pl.lit(s).str.tail(2),
174
)
175
expected = pl.DataFrame(
176
{
177
"n_expr": ["", "45", "2345", "012345", "", None, "", None],
178
"n_pos2": ["45", "45", "45", "45", "45", "45", "", None],
179
"n_neg2": ["2345", "2345", "2345", "2345", "2345", "2345", "", None],
180
"n_pos100": [s, s, s, s, s, s, "", None],
181
"n_pos_neg100": ["", "", "", "", "", "", "", None],
182
"n_pos_0": ["", "", "", "", "", "", "", None],
183
"str_lit": ["45", "45", "45", "45", "45", "45", "", None],
184
"lit_expr": ["", "45", "2345", "012345", "", None, "345", "2345"],
185
"lit_n": ["45", "45", "45", "45", "45", "45", "45", "45"],
186
}
187
)
188
assert_frame_equal(out, expected)
189
190
191
def test_str_tail_wrong_length() -> None:
192
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
193
with pytest.raises(ShapeError):
194
df.select(pl.col("num").str.tail(pl.Series([1, 2])))
195
196
197
def test_str_slice_multibyte() -> None:
198
ref = "你好世界"
199
s = pl.Series([ref])
200
201
# Pad the string to simplify (negative) offsets starting before/after the string.
202
npad = 20
203
padref = "_" * npad + ref + "_" * npad
204
for start in range(-5, 6):
205
for length in range(6):
206
offset = npad + start if start >= 0 else npad + start + len(ref)
207
correct = padref[offset : offset + length].strip("_")
208
result = s.str.slice(start, length)
209
expected = pl.Series([correct])
210
assert_series_equal(result, expected)
211
212
213
def test_str_len_bytes() -> None:
214
s = pl.Series(["Café", None, "345", "東京"])
215
result = s.str.len_bytes()
216
expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32)
217
assert_series_equal(result, expected)
218
219
220
def test_str_len_chars() -> None:
221
s = pl.Series(["Café", None, "345", "東京"])
222
result = s.str.len_chars()
223
expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32)
224
assert_series_equal(result, expected)
225
226
227
def test_str_contains() -> None:
228
s = pl.Series(["messi", "ronaldo", "ibrahimovic"])
229
expected = pl.Series([True, False, False])
230
assert_series_equal(s.str.contains("mes"), expected)
231
232
233
def test_str_contains_wrong_length() -> None:
234
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
235
with pytest.raises(ShapeError):
236
df.select(pl.col("num").str.contains(pl.Series(["a", "b"]))) # type: ignore [arg-type]
237
238
239
def test_count_match_literal() -> None:
240
s = pl.Series(["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None])
241
out = s.str.count_matches(r"\d", literal=True)
242
expected = pl.Series([0, 0, 2, None], dtype=pl.UInt32)
243
assert_series_equal(out, expected)
244
245
out = s.str.count_matches(pl.Series([r"\w", r"\w", r"\d", r"\d"]), literal=True)
246
expected = pl.Series([0, 1, 2, None], dtype=pl.UInt32)
247
assert_series_equal(out, expected)
248
249
250
def test_str_encode() -> None:
251
s = pl.Series(["foo", "bar", None])
252
hex_encoded = pl.Series(["666f6f", "626172", None])
253
base64_encoded = pl.Series(["Zm9v", "YmFy", None])
254
255
assert_series_equal(s.str.encode("hex"), hex_encoded)
256
assert_series_equal(s.str.encode("base64"), base64_encoded)
257
with pytest.raises(ValueError):
258
s.str.encode("utf8") # type: ignore[arg-type]
259
260
261
def test_str_decode() -> None:
262
hex_encoded = pl.Series(["666f6f", "626172", None])
263
base64_encoded = pl.Series(["Zm9v", "YmFy", None])
264
expected = pl.Series([b"foo", b"bar", None])
265
266
assert_series_equal(hex_encoded.str.decode("hex"), expected)
267
assert_series_equal(base64_encoded.str.decode("base64"), expected)
268
269
270
def test_str_decode_exception() -> None:
271
s = pl.Series(["not a valid", "626172", None])
272
with pytest.raises(ComputeError):
273
s.str.decode(encoding="hex")
274
with pytest.raises(ComputeError):
275
s.str.decode(encoding="base64")
276
with pytest.raises(ValueError):
277
s.str.decode("utf8") # type: ignore[arg-type]
278
279
280
@pytest.mark.parametrize("strict", [True, False])
281
def test_str_find(strict: bool) -> None:
282
df = pl.DataFrame(
283
data=[
284
("Dubai", 3564931, "b[ai]", "ai"),
285
("Abu Dhabi", 1807000, "b[ai]", " "),
286
("Sharjah", 1405000, "[ai]n", "s"),
287
("Al Ain", 846747, "[ai]n", ""),
288
("Ajman", 490035, "[ai]n", "ma"),
289
("Ras Al Khaimah", 191753, "a.+a", "Kha"),
290
("Fujairah", 118933, "a.+a", None),
291
("Umm Al Quwain", 59098, "a.+a", "wa"),
292
(None, None, None, "n/a"),
293
],
294
schema={
295
"city": pl.String,
296
"population": pl.Int32,
297
"pat": pl.String,
298
"lit": pl.String,
299
},
300
orient="row",
301
)
302
city, pop, pat, lit = (pl.col(c) for c in ("city", "population", "pat", "lit"))
303
304
for match_lit in (True, False):
305
res = df.select(
306
find_a_regex=city.str.find("(?i)a", strict=strict),
307
find_a_lit=city.str.find("a", literal=match_lit),
308
find_00_lit=pop.cast(pl.String).str.find("00", literal=match_lit),
309
find_col_lit=city.str.find(lit, strict=strict, literal=match_lit),
310
find_col_pat=city.str.find(pat, strict=strict),
311
)
312
assert res.to_dict(as_series=False) == {
313
"find_a_regex": [3, 0, 2, 0, 0, 1, 3, 4, None],
314
"find_a_lit": [3, 6, 2, None, 3, 1, 3, 10, None],
315
"find_00_lit": [None, 4, 4, None, 2, None, None, None, None],
316
"find_col_lit": [3, 3, None, 0, 2, 7, None, 9, None],
317
"find_col_pat": [2, 7, None, 4, 3, 1, 3, None, None],
318
}
319
320
321
def test_str_find_invalid_regex() -> None:
322
# test behaviour of 'strict' with invalid regular expressions
323
df = pl.DataFrame({"txt": ["AbCdEfG"]})
324
rx_invalid = "(?i)AB.))"
325
326
with pytest.raises(ComputeError):
327
df.with_columns(pl.col("txt").str.find(rx_invalid, strict=True))
328
329
res = df.with_columns(pl.col("txt").str.find(rx_invalid, strict=False))
330
assert res.item() is None
331
332
333
def test_str_find_escaped_chars() -> None:
334
# test behaviour of 'literal=True' with special chars
335
df = pl.DataFrame({"txt": ["123.*465", "x(x?)x"]})
336
337
res = df.with_columns(
338
x1=pl.col("txt").str.find("(x?)", literal=True),
339
x2=pl.col("txt").str.find(".*4", literal=True),
340
x3=pl.col("txt").str.find("(x?)"),
341
x4=pl.col("txt").str.find(".*4"),
342
)
343
# ┌──────────┬──────┬──────┬─────┬──────┐
344
# │ txt ┆ x1 ┆ x2 ┆ x3 ┆ x4 │
345
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
346
# │ str ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
347
# ╞══════════╪══════╪══════╪═════╪══════╡
348
# │ 123.*465 ┆ null ┆ 3 ┆ 0 ┆ 0 │
349
# │ x(x?)x ┆ 1 ┆ null ┆ 0 ┆ null │
350
# └──────────┴──────┴──────┴─────┴──────┘
351
assert_frame_equal(
352
pl.DataFrame(
353
{
354
"txt": ["123.*465", "x(x?)x"],
355
"x1": [None, 1],
356
"x2": [3, None],
357
"x3": [0, 0],
358
"x4": [0, None],
359
}
360
).cast({cs.signed_integer(): pl.UInt32}),
361
res,
362
)
363
364
365
def test_str_find_wrong_length() -> None:
366
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
367
with pytest.raises(ShapeError):
368
df.select(pl.col("num").str.find(pl.Series(["a", "b"]))) # type: ignore [arg-type]
369
370
371
def test_hex_decode_return_dtype() -> None:
372
data = {"a": ["68656c6c6f", "776f726c64"]}
373
expr = pl.col("a").str.decode("hex")
374
375
df = pl.DataFrame(data).select(expr)
376
assert df.schema == {"a": pl.Binary}
377
378
ldf = pl.LazyFrame(data).select(expr)
379
assert ldf.collect_schema() == {"a": pl.Binary}
380
381
382
def test_base64_decode_return_dtype() -> None:
383
data = {"a": ["Zm9v", "YmFy"]}
384
expr = pl.col("a").str.decode("base64")
385
386
df = pl.DataFrame(data).select(expr)
387
assert df.schema == {"a": pl.Binary}
388
389
ldf = pl.LazyFrame(data).select(expr)
390
assert ldf.collect_schema() == {"a": pl.Binary}
391
392
393
def test_str_replace_str_replace_all() -> None:
394
s = pl.Series(["hello", "world", "test", "rooted"])
395
expected = pl.Series(["hell0", "w0rld", "test", "r0oted"])
396
assert_series_equal(s.str.replace("o", "0"), expected)
397
398
expected = pl.Series(["hell0", "w0rld", "test", "r00ted"])
399
assert_series_equal(s.str.replace_all("o", "0"), expected)
400
401
402
def test_str_replace_n_single() -> None:
403
s = pl.Series(["aba", "abaa"])
404
405
assert s.str.replace("a", "b", n=1).to_list() == ["bba", "bbaa"]
406
assert s.str.replace("a", "b", n=2).to_list() == ["bbb", "bbba"]
407
assert s.str.replace("a", "b", n=3).to_list() == ["bbb", "bbbb"]
408
409
410
def test_str_replace_n_same_length() -> None:
411
# pat and val have the same length
412
# this triggers a fast path
413
s = pl.Series(["abfeab", "foobarabfooabab"])
414
assert s.str.replace("ab", "AB", n=1).to_list() == ["ABfeab", "foobarABfooabab"]
415
assert s.str.replace("ab", "AB", n=2).to_list() == ["ABfeAB", "foobarABfooABab"]
416
assert s.str.replace("ab", "AB", n=3).to_list() == ["ABfeAB", "foobarABfooABAB"]
417
418
419
def test_str_to_lowercase() -> None:
420
s = pl.Series(["Hello", "WORLD"])
421
expected = pl.Series(["hello", "world"])
422
assert_series_equal(s.str.to_lowercase(), expected)
423
424
425
def test_str_to_uppercase() -> None:
426
s = pl.Series(["Hello", "WORLD"])
427
expected = pl.Series(["HELLO", "WORLD"])
428
assert_series_equal(s.str.to_uppercase(), expected)
429
430
431
def test_str_case_cyrillic() -> None:
432
vals = ["Biтpyк", "Iвaн"]
433
s = pl.Series(vals)
434
assert s.str.to_lowercase().to_list() == [a.lower() for a in vals]
435
assert s.str.to_uppercase().to_list() == [a.upper() for a in vals]
436
437
438
def test_str_to_integer() -> None:
439
bin = pl.Series(["110", "101", "010"])
440
assert_series_equal(bin.str.to_integer(base=2), pl.Series([6, 5, 2]).cast(pl.Int64))
441
442
hex = pl.Series(["fa1e", "ff00", "cafe", "invalid", None])
443
assert_series_equal(
444
hex.str.to_integer(base=16, strict=False),
445
pl.Series([64030, 65280, 51966, None, None]).cast(pl.Int64),
446
check_exact=True,
447
)
448
449
with pytest.raises(ComputeError):
450
hex.str.to_integer(base=16)
451
452
453
@pytest.mark.parametrize("strict", [False, True])
454
def test_str_to_integer_invalid_base(strict: bool) -> None:
455
numbers = pl.Series(["1", "ZZZ", "-ABCZZZ", None])
456
with pytest.raises(ComputeError):
457
numbers.str.to_integer(base=100, strict=strict)
458
459
df = pl.DataFrame({"str": numbers, "base": [0, 1, 100, None]})
460
with pytest.raises(ComputeError):
461
df.select(pl.col("str").str.to_integer(base=pl.col("base"), strict=strict))
462
463
464
def test_str_to_integer_base_expr() -> None:
465
df = pl.DataFrame(
466
{"str": ["110", "ff00", "234", None, "130"], "base": [2, 16, 10, 8, None]}
467
)
468
out = df.select(base_expr=pl.col("str").str.to_integer(base="base"))
469
expected = pl.DataFrame({"base_expr": [6, 65280, 234, None, None]})
470
assert_frame_equal(out, expected)
471
472
# test strict raise
473
df = pl.DataFrame({"str": ["110", "ff00", "cafe", None], "base": [2, 10, 10, 8]})
474
475
with pytest.raises(ComputeError):
476
df.select(pl.col("str").str.to_integer(base="base"))
477
478
479
def test_str_to_integer_base_literal() -> None:
480
df = pl.DataFrame(
481
{
482
"bin": ["110", "101", "-010", "invalid", None],
483
"hex": ["fa1e", "ff00", "cafe", "invalid", None],
484
}
485
)
486
result = df.with_columns(
487
pl.col("bin").str.to_integer(base=2, strict=False),
488
pl.col("hex").str.to_integer(base=16, strict=False),
489
)
490
491
expected = pl.DataFrame(
492
{
493
"bin": [6, 5, -2, None, None],
494
"hex": [64030, 65280, 51966, None, None],
495
}
496
)
497
assert_frame_equal(result, expected)
498
499
with pytest.raises(ComputeError):
500
df.with_columns(
501
pl.col("bin").str.to_integer(base=2),
502
pl.col("hex").str.to_integer(base=16),
503
)
504
505
506
def test_str_to_integer_dtype() -> None:
507
lf = pl.LazyFrame(
508
{
509
"str": ["1111111", "7f", "127", None, "42"],
510
"base": [2, 16, 10, 8, None],
511
}
512
)
513
out = lf.select(
514
i8=pl.col("str").str.to_integer(base="base", dtype=pl.Int8),
515
i16=pl.col("str").str.to_integer(base="base", dtype=pl.Int16),
516
i32=pl.col("str").str.to_integer(base="base", dtype=pl.Int32),
517
i64=pl.col("str").str.to_integer(base="base", dtype=pl.Int64),
518
u8=pl.col("str").str.to_integer(base="base", dtype=pl.UInt8),
519
u16=pl.col("str").str.to_integer(base="base", dtype=pl.UInt16),
520
u32=pl.col("str").str.to_integer(base="base", dtype=pl.UInt32),
521
u64=pl.col("str").str.to_integer(base="base", dtype=pl.UInt64),
522
default=pl.col("str").str.to_integer(base="base"),
523
).collect()
524
525
expected = pl.DataFrame(
526
{
527
"i8": [127, 127, 127, None, None],
528
"i16": [127, 127, 127, None, None],
529
"i32": [127, 127, 127, None, None],
530
"i64": [127, 127, 127, None, None],
531
"u8": [127, 127, 127, None, None],
532
"u16": [127, 127, 127, None, None],
533
"u32": [127, 127, 127, None, None],
534
"u64": [127, 127, 127, None, None],
535
"default": [127, 127, 127, None, None],
536
},
537
schema={
538
"i8": pl.Int8,
539
"i16": pl.Int16,
540
"i32": pl.Int32,
541
"i64": pl.Int64,
542
"u8": pl.UInt8,
543
"u16": pl.UInt16,
544
"u32": pl.UInt32,
545
"u64": pl.UInt64,
546
"default": pl.Int64,
547
},
548
)
549
assert lf.collect_schema() == lf.collect().schema
550
assert_frame_equal(out, expected)
551
552
553
def test_str_to_integer_large() -> None:
554
df = pl.DataFrame(
555
{
556
"str": [
557
"-6129899454972456276923959272",
558
"1A44E53BFEBA967E6682FBB0",
559
"10100110111110110101110100000100110010101111000100011000000100010101010101101011111111101000",
560
None,
561
"7798994549724957734429272",
562
],
563
"base": [10, 16, 2, 8, None],
564
}
565
)
566
out = df.select(i128=pl.col("str").str.to_integer(base="base", dtype=pl.Int128))
567
expected = pl.DataFrame(
568
{
569
"i128": [
570
-6129899454972456276923959272,
571
8129899739726392769273592752,
572
3229899454972495776923959272,
573
None,
574
None,
575
]
576
},
577
schema={"i128": pl.Int128},
578
)
579
assert_frame_equal(out, expected)
580
581
# test strict raise
582
df = pl.DataFrame(
583
{
584
"i128": [
585
"612989945497245627692395927261298994549724562769239592726129899454972456276923959272",
586
"1A44E53BFEBA967E6682FBB0",
587
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
588
"7798994549724957734429272",
589
None,
590
"7798994549724957734429272",
591
],
592
"base": [10, 2, 16, 10, 8, None],
593
}
594
)
595
596
with pytest.raises(ComputeError):
597
df.select(pl.col("i128").str.to_integer(base="base", dtype=pl.Int128))
598
599
600
def test_str_strip_chars_expr() -> None:
601
df = pl.DataFrame(
602
{
603
"s": [" hello ", "^^world^^", "&&hi&&", " polars ", None],
604
"pat": [" ", "^", "&", None, "anything"],
605
}
606
)
607
608
all_expr = df.select(
609
pl.col("s").str.strip_chars(pl.col("pat")).alias("strip_chars"),
610
pl.col("s").str.strip_chars_start(pl.col("pat")).alias("strip_chars_start"),
611
pl.col("s").str.strip_chars_end(pl.col("pat")).alias("strip_chars_end"),
612
)
613
614
expected = pl.DataFrame(
615
{
616
"strip_chars": ["hello", "world", "hi", "polars", None],
617
"strip_chars_start": ["hello ", "world^^", "hi&&", "polars ", None],
618
"strip_chars_end": [" hello", "^^world", "&&hi", " polars", None],
619
}
620
)
621
622
assert_frame_equal(all_expr, expected)
623
624
strip_by_null = df.select(
625
pl.col("s").str.strip_chars(None).alias("strip_chars"),
626
pl.col("s").str.strip_chars_start(None).alias("strip_chars_start"),
627
pl.col("s").str.strip_chars_end(None).alias("strip_chars_end"),
628
)
629
630
# only whitespace are striped.
631
expected = pl.DataFrame(
632
{
633
"strip_chars": ["hello", "^^world^^", "&&hi&&", "polars", None],
634
"strip_chars_start": ["hello ", "^^world^^", "&&hi&&", "polars ", None],
635
"strip_chars_end": [" hello", "^^world^^", "&&hi&&", " polars", None],
636
}
637
)
638
assert_frame_equal(strip_by_null, expected)
639
640
641
def test_str_strip_chars() -> None:
642
s = pl.Series([" hello ", "world\t "])
643
expected = pl.Series(["hello", "world"])
644
assert_series_equal(s.str.strip_chars(), expected)
645
646
expected = pl.Series(["hell", "world"])
647
assert_series_equal(s.str.strip_chars().str.strip_chars("o"), expected)
648
649
expected = pl.Series(["ell", "rld\t"])
650
assert_series_equal(s.str.strip_chars(" hwo"), expected)
651
652
653
def test_str_strip_chars_wrong_length() -> None:
654
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
655
with pytest.raises(ShapeError):
656
df.select(pl.col("num").str.strip_chars(pl.Series(["a", "b"])))
657
658
659
def test_str_strip_chars_start() -> None:
660
s = pl.Series([" hello ", "\t world"])
661
expected = pl.Series(["hello ", "world"])
662
assert_series_equal(s.str.strip_chars_start(), expected)
663
664
expected = pl.Series(["ello ", "world"])
665
assert_series_equal(s.str.strip_chars_start().str.strip_chars_start("h"), expected)
666
667
expected = pl.Series(["ello ", "\t world"])
668
assert_series_equal(s.str.strip_chars_start("hw "), expected)
669
670
671
def test_str_strip_chars_start_wrong_length() -> None:
672
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
673
with pytest.raises(ShapeError):
674
df.select(pl.col("num").str.strip_chars_start(pl.Series(["a", "b"])))
675
676
677
def test_str_strip_chars_end() -> None:
678
s = pl.Series([" hello ", "world\t "])
679
expected = pl.Series([" hello", "world"])
680
assert_series_equal(s.str.strip_chars_end(), expected)
681
682
expected = pl.Series([" hell", "world"])
683
assert_series_equal(s.str.strip_chars_end().str.strip_chars_end("o"), expected)
684
685
expected = pl.Series([" he", "wor"])
686
assert_series_equal(s.str.strip_chars_end("odl \t"), expected)
687
688
689
def test_str_strip_chars_end_wrong_length() -> None:
690
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
691
with pytest.raises(ShapeError):
692
df.select(pl.col("num").str.strip_chars_end(pl.Series(["a", "b"])))
693
694
695
def test_str_strip_whitespace() -> None:
696
s = pl.Series("a", ["trailing ", " leading", " both "])
697
698
expected = pl.Series("a", ["trailing", " leading", " both"])
699
assert_series_equal(s.str.strip_chars_end(), expected)
700
701
expected = pl.Series("a", ["trailing ", "leading", "both "])
702
assert_series_equal(s.str.strip_chars_start(), expected)
703
704
expected = pl.Series("a", ["trailing", "leading", "both"])
705
assert_series_equal(s.str.strip_chars(), expected)
706
707
708
def test_str_strip_prefix_literal() -> None:
709
s = pl.Series(["foo:bar", "foofoo:bar", "bar:bar", "foo", "", None])
710
expected = pl.Series([":bar", "foo:bar", "bar:bar", "", "", None])
711
assert_series_equal(s.str.strip_prefix("foo"), expected)
712
# test null literal
713
expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)
714
assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.String)), expected)
715
716
717
def test_str_strip_prefix_suffix_expr() -> None:
718
df = pl.DataFrame(
719
{
720
"s": ["foo-bar", "foobarbar", "barfoo", "", "anything", None],
721
"prefix": ["foo", "foobar", "foo", "", None, "bar"],
722
"suffix": ["bar", "barbar", "bar", "", None, "foo"],
723
}
724
)
725
out = df.select(
726
pl.col("s").str.strip_prefix(pl.col("prefix")).alias("strip_prefix"),
727
pl.col("s").str.strip_suffix(pl.col("suffix")).alias("strip_suffix"),
728
)
729
assert out.to_dict(as_series=False) == {
730
"strip_prefix": ["-bar", "bar", "barfoo", "", None, None],
731
"strip_suffix": ["foo-", "foo", "barfoo", "", None, None],
732
}
733
734
735
def test_str_strip_prefix_wrong_length() -> None:
736
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
737
with pytest.raises(ShapeError):
738
df.select(pl.col("num").str.strip_prefix(pl.Series(["a", "b"])))
739
740
741
def test_str_strip_suffix() -> None:
742
s = pl.Series(["foo:bar", "foo:barbar", "foo:foo", "bar", "", None])
743
expected = pl.Series(["foo:", "foo:bar", "foo:foo", "", "", None])
744
assert_series_equal(s.str.strip_suffix("bar"), expected)
745
# test null literal
746
expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)
747
assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.String)), expected)
748
749
750
def test_str_strip_suffix_wrong_length() -> None:
751
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
752
with pytest.raises(ShapeError):
753
df.select(pl.col("num").str.strip_suffix(pl.Series(["a", "b"])))
754
755
756
def test_str_split() -> None:
757
a = pl.Series("a", ["a, b", "a", "ab,c,de"])
758
for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:
759
assert out[0].to_list() == ["a", " b"]
760
assert out[1].to_list() == ["a"]
761
assert out[2].to_list() == ["ab", "c", "de"]
762
763
for out in [
764
a.str.split(",", inclusive=True),
765
pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),
766
]:
767
assert out[0].to_list() == ["a,", " b"]
768
assert out[1].to_list() == ["a"]
769
assert out[2].to_list() == ["ab,", "c,", "de"]
770
771
772
def test_json_decode_series() -> None:
773
s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"])
774
expected = pl.Series([[1, 2, 3], None, [4, 5, 6]])
775
dtype = pl.List(pl.Int64)
776
assert_series_equal(s.str.json_decode(None), expected)
777
assert_series_equal(s.str.json_decode(dtype), expected)
778
779
s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}'])
780
expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}])
781
dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
782
assert_series_equal(s.str.json_decode(None), expected)
783
assert_series_equal(s.str.json_decode(dtype2), expected)
784
785
expected = pl.Series([{"a": 1}, None, {"a": 2}])
786
dtype2 = pl.Struct([pl.Field("a", pl.Int64)])
787
assert_series_equal(s.str.json_decode(dtype2), expected)
788
789
s = pl.Series([], dtype=pl.String)
790
expected = pl.Series([], dtype=pl.List(pl.Int64))
791
dtype = pl.List(pl.Int64)
792
assert_series_equal(s.str.json_decode(dtype), expected)
793
794
795
def test_json_decode_lazy_expr() -> None:
796
dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
797
ldf = (
798
pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']})
799
.lazy()
800
.select(pl.col("json").str.json_decode(dtype))
801
)
802
expected = pl.DataFrame(
803
{"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]}
804
).lazy()
805
assert ldf.collect_schema() == {"json": dtype}
806
assert_frame_equal(ldf, expected)
807
808
809
def test_json_decode_nested_struct() -> None:
810
json = [
811
'[{"key_1": "a"}]',
812
'[{"key_1": "a2", "key_2": 2}]',
813
'[{"key_1": "a3", "key_2": 3, "key_3": "c"}]',
814
]
815
s = pl.Series("json_str", json)
816
s_parsed = s.str.json_decode().rename("parsed_list_json")
817
818
expected_dtype = pl.List(
819
pl.Struct(
820
[
821
pl.Field("key_1", pl.String),
822
pl.Field("key_2", pl.Int64),
823
pl.Field("key_3", pl.String),
824
]
825
)
826
)
827
assert s_parsed.dtype == expected_dtype
828
829
key_1_values = s_parsed.to_frame().select(
830
pl.col("parsed_list_json")
831
.list.get(0)
832
.struct.field("key_1")
833
.alias("key_1_values")
834
)
835
expected_values = pl.Series("key_1_values", ["a", "a2", "a3"])
836
assert_series_equal(key_1_values.get_column("key_1_values"), expected_values)
837
838
839
def test_json_decode_primitive_to_list_11053() -> None:
840
df = pl.DataFrame(
841
{
842
"json": [
843
'{"col1": ["123"], "col2": "123"}',
844
'{"col1": ["xyz"], "col2": null}',
845
]
846
}
847
)
848
schema = pl.Struct(
849
{
850
"col1": pl.List(pl.String),
851
"col2": pl.List(pl.String),
852
}
853
)
854
855
output = df.select(
856
pl.col("json").str.json_decode(schema).alias("decoded_json")
857
).unnest("decoded_json")
858
expected = pl.DataFrame({"col1": [["123"], ["xyz"]], "col2": [["123"], None]})
859
assert_frame_equal(output, expected)
860
861
862
def test_jsonpath_single() -> None:
863
s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}'])
864
expected = pl.Series(["1", None, "2", "2.1", "true"])
865
assert_series_equal(s.str.json_path_match("$.a"), expected)
866
867
868
def test_json_path_match() -> None:
869
df = pl.DataFrame(
870
{
871
"str": [
872
'{"a":"1"}',
873
None,
874
'{"b":2}',
875
'{"a":2.1, "b": "hello"}',
876
'{"a":true}',
877
],
878
"pat": ["$.a", "$.a", "$.b", "$.b", None],
879
}
880
)
881
out = df.select(
882
all_expr=pl.col("str").str.json_path_match(pl.col("pat")),
883
str_expr=pl.col("str").str.json_path_match("$.a"),
884
pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),
885
)
886
expected = pl.DataFrame(
887
{
888
"all_expr": ["1", None, "2", "hello", None],
889
"str_expr": ["1", None, None, "2.1", "true"],
890
"pat_expr": ["1.1", "1.1", "10", "10", None],
891
}
892
)
893
assert_frame_equal(out, expected)
894
895
896
def test_str_json_path_match_wrong_length() -> None:
897
df = pl.DataFrame({"num": ["-10", "-1", "0"]})
898
with pytest.raises((ShapeError, ComputeError)):
899
df.select(pl.col("num").str.json_path_match(pl.Series(["a", "b"])))
900
901
902
def test_extract_regex() -> None:
903
s = pl.Series(
904
[
905
"http://vote.com/ballon_dor?candidate=messi&ref=polars",
906
"http://vote.com/ballon_dor?candidat=jorginho&ref=polars",
907
"http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
908
]
909
)
910
expected = pl.Series(["messi", None, "ronaldo"])
911
assert_series_equal(s.str.extract(r"candidate=(\w+)", 1), expected)
912
913
914
def test_extract() -> None:
915
df = pl.DataFrame(
916
{
917
"s": ["aron123", "12butler", "charly*", "~david", None],
918
"pat": [r"^([a-zA-Z]+)", r"^(\d+)", None, "^(da)", r"(.*)"],
919
}
920
)
921
922
out = df.select(
923
all_expr=pl.col("s").str.extract(pl.col("pat"), 1),
924
str_expr=pl.col("s").str.extract("^([a-zA-Z]+)", 1),
925
pat_expr=pl.lit("aron123").str.extract(pl.col("pat")),
926
)
927
expected = pl.DataFrame(
928
{
929
"all_expr": ["aron", "12", None, None, None],
930
"str_expr": ["aron", None, "charly", None, None],
931
"pat_expr": ["aron", None, None, None, "aron123"],
932
}
933
)
934
assert_frame_equal(out, expected)
935
936
937
def test_extract_binary() -> None:
938
df = pl.DataFrame({"foo": ["aron", "butler", "charly", "david"]})
939
out = df.filter(pl.col("foo").str.extract("^(a)", 1) == "a").to_series()
940
assert out[0] == "aron"
941
942
943
def test_str_join_returns_scalar() -> None:
944
df = pl.DataFrame(
945
[pl.Series("val", ["A", "B", "C", "D"]), pl.Series("id", [1, 1, 2, 2])]
946
)
947
grouped = (
948
df.group_by("id")
949
.agg(pl.col("val").str.join(delimiter=",").alias("grouped"))
950
.get_column("grouped")
951
)
952
assert grouped.dtype == pl.String
953
954
955
def test_contains() -> None:
956
# test strict/non strict
957
s_txt = pl.Series(["123", "456", "789"])
958
assert (
959
pl.Series([None, None, None]).cast(pl.Boolean).to_list()
960
== s_txt.str.contains("(not_valid_regex", literal=False, strict=False).to_list()
961
)
962
with pytest.raises(ComputeError):
963
s_txt.str.contains("(not_valid_regex", literal=False, strict=True)
964
assert (
965
pl.Series([True, False, False]).cast(pl.Boolean).to_list()
966
== s_txt.str.contains("1", literal=False, strict=False).to_list()
967
)
968
969
df = pl.DataFrame(
970
data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],
971
schema=["idx", "text"],
972
orient="row",
973
)
974
for pattern, as_literal, expected in (
975
(r"\* \*", False, [True, False, False]),
976
(r"* *", True, [True, False, False]),
977
(r"^\(", False, [False, True, False]),
978
(r"^\(", True, [False, False, False]),
979
(r"(", True, [False, True, False]),
980
(r"e", False, [True, True, True]),
981
(r"e", True, [True, True, True]),
982
(r"^\S+$", False, [False, False, True]),
983
(r"\?\$", False, [False, False, True]),
984
(r"?$", True, [False, False, True]),
985
):
986
# series
987
assert (
988
expected == df["text"].str.contains(pattern, literal=as_literal).to_list()
989
)
990
# frame select
991
assert (
992
expected
993
== df.select(pl.col("text").str.contains(pattern, literal=as_literal))[
994
"text"
995
].to_list()
996
)
997
# frame filter
998
assert sum(expected) == len(
999
df.filter(pl.col("text").str.contains(pattern, literal=as_literal))
1000
)
1001
1002
1003
def test_contains_expr() -> None:
1004
df = pl.DataFrame(
1005
{
1006
"text": [
1007
"some text",
1008
"(with) special\n .* chars",
1009
"**etc...?$",
1010
None,
1011
"b",
1012
"invalid_regex",
1013
],
1014
"pattern": [r"[me]", r".*", r"^\(", "a", None, "*"],
1015
}
1016
)
1017
1018
assert df.select(
1019
pl.col("text")
1020
.str.contains(pl.col("pattern"), literal=False, strict=False)
1021
.alias("contains"),
1022
pl.col("text")
1023
.str.contains(pl.col("pattern"), literal=True)
1024
.alias("contains_lit"),
1025
).to_dict(as_series=False) == {
1026
"contains": [True, True, False, None, None, None],
1027
"contains_lit": [False, True, False, None, None, False],
1028
}
1029
1030
with pytest.raises(ComputeError):
1031
df.select(
1032
pl.col("text").str.contains(pl.col("pattern"), literal=False, strict=True)
1033
)
1034
1035
1036
@pytest.mark.parametrize(
1037
("pattern", "case_insensitive", "expected"),
1038
[
1039
(["me"], False, True),
1040
(["Me"], False, False),
1041
(["Me"], True, True),
1042
(pl.Series(["me", "they"]), False, True),
1043
(pl.Series(["Me", "they"]), False, False),
1044
(pl.Series(["Me", "they"]), True, True),
1045
(["me", "they"], False, True),
1046
(["Me", "they"], False, False),
1047
(["Me", "they"], True, True),
1048
],
1049
)
1050
def test_contains_any(
1051
pattern: pl.Series | list[str],
1052
case_insensitive: bool,
1053
expected: bool,
1054
) -> None:
1055
df = pl.DataFrame({"text": ["Tell me what you want"]})
1056
# series
1057
assert (
1058
expected
1059
== df["text"]
1060
.str.contains_any(pattern, ascii_case_insensitive=case_insensitive)
1061
.item()
1062
)
1063
# expr
1064
assert (
1065
expected
1066
== df.select(
1067
pl.col("text").str.contains_any(
1068
pattern, ascii_case_insensitive=case_insensitive
1069
)
1070
)["text"].item()
1071
)
1072
# frame filter
1073
assert int(expected) == len(
1074
df.filter(
1075
pl.col("text").str.contains_any(
1076
pattern, ascii_case_insensitive=case_insensitive
1077
)
1078
)
1079
)
1080
1081
1082
def test_replace() -> None:
1083
df = pl.DataFrame(
1084
data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
1085
schema=["idx", "text"],
1086
orient="row",
1087
)
1088
for pattern, replacement, as_literal, expected in (
1089
(r"\*", "-", False, ["- * text", "(with) special\n - chars **etc...?$"]),
1090
(r"*", "-", True, ["- * text", "(with) special\n - chars **etc...?$"]),
1091
(r"^\(", "[", False, ["* * text", "[with) special\n * chars **etc...?$"]),
1092
(r"^\(", "[", True, ["* * text", "(with) special\n * chars **etc...?$"]),
1093
(r"t$", "an", False, ["* * texan", "(with) special\n * chars **etc...?$"]),
1094
(r"t$", "an", True, ["* * text", "(with) special\n * chars **etc...?$"]),
1095
(r"(with) special", "$1", True, ["* * text", "$1\n * chars **etc...?$"]),
1096
(
1097
r"\((with)\) special",
1098
":$1:",
1099
False,
1100
["* * text", ":with:\n * chars **etc...?$"],
1101
),
1102
):
1103
# series
1104
assert (
1105
expected
1106
== df["text"]
1107
.str.replace(pattern, replacement, literal=as_literal)
1108
.to_list()
1109
)
1110
# expr
1111
assert (
1112
expected
1113
== df.select(
1114
pl.col("text").str.replace(pattern, replacement, literal=as_literal)
1115
)["text"].to_list()
1116
)
1117
1118
assert pl.Series(["."]).str.replace(".", "$0", literal=True)[0] == "$0"
1119
assert pl.Series(["(.)(?)"]).str.replace(".", "$1", literal=True)[0] == "($1)(?)"
1120
1121
1122
def test_replace_all() -> None:
1123
df = pl.DataFrame(
1124
data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
1125
schema=["idx", "text"],
1126
orient="row",
1127
)
1128
for pattern, replacement, as_literal, expected in (
1129
(r"\*", "-", False, ["- - text", "(with) special\n - chars --etc...?$"]),
1130
(r"*", "-", True, ["- - text", "(with) special\n - chars --etc...?$"]),
1131
(r"\W", "", False, ["text", "withspecialcharsetc"]),
1132
(r".?$", "", True, ["* * text", "(with) special\n * chars **etc.."]),
1133
(
1134
r"(with) special",
1135
"$1",
1136
True,
1137
["* * text", "$1\n * chars **etc...?$"],
1138
),
1139
(
1140
r"\((with)\) special",
1141
":$1:",
1142
False,
1143
["* * text", ":with:\n * chars **etc...?$"],
1144
),
1145
(
1146
r"(\b)[\w\s]{2,}(\b)",
1147
"$1(blah)$3",
1148
False,
1149
["* * (blah)", "((blah)) (blah)\n * (blah) **(blah)...?$"],
1150
),
1151
):
1152
# series
1153
assert (
1154
expected
1155
== df["text"]
1156
.str.replace_all(pattern, replacement, literal=as_literal)
1157
.to_list()
1158
)
1159
# expr
1160
assert (
1161
expected
1162
== df.select(
1163
pl.col("text").str.replace_all(pattern, replacement, literal=as_literal)
1164
)["text"].to_list()
1165
)
1166
# invalid regex (but valid literal - requires "literal=True")
1167
with pytest.raises(ComputeError):
1168
df["text"].str.replace_all("*", "")
1169
1170
assert (
1171
pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=True)[0]
1172
== "(.)($0)($0)"
1173
)
1174
assert (
1175
pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=False)[0]
1176
== "(.)(\\?)(\\?)"
1177
)
1178
1179
1180
def test_replace_all_literal_no_caputures() -> None:
1181
# When using literal = True, capture groups should be disabled
1182
1183
# Single row code path in Rust
1184
df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
1185
df = df.with_columns(
1186
pl.col("text")
1187
.str.replace_all("<amt>", pl.col("amt"), literal=True)
1188
.alias("text2")
1189
)
1190
assert df.get_column("text2")[0] == "I found $1 yesterday."
1191
1192
# Multi-row code path in Rust
1193
df2 = pl.DataFrame(
1194
{
1195
"text": ["I found <amt> yesterday.", "I lost <amt> yesterday."],
1196
"amt": ["$1", "$2"],
1197
}
1198
)
1199
df2 = df2.with_columns(
1200
pl.col("text")
1201
.str.replace_all("<amt>", pl.col("amt"), literal=True)
1202
.alias("text2")
1203
)
1204
assert df2.get_column("text2")[0] == "I found $1 yesterday."
1205
assert df2.get_column("text2")[1] == "I lost $2 yesterday."
1206
1207
1208
def test_replace_literal_no_caputures() -> None:
1209
# When using literal = True, capture groups should be disabled
1210
1211
# Single row code path in Rust
1212
df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
1213
df = df.with_columns(
1214
pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")
1215
)
1216
assert df.get_column("text2")[0] == "I found $1 yesterday."
1217
1218
# Multi-row code path in Rust
1219
# A string shorter than 32 chars,
1220
# and one longer than 32 chars to test both sub-paths
1221
df2 = pl.DataFrame(
1222
{
1223
"text": [
1224
"I found <amt> yesterday.",
1225
"I lost <amt> yesterday and this string is longer than 32 characters.",
1226
],
1227
"amt": ["$1", "$2"],
1228
}
1229
)
1230
df2 = df2.with_columns(
1231
pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")
1232
)
1233
assert df2.get_column("text2")[0] == "I found $1 yesterday."
1234
assert (
1235
df2.get_column("text2")[1]
1236
== "I lost $2 yesterday and this string is longer than 32 characters."
1237
)
1238
1239
1240
def test_replace_expressions() -> None:
1241
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})
1242
out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])
1243
assert out.to_dict(as_series=False) == {"foo": ["A", "xyz 678 910t"]}
1244
out = df.select([pl.col("foo").str.replace(pl.col("foo").last(), "value")])
1245
assert out.to_dict(as_series=False) == {"foo": ["123 bla 45 asd", "value"]}
1246
1247
df = pl.DataFrame(
1248
{"foo": ["1 bla 45 asd", "xyz 6t"], "pat": [r"\d", r"\W"], "value": ["A", "B"]}
1249
)
1250
out = df.select([pl.col("foo").str.replace_all(pl.col("pat").first(), "value")])
1251
assert out.to_dict(as_series=False) == {
1252
"foo": ["value bla valuevalue asd", "xyz valuet"]
1253
}
1254
1255
1256
@pytest.mark.parametrize(
1257
("pattern", "replacement", "case_insensitive", "expected"),
1258
[
1259
(["say"], "", False, "Tell me what you want"),
1260
(["me"], ["them"], False, "Tell them what you want"),
1261
(["who"], ["them"], False, "Tell me what you want"),
1262
(["me", "you"], "it", False, "Tell it what it want"),
1263
(["Me", "you"], "it", False, "Tell me what it want"),
1264
(["me", "you"], ["it"], False, "Tell it what it want"),
1265
(["me", "you"], ["you", "me"], False, "Tell you what me want"),
1266
(["me", "You", "them"], "it", False, "Tell it what you want"),
1267
(["Me", "you"], "it", True, "Tell it what it want"),
1268
(["me", "YOU"], ["you", "me"], True, "Tell you what me want"),
1269
(pl.Series(["me", "YOU"]), ["you", "me"], False, "Tell you what you want"),
1270
(pl.Series(["me", "YOU"]), ["you", "me"], True, "Tell you what me want"),
1271
],
1272
)
1273
def test_replace_many(
1274
pattern: pl.Series | list[str],
1275
replacement: pl.Series | list[str] | str,
1276
case_insensitive: bool,
1277
expected: str,
1278
) -> None:
1279
df = pl.DataFrame({"text": ["Tell me what you want"]})
1280
# series
1281
assert (
1282
expected
1283
== df["text"]
1284
.str.replace_many(pattern, replacement, ascii_case_insensitive=case_insensitive)
1285
.item()
1286
)
1287
# expr
1288
assert (
1289
expected
1290
== df.select(
1291
pl.col("text").str.replace_many(
1292
pattern,
1293
replacement,
1294
ascii_case_insensitive=case_insensitive,
1295
)
1296
).item()
1297
)
1298
1299
1300
def test_replace_many_groupby() -> None:
1301
df = pl.DataFrame(
1302
{
1303
"x": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
1304
"g": [0, 0, 0, 1, 1, 1, 2, 2, 2],
1305
}
1306
)
1307
out = df.group_by("g").agg(pl.col.x.str.replace_many(pl.col.x.head(2), ""))
1308
expected = pl.DataFrame(
1309
{
1310
"g": [0, 1, 2],
1311
"x": [["", "", "c"], ["", "", "f"], ["", "", "i"]],
1312
}
1313
)
1314
assert_frame_equal(out, expected, check_row_order=False)
1315
1316
1317
@pytest.mark.parametrize(
1318
("mapping", "case_insensitive", "expected"),
1319
[
1320
({}, False, "Tell me what you want"),
1321
({"me": "them"}, False, "Tell them what you want"),
1322
({"who": "them"}, False, "Tell me what you want"),
1323
({"me": "it", "you": "it"}, False, "Tell it what it want"),
1324
({"Me": "it", "you": "it"}, False, "Tell me what it want"),
1325
({"me": "you", "you": "me"}, False, "Tell you what me want"),
1326
({}, True, "Tell me what you want"),
1327
({"Me": "it", "you": "it"}, True, "Tell it what it want"),
1328
({"me": "you", "YOU": "me"}, True, "Tell you what me want"),
1329
],
1330
)
1331
def test_replace_many_mapping(
1332
mapping: dict[str, str],
1333
case_insensitive: bool,
1334
expected: str,
1335
) -> None:
1336
df = pl.DataFrame({"text": ["Tell me what you want"]})
1337
# series
1338
assert (
1339
expected
1340
== df["text"]
1341
.str.replace_many(mapping, ascii_case_insensitive=case_insensitive)
1342
.item()
1343
)
1344
# expr
1345
assert (
1346
expected
1347
== df.select(
1348
pl.col("text").str.replace_many(
1349
mapping,
1350
ascii_case_insensitive=case_insensitive,
1351
)
1352
).item()
1353
)
1354
1355
1356
def test_replace_many_invalid_inputs() -> None:
1357
df = pl.DataFrame({"text": ["Tell me what you want"]})
1358
1359
# Ensure a string as the first argument is parsed as a column name.
1360
with pytest.raises(ColumnNotFoundError, match="me"):
1361
df.select(pl.col("text").str.replace_many("me", "you"))
1362
1363
with pytest.raises(InvalidOperationError):
1364
df.select(pl.col("text").str.replace_many(1, 2))
1365
1366
with pytest.raises(InvalidOperationError):
1367
df.select(pl.col("text").str.replace_many([1], [2]))
1368
1369
with pytest.raises(InvalidOperationError):
1370
df.select(pl.col("text").str.replace_many(["me"], None))
1371
1372
with pytest.raises(TypeError):
1373
df.select(pl.col("text").str.replace_many(["me"]))
1374
1375
with pytest.raises(
1376
InvalidOperationError,
1377
match="expected the same amount of patterns as replacement strings",
1378
):
1379
df.select(pl.col("text").str.replace_many(["a"], ["b", "c"]))
1380
1381
s = df.to_series()
1382
1383
with pytest.raises(ColumnNotFoundError, match="me"):
1384
s.str.replace_many("me", "you") # type: ignore[arg-type]
1385
1386
with pytest.raises(TypeError):
1387
df.select(pl.col("text").str.replace_many(["me"]))
1388
1389
with pytest.raises(
1390
InvalidOperationError,
1391
match="expected the same amount of patterns as replacement strings",
1392
):
1393
s.str.replace_many(["a"], ["b", "c"])
1394
1395
1396
def test_extract_all_count() -> None:
1397
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xaz 678 910t", "boo", None]})
1398
assert (
1399
df.select(
1400
pl.col("foo").str.extract_all(r"a").alias("extract"),
1401
pl.col("foo").str.count_matches(r"a").alias("count"),
1402
).to_dict(as_series=False)
1403
) == {"extract": [["a", "a"], ["a"], [], None], "count": [2, 1, 0, None]}
1404
1405
assert df["foo"].str.extract_all(r"a").dtype == pl.List
1406
assert df["foo"].str.count_matches(r"a").dtype == pl.UInt32
1407
1408
1409
def test_count_matches_many() -> None:
1410
df = pl.DataFrame(
1411
{
1412
"foo": ["123 bla 45 asd", "xyz 678 910t", None, "boo"],
1413
"bar": [r"\d", r"[a-z]", r"\d", None],
1414
}
1415
)
1416
assert (
1417
df.select(
1418
pl.col("foo").str.count_matches(pl.col("bar")).alias("count")
1419
).to_dict(as_series=False)
1420
) == {"count": [5, 4, None, None]}
1421
1422
assert df["foo"].str.count_matches(df["bar"]).dtype == pl.UInt32
1423
1424
# Test broadcast.
1425
broad = df.select(
1426
pl.col("foo").str.count_matches(pl.col("bar").first()).alias("count"),
1427
pl.col("foo").str.count_matches(pl.col("bar").last()).alias("count_null"),
1428
)
1429
assert broad.to_dict(as_series=False) == {
1430
"count": [5, 6, None, 0],
1431
"count_null": [None, None, None, None],
1432
}
1433
assert broad.schema == {"count": pl.UInt32, "count_null": pl.UInt32}
1434
1435
1436
def test_extract_all_many() -> None:
1437
df = pl.DataFrame(
1438
{
1439
"foo": ["ab", "abc", "abcd", "foo", None, "boo"],
1440
"re": ["a", "bc", "a.c", "a", "a", None],
1441
}
1442
)
1443
assert df["foo"].str.extract_all(df["re"]).to_list() == [
1444
["a"],
1445
["bc"],
1446
["abc"],
1447
[],
1448
None,
1449
None,
1450
]
1451
1452
# Test broadcast.
1453
broad = df.select(
1454
pl.col("foo").str.extract_all(pl.col("re").first()).alias("a"),
1455
pl.col("foo").str.extract_all(pl.col("re").last()).alias("null"),
1456
)
1457
assert broad.to_dict(as_series=False) == {
1458
"a": [["a"], ["a"], ["a"], [], None, []],
1459
"null": [None] * 6,
1460
}
1461
assert broad.schema == {"a": pl.List(pl.String), "null": pl.List(pl.String)}
1462
1463
1464
@pytest.mark.may_fail_cloud # reason: zero-field struct
1465
def test_extract_groups_empty() -> None:
1466
df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})
1467
1468
assert df.select(pl.col("iso_code").str.extract_groups("")).to_dict(
1469
as_series=False
1470
) == {"iso_code": [{}, {}]}
1471
1472
q = df.lazy().select(pl.col("iso_code").str.extract_groups(""))
1473
assert q.collect_schema() == q.collect().schema
1474
1475
1476
def test_extract_groups() -> None:
1477
def _named_groups_builder(pattern: str, groups: dict[str, str]) -> str:
1478
return pattern.format(
1479
**{name: f"(?<{name}>{value})" for name, value in groups.items()}
1480
)
1481
1482
expected = {
1483
"authority": ["ISO", "ISO/IEC/IEEE"],
1484
"spec_num": ["80000", "29148"],
1485
"part_num": ["1", None],
1486
"revision_year": ["2009", "2018"],
1487
}
1488
1489
pattern = _named_groups_builder(
1490
r"{authority}\s{spec_num}(?:-{part_num})?(?::{revision_year})",
1491
{
1492
"authority": r"^ISO(?:/[A-Z]+)*",
1493
"spec_num": r"\d+",
1494
"part_num": r"\d+",
1495
"revision_year": r"\d{4}",
1496
},
1497
)
1498
1499
df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})
1500
1501
assert (
1502
df.select(pl.col("iso_code").str.extract_groups(pattern))
1503
.unnest("iso_code")
1504
.to_dict(as_series=False)
1505
== expected
1506
)
1507
1508
assert df.select(
1509
pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(\d+)")
1510
).to_dict(as_series=False) == {
1511
"iso_code": [{"1": "ISO", "2": "80000"}, {"1": "ISO/IEC/IEEE", "2": "29148"}]
1512
}
1513
1514
assert df.select(
1515
pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(?<year>\d+)\z")
1516
).to_dict(as_series=False) == {
1517
"iso_code": [
1518
{"1": "ISO", "year": "2009"},
1519
{"1": "ISO/IEC/IEEE", "year": "2018"},
1520
]
1521
}
1522
1523
assert pl.select(
1524
pl.lit(r"foobar").str.extract_groups(r"(?<foo>.{3})|(?<bar>...)")
1525
).to_dict(as_series=False) == {"literal": [{"foo": "foo", "bar": None}]}
1526
1527
1528
def test_starts_ends_with() -> None:
1529
df = pl.DataFrame(
1530
{
1531
"a": ["hamburger_with_tomatoes", "nuts", "lollypop", None],
1532
"sub": ["ham", "ts", None, "anything"],
1533
}
1534
)
1535
1536
assert df.select(
1537
pl.col("a").str.ends_with("pop").alias("ends_pop"),
1538
pl.col("a").str.ends_with(pl.lit(None)).alias("ends_None"),
1539
pl.col("a").str.ends_with(pl.col("sub")).alias("ends_sub"),
1540
pl.col("a").str.starts_with("ham").alias("starts_ham"),
1541
pl.col("a").str.starts_with(pl.lit(None)).alias("starts_None"),
1542
pl.col("a").str.starts_with(pl.col("sub")).alias("starts_sub"),
1543
).to_dict(as_series=False) == {
1544
"ends_pop": [False, False, True, None],
1545
"ends_None": [None, None, None, None],
1546
"ends_sub": [False, True, None, None],
1547
"starts_ham": [True, False, False, None],
1548
"starts_None": [None, None, None, None],
1549
"starts_sub": [True, False, None, None],
1550
}
1551
1552
1553
def test_json_path_match_type_4905() -> None:
1554
df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})
1555
assert df.filter(
1556
pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])
1557
).to_dict(as_series=False) == {"json_val": ['{"a":"hello"}']}
1558
1559
1560
def test_decode_strict() -> None:
1561
df = pl.DataFrame(
1562
{"strings": ["0IbQvTc3", "0J%2FQldCf0JA%3D", "0J%2FRgNC%2B0YHRgtC%2B"]}
1563
)
1564
result = df.select(pl.col("strings").str.decode("base64", strict=False))
1565
expected = {"strings": [b"\xd0\x86\xd0\xbd77", None, None]}
1566
assert result.to_dict(as_series=False) == expected
1567
1568
with pytest.raises(ComputeError):
1569
df.select(pl.col("strings").str.decode("base64", strict=True))
1570
1571
1572
def test_split() -> None:
1573
df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})
1574
out = df.select([pl.col("x").str.split("_")])
1575
1576
expected = pl.DataFrame(
1577
[
1578
{"x": ["a", "a"]},
1579
{"x": None},
1580
{"x": ["b"]},
1581
{"x": ["c", "c", "c"]},
1582
{"x": [""]},
1583
]
1584
)
1585
1586
assert_frame_equal(out, expected)
1587
assert_frame_equal(df["x"].str.split("_").to_frame(), expected)
1588
1589
out = df.select([pl.col("x").str.split("_", inclusive=True)])
1590
1591
expected = pl.DataFrame(
1592
[
1593
{"x": ["a_", "a"]},
1594
{"x": None},
1595
{"x": ["b"]},
1596
{"x": ["c_", "c_", "c"]},
1597
{"x": []},
1598
]
1599
)
1600
1601
assert_frame_equal(out, expected)
1602
assert_frame_equal(df["x"].str.split("_", inclusive=True).to_frame(), expected)
1603
1604
out = df.select([pl.col("x").str.split("")])
1605
1606
expected = pl.DataFrame(
1607
[
1608
{"x": ["a", "_", "a"]},
1609
{"x": None},
1610
{"x": ["b"]},
1611
{"x": ["c", "_", "c", "_", "c"]},
1612
{"x": []},
1613
]
1614
)
1615
1616
assert_frame_equal(out, expected)
1617
assert_frame_equal(df["x"].str.split("").to_frame(), expected)
1618
1619
out = df.select([pl.col("x").str.split("", inclusive=True)])
1620
1621
expected = pl.DataFrame(
1622
[
1623
{"x": ["a", "_", "a"]},
1624
{"x": None},
1625
{"x": ["b"]},
1626
{"x": ["c", "_", "c", "_", "c"]},
1627
{"x": []},
1628
]
1629
)
1630
1631
assert_frame_equal(out, expected)
1632
assert_frame_equal(df["x"].str.split("", inclusive=True).to_frame(), expected)
1633
1634
plan = (
1635
df.lazy()
1636
.select(
1637
a=pl.col("x").str.split(" ", inclusive=False),
1638
b=pl.col("x").str.split_exact(" ", 1, inclusive=False),
1639
)
1640
.explain()
1641
)
1642
1643
assert "str.split(" in plan
1644
assert "str.split_exact(" in plan
1645
1646
plan = (
1647
df.lazy()
1648
.select(
1649
a=pl.col("x").str.split(" ", inclusive=True),
1650
b=pl.col("x").str.split_exact(" ", 1, inclusive=True),
1651
)
1652
.explain()
1653
)
1654
1655
assert "str.split_inclusive(" in plan
1656
assert "str.split_exact_inclusive(" in plan
1657
1658
1659
def test_split_expr() -> None:
1660
df = pl.DataFrame(
1661
{
1662
"x": ["a_a", None, "b", "c*c*c", "dddd", ""],
1663
"by": ["_", "#", "^", "*", "", ""],
1664
}
1665
)
1666
out = df.select([pl.col("x").str.split(pl.col("by"))])
1667
expected = pl.DataFrame(
1668
[
1669
{"x": ["a", "a"]},
1670
{"x": None},
1671
{"x": ["b"]},
1672
{"x": ["c", "c", "c"]},
1673
{"x": ["d", "d", "d", "d"]},
1674
{"x": []},
1675
]
1676
)
1677
assert_frame_equal(out, expected)
1678
1679
out = df.select([pl.col("x").str.split(pl.col("by"), inclusive=True)])
1680
expected = pl.DataFrame(
1681
[
1682
{"x": ["a_", "a"]},
1683
{"x": None},
1684
{"x": ["b"]},
1685
{"x": ["c*", "c*", "c"]},
1686
{"x": ["d", "d", "d", "d"]},
1687
{"x": []},
1688
]
1689
)
1690
assert_frame_equal(out, expected)
1691
1692
1693
def test_split_exact() -> None:
1694
df = pl.DataFrame({"x": ["a_a", None, "b", "c_c", ""]})
1695
out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x")
1696
1697
expected = pl.DataFrame(
1698
{
1699
"field_0": ["a", None, "b", "c", ""],
1700
"field_1": ["a", None, None, "c", None],
1701
"field_2": pl.Series([None, None, None, None, None], dtype=pl.String),
1702
}
1703
)
1704
1705
assert_frame_equal(out, expected)
1706
out2 = df["x"].str.split_exact("_", 2, inclusive=False).to_frame().unnest("x")
1707
assert_frame_equal(out2, expected)
1708
1709
out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x")
1710
1711
expected = pl.DataFrame(
1712
{
1713
"field_0": ["a_", None, "b", "c_", None],
1714
"field_1": ["a", None, None, "c", None],
1715
}
1716
)
1717
assert_frame_equal(out, expected)
1718
assert df["x"].str.split_exact("_", 1).dtype == pl.Struct
1719
assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct
1720
1721
out = df.select([pl.col("x").str.split_exact("", 1)]).unnest("x")
1722
1723
expected = pl.DataFrame(
1724
{
1725
"field_0": ["a", None, "b", "c", None],
1726
"field_1": ["_", None, None, "_", None],
1727
}
1728
)
1729
assert_frame_equal(out, expected)
1730
1731
out = df.select([pl.col("x").str.split_exact("", 1, inclusive=True)]).unnest("x")
1732
1733
expected = pl.DataFrame(
1734
{
1735
"field_0": ["a", None, "b", "c", None],
1736
"field_1": ["_", None, None, "_", None],
1737
}
1738
)
1739
assert_frame_equal(out, expected)
1740
1741
1742
def test_split_exact_expr() -> None:
1743
df = pl.DataFrame(
1744
{
1745
"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],
1746
"by": ["_", "&", "$", "^", None, "", ""],
1747
}
1748
)
1749
1750
out = df.select(
1751
pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=False)
1752
).unnest("x")
1753
1754
expected = pl.DataFrame(
1755
{
1756
"field_0": ["a", None, "b", "c", None, "e", None],
1757
"field_1": ["a", None, None, "c", None, "e", None],
1758
"field_2": pl.Series(
1759
[None, None, None, "c", None, "e", None], dtype=pl.String
1760
),
1761
}
1762
)
1763
1764
assert_frame_equal(out, expected)
1765
1766
out2 = df.select(
1767
pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=True)
1768
).unnest("x")
1769
1770
expected2 = pl.DataFrame(
1771
{
1772
"field_0": ["a_", None, "b", "c^", None, "e", None],
1773
"field_1": ["a", None, None, "c^", None, "e", None],
1774
"field_2": pl.Series(
1775
[None, None, None, "c", None, "e", None], dtype=pl.String
1776
),
1777
}
1778
)
1779
assert_frame_equal(out2, expected2)
1780
1781
1782
def test_splitn() -> None:
1783
df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})
1784
out = df.select([pl.col("x").str.splitn("_", 2)]).unnest("x")
1785
1786
expected = pl.DataFrame(
1787
{
1788
"field_0": ["a", None, "b", "c", ""],
1789
"field_1": ["a", None, None, "c_c", None],
1790
}
1791
)
1792
1793
assert_frame_equal(out, expected)
1794
assert_frame_equal(df["x"].str.splitn("_", 2).to_frame().unnest("x"), expected)
1795
1796
out = df.select([pl.col("x").str.splitn("", 2)]).unnest("x")
1797
1798
expected = pl.DataFrame(
1799
{
1800
"field_0": ["a", None, "b", "c", None],
1801
"field_1": ["_a", None, None, "_c_c", None],
1802
}
1803
)
1804
1805
assert_frame_equal(out, expected)
1806
assert_frame_equal(df["x"].str.splitn("", 2).to_frame().unnest("x"), expected)
1807
1808
1809
def test_splitn_expr() -> None:
1810
df = pl.DataFrame(
1811
{
1812
"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],
1813
"by": ["_", "&", "$", "^", None, "", ""],
1814
}
1815
)
1816
1817
out = df.select(pl.col("x").str.splitn(pl.col("by"), 2)).unnest("x")
1818
1819
expected = pl.DataFrame(
1820
{
1821
"field_0": ["a", None, "b", "c", None, "e", None],
1822
"field_1": ["a", None, None, "c^c", None, "eee", None],
1823
}
1824
)
1825
1826
assert_frame_equal(out, expected)
1827
1828
1829
def test_titlecase() -> None:
1830
df = pl.DataFrame(
1831
{
1832
"misc": [
1833
"welcome to my world",
1834
"double space",
1835
"and\ta\t tab",
1836
"by jean-paul sartre, 'esq'",
1837
"SOMETIMES/life/gives/you/a/2nd/chance",
1838
],
1839
}
1840
)
1841
expected = [
1842
"Welcome To My World",
1843
"Double Space",
1844
"And\tA\t Tab",
1845
"By Jean-Paul Sartre, 'Esq'",
1846
"Sometimes/Life/Gives/You/A/2nd/Chance",
1847
]
1848
actual = df.select(pl.col("misc").str.to_titlecase()).to_series()
1849
for ex, act in zip(expected, actual):
1850
assert ex == act, f"{ex} != {act}"
1851
1852
df = pl.DataFrame(
1853
{
1854
"quotes": [
1855
"'e.t. phone home'",
1856
"you talkin' to me?",
1857
"i feel the need--the need for speed",
1858
"to infinity,and BEYOND!",
1859
"say 'what' again!i dare you - I\u00a0double-dare you!",
1860
"What.we.got.here... is#failure#to#communicate",
1861
]
1862
}
1863
)
1864
expected_str = [
1865
"'E.T. Phone Home'",
1866
"You Talkin' To Me?",
1867
"I Feel The Need--The Need For Speed",
1868
"To Infinity,And Beyond!",
1869
"Say 'What' Again!I Dare You - I\u00a0Double-Dare You!",
1870
"What.We.Got.Here... Is#Failure#To#Communicate",
1871
]
1872
expected_py = [s.title() for s in df["quotes"].to_list()]
1873
for ex_str, ex_py, act in zip(
1874
expected_str, expected_py, df["quotes"].str.to_titlecase()
1875
):
1876
assert ex_str == act, f"{ex_str} != {act}"
1877
assert ex_py == act, f"{ex_py} != {act}"
1878
1879
1880
def test_string_replace_with_nulls_10124() -> None:
1881
df = pl.DataFrame({"col1": ["S", "S", "S", None, "S", "S", "S", "S"]})
1882
1883
assert df.select(
1884
pl.col("col1"),
1885
pl.col("col1").str.replace("S", "O", n=1).alias("n_1"),
1886
pl.col("col1").str.replace("S", "O", n=3).alias("n_3"),
1887
).to_dict(as_series=False) == {
1888
"col1": ["S", "S", "S", None, "S", "S", "S", "S"],
1889
"n_1": ["O", "O", "O", None, "O", "O", "O", "O"],
1890
"n_3": ["O", "O", "O", None, "O", "O", "O", "O"],
1891
}
1892
1893
1894
def test_string_extract_groups_lazy_schema_10305() -> None:
1895
df = pl.LazyFrame(
1896
data={
1897
"url": [
1898
"http://vote.com/ballon_dor?candidate=messi&ref=python",
1899
"http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
1900
"http://vote.com/ballon_dor?error=404&ref=rust",
1901
]
1902
}
1903
)
1904
pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"
1905
df = df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(
1906
"captures"
1907
)
1908
1909
assert df.collect_schema() == {"candidate": pl.String, "ref": pl.String}
1910
1911
1912
def test_string_reverse() -> None:
1913
df = pl.DataFrame(
1914
{
1915
"text": [None, "foo", "bar", "i like pizza&#", None, "man\u0303ana"],
1916
}
1917
)
1918
expected = pl.DataFrame(
1919
[
1920
pl.Series(
1921
"text",
1922
[None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"],
1923
dtype=pl.String,
1924
),
1925
]
1926
)
1927
1928
result = df.select(pl.col("text").str.reverse())
1929
assert_frame_equal(result, expected)
1930
1931
1932
@pytest.mark.parametrize(
1933
("data", "expected_data"),
1934
[
1935
(["", None, "a"], ["", None, "b"]),
1936
([None, None, "a"], [None, None, "b"]),
1937
(["", "", ""], ["", "", ""]),
1938
([None, None, None], [None, None, None]),
1939
(["a", "", None], ["b", "", None]),
1940
],
1941
)
1942
def test_replace_lit_n_char_13385(
1943
data: list[str | None], expected_data: list[str | None]
1944
) -> None:
1945
s = pl.Series(data, dtype=pl.String)
1946
res = s.str.replace("a", "b", literal=True)
1947
expected_s = pl.Series(expected_data, dtype=pl.String)
1948
assert_series_equal(res, expected_s)
1949
1950
1951
def test_extract_many() -> None:
1952
df = pl.DataFrame({"values": ["discontent", "foobar"]})
1953
patterns = ["winter", "disco", "onte", "discontent"]
1954
assert df.with_columns(
1955
pl.col("values").str.extract_many(patterns, overlapping=False).alias("matches"),
1956
pl.col("values")
1957
.str.extract_many(patterns, overlapping=True)
1958
.alias("matches_overlapping"),
1959
).to_dict(as_series=False) == {
1960
"values": ["discontent", "foobar"],
1961
"matches": [["disco"], []],
1962
"matches_overlapping": [["disco", "onte", "discontent"], []],
1963
}
1964
1965
# many patterns
1966
df = pl.DataFrame(
1967
{
1968
"values": ["discontent", "rhapsody"],
1969
"patterns": [
1970
["winter", "disco", "onte", "discontent"],
1971
["rhap", "ody", "coalesce"],
1972
],
1973
}
1974
)
1975
1976
# extract_many
1977
assert df.select(pl.col("values").str.extract_many("patterns")).to_dict(
1978
as_series=False
1979
) == {"values": [["disco"], ["rhap", "ody"]]}
1980
1981
# find_many
1982
f1 = df.select(pl.col("values").str.find_many("patterns"))
1983
f2 = df["values"].str.find_many(df["patterns"])
1984
1985
assert_series_equal(f1["values"], f2)
1986
assert f2.to_list() == [[0], [0, 5]]
1987
1988
1989
def test_json_decode_raise_on_data_type_mismatch_13061() -> None:
1990
assert_series_equal(
1991
pl.Series(["null", "null"]).str.json_decode(infer_schema_length=1),
1992
pl.Series([None, None]),
1993
)
1994
1995
with pytest.raises(ComputeError):
1996
pl.Series(["null", "1"]).str.json_decode(infer_schema_length=1)
1997
1998
assert_series_equal(
1999
pl.Series(["null", "1"]).str.json_decode(infer_schema_length=2),
2000
pl.Series([None, 1]),
2001
)
2002
2003
2004
def test_json_decode_struct_schema() -> None:
2005
with pytest.raises(ComputeError, match="extra field in struct data: b"):
2006
pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2007
infer_schema_length=1
2008
)
2009
2010
assert_series_equal(
2011
pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2012
infer_schema_length=2
2013
),
2014
pl.Series([{"a": 1, "b": None}, {"a": 2, "b": 2}]),
2015
)
2016
2017
# If the schema was explicitly given, then we ignore extra fields.
2018
# TODO: There should be a `columns=` parameter to this.
2019
assert_series_equal(
2020
pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2021
dtype=pl.Struct({"a": pl.Int64})
2022
),
2023
pl.Series([{"a": 1}, {"a": 2}]),
2024
)
2025
2026
2027
def test_escape_regex() -> None:
2028
df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
2029
result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
2030
expected_df = pl.DataFrame(
2031
{
2032
"text": ["abc", "def", None, "abc(\\w+)"],
2033
"escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],
2034
}
2035
)
2036
2037
assert_frame_equal(result_df, expected_df)
2038
assert_series_equal(result_df["escaped"], expected_df["escaped"])
2039
2040
2041
@pytest.mark.parametrize(
2042
("form", "expected_data"),
2043
[
2044
("NFC", ["01²", "KADOKAWA"]), # noqa: RUF001
2045
("NFD", ["01²", "KADOKAWA"]), # noqa: RUF001
2046
("NFKC", ["012", "KADOKAWA"]),
2047
("NFKD", ["012", "KADOKAWA"]),
2048
],
2049
)
2050
def test_string_normalize(form: Any, expected_data: list[str | None]) -> None:
2051
s = pl.Series(["01²", "KADOKAWA"], dtype=pl.String) # noqa: RUF001
2052
res = s.str.normalize(form)
2053
expected_s = pl.Series(expected_data, dtype=pl.String)
2054
assert_series_equal(res, expected_s)
2055
2056
2057
def test_string_normalize_wrong_input() -> None:
2058
with pytest.raises(ValueError, match="`form` must be one of"):
2059
pl.Series(["01²"], dtype=pl.String).str.normalize("foobar") # type: ignore[arg-type]
2060
2061
2062
def test_to_integer_unequal_lengths_22034() -> None:
2063
s = pl.Series("a", ["1", "2", "3"], pl.String)
2064
with pytest.raises(pl.exceptions.ShapeError):
2065
s.str.to_integer(base=pl.Series([4, 5, 5, 4]))
2066
2067
2068
def test_broadcast_self() -> None:
2069
s = pl.Series("a", ["3"], pl.String)
2070
with pytest.raises(
2071
pl.exceptions.ComputeError, match="strict integer parsing failed"
2072
):
2073
s.str.to_integer(base=pl.Series([2, 2, 3, 4]))
2074
2075
2076
def test_strptime_unequal_length_22018() -> None:
2077
s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
2078
with pytest.raises(pl.exceptions.ShapeError):
2079
s.str.strptime(
2080
pl.Datetime, "%Y-%m-%d %H:%M%#z", ambiguous=pl.Series(["a", "b", "d"])
2081
)
2082
2083
2084
@pytest.mark.parametrize("inclusive", [False, True])
2085
def test_str_split_unequal_length_22018(inclusive: bool) -> None:
2086
with pytest.raises(pl.exceptions.ShapeError):
2087
pl.Series(["a-c", "x-y"]).str.split(
2088
pl.Series(["-", "/", "+"]), inclusive=inclusive
2089
)
2090
2091
2092
def test_str_split_self_broadcast() -> None:
2093
assert_series_equal(
2094
pl.Series(["a-/c"]).str.split(pl.Series(["-", "/", "+"])),
2095
pl.Series([["a", "/c"], ["a-", "c"], ["a-/c"]]),
2096
)
2097
2098
2099
def test_replace_many_mapping_in_list() -> None:
2100
assert_series_equal(
2101
pl.Series([["a", "b"]]).list.eval(
2102
pl.element().replace_strict({"a": 1, "b": 2})
2103
),
2104
pl.Series([[1, 2]]),
2105
)
2106
2107
2108
def test_str_replace_n_zero_23570() -> None:
2109
# more than 32 bytes
2110
abc_long = "abc " * 20 + "abc"
2111
df = pl.DataFrame(
2112
{"a": [abc_long, "abc abc abc", "abc ghi"], "b": ["jkl", "pqr", "xyz"]}
2113
)
2114
expected = df
2115
2116
out = df.with_columns(pl.col("a").str.replace("abc", "XYZ", n=0))
2117
assert_frame_equal(out, expected)
2118
2119
out = df.with_columns(pl.col("a").str.replace("abc", pl.col("b"), n=0))
2120
assert_frame_equal(out, expected)
2121
2122