CoCalc -- test_string.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/string/test_string.py
⁶⁹⁴⁰ views
1
from __future__ import annotations
2

3
from typing import Any
4

5
import pytest
6

7
import polars as pl
8
import polars.selectors as cs
9
from polars.exceptions import (
10
    ColumnNotFoundError,
11
    ComputeError,
12
    InvalidOperationError,
13
    ShapeError,
14
)
15
from polars.testing import assert_frame_equal, assert_series_equal
16

17

18
def test_str_slice() -> None:
19
    df = pl.DataFrame({"a": ["foobar", "barfoo"]})
20
    assert df["a"].str.slice(-3).to_list() == ["bar", "foo"]
21
    assert df.select([pl.col("a").str.slice(2, 4)])["a"].to_list() == ["obar", "rfoo"]
22

23

24
def test_str_slice_expr() -> None:
25
    df = pl.DataFrame(
26
        {
27
            "a": ["foobar", None, "barfoo", "abcd", ""],
28
            "offset": [1, 3, None, -3, 2],
29
            "length": [3, 4, 2, None, 2],
30
        }
31
    )
32
    out = df.select(
33
        all_expr=pl.col("a").str.slice("offset", "length"),
34
        offset_expr=pl.col("a").str.slice("offset", 2),
35
        length_expr=pl.col("a").str.slice(0, "length"),
36
        length_none=pl.col("a").str.slice("offset", None),
37
        offset_length_lit=pl.col("a").str.slice(-3, 3),
38
        str_lit=pl.lit("qwert").str.slice("offset", "length"),
39
    )
40
    expected = pl.DataFrame(
41
        {
42
            "all_expr": ["oob", None, None, "bcd", ""],
43
            "offset_expr": ["oo", None, None, "bc", ""],
44
            "length_expr": ["foo", None, "ba", "abcd", ""],
45
            "length_none": ["oobar", None, None, "bcd", ""],
46
            "offset_length_lit": ["bar", None, "foo", "bcd", ""],
47
            "str_lit": ["wer", "rt", None, "ert", "er"],
48
        }
49
    )
50
    assert_frame_equal(out, expected)
51

52
    # negative length is not allowed
53
    with pytest.raises(InvalidOperationError):
54
        df.select(pl.col("a").str.slice(0, -1))
55

56

57
def test_str_slice_wrong_length() -> None:
58
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
59
    with pytest.raises(ShapeError):
60
        df.select(pl.col("num").str.slice(pl.Series([1, 2])))
61

62

63
@pytest.mark.parametrize(
64
    ("input", "n", "output"),
65
    [
66
        (["012345", "", None], 0, ["", "", None]),
67
        (["012345", "", None], 2, ["01", "", None]),
68
        (["012345", "", None], -2, ["0123", "", None]),
69
        (["012345", "", None], 100, ["012345", "", None]),
70
        (["012345", "", None], -100, ["", "", None]),
71
    ],
72
)
73
def test_str_head(input: list[str], n: int, output: list[str]) -> None:
74
    assert pl.Series(input).str.head(n).to_list() == output
75

76

77
@pytest.mark.parametrize(
78
    ("input", "n", "output"),
79
    [
80
        ("你好世界", 0, ""),
81
        ("你好世界", 2, "你好"),
82
        ("你好世界", 999, "你好世界"),
83
        ("你好世界", -1, "你好世"),
84
        ("你好世界", -2, "你好"),
85
        ("你好世界", -999, ""),
86
    ],
87
)
88
def test_str_head_codepoints(input: str, n: int, output: str) -> None:
89
    assert pl.Series([input]).str.head(n).to_list() == [output]
90

91

92
def test_str_head_expr() -> None:
93
    s = "012345"
94
    df = pl.DataFrame(
95
        {"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}
96
    )
97
    out = df.select(
98
        n_expr=pl.col("a").str.head("n"),
99
        n_pos2=pl.col("a").str.head(2),
100
        n_neg2=pl.col("a").str.head(-2),
101
        n_pos100=pl.col("a").str.head(100),
102
        n_pos_neg100=pl.col("a").str.head(-100),
103
        n_pos_0=pl.col("a").str.head(0),
104
        str_lit=pl.col("a").str.head(pl.lit(2)),
105
        lit_expr=pl.lit(s).str.head("n"),
106
        lit_n=pl.lit(s).str.head(2),
107
    )
108
    expected = pl.DataFrame(
109
        {
110
            "n_expr": ["", "01", "0123", "012345", "", None, "", None],
111
            "n_pos2": ["01", "01", "01", "01", "01", "01", "", None],
112
            "n_neg2": ["0123", "0123", "0123", "0123", "0123", "0123", "", None],
113
            "n_pos100": [s, s, s, s, s, s, "", None],
114
            "n_pos_neg100": ["", "", "", "", "", "", "", None],
115
            "n_pos_0": ["", "", "", "", "", "", "", None],
116
            "str_lit": ["01", "01", "01", "01", "01", "01", "", None],
117
            "lit_expr": ["", "01", "0123", "012345", "", None, "012", "0123"],
118
            "lit_n": ["01", "01", "01", "01", "01", "01", "01", "01"],
119
        }
120
    )
121
    assert_frame_equal(out, expected)
122

123

124
def test_str_head_wrong_length() -> None:
125
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
126
    with pytest.raises(ShapeError):
127
        df.select(pl.col("num").str.head(pl.Series([1, 2])))
128

129

130
@pytest.mark.parametrize(
131
    ("input", "n", "output"),
132
    [
133
        (["012345", "", None], 0, ["", "", None]),
134
        (["012345", "", None], 2, ["45", "", None]),
135
        (["012345", "", None], -2, ["2345", "", None]),
136
        (["012345", "", None], 100, ["012345", "", None]),
137
        (["012345", "", None], -100, ["", "", None]),
138
    ],
139
)
140
def test_str_tail(input: list[str], n: int, output: list[str]) -> None:
141
    assert pl.Series(input).str.tail(n).to_list() == output
142

143

144
@pytest.mark.parametrize(
145
    ("input", "n", "output"),
146
    [
147
        ("你好世界", 0, ""),
148
        ("你好世界", 2, "世界"),
149
        ("你好世界", 999, "你好世界"),
150
        ("你好世界", -1, "好世界"),
151
        ("你好世界", -2, "世界"),
152
        ("你好世界", -999, ""),
153
    ],
154
)
155
def test_str_tail_codepoints(input: str, n: int, output: str) -> None:
156
    assert pl.Series([input]).str.tail(n).to_list() == [output]
157

158

159
def test_str_tail_expr() -> None:
160
    s = "012345"
161
    df = pl.DataFrame(
162
        {"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}
163
    )
164
    out = df.select(
165
        n_expr=pl.col("a").str.tail("n"),
166
        n_pos2=pl.col("a").str.tail(2),
167
        n_neg2=pl.col("a").str.tail(-2),
168
        n_pos100=pl.col("a").str.tail(100),
169
        n_pos_neg100=pl.col("a").str.tail(-100),
170
        n_pos_0=pl.col("a").str.tail(0),
171
        str_lit=pl.col("a").str.tail(pl.lit(2)),
172
        lit_expr=pl.lit(s).str.tail("n"),
173
        lit_n=pl.lit(s).str.tail(2),
174
    )
175
    expected = pl.DataFrame(
176
        {
177
            "n_expr": ["", "45", "2345", "012345", "", None, "", None],
178
            "n_pos2": ["45", "45", "45", "45", "45", "45", "", None],
179
            "n_neg2": ["2345", "2345", "2345", "2345", "2345", "2345", "", None],
180
            "n_pos100": [s, s, s, s, s, s, "", None],
181
            "n_pos_neg100": ["", "", "", "", "", "", "", None],
182
            "n_pos_0": ["", "", "", "", "", "", "", None],
183
            "str_lit": ["45", "45", "45", "45", "45", "45", "", None],
184
            "lit_expr": ["", "45", "2345", "012345", "", None, "345", "2345"],
185
            "lit_n": ["45", "45", "45", "45", "45", "45", "45", "45"],
186
        }
187
    )
188
    assert_frame_equal(out, expected)
189

190

191
def test_str_tail_wrong_length() -> None:
192
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
193
    with pytest.raises(ShapeError):
194
        df.select(pl.col("num").str.tail(pl.Series([1, 2])))
195

196

197
def test_str_slice_multibyte() -> None:
198
    ref = "你好世界"
199
    s = pl.Series([ref])
200

201
    # Pad the string to simplify (negative) offsets starting before/after the string.
202
    npad = 20
203
    padref = "_" * npad + ref + "_" * npad
204
    for start in range(-5, 6):
205
        for length in range(6):
206
            offset = npad + start if start >= 0 else npad + start + len(ref)
207
            correct = padref[offset : offset + length].strip("_")
208
            result = s.str.slice(start, length)
209
            expected = pl.Series([correct])
210
            assert_series_equal(result, expected)
211

212

213
def test_str_len_bytes() -> None:
214
    s = pl.Series(["Café", None, "345", "東京"])
215
    result = s.str.len_bytes()
216
    expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32)
217
    assert_series_equal(result, expected)
218

219

220
def test_str_len_chars() -> None:
221
    s = pl.Series(["Café", None, "345", "東京"])
222
    result = s.str.len_chars()
223
    expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32)
224
    assert_series_equal(result, expected)
225

226

227
def test_str_contains() -> None:
228
    s = pl.Series(["messi", "ronaldo", "ibrahimovic"])
229
    expected = pl.Series([True, False, False])
230
    assert_series_equal(s.str.contains("mes"), expected)
231

232

233
def test_str_contains_wrong_length() -> None:
234
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
235
    with pytest.raises(ShapeError):
236
        df.select(pl.col("num").str.contains(pl.Series(["a", "b"])))  # type: ignore [arg-type]
237

238

239
def test_count_match_literal() -> None:
240
    s = pl.Series(["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None])
241
    out = s.str.count_matches(r"\d", literal=True)
242
    expected = pl.Series([0, 0, 2, None], dtype=pl.UInt32)
243
    assert_series_equal(out, expected)
244

245
    out = s.str.count_matches(pl.Series([r"\w", r"\w", r"\d", r"\d"]), literal=True)
246
    expected = pl.Series([0, 1, 2, None], dtype=pl.UInt32)
247
    assert_series_equal(out, expected)
248

249

250
def test_str_encode() -> None:
251
    s = pl.Series(["foo", "bar", None])
252
    hex_encoded = pl.Series(["666f6f", "626172", None])
253
    base64_encoded = pl.Series(["Zm9v", "YmFy", None])
254

255
    assert_series_equal(s.str.encode("hex"), hex_encoded)
256
    assert_series_equal(s.str.encode("base64"), base64_encoded)
257
    with pytest.raises(ValueError):
258
        s.str.encode("utf8")  # type: ignore[arg-type]
259

260

261
def test_str_decode() -> None:
262
    hex_encoded = pl.Series(["666f6f", "626172", None])
263
    base64_encoded = pl.Series(["Zm9v", "YmFy", None])
264
    expected = pl.Series([b"foo", b"bar", None])
265

266
    assert_series_equal(hex_encoded.str.decode("hex"), expected)
267
    assert_series_equal(base64_encoded.str.decode("base64"), expected)
268

269

270
def test_str_decode_exception() -> None:
271
    s = pl.Series(["not a valid", "626172", None])
272
    with pytest.raises(ComputeError):
273
        s.str.decode(encoding="hex")
274
    with pytest.raises(ComputeError):
275
        s.str.decode(encoding="base64")
276
    with pytest.raises(ValueError):
277
        s.str.decode("utf8")  # type: ignore[arg-type]
278

279

280
@pytest.mark.parametrize("strict", [True, False])
281
def test_str_find(strict: bool) -> None:
282
    df = pl.DataFrame(
283
        data=[
284
            ("Dubai", 3564931, "b[ai]", "ai"),
285
            ("Abu Dhabi", 1807000, "b[ai]", " "),
286
            ("Sharjah", 1405000, "[ai]n", "s"),
287
            ("Al Ain", 846747, "[ai]n", ""),
288
            ("Ajman", 490035, "[ai]n", "ma"),
289
            ("Ras Al Khaimah", 191753, "a.+a", "Kha"),
290
            ("Fujairah", 118933, "a.+a", None),
291
            ("Umm Al Quwain", 59098, "a.+a", "wa"),
292
            (None, None, None, "n/a"),
293
        ],
294
        schema={
295
            "city": pl.String,
296
            "population": pl.Int32,
297
            "pat": pl.String,
298
            "lit": pl.String,
299
        },
300
        orient="row",
301
    )
302
    city, pop, pat, lit = (pl.col(c) for c in ("city", "population", "pat", "lit"))
303

304
    for match_lit in (True, False):
305
        res = df.select(
306
            find_a_regex=city.str.find("(?i)a", strict=strict),
307
            find_a_lit=city.str.find("a", literal=match_lit),
308
            find_00_lit=pop.cast(pl.String).str.find("00", literal=match_lit),
309
            find_col_lit=city.str.find(lit, strict=strict, literal=match_lit),
310
            find_col_pat=city.str.find(pat, strict=strict),
311
        )
312
        assert res.to_dict(as_series=False) == {
313
            "find_a_regex": [3, 0, 2, 0, 0, 1, 3, 4, None],
314
            "find_a_lit": [3, 6, 2, None, 3, 1, 3, 10, None],
315
            "find_00_lit": [None, 4, 4, None, 2, None, None, None, None],
316
            "find_col_lit": [3, 3, None, 0, 2, 7, None, 9, None],
317
            "find_col_pat": [2, 7, None, 4, 3, 1, 3, None, None],
318
        }
319

320

321
def test_str_find_invalid_regex() -> None:
322
    # test behaviour of 'strict' with invalid regular expressions
323
    df = pl.DataFrame({"txt": ["AbCdEfG"]})
324
    rx_invalid = "(?i)AB.))"
325

326
    with pytest.raises(ComputeError):
327
        df.with_columns(pl.col("txt").str.find(rx_invalid, strict=True))
328

329
    res = df.with_columns(pl.col("txt").str.find(rx_invalid, strict=False))
330
    assert res.item() is None
331

332

333
def test_str_find_escaped_chars() -> None:
334
    # test behaviour of 'literal=True' with special chars
335
    df = pl.DataFrame({"txt": ["123.*465", "x(x?)x"]})
336

337
    res = df.with_columns(
338
        x1=pl.col("txt").str.find("(x?)", literal=True),
339
        x2=pl.col("txt").str.find(".*4", literal=True),
340
        x3=pl.col("txt").str.find("(x?)"),
341
        x4=pl.col("txt").str.find(".*4"),
342
    )
343
    # ┌──────────┬──────┬──────┬─────┬──────┐
344
    # │ txt      ┆ x1   ┆ x2   ┆ x3  ┆ x4   │
345
    # │ ---      ┆ ---  ┆ ---  ┆ --- ┆ ---  │
346
    # │ str      ┆ u32  ┆ u32  ┆ u32 ┆ u32  │
347
    # ╞══════════╪══════╪══════╪═════╪══════╡
348
    # │ 123.*465 ┆ null ┆ 3    ┆ 0   ┆ 0    │
349
    # │ x(x?)x   ┆ 1    ┆ null ┆ 0   ┆ null │
350
    # └──────────┴──────┴──────┴─────┴──────┘
351
    assert_frame_equal(
352
        pl.DataFrame(
353
            {
354
                "txt": ["123.*465", "x(x?)x"],
355
                "x1": [None, 1],
356
                "x2": [3, None],
357
                "x3": [0, 0],
358
                "x4": [0, None],
359
            }
360
        ).cast({cs.signed_integer(): pl.UInt32}),
361
        res,
362
    )
363

364

365
def test_str_find_wrong_length() -> None:
366
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
367
    with pytest.raises(ShapeError):
368
        df.select(pl.col("num").str.find(pl.Series(["a", "b"])))  # type: ignore [arg-type]
369

370

371
def test_hex_decode_return_dtype() -> None:
372
    data = {"a": ["68656c6c6f", "776f726c64"]}
373
    expr = pl.col("a").str.decode("hex")
374

375
    df = pl.DataFrame(data).select(expr)
376
    assert df.schema == {"a": pl.Binary}
377

378
    ldf = pl.LazyFrame(data).select(expr)
379
    assert ldf.collect_schema() == {"a": pl.Binary}
380

381

382
def test_base64_decode_return_dtype() -> None:
383
    data = {"a": ["Zm9v", "YmFy"]}
384
    expr = pl.col("a").str.decode("base64")
385

386
    df = pl.DataFrame(data).select(expr)
387
    assert df.schema == {"a": pl.Binary}
388

389
    ldf = pl.LazyFrame(data).select(expr)
390
    assert ldf.collect_schema() == {"a": pl.Binary}
391

392

393
def test_str_replace_str_replace_all() -> None:
394
    s = pl.Series(["hello", "world", "test", "rooted"])
395
    expected = pl.Series(["hell0", "w0rld", "test", "r0oted"])
396
    assert_series_equal(s.str.replace("o", "0"), expected)
397

398
    expected = pl.Series(["hell0", "w0rld", "test", "r00ted"])
399
    assert_series_equal(s.str.replace_all("o", "0"), expected)
400

401

402
def test_str_replace_n_single() -> None:
403
    s = pl.Series(["aba", "abaa"])
404

405
    assert s.str.replace("a", "b", n=1).to_list() == ["bba", "bbaa"]
406
    assert s.str.replace("a", "b", n=2).to_list() == ["bbb", "bbba"]
407
    assert s.str.replace("a", "b", n=3).to_list() == ["bbb", "bbbb"]
408

409

410
def test_str_replace_n_same_length() -> None:
411
    # pat and val have the same length
412
    # this triggers a fast path
413
    s = pl.Series(["abfeab", "foobarabfooabab"])
414
    assert s.str.replace("ab", "AB", n=1).to_list() == ["ABfeab", "foobarABfooabab"]
415
    assert s.str.replace("ab", "AB", n=2).to_list() == ["ABfeAB", "foobarABfooABab"]
416
    assert s.str.replace("ab", "AB", n=3).to_list() == ["ABfeAB", "foobarABfooABAB"]
417

418

419
def test_str_to_lowercase() -> None:
420
    s = pl.Series(["Hello", "WORLD"])
421
    expected = pl.Series(["hello", "world"])
422
    assert_series_equal(s.str.to_lowercase(), expected)
423

424

425
def test_str_to_uppercase() -> None:
426
    s = pl.Series(["Hello", "WORLD"])
427
    expected = pl.Series(["HELLO", "WORLD"])
428
    assert_series_equal(s.str.to_uppercase(), expected)
429

430

431
def test_str_case_cyrillic() -> None:
432
    vals = ["Biтpyк", "Iвaн"]
433
    s = pl.Series(vals)
434
    assert s.str.to_lowercase().to_list() == [a.lower() for a in vals]
435
    assert s.str.to_uppercase().to_list() == [a.upper() for a in vals]
436

437

438
def test_str_to_integer() -> None:
439
    bin = pl.Series(["110", "101", "010"])
440
    assert_series_equal(bin.str.to_integer(base=2), pl.Series([6, 5, 2]).cast(pl.Int64))
441

442
    hex = pl.Series(["fa1e", "ff00", "cafe", "invalid", None])
443
    assert_series_equal(
444
        hex.str.to_integer(base=16, strict=False),
445
        pl.Series([64030, 65280, 51966, None, None]).cast(pl.Int64),
446
        check_exact=True,
447
    )
448

449
    with pytest.raises(ComputeError):
450
        hex.str.to_integer(base=16)
451

452

453
@pytest.mark.parametrize("strict", [False, True])
454
def test_str_to_integer_invalid_base(strict: bool) -> None:
455
    numbers = pl.Series(["1", "ZZZ", "-ABCZZZ", None])
456
    with pytest.raises(ComputeError):
457
        numbers.str.to_integer(base=100, strict=strict)
458

459
    df = pl.DataFrame({"str": numbers, "base": [0, 1, 100, None]})
460
    with pytest.raises(ComputeError):
461
        df.select(pl.col("str").str.to_integer(base=pl.col("base"), strict=strict))
462

463

464
def test_str_to_integer_base_expr() -> None:
465
    df = pl.DataFrame(
466
        {"str": ["110", "ff00", "234", None, "130"], "base": [2, 16, 10, 8, None]}
467
    )
468
    out = df.select(base_expr=pl.col("str").str.to_integer(base="base"))
469
    expected = pl.DataFrame({"base_expr": [6, 65280, 234, None, None]})
470
    assert_frame_equal(out, expected)
471

472
    # test strict raise
473
    df = pl.DataFrame({"str": ["110", "ff00", "cafe", None], "base": [2, 10, 10, 8]})
474

475
    with pytest.raises(ComputeError):
476
        df.select(pl.col("str").str.to_integer(base="base"))
477

478

479
def test_str_to_integer_base_literal() -> None:
480
    df = pl.DataFrame(
481
        {
482
            "bin": ["110", "101", "-010", "invalid", None],
483
            "hex": ["fa1e", "ff00", "cafe", "invalid", None],
484
        }
485
    )
486
    result = df.with_columns(
487
        pl.col("bin").str.to_integer(base=2, strict=False),
488
        pl.col("hex").str.to_integer(base=16, strict=False),
489
    )
490

491
    expected = pl.DataFrame(
492
        {
493
            "bin": [6, 5, -2, None, None],
494
            "hex": [64030, 65280, 51966, None, None],
495
        }
496
    )
497
    assert_frame_equal(result, expected)
498

499
    with pytest.raises(ComputeError):
500
        df.with_columns(
501
            pl.col("bin").str.to_integer(base=2),
502
            pl.col("hex").str.to_integer(base=16),
503
        )
504

505

506
def test_str_to_integer_dtype() -> None:
507
    lf = pl.LazyFrame(
508
        {
509
            "str": ["1111111", "7f", "127", None, "42"],
510
            "base": [2, 16, 10, 8, None],
511
        }
512
    )
513
    out = lf.select(
514
        i8=pl.col("str").str.to_integer(base="base", dtype=pl.Int8),
515
        i16=pl.col("str").str.to_integer(base="base", dtype=pl.Int16),
516
        i32=pl.col("str").str.to_integer(base="base", dtype=pl.Int32),
517
        i64=pl.col("str").str.to_integer(base="base", dtype=pl.Int64),
518
        u8=pl.col("str").str.to_integer(base="base", dtype=pl.UInt8),
519
        u16=pl.col("str").str.to_integer(base="base", dtype=pl.UInt16),
520
        u32=pl.col("str").str.to_integer(base="base", dtype=pl.UInt32),
521
        u64=pl.col("str").str.to_integer(base="base", dtype=pl.UInt64),
522
        default=pl.col("str").str.to_integer(base="base"),
523
    ).collect()
524

525
    expected = pl.DataFrame(
526
        {
527
            "i8": [127, 127, 127, None, None],
528
            "i16": [127, 127, 127, None, None],
529
            "i32": [127, 127, 127, None, None],
530
            "i64": [127, 127, 127, None, None],
531
            "u8": [127, 127, 127, None, None],
532
            "u16": [127, 127, 127, None, None],
533
            "u32": [127, 127, 127, None, None],
534
            "u64": [127, 127, 127, None, None],
535
            "default": [127, 127, 127, None, None],
536
        },
537
        schema={
538
            "i8": pl.Int8,
539
            "i16": pl.Int16,
540
            "i32": pl.Int32,
541
            "i64": pl.Int64,
542
            "u8": pl.UInt8,
543
            "u16": pl.UInt16,
544
            "u32": pl.UInt32,
545
            "u64": pl.UInt64,
546
            "default": pl.Int64,
547
        },
548
    )
549
    assert lf.collect_schema() == lf.collect().schema
550
    assert_frame_equal(out, expected)
551

552

553
def test_str_to_integer_large() -> None:
554
    df = pl.DataFrame(
555
        {
556
            "str": [
557
                "-6129899454972456276923959272",
558
                "1A44E53BFEBA967E6682FBB0",
559
                "10100110111110110101110100000100110010101111000100011000000100010101010101101011111111101000",
560
                None,
561
                "7798994549724957734429272",
562
            ],
563
            "base": [10, 16, 2, 8, None],
564
        }
565
    )
566
    out = df.select(i128=pl.col("str").str.to_integer(base="base", dtype=pl.Int128))
567
    expected = pl.DataFrame(
568
        {
569
            "i128": [
570
                -6129899454972456276923959272,
571
                8129899739726392769273592752,
572
                3229899454972495776923959272,
573
                None,
574
                None,
575
            ]
576
        },
577
        schema={"i128": pl.Int128},
578
    )
579
    assert_frame_equal(out, expected)
580

581
    # test strict raise
582
    df = pl.DataFrame(
583
        {
584
            "i128": [
585
                "612989945497245627692395927261298994549724562769239592726129899454972456276923959272",
586
                "1A44E53BFEBA967E6682FBB0",
587
                "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
588
                "7798994549724957734429272",
589
                None,
590
                "7798994549724957734429272",
591
            ],
592
            "base": [10, 2, 16, 10, 8, None],
593
        }
594
    )
595

596
    with pytest.raises(ComputeError):
597
        df.select(pl.col("i128").str.to_integer(base="base", dtype=pl.Int128))
598

599

600
def test_str_strip_chars_expr() -> None:
601
    df = pl.DataFrame(
602
        {
603
            "s": [" hello ", "^^world^^", "&&hi&&", "  polars  ", None],
604
            "pat": [" ", "^", "&", None, "anything"],
605
        }
606
    )
607

608
    all_expr = df.select(
609
        pl.col("s").str.strip_chars(pl.col("pat")).alias("strip_chars"),
610
        pl.col("s").str.strip_chars_start(pl.col("pat")).alias("strip_chars_start"),
611
        pl.col("s").str.strip_chars_end(pl.col("pat")).alias("strip_chars_end"),
612
    )
613

614
    expected = pl.DataFrame(
615
        {
616
            "strip_chars": ["hello", "world", "hi", "polars", None],
617
            "strip_chars_start": ["hello ", "world^^", "hi&&", "polars  ", None],
618
            "strip_chars_end": [" hello", "^^world", "&&hi", "  polars", None],
619
        }
620
    )
621

622
    assert_frame_equal(all_expr, expected)
623

624
    strip_by_null = df.select(
625
        pl.col("s").str.strip_chars(None).alias("strip_chars"),
626
        pl.col("s").str.strip_chars_start(None).alias("strip_chars_start"),
627
        pl.col("s").str.strip_chars_end(None).alias("strip_chars_end"),
628
    )
629

630
    # only whitespace are striped.
631
    expected = pl.DataFrame(
632
        {
633
            "strip_chars": ["hello", "^^world^^", "&&hi&&", "polars", None],
634
            "strip_chars_start": ["hello ", "^^world^^", "&&hi&&", "polars  ", None],
635
            "strip_chars_end": [" hello", "^^world^^", "&&hi&&", "  polars", None],
636
        }
637
    )
638
    assert_frame_equal(strip_by_null, expected)
639

640

641
def test_str_strip_chars() -> None:
642
    s = pl.Series([" hello ", "world\t "])
643
    expected = pl.Series(["hello", "world"])
644
    assert_series_equal(s.str.strip_chars(), expected)
645

646
    expected = pl.Series(["hell", "world"])
647
    assert_series_equal(s.str.strip_chars().str.strip_chars("o"), expected)
648

649
    expected = pl.Series(["ell", "rld\t"])
650
    assert_series_equal(s.str.strip_chars(" hwo"), expected)
651

652

653
def test_str_strip_chars_wrong_length() -> None:
654
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
655
    with pytest.raises(ShapeError):
656
        df.select(pl.col("num").str.strip_chars(pl.Series(["a", "b"])))
657

658

659
def test_str_strip_chars_start() -> None:
660
    s = pl.Series([" hello ", "\t world"])
661
    expected = pl.Series(["hello ", "world"])
662
    assert_series_equal(s.str.strip_chars_start(), expected)
663

664
    expected = pl.Series(["ello ", "world"])
665
    assert_series_equal(s.str.strip_chars_start().str.strip_chars_start("h"), expected)
666

667
    expected = pl.Series(["ello ", "\t world"])
668
    assert_series_equal(s.str.strip_chars_start("hw "), expected)
669

670

671
def test_str_strip_chars_start_wrong_length() -> None:
672
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
673
    with pytest.raises(ShapeError):
674
        df.select(pl.col("num").str.strip_chars_start(pl.Series(["a", "b"])))
675

676

677
def test_str_strip_chars_end() -> None:
678
    s = pl.Series([" hello ", "world\t "])
679
    expected = pl.Series([" hello", "world"])
680
    assert_series_equal(s.str.strip_chars_end(), expected)
681

682
    expected = pl.Series([" hell", "world"])
683
    assert_series_equal(s.str.strip_chars_end().str.strip_chars_end("o"), expected)
684

685
    expected = pl.Series([" he", "wor"])
686
    assert_series_equal(s.str.strip_chars_end("odl \t"), expected)
687

688

689
def test_str_strip_chars_end_wrong_length() -> None:
690
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
691
    with pytest.raises(ShapeError):
692
        df.select(pl.col("num").str.strip_chars_end(pl.Series(["a", "b"])))
693

694

695
def test_str_strip_whitespace() -> None:
696
    s = pl.Series("a", ["trailing  ", "  leading", "  both  "])
697

698
    expected = pl.Series("a", ["trailing", "  leading", "  both"])
699
    assert_series_equal(s.str.strip_chars_end(), expected)
700

701
    expected = pl.Series("a", ["trailing  ", "leading", "both  "])
702
    assert_series_equal(s.str.strip_chars_start(), expected)
703

704
    expected = pl.Series("a", ["trailing", "leading", "both"])
705
    assert_series_equal(s.str.strip_chars(), expected)
706

707

708
def test_str_strip_prefix_literal() -> None:
709
    s = pl.Series(["foo:bar", "foofoo:bar", "bar:bar", "foo", "", None])
710
    expected = pl.Series([":bar", "foo:bar", "bar:bar", "", "", None])
711
    assert_series_equal(s.str.strip_prefix("foo"), expected)
712
    # test null literal
713
    expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)
714
    assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.String)), expected)
715

716

717
def test_str_strip_prefix_suffix_expr() -> None:
718
    df = pl.DataFrame(
719
        {
720
            "s": ["foo-bar", "foobarbar", "barfoo", "", "anything", None],
721
            "prefix": ["foo", "foobar", "foo", "", None, "bar"],
722
            "suffix": ["bar", "barbar", "bar", "", None, "foo"],
723
        }
724
    )
725
    out = df.select(
726
        pl.col("s").str.strip_prefix(pl.col("prefix")).alias("strip_prefix"),
727
        pl.col("s").str.strip_suffix(pl.col("suffix")).alias("strip_suffix"),
728
    )
729
    assert out.to_dict(as_series=False) == {
730
        "strip_prefix": ["-bar", "bar", "barfoo", "", None, None],
731
        "strip_suffix": ["foo-", "foo", "barfoo", "", None, None],
732
    }
733

734

735
def test_str_strip_prefix_wrong_length() -> None:
736
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
737
    with pytest.raises(ShapeError):
738
        df.select(pl.col("num").str.strip_prefix(pl.Series(["a", "b"])))
739

740

741
def test_str_strip_suffix() -> None:
742
    s = pl.Series(["foo:bar", "foo:barbar", "foo:foo", "bar", "", None])
743
    expected = pl.Series(["foo:", "foo:bar", "foo:foo", "", "", None])
744
    assert_series_equal(s.str.strip_suffix("bar"), expected)
745
    # test null literal
746
    expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)
747
    assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.String)), expected)
748

749

750
def test_str_strip_suffix_wrong_length() -> None:
751
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
752
    with pytest.raises(ShapeError):
753
        df.select(pl.col("num").str.strip_suffix(pl.Series(["a", "b"])))
754

755

756
def test_str_split() -> None:
757
    a = pl.Series("a", ["a, b", "a", "ab,c,de"])
758
    for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:
759
        assert out[0].to_list() == ["a", " b"]
760
        assert out[1].to_list() == ["a"]
761
        assert out[2].to_list() == ["ab", "c", "de"]
762

763
    for out in [
764
        a.str.split(",", inclusive=True),
765
        pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),
766
    ]:
767
        assert out[0].to_list() == ["a,", " b"]
768
        assert out[1].to_list() == ["a"]
769
        assert out[2].to_list() == ["ab,", "c,", "de"]
770

771

772
def test_json_decode_series() -> None:
773
    s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"])
774
    expected = pl.Series([[1, 2, 3], None, [4, 5, 6]])
775
    dtype = pl.List(pl.Int64)
776
    assert_series_equal(s.str.json_decode(None), expected)
777
    assert_series_equal(s.str.json_decode(dtype), expected)
778

779
    s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}'])
780
    expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}])
781
    dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
782
    assert_series_equal(s.str.json_decode(None), expected)
783
    assert_series_equal(s.str.json_decode(dtype2), expected)
784

785
    expected = pl.Series([{"a": 1}, None, {"a": 2}])
786
    dtype2 = pl.Struct([pl.Field("a", pl.Int64)])
787
    assert_series_equal(s.str.json_decode(dtype2), expected)
788

789
    s = pl.Series([], dtype=pl.String)
790
    expected = pl.Series([], dtype=pl.List(pl.Int64))
791
    dtype = pl.List(pl.Int64)
792
    assert_series_equal(s.str.json_decode(dtype), expected)
793

794

795
def test_json_decode_lazy_expr() -> None:
796
    dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
797
    ldf = (
798
        pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']})
799
        .lazy()
800
        .select(pl.col("json").str.json_decode(dtype))
801
    )
802
    expected = pl.DataFrame(
803
        {"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]}
804
    ).lazy()
805
    assert ldf.collect_schema() == {"json": dtype}
806
    assert_frame_equal(ldf, expected)
807

808

809
def test_json_decode_nested_struct() -> None:
810
    json = [
811
        '[{"key_1": "a"}]',
812
        '[{"key_1": "a2", "key_2": 2}]',
813
        '[{"key_1": "a3", "key_2": 3, "key_3": "c"}]',
814
    ]
815
    s = pl.Series("json_str", json)
816
    s_parsed = s.str.json_decode().rename("parsed_list_json")
817

818
    expected_dtype = pl.List(
819
        pl.Struct(
820
            [
821
                pl.Field("key_1", pl.String),
822
                pl.Field("key_2", pl.Int64),
823
                pl.Field("key_3", pl.String),
824
            ]
825
        )
826
    )
827
    assert s_parsed.dtype == expected_dtype
828

829
    key_1_values = s_parsed.to_frame().select(
830
        pl.col("parsed_list_json")
831
        .list.get(0)
832
        .struct.field("key_1")
833
        .alias("key_1_values")
834
    )
835
    expected_values = pl.Series("key_1_values", ["a", "a2", "a3"])
836
    assert_series_equal(key_1_values.get_column("key_1_values"), expected_values)
837

838

839
def test_json_decode_primitive_to_list_11053() -> None:
840
    df = pl.DataFrame(
841
        {
842
            "json": [
843
                '{"col1": ["123"], "col2": "123"}',
844
                '{"col1": ["xyz"], "col2": null}',
845
            ]
846
        }
847
    )
848
    schema = pl.Struct(
849
        {
850
            "col1": pl.List(pl.String),
851
            "col2": pl.List(pl.String),
852
        }
853
    )
854

855
    output = df.select(
856
        pl.col("json").str.json_decode(schema).alias("decoded_json")
857
    ).unnest("decoded_json")
858
    expected = pl.DataFrame({"col1": [["123"], ["xyz"]], "col2": [["123"], None]})
859
    assert_frame_equal(output, expected)
860

861

862
def test_jsonpath_single() -> None:
863
    s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}'])
864
    expected = pl.Series(["1", None, "2", "2.1", "true"])
865
    assert_series_equal(s.str.json_path_match("$.a"), expected)
866

867

868
def test_json_path_match() -> None:
869
    df = pl.DataFrame(
870
        {
871
            "str": [
872
                '{"a":"1"}',
873
                None,
874
                '{"b":2}',
875
                '{"a":2.1, "b": "hello"}',
876
                '{"a":true}',
877
            ],
878
            "pat": ["$.a", "$.a", "$.b", "$.b", None],
879
        }
880
    )
881
    out = df.select(
882
        all_expr=pl.col("str").str.json_path_match(pl.col("pat")),
883
        str_expr=pl.col("str").str.json_path_match("$.a"),
884
        pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),
885
    )
886
    expected = pl.DataFrame(
887
        {
888
            "all_expr": ["1", None, "2", "hello", None],
889
            "str_expr": ["1", None, None, "2.1", "true"],
890
            "pat_expr": ["1.1", "1.1", "10", "10", None],
891
        }
892
    )
893
    assert_frame_equal(out, expected)
894

895

896
def test_str_json_path_match_wrong_length() -> None:
897
    df = pl.DataFrame({"num": ["-10", "-1", "0"]})
898
    with pytest.raises((ShapeError, ComputeError)):
899
        df.select(pl.col("num").str.json_path_match(pl.Series(["a", "b"])))
900

901

902
def test_extract_regex() -> None:
903
    s = pl.Series(
904
        [
905
            "http://vote.com/ballon_dor?candidate=messi&ref=polars",
906
            "http://vote.com/ballon_dor?candidat=jorginho&ref=polars",
907
            "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
908
        ]
909
    )
910
    expected = pl.Series(["messi", None, "ronaldo"])
911
    assert_series_equal(s.str.extract(r"candidate=(\w+)", 1), expected)
912

913

914
def test_extract() -> None:
915
    df = pl.DataFrame(
916
        {
917
            "s": ["aron123", "12butler", "charly*", "~david", None],
918
            "pat": [r"^([a-zA-Z]+)", r"^(\d+)", None, "^(da)", r"(.*)"],
919
        }
920
    )
921

922
    out = df.select(
923
        all_expr=pl.col("s").str.extract(pl.col("pat"), 1),
924
        str_expr=pl.col("s").str.extract("^([a-zA-Z]+)", 1),
925
        pat_expr=pl.lit("aron123").str.extract(pl.col("pat")),
926
    )
927
    expected = pl.DataFrame(
928
        {
929
            "all_expr": ["aron", "12", None, None, None],
930
            "str_expr": ["aron", None, "charly", None, None],
931
            "pat_expr": ["aron", None, None, None, "aron123"],
932
        }
933
    )
934
    assert_frame_equal(out, expected)
935

936

937
def test_extract_binary() -> None:
938
    df = pl.DataFrame({"foo": ["aron", "butler", "charly", "david"]})
939
    out = df.filter(pl.col("foo").str.extract("^(a)", 1) == "a").to_series()
940
    assert out[0] == "aron"
941

942

943
def test_str_join_returns_scalar() -> None:
944
    df = pl.DataFrame(
945
        [pl.Series("val", ["A", "B", "C", "D"]), pl.Series("id", [1, 1, 2, 2])]
946
    )
947
    grouped = (
948
        df.group_by("id")
949
        .agg(pl.col("val").str.join(delimiter=",").alias("grouped"))
950
        .get_column("grouped")
951
    )
952
    assert grouped.dtype == pl.String
953

954

955
def test_contains() -> None:
956
    # test strict/non strict
957
    s_txt = pl.Series(["123", "456", "789"])
958
    assert (
959
        pl.Series([None, None, None]).cast(pl.Boolean).to_list()
960
        == s_txt.str.contains("(not_valid_regex", literal=False, strict=False).to_list()
961
    )
962
    with pytest.raises(ComputeError):
963
        s_txt.str.contains("(not_valid_regex", literal=False, strict=True)
964
    assert (
965
        pl.Series([True, False, False]).cast(pl.Boolean).to_list()
966
        == s_txt.str.contains("1", literal=False, strict=False).to_list()
967
    )
968

969
    df = pl.DataFrame(
970
        data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],
971
        schema=["idx", "text"],
972
        orient="row",
973
    )
974
    for pattern, as_literal, expected in (
975
        (r"\* \*", False, [True, False, False]),
976
        (r"* *", True, [True, False, False]),
977
        (r"^\(", False, [False, True, False]),
978
        (r"^\(", True, [False, False, False]),
979
        (r"(", True, [False, True, False]),
980
        (r"e", False, [True, True, True]),
981
        (r"e", True, [True, True, True]),
982
        (r"^\S+$", False, [False, False, True]),
983
        (r"\?\$", False, [False, False, True]),
984
        (r"?$", True, [False, False, True]),
985
    ):
986
        # series
987
        assert (
988
            expected == df["text"].str.contains(pattern, literal=as_literal).to_list()
989
        )
990
        # frame select
991
        assert (
992
            expected
993
            == df.select(pl.col("text").str.contains(pattern, literal=as_literal))[
994
                "text"
995
            ].to_list()
996
        )
997
        # frame filter
998
        assert sum(expected) == len(
999
            df.filter(pl.col("text").str.contains(pattern, literal=as_literal))
1000
        )
1001

1002

1003
def test_contains_expr() -> None:
1004
    df = pl.DataFrame(
1005
        {
1006
            "text": [
1007
                "some text",
1008
                "(with) special\n .* chars",
1009
                "**etc...?$",
1010
                None,
1011
                "b",
1012
                "invalid_regex",
1013
            ],
1014
            "pattern": [r"[me]", r".*", r"^\(", "a", None, "*"],
1015
        }
1016
    )
1017

1018
    assert df.select(
1019
        pl.col("text")
1020
        .str.contains(pl.col("pattern"), literal=False, strict=False)
1021
        .alias("contains"),
1022
        pl.col("text")
1023
        .str.contains(pl.col("pattern"), literal=True)
1024
        .alias("contains_lit"),
1025
    ).to_dict(as_series=False) == {
1026
        "contains": [True, True, False, None, None, None],
1027
        "contains_lit": [False, True, False, None, None, False],
1028
    }
1029

1030
    with pytest.raises(ComputeError):
1031
        df.select(
1032
            pl.col("text").str.contains(pl.col("pattern"), literal=False, strict=True)
1033
        )
1034

1035

1036
@pytest.mark.parametrize(
1037
    ("pattern", "case_insensitive", "expected"),
1038
    [
1039
        (["me"], False, True),
1040
        (["Me"], False, False),
1041
        (["Me"], True, True),
1042
        (pl.Series(["me", "they"]), False, True),
1043
        (pl.Series(["Me", "they"]), False, False),
1044
        (pl.Series(["Me", "they"]), True, True),
1045
        (["me", "they"], False, True),
1046
        (["Me", "they"], False, False),
1047
        (["Me", "they"], True, True),
1048
    ],
1049
)
1050
def test_contains_any(
1051
    pattern: pl.Series | list[str],
1052
    case_insensitive: bool,
1053
    expected: bool,
1054
) -> None:
1055
    df = pl.DataFrame({"text": ["Tell me what you want"]})
1056
    # series
1057
    assert (
1058
        expected
1059
        == df["text"]
1060
        .str.contains_any(pattern, ascii_case_insensitive=case_insensitive)
1061
        .item()
1062
    )
1063
    # expr
1064
    assert (
1065
        expected
1066
        == df.select(
1067
            pl.col("text").str.contains_any(
1068
                pattern, ascii_case_insensitive=case_insensitive
1069
            )
1070
        )["text"].item()
1071
    )
1072
    # frame filter
1073
    assert int(expected) == len(
1074
        df.filter(
1075
            pl.col("text").str.contains_any(
1076
                pattern, ascii_case_insensitive=case_insensitive
1077
            )
1078
        )
1079
    )
1080

1081

1082
def test_replace() -> None:
1083
    df = pl.DataFrame(
1084
        data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
1085
        schema=["idx", "text"],
1086
        orient="row",
1087
    )
1088
    for pattern, replacement, as_literal, expected in (
1089
        (r"\*", "-", False, ["- * text", "(with) special\n - chars **etc...?$"]),
1090
        (r"*", "-", True, ["- * text", "(with) special\n - chars **etc...?$"]),
1091
        (r"^\(", "[", False, ["* * text", "[with) special\n * chars **etc...?$"]),
1092
        (r"^\(", "[", True, ["* * text", "(with) special\n * chars **etc...?$"]),
1093
        (r"t$", "an", False, ["* * texan", "(with) special\n * chars **etc...?$"]),
1094
        (r"t$", "an", True, ["* * text", "(with) special\n * chars **etc...?$"]),
1095
        (r"(with) special", "$1", True, ["* * text", "$1\n * chars **etc...?$"]),
1096
        (
1097
            r"\((with)\) special",
1098
            ":$1:",
1099
            False,
1100
            ["* * text", ":with:\n * chars **etc...?$"],
1101
        ),
1102
    ):
1103
        # series
1104
        assert (
1105
            expected
1106
            == df["text"]
1107
            .str.replace(pattern, replacement, literal=as_literal)
1108
            .to_list()
1109
        )
1110
        # expr
1111
        assert (
1112
            expected
1113
            == df.select(
1114
                pl.col("text").str.replace(pattern, replacement, literal=as_literal)
1115
            )["text"].to_list()
1116
        )
1117

1118
    assert pl.Series(["."]).str.replace(".", "$0", literal=True)[0] == "$0"
1119
    assert pl.Series(["(.)(?)"]).str.replace(".", "$1", literal=True)[0] == "($1)(?)"
1120

1121

1122
def test_replace_all() -> None:
1123
    df = pl.DataFrame(
1124
        data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],
1125
        schema=["idx", "text"],
1126
        orient="row",
1127
    )
1128
    for pattern, replacement, as_literal, expected in (
1129
        (r"\*", "-", False, ["- - text", "(with) special\n - chars --etc...?$"]),
1130
        (r"*", "-", True, ["- - text", "(with) special\n - chars --etc...?$"]),
1131
        (r"\W", "", False, ["text", "withspecialcharsetc"]),
1132
        (r".?$", "", True, ["* * text", "(with) special\n * chars **etc.."]),
1133
        (
1134
            r"(with) special",
1135
            "$1",
1136
            True,
1137
            ["* * text", "$1\n * chars **etc...?$"],
1138
        ),
1139
        (
1140
            r"\((with)\) special",
1141
            ":$1:",
1142
            False,
1143
            ["* * text", ":with:\n * chars **etc...?$"],
1144
        ),
1145
        (
1146
            r"(\b)[\w\s]{2,}(\b)",
1147
            "$1(blah)$3",
1148
            False,
1149
            ["* * (blah)", "((blah)) (blah)\n * (blah) **(blah)...?$"],
1150
        ),
1151
    ):
1152
        # series
1153
        assert (
1154
            expected
1155
            == df["text"]
1156
            .str.replace_all(pattern, replacement, literal=as_literal)
1157
            .to_list()
1158
        )
1159
        # expr
1160
        assert (
1161
            expected
1162
            == df.select(
1163
                pl.col("text").str.replace_all(pattern, replacement, literal=as_literal)
1164
            )["text"].to_list()
1165
        )
1166
        # invalid regex (but valid literal - requires "literal=True")
1167
        with pytest.raises(ComputeError):
1168
            df["text"].str.replace_all("*", "")
1169

1170
    assert (
1171
        pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=True)[0]
1172
        == "(.)($0)($0)"
1173
    )
1174
    assert (
1175
        pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=False)[0]
1176
        == "(.)(\\?)(\\?)"
1177
    )
1178

1179

1180
def test_replace_all_literal_no_caputures() -> None:
1181
    # When using literal = True, capture groups should be disabled
1182

1183
    # Single row code path in Rust
1184
    df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
1185
    df = df.with_columns(
1186
        pl.col("text")
1187
        .str.replace_all("<amt>", pl.col("amt"), literal=True)
1188
        .alias("text2")
1189
    )
1190
    assert df.get_column("text2")[0] == "I found $1 yesterday."
1191

1192
    # Multi-row code path in Rust
1193
    df2 = pl.DataFrame(
1194
        {
1195
            "text": ["I found <amt> yesterday.", "I lost <amt> yesterday."],
1196
            "amt": ["$1", "$2"],
1197
        }
1198
    )
1199
    df2 = df2.with_columns(
1200
        pl.col("text")
1201
        .str.replace_all("<amt>", pl.col("amt"), literal=True)
1202
        .alias("text2")
1203
    )
1204
    assert df2.get_column("text2")[0] == "I found $1 yesterday."
1205
    assert df2.get_column("text2")[1] == "I lost $2 yesterday."
1206

1207

1208
def test_replace_literal_no_caputures() -> None:
1209
    # When using literal = True, capture groups should be disabled
1210

1211
    # Single row code path in Rust
1212
    df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
1213
    df = df.with_columns(
1214
        pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")
1215
    )
1216
    assert df.get_column("text2")[0] == "I found $1 yesterday."
1217

1218
    # Multi-row code path in Rust
1219
    # A string shorter than 32 chars,
1220
    # and one longer than 32 chars to test both sub-paths
1221
    df2 = pl.DataFrame(
1222
        {
1223
            "text": [
1224
                "I found <amt> yesterday.",
1225
                "I lost <amt> yesterday and this string is longer than 32 characters.",
1226
            ],
1227
            "amt": ["$1", "$2"],
1228
        }
1229
    )
1230
    df2 = df2.with_columns(
1231
        pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")
1232
    )
1233
    assert df2.get_column("text2")[0] == "I found $1 yesterday."
1234
    assert (
1235
        df2.get_column("text2")[1]
1236
        == "I lost $2 yesterday and this string is longer than 32 characters."
1237
    )
1238

1239

1240
def test_replace_expressions() -> None:
1241
    df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})
1242
    out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])
1243
    assert out.to_dict(as_series=False) == {"foo": ["A", "xyz 678 910t"]}
1244
    out = df.select([pl.col("foo").str.replace(pl.col("foo").last(), "value")])
1245
    assert out.to_dict(as_series=False) == {"foo": ["123 bla 45 asd", "value"]}
1246

1247
    df = pl.DataFrame(
1248
        {"foo": ["1 bla 45 asd", "xyz 6t"], "pat": [r"\d", r"\W"], "value": ["A", "B"]}
1249
    )
1250
    out = df.select([pl.col("foo").str.replace_all(pl.col("pat").first(), "value")])
1251
    assert out.to_dict(as_series=False) == {
1252
        "foo": ["value bla valuevalue asd", "xyz valuet"]
1253
    }
1254

1255

1256
@pytest.mark.parametrize(
1257
    ("pattern", "replacement", "case_insensitive", "expected"),
1258
    [
1259
        (["say"], "", False, "Tell me what you want"),
1260
        (["me"], ["them"], False, "Tell them what you want"),
1261
        (["who"], ["them"], False, "Tell me what you want"),
1262
        (["me", "you"], "it", False, "Tell it what it want"),
1263
        (["Me", "you"], "it", False, "Tell me what it want"),
1264
        (["me", "you"], ["it"], False, "Tell it what it want"),
1265
        (["me", "you"], ["you", "me"], False, "Tell you what me want"),
1266
        (["me", "You", "them"], "it", False, "Tell it what you want"),
1267
        (["Me", "you"], "it", True, "Tell it what it want"),
1268
        (["me", "YOU"], ["you", "me"], True, "Tell you what me want"),
1269
        (pl.Series(["me", "YOU"]), ["you", "me"], False, "Tell you what you want"),
1270
        (pl.Series(["me", "YOU"]), ["you", "me"], True, "Tell you what me want"),
1271
    ],
1272
)
1273
def test_replace_many(
1274
    pattern: pl.Series | list[str],
1275
    replacement: pl.Series | list[str] | str,
1276
    case_insensitive: bool,
1277
    expected: str,
1278
) -> None:
1279
    df = pl.DataFrame({"text": ["Tell me what you want"]})
1280
    # series
1281
    assert (
1282
        expected
1283
        == df["text"]
1284
        .str.replace_many(pattern, replacement, ascii_case_insensitive=case_insensitive)
1285
        .item()
1286
    )
1287
    # expr
1288
    assert (
1289
        expected
1290
        == df.select(
1291
            pl.col("text").str.replace_many(
1292
                pattern,
1293
                replacement,
1294
                ascii_case_insensitive=case_insensitive,
1295
            )
1296
        ).item()
1297
    )
1298

1299

1300
def test_replace_many_groupby() -> None:
1301
    df = pl.DataFrame(
1302
        {
1303
            "x": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
1304
            "g": [0, 0, 0, 1, 1, 1, 2, 2, 2],
1305
        }
1306
    )
1307
    out = df.group_by("g").agg(pl.col.x.str.replace_many(pl.col.x.head(2), ""))
1308
    expected = pl.DataFrame(
1309
        {
1310
            "g": [0, 1, 2],
1311
            "x": [["", "", "c"], ["", "", "f"], ["", "", "i"]],
1312
        }
1313
    )
1314
    assert_frame_equal(out, expected, check_row_order=False)
1315

1316

1317
@pytest.mark.parametrize(
1318
    ("mapping", "case_insensitive", "expected"),
1319
    [
1320
        ({}, False, "Tell me what you want"),
1321
        ({"me": "them"}, False, "Tell them what you want"),
1322
        ({"who": "them"}, False, "Tell me what you want"),
1323
        ({"me": "it", "you": "it"}, False, "Tell it what it want"),
1324
        ({"Me": "it", "you": "it"}, False, "Tell me what it want"),
1325
        ({"me": "you", "you": "me"}, False, "Tell you what me want"),
1326
        ({}, True, "Tell me what you want"),
1327
        ({"Me": "it", "you": "it"}, True, "Tell it what it want"),
1328
        ({"me": "you", "YOU": "me"}, True, "Tell you what me want"),
1329
    ],
1330
)
1331
def test_replace_many_mapping(
1332
    mapping: dict[str, str],
1333
    case_insensitive: bool,
1334
    expected: str,
1335
) -> None:
1336
    df = pl.DataFrame({"text": ["Tell me what you want"]})
1337
    # series
1338
    assert (
1339
        expected
1340
        == df["text"]
1341
        .str.replace_many(mapping, ascii_case_insensitive=case_insensitive)
1342
        .item()
1343
    )
1344
    # expr
1345
    assert (
1346
        expected
1347
        == df.select(
1348
            pl.col("text").str.replace_many(
1349
                mapping,
1350
                ascii_case_insensitive=case_insensitive,
1351
            )
1352
        ).item()
1353
    )
1354

1355

1356
def test_replace_many_invalid_inputs() -> None:
1357
    df = pl.DataFrame({"text": ["Tell me what you want"]})
1358

1359
    # Ensure a string as the first argument is parsed as a column name.
1360
    with pytest.raises(ColumnNotFoundError, match="me"):
1361
        df.select(pl.col("text").str.replace_many("me", "you"))
1362

1363
    with pytest.raises(InvalidOperationError):
1364
        df.select(pl.col("text").str.replace_many(1, 2))
1365

1366
    with pytest.raises(InvalidOperationError):
1367
        df.select(pl.col("text").str.replace_many([1], [2]))
1368

1369
    with pytest.raises(InvalidOperationError):
1370
        df.select(pl.col("text").str.replace_many(["me"], None))
1371

1372
    with pytest.raises(TypeError):
1373
        df.select(pl.col("text").str.replace_many(["me"]))
1374

1375
    with pytest.raises(
1376
        InvalidOperationError,
1377
        match="expected the same amount of patterns as replacement strings",
1378
    ):
1379
        df.select(pl.col("text").str.replace_many(["a"], ["b", "c"]))
1380

1381
    s = df.to_series()
1382

1383
    with pytest.raises(ColumnNotFoundError, match="me"):
1384
        s.str.replace_many("me", "you")  # type: ignore[arg-type]
1385

1386
    with pytest.raises(TypeError):
1387
        df.select(pl.col("text").str.replace_many(["me"]))
1388

1389
    with pytest.raises(
1390
        InvalidOperationError,
1391
        match="expected the same amount of patterns as replacement strings",
1392
    ):
1393
        s.str.replace_many(["a"], ["b", "c"])
1394

1395

1396
def test_extract_all_count() -> None:
1397
    df = pl.DataFrame({"foo": ["123 bla 45 asd", "xaz 678 910t", "boo", None]})
1398
    assert (
1399
        df.select(
1400
            pl.col("foo").str.extract_all(r"a").alias("extract"),
1401
            pl.col("foo").str.count_matches(r"a").alias("count"),
1402
        ).to_dict(as_series=False)
1403
    ) == {"extract": [["a", "a"], ["a"], [], None], "count": [2, 1, 0, None]}
1404

1405
    assert df["foo"].str.extract_all(r"a").dtype == pl.List
1406
    assert df["foo"].str.count_matches(r"a").dtype == pl.UInt32
1407

1408

1409
def test_count_matches_many() -> None:
1410
    df = pl.DataFrame(
1411
        {
1412
            "foo": ["123 bla 45 asd", "xyz 678 910t", None, "boo"],
1413
            "bar": [r"\d", r"[a-z]", r"\d", None],
1414
        }
1415
    )
1416
    assert (
1417
        df.select(
1418
            pl.col("foo").str.count_matches(pl.col("bar")).alias("count")
1419
        ).to_dict(as_series=False)
1420
    ) == {"count": [5, 4, None, None]}
1421

1422
    assert df["foo"].str.count_matches(df["bar"]).dtype == pl.UInt32
1423

1424
    # Test broadcast.
1425
    broad = df.select(
1426
        pl.col("foo").str.count_matches(pl.col("bar").first()).alias("count"),
1427
        pl.col("foo").str.count_matches(pl.col("bar").last()).alias("count_null"),
1428
    )
1429
    assert broad.to_dict(as_series=False) == {
1430
        "count": [5, 6, None, 0],
1431
        "count_null": [None, None, None, None],
1432
    }
1433
    assert broad.schema == {"count": pl.UInt32, "count_null": pl.UInt32}
1434

1435

1436
def test_extract_all_many() -> None:
1437
    df = pl.DataFrame(
1438
        {
1439
            "foo": ["ab", "abc", "abcd", "foo", None, "boo"],
1440
            "re": ["a", "bc", "a.c", "a", "a", None],
1441
        }
1442
    )
1443
    assert df["foo"].str.extract_all(df["re"]).to_list() == [
1444
        ["a"],
1445
        ["bc"],
1446
        ["abc"],
1447
        [],
1448
        None,
1449
        None,
1450
    ]
1451

1452
    # Test broadcast.
1453
    broad = df.select(
1454
        pl.col("foo").str.extract_all(pl.col("re").first()).alias("a"),
1455
        pl.col("foo").str.extract_all(pl.col("re").last()).alias("null"),
1456
    )
1457
    assert broad.to_dict(as_series=False) == {
1458
        "a": [["a"], ["a"], ["a"], [], None, []],
1459
        "null": [None] * 6,
1460
    }
1461
    assert broad.schema == {"a": pl.List(pl.String), "null": pl.List(pl.String)}
1462

1463

1464
@pytest.mark.may_fail_cloud  # reason: zero-field struct
1465
def test_extract_groups_empty() -> None:
1466
    df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})
1467

1468
    assert df.select(pl.col("iso_code").str.extract_groups("")).to_dict(
1469
        as_series=False
1470
    ) == {"iso_code": [{}, {}]}
1471

1472
    q = df.lazy().select(pl.col("iso_code").str.extract_groups(""))
1473
    assert q.collect_schema() == q.collect().schema
1474

1475

1476
def test_extract_groups() -> None:
1477
    def _named_groups_builder(pattern: str, groups: dict[str, str]) -> str:
1478
        return pattern.format(
1479
            **{name: f"(?<{name}>{value})" for name, value in groups.items()}
1480
        )
1481

1482
    expected = {
1483
        "authority": ["ISO", "ISO/IEC/IEEE"],
1484
        "spec_num": ["80000", "29148"],
1485
        "part_num": ["1", None],
1486
        "revision_year": ["2009", "2018"],
1487
    }
1488

1489
    pattern = _named_groups_builder(
1490
        r"{authority}\s{spec_num}(?:-{part_num})?(?::{revision_year})",
1491
        {
1492
            "authority": r"^ISO(?:/[A-Z]+)*",
1493
            "spec_num": r"\d+",
1494
            "part_num": r"\d+",
1495
            "revision_year": r"\d{4}",
1496
        },
1497
    )
1498

1499
    df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})
1500

1501
    assert (
1502
        df.select(pl.col("iso_code").str.extract_groups(pattern))
1503
        .unnest("iso_code")
1504
        .to_dict(as_series=False)
1505
        == expected
1506
    )
1507

1508
    assert df.select(
1509
        pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(\d+)")
1510
    ).to_dict(as_series=False) == {
1511
        "iso_code": [{"1": "ISO", "2": "80000"}, {"1": "ISO/IEC/IEEE", "2": "29148"}]
1512
    }
1513

1514
    assert df.select(
1515
        pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(?<year>\d+)\z")
1516
    ).to_dict(as_series=False) == {
1517
        "iso_code": [
1518
            {"1": "ISO", "year": "2009"},
1519
            {"1": "ISO/IEC/IEEE", "year": "2018"},
1520
        ]
1521
    }
1522

1523
    assert pl.select(
1524
        pl.lit(r"foobar").str.extract_groups(r"(?<foo>.{3})|(?<bar>...)")
1525
    ).to_dict(as_series=False) == {"literal": [{"foo": "foo", "bar": None}]}
1526

1527

1528
def test_starts_ends_with() -> None:
1529
    df = pl.DataFrame(
1530
        {
1531
            "a": ["hamburger_with_tomatoes", "nuts", "lollypop", None],
1532
            "sub": ["ham", "ts", None, "anything"],
1533
        }
1534
    )
1535

1536
    assert df.select(
1537
        pl.col("a").str.ends_with("pop").alias("ends_pop"),
1538
        pl.col("a").str.ends_with(pl.lit(None)).alias("ends_None"),
1539
        pl.col("a").str.ends_with(pl.col("sub")).alias("ends_sub"),
1540
        pl.col("a").str.starts_with("ham").alias("starts_ham"),
1541
        pl.col("a").str.starts_with(pl.lit(None)).alias("starts_None"),
1542
        pl.col("a").str.starts_with(pl.col("sub")).alias("starts_sub"),
1543
    ).to_dict(as_series=False) == {
1544
        "ends_pop": [False, False, True, None],
1545
        "ends_None": [None, None, None, None],
1546
        "ends_sub": [False, True, None, None],
1547
        "starts_ham": [True, False, False, None],
1548
        "starts_None": [None, None, None, None],
1549
        "starts_sub": [True, False, None, None],
1550
    }
1551

1552

1553
def test_json_path_match_type_4905() -> None:
1554
    df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})
1555
    assert df.filter(
1556
        pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])
1557
    ).to_dict(as_series=False) == {"json_val": ['{"a":"hello"}']}
1558

1559

1560
def test_decode_strict() -> None:
1561
    df = pl.DataFrame(
1562
        {"strings": ["0IbQvTc3", "0J%2FQldCf0JA%3D", "0J%2FRgNC%2B0YHRgtC%2B"]}
1563
    )
1564
    result = df.select(pl.col("strings").str.decode("base64", strict=False))
1565
    expected = {"strings": [b"\xd0\x86\xd0\xbd77", None, None]}
1566
    assert result.to_dict(as_series=False) == expected
1567

1568
    with pytest.raises(ComputeError):
1569
        df.select(pl.col("strings").str.decode("base64", strict=True))
1570

1571

1572
def test_split() -> None:
1573
    df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})
1574
    out = df.select([pl.col("x").str.split("_")])
1575

1576
    expected = pl.DataFrame(
1577
        [
1578
            {"x": ["a", "a"]},
1579
            {"x": None},
1580
            {"x": ["b"]},
1581
            {"x": ["c", "c", "c"]},
1582
            {"x": [""]},
1583
        ]
1584
    )
1585

1586
    assert_frame_equal(out, expected)
1587
    assert_frame_equal(df["x"].str.split("_").to_frame(), expected)
1588

1589
    out = df.select([pl.col("x").str.split("_", inclusive=True)])
1590

1591
    expected = pl.DataFrame(
1592
        [
1593
            {"x": ["a_", "a"]},
1594
            {"x": None},
1595
            {"x": ["b"]},
1596
            {"x": ["c_", "c_", "c"]},
1597
            {"x": []},
1598
        ]
1599
    )
1600

1601
    assert_frame_equal(out, expected)
1602
    assert_frame_equal(df["x"].str.split("_", inclusive=True).to_frame(), expected)
1603

1604
    out = df.select([pl.col("x").str.split("")])
1605

1606
    expected = pl.DataFrame(
1607
        [
1608
            {"x": ["a", "_", "a"]},
1609
            {"x": None},
1610
            {"x": ["b"]},
1611
            {"x": ["c", "_", "c", "_", "c"]},
1612
            {"x": []},
1613
        ]
1614
    )
1615

1616
    assert_frame_equal(out, expected)
1617
    assert_frame_equal(df["x"].str.split("").to_frame(), expected)
1618

1619
    out = df.select([pl.col("x").str.split("", inclusive=True)])
1620

1621
    expected = pl.DataFrame(
1622
        [
1623
            {"x": ["a", "_", "a"]},
1624
            {"x": None},
1625
            {"x": ["b"]},
1626
            {"x": ["c", "_", "c", "_", "c"]},
1627
            {"x": []},
1628
        ]
1629
    )
1630

1631
    assert_frame_equal(out, expected)
1632
    assert_frame_equal(df["x"].str.split("", inclusive=True).to_frame(), expected)
1633

1634
    plan = (
1635
        df.lazy()
1636
        .select(
1637
            a=pl.col("x").str.split(" ", inclusive=False),
1638
            b=pl.col("x").str.split_exact(" ", 1, inclusive=False),
1639
        )
1640
        .explain()
1641
    )
1642

1643
    assert "str.split(" in plan
1644
    assert "str.split_exact(" in plan
1645

1646
    plan = (
1647
        df.lazy()
1648
        .select(
1649
            a=pl.col("x").str.split(" ", inclusive=True),
1650
            b=pl.col("x").str.split_exact(" ", 1, inclusive=True),
1651
        )
1652
        .explain()
1653
    )
1654

1655
    assert "str.split_inclusive(" in plan
1656
    assert "str.split_exact_inclusive(" in plan
1657

1658

1659
def test_split_expr() -> None:
1660
    df = pl.DataFrame(
1661
        {
1662
            "x": ["a_a", None, "b", "c*c*c", "dddd", ""],
1663
            "by": ["_", "#", "^", "*", "", ""],
1664
        }
1665
    )
1666
    out = df.select([pl.col("x").str.split(pl.col("by"))])
1667
    expected = pl.DataFrame(
1668
        [
1669
            {"x": ["a", "a"]},
1670
            {"x": None},
1671
            {"x": ["b"]},
1672
            {"x": ["c", "c", "c"]},
1673
            {"x": ["d", "d", "d", "d"]},
1674
            {"x": []},
1675
        ]
1676
    )
1677
    assert_frame_equal(out, expected)
1678

1679
    out = df.select([pl.col("x").str.split(pl.col("by"), inclusive=True)])
1680
    expected = pl.DataFrame(
1681
        [
1682
            {"x": ["a_", "a"]},
1683
            {"x": None},
1684
            {"x": ["b"]},
1685
            {"x": ["c*", "c*", "c"]},
1686
            {"x": ["d", "d", "d", "d"]},
1687
            {"x": []},
1688
        ]
1689
    )
1690
    assert_frame_equal(out, expected)
1691

1692

1693
def test_split_exact() -> None:
1694
    df = pl.DataFrame({"x": ["a_a", None, "b", "c_c", ""]})
1695
    out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x")
1696

1697
    expected = pl.DataFrame(
1698
        {
1699
            "field_0": ["a", None, "b", "c", ""],
1700
            "field_1": ["a", None, None, "c", None],
1701
            "field_2": pl.Series([None, None, None, None, None], dtype=pl.String),
1702
        }
1703
    )
1704

1705
    assert_frame_equal(out, expected)
1706
    out2 = df["x"].str.split_exact("_", 2, inclusive=False).to_frame().unnest("x")
1707
    assert_frame_equal(out2, expected)
1708

1709
    out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x")
1710

1711
    expected = pl.DataFrame(
1712
        {
1713
            "field_0": ["a_", None, "b", "c_", None],
1714
            "field_1": ["a", None, None, "c", None],
1715
        }
1716
    )
1717
    assert_frame_equal(out, expected)
1718
    assert df["x"].str.split_exact("_", 1).dtype == pl.Struct
1719
    assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct
1720

1721
    out = df.select([pl.col("x").str.split_exact("", 1)]).unnest("x")
1722

1723
    expected = pl.DataFrame(
1724
        {
1725
            "field_0": ["a", None, "b", "c", None],
1726
            "field_1": ["_", None, None, "_", None],
1727
        }
1728
    )
1729
    assert_frame_equal(out, expected)
1730

1731
    out = df.select([pl.col("x").str.split_exact("", 1, inclusive=True)]).unnest("x")
1732

1733
    expected = pl.DataFrame(
1734
        {
1735
            "field_0": ["a", None, "b", "c", None],
1736
            "field_1": ["_", None, None, "_", None],
1737
        }
1738
    )
1739
    assert_frame_equal(out, expected)
1740

1741

1742
def test_split_exact_expr() -> None:
1743
    df = pl.DataFrame(
1744
        {
1745
            "x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],
1746
            "by": ["_", "&", "$", "^", None, "", ""],
1747
        }
1748
    )
1749

1750
    out = df.select(
1751
        pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=False)
1752
    ).unnest("x")
1753

1754
    expected = pl.DataFrame(
1755
        {
1756
            "field_0": ["a", None, "b", "c", None, "e", None],
1757
            "field_1": ["a", None, None, "c", None, "e", None],
1758
            "field_2": pl.Series(
1759
                [None, None, None, "c", None, "e", None], dtype=pl.String
1760
            ),
1761
        }
1762
    )
1763

1764
    assert_frame_equal(out, expected)
1765

1766
    out2 = df.select(
1767
        pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=True)
1768
    ).unnest("x")
1769

1770
    expected2 = pl.DataFrame(
1771
        {
1772
            "field_0": ["a_", None, "b", "c^", None, "e", None],
1773
            "field_1": ["a", None, None, "c^", None, "e", None],
1774
            "field_2": pl.Series(
1775
                [None, None, None, "c", None, "e", None], dtype=pl.String
1776
            ),
1777
        }
1778
    )
1779
    assert_frame_equal(out2, expected2)
1780

1781

1782
def test_splitn() -> None:
1783
    df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})
1784
    out = df.select([pl.col("x").str.splitn("_", 2)]).unnest("x")
1785

1786
    expected = pl.DataFrame(
1787
        {
1788
            "field_0": ["a", None, "b", "c", ""],
1789
            "field_1": ["a", None, None, "c_c", None],
1790
        }
1791
    )
1792

1793
    assert_frame_equal(out, expected)
1794
    assert_frame_equal(df["x"].str.splitn("_", 2).to_frame().unnest("x"), expected)
1795

1796
    out = df.select([pl.col("x").str.splitn("", 2)]).unnest("x")
1797

1798
    expected = pl.DataFrame(
1799
        {
1800
            "field_0": ["a", None, "b", "c", None],
1801
            "field_1": ["_a", None, None, "_c_c", None],
1802
        }
1803
    )
1804

1805
    assert_frame_equal(out, expected)
1806
    assert_frame_equal(df["x"].str.splitn("", 2).to_frame().unnest("x"), expected)
1807

1808

1809
def test_splitn_expr() -> None:
1810
    df = pl.DataFrame(
1811
        {
1812
            "x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],
1813
            "by": ["_", "&", "$", "^", None, "", ""],
1814
        }
1815
    )
1816

1817
    out = df.select(pl.col("x").str.splitn(pl.col("by"), 2)).unnest("x")
1818

1819
    expected = pl.DataFrame(
1820
        {
1821
            "field_0": ["a", None, "b", "c", None, "e", None],
1822
            "field_1": ["a", None, None, "c^c", None, "eee", None],
1823
        }
1824
    )
1825

1826
    assert_frame_equal(out, expected)
1827

1828

1829
def test_titlecase() -> None:
1830
    df = pl.DataFrame(
1831
        {
1832
            "misc": [
1833
                "welcome to my world",
1834
                "double  space",
1835
                "and\ta\t tab",
1836
                "by jean-paul sartre, 'esq'",
1837
                "SOMETIMES/life/gives/you/a/2nd/chance",
1838
            ],
1839
        }
1840
    )
1841
    expected = [
1842
        "Welcome To My World",
1843
        "Double  Space",
1844
        "And\tA\t Tab",
1845
        "By Jean-Paul Sartre, 'Esq'",
1846
        "Sometimes/Life/Gives/You/A/2nd/Chance",
1847
    ]
1848
    actual = df.select(pl.col("misc").str.to_titlecase()).to_series()
1849
    for ex, act in zip(expected, actual):
1850
        assert ex == act, f"{ex} != {act}"
1851

1852
    df = pl.DataFrame(
1853
        {
1854
            "quotes": [
1855
                "'e.t. phone home'",
1856
                "you talkin' to me?",
1857
                "i feel the need--the need for speed",
1858
                "to infinity,and BEYOND!",
1859
                "say 'what' again!i dare you - I\u00a0double-dare you!",
1860
                "What.we.got.here... is#failure#to#communicate",
1861
            ]
1862
        }
1863
    )
1864
    expected_str = [
1865
        "'E.T. Phone Home'",
1866
        "You Talkin' To Me?",
1867
        "I Feel The Need--The Need For Speed",
1868
        "To Infinity,And Beyond!",
1869
        "Say 'What' Again!I Dare You - I\u00a0Double-Dare You!",
1870
        "What.We.Got.Here... Is#Failure#To#Communicate",
1871
    ]
1872
    expected_py = [s.title() for s in df["quotes"].to_list()]
1873
    for ex_str, ex_py, act in zip(
1874
        expected_str, expected_py, df["quotes"].str.to_titlecase()
1875
    ):
1876
        assert ex_str == act, f"{ex_str} != {act}"
1877
        assert ex_py == act, f"{ex_py} != {act}"
1878

1879

1880
def test_string_replace_with_nulls_10124() -> None:
1881
    df = pl.DataFrame({"col1": ["S", "S", "S", None, "S", "S", "S", "S"]})
1882

1883
    assert df.select(
1884
        pl.col("col1"),
1885
        pl.col("col1").str.replace("S", "O", n=1).alias("n_1"),
1886
        pl.col("col1").str.replace("S", "O", n=3).alias("n_3"),
1887
    ).to_dict(as_series=False) == {
1888
        "col1": ["S", "S", "S", None, "S", "S", "S", "S"],
1889
        "n_1": ["O", "O", "O", None, "O", "O", "O", "O"],
1890
        "n_3": ["O", "O", "O", None, "O", "O", "O", "O"],
1891
    }
1892

1893

1894
def test_string_extract_groups_lazy_schema_10305() -> None:
1895
    df = pl.LazyFrame(
1896
        data={
1897
            "url": [
1898
                "http://vote.com/ballon_dor?candidate=messi&ref=python",
1899
                "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
1900
                "http://vote.com/ballon_dor?error=404&ref=rust",
1901
            ]
1902
        }
1903
    )
1904
    pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"
1905
    df = df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(
1906
        "captures"
1907
    )
1908

1909
    assert df.collect_schema() == {"candidate": pl.String, "ref": pl.String}
1910

1911

1912
def test_string_reverse() -> None:
1913
    df = pl.DataFrame(
1914
        {
1915
            "text": [None, "foo", "bar", "i like pizza&#", None, "man\u0303ana"],
1916
        }
1917
    )
1918
    expected = pl.DataFrame(
1919
        [
1920
            pl.Series(
1921
                "text",
1922
                [None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"],
1923
                dtype=pl.String,
1924
            ),
1925
        ]
1926
    )
1927

1928
    result = df.select(pl.col("text").str.reverse())
1929
    assert_frame_equal(result, expected)
1930

1931

1932
@pytest.mark.parametrize(
1933
    ("data", "expected_data"),
1934
    [
1935
        (["", None, "a"], ["", None, "b"]),
1936
        ([None, None, "a"], [None, None, "b"]),
1937
        (["", "", ""], ["", "", ""]),
1938
        ([None, None, None], [None, None, None]),
1939
        (["a", "", None], ["b", "", None]),
1940
    ],
1941
)
1942
def test_replace_lit_n_char_13385(
1943
    data: list[str | None], expected_data: list[str | None]
1944
) -> None:
1945
    s = pl.Series(data, dtype=pl.String)
1946
    res = s.str.replace("a", "b", literal=True)
1947
    expected_s = pl.Series(expected_data, dtype=pl.String)
1948
    assert_series_equal(res, expected_s)
1949

1950

1951
def test_extract_many() -> None:
1952
    df = pl.DataFrame({"values": ["discontent", "foobar"]})
1953
    patterns = ["winter", "disco", "onte", "discontent"]
1954
    assert df.with_columns(
1955
        pl.col("values").str.extract_many(patterns, overlapping=False).alias("matches"),
1956
        pl.col("values")
1957
        .str.extract_many(patterns, overlapping=True)
1958
        .alias("matches_overlapping"),
1959
    ).to_dict(as_series=False) == {
1960
        "values": ["discontent", "foobar"],
1961
        "matches": [["disco"], []],
1962
        "matches_overlapping": [["disco", "onte", "discontent"], []],
1963
    }
1964

1965
    # many patterns
1966
    df = pl.DataFrame(
1967
        {
1968
            "values": ["discontent", "rhapsody"],
1969
            "patterns": [
1970
                ["winter", "disco", "onte", "discontent"],
1971
                ["rhap", "ody", "coalesce"],
1972
            ],
1973
        }
1974
    )
1975

1976
    # extract_many
1977
    assert df.select(pl.col("values").str.extract_many("patterns")).to_dict(
1978
        as_series=False
1979
    ) == {"values": [["disco"], ["rhap", "ody"]]}
1980

1981
    # find_many
1982
    f1 = df.select(pl.col("values").str.find_many("patterns"))
1983
    f2 = df["values"].str.find_many(df["patterns"])
1984

1985
    assert_series_equal(f1["values"], f2)
1986
    assert f2.to_list() == [[0], [0, 5]]
1987

1988

1989
def test_json_decode_raise_on_data_type_mismatch_13061() -> None:
1990
    assert_series_equal(
1991
        pl.Series(["null", "null"]).str.json_decode(infer_schema_length=1),
1992
        pl.Series([None, None]),
1993
    )
1994

1995
    with pytest.raises(ComputeError):
1996
        pl.Series(["null", "1"]).str.json_decode(infer_schema_length=1)
1997

1998
    assert_series_equal(
1999
        pl.Series(["null", "1"]).str.json_decode(infer_schema_length=2),
2000
        pl.Series([None, 1]),
2001
    )
2002

2003

2004
def test_json_decode_struct_schema() -> None:
2005
    with pytest.raises(ComputeError, match="extra field in struct data: b"):
2006
        pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2007
            infer_schema_length=1
2008
        )
2009

2010
    assert_series_equal(
2011
        pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2012
            infer_schema_length=2
2013
        ),
2014
        pl.Series([{"a": 1, "b": None}, {"a": 2, "b": 2}]),
2015
    )
2016

2017
    # If the schema was explicitly given, then we ignore extra fields.
2018
    # TODO: There should be a `columns=` parameter to this.
2019
    assert_series_equal(
2020
        pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(
2021
            dtype=pl.Struct({"a": pl.Int64})
2022
        ),
2023
        pl.Series([{"a": 1}, {"a": 2}]),
2024
    )
2025

2026

2027
def test_escape_regex() -> None:
2028
    df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
2029
    result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
2030
    expected_df = pl.DataFrame(
2031
        {
2032
            "text": ["abc", "def", None, "abc(\\w+)"],
2033
            "escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],
2034
        }
2035
    )
2036

2037
    assert_frame_equal(result_df, expected_df)
2038
    assert_series_equal(result_df["escaped"], expected_df["escaped"])
2039

2040

2041
@pytest.mark.parametrize(
2042
    ("form", "expected_data"),
2043
    [
2044
        ("NFC", ["01²", "ＫＡＤＯＫＡＷＡ"]),  # noqa: RUF001
2045
        ("NFD", ["01²", "ＫＡＤＯＫＡＷＡ"]),  # noqa: RUF001
2046
        ("NFKC", ["012", "KADOKAWA"]),
2047
        ("NFKD", ["012", "KADOKAWA"]),
2048
    ],
2049
)
2050
def test_string_normalize(form: Any, expected_data: list[str | None]) -> None:
2051
    s = pl.Series(["01²", "ＫＡＤＯＫＡＷＡ"], dtype=pl.String)  # noqa: RUF001
2052
    res = s.str.normalize(form)
2053
    expected_s = pl.Series(expected_data, dtype=pl.String)
2054
    assert_series_equal(res, expected_s)
2055

2056

2057
def test_string_normalize_wrong_input() -> None:
2058
    with pytest.raises(ValueError, match="`form` must be one of"):
2059
        pl.Series(["01²"], dtype=pl.String).str.normalize("foobar")  # type: ignore[arg-type]
2060

2061

2062
def test_to_integer_unequal_lengths_22034() -> None:
2063
    s = pl.Series("a", ["1", "2", "3"], pl.String)
2064
    with pytest.raises(pl.exceptions.ShapeError):
2065
        s.str.to_integer(base=pl.Series([4, 5, 5, 4]))
2066

2067

2068
def test_broadcast_self() -> None:
2069
    s = pl.Series("a", ["3"], pl.String)
2070
    with pytest.raises(
2071
        pl.exceptions.ComputeError, match="strict integer parsing failed"
2072
    ):
2073
        s.str.to_integer(base=pl.Series([2, 2, 3, 4]))
2074

2075

2076
def test_strptime_unequal_length_22018() -> None:
2077
    s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
2078
    with pytest.raises(pl.exceptions.ShapeError):
2079
        s.str.strptime(
2080
            pl.Datetime, "%Y-%m-%d %H:%M%#z", ambiguous=pl.Series(["a", "b", "d"])
2081
        )
2082

2083

2084
@pytest.mark.parametrize("inclusive", [False, True])
2085
def test_str_split_unequal_length_22018(inclusive: bool) -> None:
2086
    with pytest.raises(pl.exceptions.ShapeError):
2087
        pl.Series(["a-c", "x-y"]).str.split(
2088
            pl.Series(["-", "/", "+"]), inclusive=inclusive
2089
        )
2090

2091

2092
def test_str_split_self_broadcast() -> None:
2093
    assert_series_equal(
2094
        pl.Series(["a-/c"]).str.split(pl.Series(["-", "/", "+"])),
2095
        pl.Series([["a", "/c"], ["a-", "c"], ["a-/c"]]),
2096
    )
2097

2098

2099
def test_replace_many_mapping_in_list() -> None:
2100
    assert_series_equal(
2101
        pl.Series([["a", "b"]]).list.eval(
2102
            pl.element().replace_strict({"a": 1, "b": 2})
2103
        ),
2104
        pl.Series([[1, 2]]),
2105
    )
2106

2107

2108
def test_str_replace_n_zero_23570() -> None:
2109
    # more than 32 bytes
2110
    abc_long = "abc " * 20 + "abc"
2111
    df = pl.DataFrame(
2112
        {"a": [abc_long, "abc abc abc", "abc ghi"], "b": ["jkl", "pqr", "xyz"]}
2113
    )
2114
    expected = df
2115

2116
    out = df.with_columns(pl.col("a").str.replace("abc", "XYZ", n=0))
2117
    assert_frame_equal(out, expected)
2118

2119
    out = df.with_columns(pl.col("a").str.replace("abc", pl.col("b"), n=0))
2120
    assert_frame_equal(out, expected)
2121

2122
Product

Resources

Company