Path: blob/main/py-polars/tests/unit/operations/namespaces/string/test_string.py
6940 views
from __future__ import annotations12from typing import Any34import pytest56import polars as pl7import polars.selectors as cs8from polars.exceptions import (9ColumnNotFoundError,10ComputeError,11InvalidOperationError,12ShapeError,13)14from polars.testing import assert_frame_equal, assert_series_equal151617def test_str_slice() -> None:18df = pl.DataFrame({"a": ["foobar", "barfoo"]})19assert df["a"].str.slice(-3).to_list() == ["bar", "foo"]20assert df.select([pl.col("a").str.slice(2, 4)])["a"].to_list() == ["obar", "rfoo"]212223def test_str_slice_expr() -> None:24df = pl.DataFrame(25{26"a": ["foobar", None, "barfoo", "abcd", ""],27"offset": [1, 3, None, -3, 2],28"length": [3, 4, 2, None, 2],29}30)31out = df.select(32all_expr=pl.col("a").str.slice("offset", "length"),33offset_expr=pl.col("a").str.slice("offset", 2),34length_expr=pl.col("a").str.slice(0, "length"),35length_none=pl.col("a").str.slice("offset", None),36offset_length_lit=pl.col("a").str.slice(-3, 3),37str_lit=pl.lit("qwert").str.slice("offset", "length"),38)39expected = pl.DataFrame(40{41"all_expr": ["oob", None, None, "bcd", ""],42"offset_expr": ["oo", None, None, "bc", ""],43"length_expr": ["foo", None, "ba", "abcd", ""],44"length_none": ["oobar", None, None, "bcd", ""],45"offset_length_lit": ["bar", None, "foo", "bcd", ""],46"str_lit": ["wer", "rt", None, "ert", "er"],47}48)49assert_frame_equal(out, expected)5051# negative length is not allowed52with pytest.raises(InvalidOperationError):53df.select(pl.col("a").str.slice(0, -1))545556def test_str_slice_wrong_length() -> None:57df = pl.DataFrame({"num": ["-10", "-1", "0"]})58with pytest.raises(ShapeError):59df.select(pl.col("num").str.slice(pl.Series([1, 2])))606162@pytest.mark.parametrize(63("input", "n", "output"),64[65(["012345", "", None], 0, ["", "", None]),66(["012345", "", None], 2, ["01", "", None]),67(["012345", "", None], -2, ["0123", "", None]),68(["012345", "", None], 100, ["012345", "", None]),69(["012345", "", None], -100, ["", "", None]),70],71)72def test_str_head(input: list[str], n: int, output: list[str]) -> None:73assert pl.Series(input).str.head(n).to_list() == output747576@pytest.mark.parametrize(77("input", "n", "output"),78[79("你好世界", 0, ""),80("你好世界", 2, "你好"),81("你好世界", 999, "你好世界"),82("你好世界", -1, "你好世"),83("你好世界", -2, "你好"),84("你好世界", -999, ""),85],86)87def test_str_head_codepoints(input: str, n: int, output: str) -> None:88assert pl.Series([input]).str.head(n).to_list() == [output]899091def test_str_head_expr() -> None:92s = "012345"93df = pl.DataFrame(94{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}95)96out = df.select(97n_expr=pl.col("a").str.head("n"),98n_pos2=pl.col("a").str.head(2),99n_neg2=pl.col("a").str.head(-2),100n_pos100=pl.col("a").str.head(100),101n_pos_neg100=pl.col("a").str.head(-100),102n_pos_0=pl.col("a").str.head(0),103str_lit=pl.col("a").str.head(pl.lit(2)),104lit_expr=pl.lit(s).str.head("n"),105lit_n=pl.lit(s).str.head(2),106)107expected = pl.DataFrame(108{109"n_expr": ["", "01", "0123", "012345", "", None, "", None],110"n_pos2": ["01", "01", "01", "01", "01", "01", "", None],111"n_neg2": ["0123", "0123", "0123", "0123", "0123", "0123", "", None],112"n_pos100": [s, s, s, s, s, s, "", None],113"n_pos_neg100": ["", "", "", "", "", "", "", None],114"n_pos_0": ["", "", "", "", "", "", "", None],115"str_lit": ["01", "01", "01", "01", "01", "01", "", None],116"lit_expr": ["", "01", "0123", "012345", "", None, "012", "0123"],117"lit_n": ["01", "01", "01", "01", "01", "01", "01", "01"],118}119)120assert_frame_equal(out, expected)121122123def test_str_head_wrong_length() -> None:124df = pl.DataFrame({"num": ["-10", "-1", "0"]})125with pytest.raises(ShapeError):126df.select(pl.col("num").str.head(pl.Series([1, 2])))127128129@pytest.mark.parametrize(130("input", "n", "output"),131[132(["012345", "", None], 0, ["", "", None]),133(["012345", "", None], 2, ["45", "", None]),134(["012345", "", None], -2, ["2345", "", None]),135(["012345", "", None], 100, ["012345", "", None]),136(["012345", "", None], -100, ["", "", None]),137],138)139def test_str_tail(input: list[str], n: int, output: list[str]) -> None:140assert pl.Series(input).str.tail(n).to_list() == output141142143@pytest.mark.parametrize(144("input", "n", "output"),145[146("你好世界", 0, ""),147("你好世界", 2, "世界"),148("你好世界", 999, "你好世界"),149("你好世界", -1, "好世界"),150("你好世界", -2, "世界"),151("你好世界", -999, ""),152],153)154def test_str_tail_codepoints(input: str, n: int, output: str) -> None:155assert pl.Series([input]).str.tail(n).to_list() == [output]156157158def test_str_tail_expr() -> None:159s = "012345"160df = pl.DataFrame(161{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}162)163out = df.select(164n_expr=pl.col("a").str.tail("n"),165n_pos2=pl.col("a").str.tail(2),166n_neg2=pl.col("a").str.tail(-2),167n_pos100=pl.col("a").str.tail(100),168n_pos_neg100=pl.col("a").str.tail(-100),169n_pos_0=pl.col("a").str.tail(0),170str_lit=pl.col("a").str.tail(pl.lit(2)),171lit_expr=pl.lit(s).str.tail("n"),172lit_n=pl.lit(s).str.tail(2),173)174expected = pl.DataFrame(175{176"n_expr": ["", "45", "2345", "012345", "", None, "", None],177"n_pos2": ["45", "45", "45", "45", "45", "45", "", None],178"n_neg2": ["2345", "2345", "2345", "2345", "2345", "2345", "", None],179"n_pos100": [s, s, s, s, s, s, "", None],180"n_pos_neg100": ["", "", "", "", "", "", "", None],181"n_pos_0": ["", "", "", "", "", "", "", None],182"str_lit": ["45", "45", "45", "45", "45", "45", "", None],183"lit_expr": ["", "45", "2345", "012345", "", None, "345", "2345"],184"lit_n": ["45", "45", "45", "45", "45", "45", "45", "45"],185}186)187assert_frame_equal(out, expected)188189190def test_str_tail_wrong_length() -> None:191df = pl.DataFrame({"num": ["-10", "-1", "0"]})192with pytest.raises(ShapeError):193df.select(pl.col("num").str.tail(pl.Series([1, 2])))194195196def test_str_slice_multibyte() -> None:197ref = "你好世界"198s = pl.Series([ref])199200# Pad the string to simplify (negative) offsets starting before/after the string.201npad = 20202padref = "_" * npad + ref + "_" * npad203for start in range(-5, 6):204for length in range(6):205offset = npad + start if start >= 0 else npad + start + len(ref)206correct = padref[offset : offset + length].strip("_")207result = s.str.slice(start, length)208expected = pl.Series([correct])209assert_series_equal(result, expected)210211212def test_str_len_bytes() -> None:213s = pl.Series(["Café", None, "345", "東京"])214result = s.str.len_bytes()215expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32)216assert_series_equal(result, expected)217218219def test_str_len_chars() -> None:220s = pl.Series(["Café", None, "345", "東京"])221result = s.str.len_chars()222expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32)223assert_series_equal(result, expected)224225226def test_str_contains() -> None:227s = pl.Series(["messi", "ronaldo", "ibrahimovic"])228expected = pl.Series([True, False, False])229assert_series_equal(s.str.contains("mes"), expected)230231232def test_str_contains_wrong_length() -> None:233df = pl.DataFrame({"num": ["-10", "-1", "0"]})234with pytest.raises(ShapeError):235df.select(pl.col("num").str.contains(pl.Series(["a", "b"]))) # type: ignore [arg-type]236237238def test_count_match_literal() -> None:239s = pl.Series(["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None])240out = s.str.count_matches(r"\d", literal=True)241expected = pl.Series([0, 0, 2, None], dtype=pl.UInt32)242assert_series_equal(out, expected)243244out = s.str.count_matches(pl.Series([r"\w", r"\w", r"\d", r"\d"]), literal=True)245expected = pl.Series([0, 1, 2, None], dtype=pl.UInt32)246assert_series_equal(out, expected)247248249def test_str_encode() -> None:250s = pl.Series(["foo", "bar", None])251hex_encoded = pl.Series(["666f6f", "626172", None])252base64_encoded = pl.Series(["Zm9v", "YmFy", None])253254assert_series_equal(s.str.encode("hex"), hex_encoded)255assert_series_equal(s.str.encode("base64"), base64_encoded)256with pytest.raises(ValueError):257s.str.encode("utf8") # type: ignore[arg-type]258259260def test_str_decode() -> None:261hex_encoded = pl.Series(["666f6f", "626172", None])262base64_encoded = pl.Series(["Zm9v", "YmFy", None])263expected = pl.Series([b"foo", b"bar", None])264265assert_series_equal(hex_encoded.str.decode("hex"), expected)266assert_series_equal(base64_encoded.str.decode("base64"), expected)267268269def test_str_decode_exception() -> None:270s = pl.Series(["not a valid", "626172", None])271with pytest.raises(ComputeError):272s.str.decode(encoding="hex")273with pytest.raises(ComputeError):274s.str.decode(encoding="base64")275with pytest.raises(ValueError):276s.str.decode("utf8") # type: ignore[arg-type]277278279@pytest.mark.parametrize("strict", [True, False])280def test_str_find(strict: bool) -> None:281df = pl.DataFrame(282data=[283("Dubai", 3564931, "b[ai]", "ai"),284("Abu Dhabi", 1807000, "b[ai]", " "),285("Sharjah", 1405000, "[ai]n", "s"),286("Al Ain", 846747, "[ai]n", ""),287("Ajman", 490035, "[ai]n", "ma"),288("Ras Al Khaimah", 191753, "a.+a", "Kha"),289("Fujairah", 118933, "a.+a", None),290("Umm Al Quwain", 59098, "a.+a", "wa"),291(None, None, None, "n/a"),292],293schema={294"city": pl.String,295"population": pl.Int32,296"pat": pl.String,297"lit": pl.String,298},299orient="row",300)301city, pop, pat, lit = (pl.col(c) for c in ("city", "population", "pat", "lit"))302303for match_lit in (True, False):304res = df.select(305find_a_regex=city.str.find("(?i)a", strict=strict),306find_a_lit=city.str.find("a", literal=match_lit),307find_00_lit=pop.cast(pl.String).str.find("00", literal=match_lit),308find_col_lit=city.str.find(lit, strict=strict, literal=match_lit),309find_col_pat=city.str.find(pat, strict=strict),310)311assert res.to_dict(as_series=False) == {312"find_a_regex": [3, 0, 2, 0, 0, 1, 3, 4, None],313"find_a_lit": [3, 6, 2, None, 3, 1, 3, 10, None],314"find_00_lit": [None, 4, 4, None, 2, None, None, None, None],315"find_col_lit": [3, 3, None, 0, 2, 7, None, 9, None],316"find_col_pat": [2, 7, None, 4, 3, 1, 3, None, None],317}318319320def test_str_find_invalid_regex() -> None:321# test behaviour of 'strict' with invalid regular expressions322df = pl.DataFrame({"txt": ["AbCdEfG"]})323rx_invalid = "(?i)AB.))"324325with pytest.raises(ComputeError):326df.with_columns(pl.col("txt").str.find(rx_invalid, strict=True))327328res = df.with_columns(pl.col("txt").str.find(rx_invalid, strict=False))329assert res.item() is None330331332def test_str_find_escaped_chars() -> None:333# test behaviour of 'literal=True' with special chars334df = pl.DataFrame({"txt": ["123.*465", "x(x?)x"]})335336res = df.with_columns(337x1=pl.col("txt").str.find("(x?)", literal=True),338x2=pl.col("txt").str.find(".*4", literal=True),339x3=pl.col("txt").str.find("(x?)"),340x4=pl.col("txt").str.find(".*4"),341)342# ┌──────────┬──────┬──────┬─────┬──────┐343# │ txt ┆ x1 ┆ x2 ┆ x3 ┆ x4 │344# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │345# │ str ┆ u32 ┆ u32 ┆ u32 ┆ u32 │346# ╞══════════╪══════╪══════╪═════╪══════╡347# │ 123.*465 ┆ null ┆ 3 ┆ 0 ┆ 0 │348# │ x(x?)x ┆ 1 ┆ null ┆ 0 ┆ null │349# └──────────┴──────┴──────┴─────┴──────┘350assert_frame_equal(351pl.DataFrame(352{353"txt": ["123.*465", "x(x?)x"],354"x1": [None, 1],355"x2": [3, None],356"x3": [0, 0],357"x4": [0, None],358}359).cast({cs.signed_integer(): pl.UInt32}),360res,361)362363364def test_str_find_wrong_length() -> None:365df = pl.DataFrame({"num": ["-10", "-1", "0"]})366with pytest.raises(ShapeError):367df.select(pl.col("num").str.find(pl.Series(["a", "b"]))) # type: ignore [arg-type]368369370def test_hex_decode_return_dtype() -> None:371data = {"a": ["68656c6c6f", "776f726c64"]}372expr = pl.col("a").str.decode("hex")373374df = pl.DataFrame(data).select(expr)375assert df.schema == {"a": pl.Binary}376377ldf = pl.LazyFrame(data).select(expr)378assert ldf.collect_schema() == {"a": pl.Binary}379380381def test_base64_decode_return_dtype() -> None:382data = {"a": ["Zm9v", "YmFy"]}383expr = pl.col("a").str.decode("base64")384385df = pl.DataFrame(data).select(expr)386assert df.schema == {"a": pl.Binary}387388ldf = pl.LazyFrame(data).select(expr)389assert ldf.collect_schema() == {"a": pl.Binary}390391392def test_str_replace_str_replace_all() -> None:393s = pl.Series(["hello", "world", "test", "rooted"])394expected = pl.Series(["hell0", "w0rld", "test", "r0oted"])395assert_series_equal(s.str.replace("o", "0"), expected)396397expected = pl.Series(["hell0", "w0rld", "test", "r00ted"])398assert_series_equal(s.str.replace_all("o", "0"), expected)399400401def test_str_replace_n_single() -> None:402s = pl.Series(["aba", "abaa"])403404assert s.str.replace("a", "b", n=1).to_list() == ["bba", "bbaa"]405assert s.str.replace("a", "b", n=2).to_list() == ["bbb", "bbba"]406assert s.str.replace("a", "b", n=3).to_list() == ["bbb", "bbbb"]407408409def test_str_replace_n_same_length() -> None:410# pat and val have the same length411# this triggers a fast path412s = pl.Series(["abfeab", "foobarabfooabab"])413assert s.str.replace("ab", "AB", n=1).to_list() == ["ABfeab", "foobarABfooabab"]414assert s.str.replace("ab", "AB", n=2).to_list() == ["ABfeAB", "foobarABfooABab"]415assert s.str.replace("ab", "AB", n=3).to_list() == ["ABfeAB", "foobarABfooABAB"]416417418def test_str_to_lowercase() -> None:419s = pl.Series(["Hello", "WORLD"])420expected = pl.Series(["hello", "world"])421assert_series_equal(s.str.to_lowercase(), expected)422423424def test_str_to_uppercase() -> None:425s = pl.Series(["Hello", "WORLD"])426expected = pl.Series(["HELLO", "WORLD"])427assert_series_equal(s.str.to_uppercase(), expected)428429430def test_str_case_cyrillic() -> None:431vals = ["Biтpyк", "Iвaн"]432s = pl.Series(vals)433assert s.str.to_lowercase().to_list() == [a.lower() for a in vals]434assert s.str.to_uppercase().to_list() == [a.upper() for a in vals]435436437def test_str_to_integer() -> None:438bin = pl.Series(["110", "101", "010"])439assert_series_equal(bin.str.to_integer(base=2), pl.Series([6, 5, 2]).cast(pl.Int64))440441hex = pl.Series(["fa1e", "ff00", "cafe", "invalid", None])442assert_series_equal(443hex.str.to_integer(base=16, strict=False),444pl.Series([64030, 65280, 51966, None, None]).cast(pl.Int64),445check_exact=True,446)447448with pytest.raises(ComputeError):449hex.str.to_integer(base=16)450451452@pytest.mark.parametrize("strict", [False, True])453def test_str_to_integer_invalid_base(strict: bool) -> None:454numbers = pl.Series(["1", "ZZZ", "-ABCZZZ", None])455with pytest.raises(ComputeError):456numbers.str.to_integer(base=100, strict=strict)457458df = pl.DataFrame({"str": numbers, "base": [0, 1, 100, None]})459with pytest.raises(ComputeError):460df.select(pl.col("str").str.to_integer(base=pl.col("base"), strict=strict))461462463def test_str_to_integer_base_expr() -> None:464df = pl.DataFrame(465{"str": ["110", "ff00", "234", None, "130"], "base": [2, 16, 10, 8, None]}466)467out = df.select(base_expr=pl.col("str").str.to_integer(base="base"))468expected = pl.DataFrame({"base_expr": [6, 65280, 234, None, None]})469assert_frame_equal(out, expected)470471# test strict raise472df = pl.DataFrame({"str": ["110", "ff00", "cafe", None], "base": [2, 10, 10, 8]})473474with pytest.raises(ComputeError):475df.select(pl.col("str").str.to_integer(base="base"))476477478def test_str_to_integer_base_literal() -> None:479df = pl.DataFrame(480{481"bin": ["110", "101", "-010", "invalid", None],482"hex": ["fa1e", "ff00", "cafe", "invalid", None],483}484)485result = df.with_columns(486pl.col("bin").str.to_integer(base=2, strict=False),487pl.col("hex").str.to_integer(base=16, strict=False),488)489490expected = pl.DataFrame(491{492"bin": [6, 5, -2, None, None],493"hex": [64030, 65280, 51966, None, None],494}495)496assert_frame_equal(result, expected)497498with pytest.raises(ComputeError):499df.with_columns(500pl.col("bin").str.to_integer(base=2),501pl.col("hex").str.to_integer(base=16),502)503504505def test_str_to_integer_dtype() -> None:506lf = pl.LazyFrame(507{508"str": ["1111111", "7f", "127", None, "42"],509"base": [2, 16, 10, 8, None],510}511)512out = lf.select(513i8=pl.col("str").str.to_integer(base="base", dtype=pl.Int8),514i16=pl.col("str").str.to_integer(base="base", dtype=pl.Int16),515i32=pl.col("str").str.to_integer(base="base", dtype=pl.Int32),516i64=pl.col("str").str.to_integer(base="base", dtype=pl.Int64),517u8=pl.col("str").str.to_integer(base="base", dtype=pl.UInt8),518u16=pl.col("str").str.to_integer(base="base", dtype=pl.UInt16),519u32=pl.col("str").str.to_integer(base="base", dtype=pl.UInt32),520u64=pl.col("str").str.to_integer(base="base", dtype=pl.UInt64),521default=pl.col("str").str.to_integer(base="base"),522).collect()523524expected = pl.DataFrame(525{526"i8": [127, 127, 127, None, None],527"i16": [127, 127, 127, None, None],528"i32": [127, 127, 127, None, None],529"i64": [127, 127, 127, None, None],530"u8": [127, 127, 127, None, None],531"u16": [127, 127, 127, None, None],532"u32": [127, 127, 127, None, None],533"u64": [127, 127, 127, None, None],534"default": [127, 127, 127, None, None],535},536schema={537"i8": pl.Int8,538"i16": pl.Int16,539"i32": pl.Int32,540"i64": pl.Int64,541"u8": pl.UInt8,542"u16": pl.UInt16,543"u32": pl.UInt32,544"u64": pl.UInt64,545"default": pl.Int64,546},547)548assert lf.collect_schema() == lf.collect().schema549assert_frame_equal(out, expected)550551552def test_str_to_integer_large() -> None:553df = pl.DataFrame(554{555"str": [556"-6129899454972456276923959272",557"1A44E53BFEBA967E6682FBB0",558"10100110111110110101110100000100110010101111000100011000000100010101010101101011111111101000",559None,560"7798994549724957734429272",561],562"base": [10, 16, 2, 8, None],563}564)565out = df.select(i128=pl.col("str").str.to_integer(base="base", dtype=pl.Int128))566expected = pl.DataFrame(567{568"i128": [569-6129899454972456276923959272,5708129899739726392769273592752,5713229899454972495776923959272,572None,573None,574]575},576schema={"i128": pl.Int128},577)578assert_frame_equal(out, expected)579580# test strict raise581df = pl.DataFrame(582{583"i128": [584"612989945497245627692395927261298994549724562769239592726129899454972456276923959272",585"1A44E53BFEBA967E6682FBB0",586"ABCDEFGHIJKLMNOPQRSTUVWXYZ",587"7798994549724957734429272",588None,589"7798994549724957734429272",590],591"base": [10, 2, 16, 10, 8, None],592}593)594595with pytest.raises(ComputeError):596df.select(pl.col("i128").str.to_integer(base="base", dtype=pl.Int128))597598599def test_str_strip_chars_expr() -> None:600df = pl.DataFrame(601{602"s": [" hello ", "^^world^^", "&&hi&&", " polars ", None],603"pat": [" ", "^", "&", None, "anything"],604}605)606607all_expr = df.select(608pl.col("s").str.strip_chars(pl.col("pat")).alias("strip_chars"),609pl.col("s").str.strip_chars_start(pl.col("pat")).alias("strip_chars_start"),610pl.col("s").str.strip_chars_end(pl.col("pat")).alias("strip_chars_end"),611)612613expected = pl.DataFrame(614{615"strip_chars": ["hello", "world", "hi", "polars", None],616"strip_chars_start": ["hello ", "world^^", "hi&&", "polars ", None],617"strip_chars_end": [" hello", "^^world", "&&hi", " polars", None],618}619)620621assert_frame_equal(all_expr, expected)622623strip_by_null = df.select(624pl.col("s").str.strip_chars(None).alias("strip_chars"),625pl.col("s").str.strip_chars_start(None).alias("strip_chars_start"),626pl.col("s").str.strip_chars_end(None).alias("strip_chars_end"),627)628629# only whitespace are striped.630expected = pl.DataFrame(631{632"strip_chars": ["hello", "^^world^^", "&&hi&&", "polars", None],633"strip_chars_start": ["hello ", "^^world^^", "&&hi&&", "polars ", None],634"strip_chars_end": [" hello", "^^world^^", "&&hi&&", " polars", None],635}636)637assert_frame_equal(strip_by_null, expected)638639640def test_str_strip_chars() -> None:641s = pl.Series([" hello ", "world\t "])642expected = pl.Series(["hello", "world"])643assert_series_equal(s.str.strip_chars(), expected)644645expected = pl.Series(["hell", "world"])646assert_series_equal(s.str.strip_chars().str.strip_chars("o"), expected)647648expected = pl.Series(["ell", "rld\t"])649assert_series_equal(s.str.strip_chars(" hwo"), expected)650651652def test_str_strip_chars_wrong_length() -> None:653df = pl.DataFrame({"num": ["-10", "-1", "0"]})654with pytest.raises(ShapeError):655df.select(pl.col("num").str.strip_chars(pl.Series(["a", "b"])))656657658def test_str_strip_chars_start() -> None:659s = pl.Series([" hello ", "\t world"])660expected = pl.Series(["hello ", "world"])661assert_series_equal(s.str.strip_chars_start(), expected)662663expected = pl.Series(["ello ", "world"])664assert_series_equal(s.str.strip_chars_start().str.strip_chars_start("h"), expected)665666expected = pl.Series(["ello ", "\t world"])667assert_series_equal(s.str.strip_chars_start("hw "), expected)668669670def test_str_strip_chars_start_wrong_length() -> None:671df = pl.DataFrame({"num": ["-10", "-1", "0"]})672with pytest.raises(ShapeError):673df.select(pl.col("num").str.strip_chars_start(pl.Series(["a", "b"])))674675676def test_str_strip_chars_end() -> None:677s = pl.Series([" hello ", "world\t "])678expected = pl.Series([" hello", "world"])679assert_series_equal(s.str.strip_chars_end(), expected)680681expected = pl.Series([" hell", "world"])682assert_series_equal(s.str.strip_chars_end().str.strip_chars_end("o"), expected)683684expected = pl.Series([" he", "wor"])685assert_series_equal(s.str.strip_chars_end("odl \t"), expected)686687688def test_str_strip_chars_end_wrong_length() -> None:689df = pl.DataFrame({"num": ["-10", "-1", "0"]})690with pytest.raises(ShapeError):691df.select(pl.col("num").str.strip_chars_end(pl.Series(["a", "b"])))692693694def test_str_strip_whitespace() -> None:695s = pl.Series("a", ["trailing ", " leading", " both "])696697expected = pl.Series("a", ["trailing", " leading", " both"])698assert_series_equal(s.str.strip_chars_end(), expected)699700expected = pl.Series("a", ["trailing ", "leading", "both "])701assert_series_equal(s.str.strip_chars_start(), expected)702703expected = pl.Series("a", ["trailing", "leading", "both"])704assert_series_equal(s.str.strip_chars(), expected)705706707def test_str_strip_prefix_literal() -> None:708s = pl.Series(["foo:bar", "foofoo:bar", "bar:bar", "foo", "", None])709expected = pl.Series([":bar", "foo:bar", "bar:bar", "", "", None])710assert_series_equal(s.str.strip_prefix("foo"), expected)711# test null literal712expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)713assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.String)), expected)714715716def test_str_strip_prefix_suffix_expr() -> None:717df = pl.DataFrame(718{719"s": ["foo-bar", "foobarbar", "barfoo", "", "anything", None],720"prefix": ["foo", "foobar", "foo", "", None, "bar"],721"suffix": ["bar", "barbar", "bar", "", None, "foo"],722}723)724out = df.select(725pl.col("s").str.strip_prefix(pl.col("prefix")).alias("strip_prefix"),726pl.col("s").str.strip_suffix(pl.col("suffix")).alias("strip_suffix"),727)728assert out.to_dict(as_series=False) == {729"strip_prefix": ["-bar", "bar", "barfoo", "", None, None],730"strip_suffix": ["foo-", "foo", "barfoo", "", None, None],731}732733734def test_str_strip_prefix_wrong_length() -> None:735df = pl.DataFrame({"num": ["-10", "-1", "0"]})736with pytest.raises(ShapeError):737df.select(pl.col("num").str.strip_prefix(pl.Series(["a", "b"])))738739740def test_str_strip_suffix() -> None:741s = pl.Series(["foo:bar", "foo:barbar", "foo:foo", "bar", "", None])742expected = pl.Series(["foo:", "foo:bar", "foo:foo", "", "", None])743assert_series_equal(s.str.strip_suffix("bar"), expected)744# test null literal745expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)746assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.String)), expected)747748749def test_str_strip_suffix_wrong_length() -> None:750df = pl.DataFrame({"num": ["-10", "-1", "0"]})751with pytest.raises(ShapeError):752df.select(pl.col("num").str.strip_suffix(pl.Series(["a", "b"])))753754755def test_str_split() -> None:756a = pl.Series("a", ["a, b", "a", "ab,c,de"])757for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:758assert out[0].to_list() == ["a", " b"]759assert out[1].to_list() == ["a"]760assert out[2].to_list() == ["ab", "c", "de"]761762for out in [763a.str.split(",", inclusive=True),764pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),765]:766assert out[0].to_list() == ["a,", " b"]767assert out[1].to_list() == ["a"]768assert out[2].to_list() == ["ab,", "c,", "de"]769770771def test_json_decode_series() -> None:772s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"])773expected = pl.Series([[1, 2, 3], None, [4, 5, 6]])774dtype = pl.List(pl.Int64)775assert_series_equal(s.str.json_decode(None), expected)776assert_series_equal(s.str.json_decode(dtype), expected)777778s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}'])779expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}])780dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])781assert_series_equal(s.str.json_decode(None), expected)782assert_series_equal(s.str.json_decode(dtype2), expected)783784expected = pl.Series([{"a": 1}, None, {"a": 2}])785dtype2 = pl.Struct([pl.Field("a", pl.Int64)])786assert_series_equal(s.str.json_decode(dtype2), expected)787788s = pl.Series([], dtype=pl.String)789expected = pl.Series([], dtype=pl.List(pl.Int64))790dtype = pl.List(pl.Int64)791assert_series_equal(s.str.json_decode(dtype), expected)792793794def test_json_decode_lazy_expr() -> None:795dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])796ldf = (797pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']})798.lazy()799.select(pl.col("json").str.json_decode(dtype))800)801expected = pl.DataFrame(802{"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]}803).lazy()804assert ldf.collect_schema() == {"json": dtype}805assert_frame_equal(ldf, expected)806807808def test_json_decode_nested_struct() -> None:809json = [810'[{"key_1": "a"}]',811'[{"key_1": "a2", "key_2": 2}]',812'[{"key_1": "a3", "key_2": 3, "key_3": "c"}]',813]814s = pl.Series("json_str", json)815s_parsed = s.str.json_decode().rename("parsed_list_json")816817expected_dtype = pl.List(818pl.Struct(819[820pl.Field("key_1", pl.String),821pl.Field("key_2", pl.Int64),822pl.Field("key_3", pl.String),823]824)825)826assert s_parsed.dtype == expected_dtype827828key_1_values = s_parsed.to_frame().select(829pl.col("parsed_list_json")830.list.get(0)831.struct.field("key_1")832.alias("key_1_values")833)834expected_values = pl.Series("key_1_values", ["a", "a2", "a3"])835assert_series_equal(key_1_values.get_column("key_1_values"), expected_values)836837838def test_json_decode_primitive_to_list_11053() -> None:839df = pl.DataFrame(840{841"json": [842'{"col1": ["123"], "col2": "123"}',843'{"col1": ["xyz"], "col2": null}',844]845}846)847schema = pl.Struct(848{849"col1": pl.List(pl.String),850"col2": pl.List(pl.String),851}852)853854output = df.select(855pl.col("json").str.json_decode(schema).alias("decoded_json")856).unnest("decoded_json")857expected = pl.DataFrame({"col1": [["123"], ["xyz"]], "col2": [["123"], None]})858assert_frame_equal(output, expected)859860861def test_jsonpath_single() -> None:862s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}'])863expected = pl.Series(["1", None, "2", "2.1", "true"])864assert_series_equal(s.str.json_path_match("$.a"), expected)865866867def test_json_path_match() -> None:868df = pl.DataFrame(869{870"str": [871'{"a":"1"}',872None,873'{"b":2}',874'{"a":2.1, "b": "hello"}',875'{"a":true}',876],877"pat": ["$.a", "$.a", "$.b", "$.b", None],878}879)880out = df.select(881all_expr=pl.col("str").str.json_path_match(pl.col("pat")),882str_expr=pl.col("str").str.json_path_match("$.a"),883pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),884)885expected = pl.DataFrame(886{887"all_expr": ["1", None, "2", "hello", None],888"str_expr": ["1", None, None, "2.1", "true"],889"pat_expr": ["1.1", "1.1", "10", "10", None],890}891)892assert_frame_equal(out, expected)893894895def test_str_json_path_match_wrong_length() -> None:896df = pl.DataFrame({"num": ["-10", "-1", "0"]})897with pytest.raises((ShapeError, ComputeError)):898df.select(pl.col("num").str.json_path_match(pl.Series(["a", "b"])))899900901def test_extract_regex() -> None:902s = pl.Series(903[904"http://vote.com/ballon_dor?candidate=messi&ref=polars",905"http://vote.com/ballon_dor?candidat=jorginho&ref=polars",906"http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",907]908)909expected = pl.Series(["messi", None, "ronaldo"])910assert_series_equal(s.str.extract(r"candidate=(\w+)", 1), expected)911912913def test_extract() -> None:914df = pl.DataFrame(915{916"s": ["aron123", "12butler", "charly*", "~david", None],917"pat": [r"^([a-zA-Z]+)", r"^(\d+)", None, "^(da)", r"(.*)"],918}919)920921out = df.select(922all_expr=pl.col("s").str.extract(pl.col("pat"), 1),923str_expr=pl.col("s").str.extract("^([a-zA-Z]+)", 1),924pat_expr=pl.lit("aron123").str.extract(pl.col("pat")),925)926expected = pl.DataFrame(927{928"all_expr": ["aron", "12", None, None, None],929"str_expr": ["aron", None, "charly", None, None],930"pat_expr": ["aron", None, None, None, "aron123"],931}932)933assert_frame_equal(out, expected)934935936def test_extract_binary() -> None:937df = pl.DataFrame({"foo": ["aron", "butler", "charly", "david"]})938out = df.filter(pl.col("foo").str.extract("^(a)", 1) == "a").to_series()939assert out[0] == "aron"940941942def test_str_join_returns_scalar() -> None:943df = pl.DataFrame(944[pl.Series("val", ["A", "B", "C", "D"]), pl.Series("id", [1, 1, 2, 2])]945)946grouped = (947df.group_by("id")948.agg(pl.col("val").str.join(delimiter=",").alias("grouped"))949.get_column("grouped")950)951assert grouped.dtype == pl.String952953954def test_contains() -> None:955# test strict/non strict956s_txt = pl.Series(["123", "456", "789"])957assert (958pl.Series([None, None, None]).cast(pl.Boolean).to_list()959== s_txt.str.contains("(not_valid_regex", literal=False, strict=False).to_list()960)961with pytest.raises(ComputeError):962s_txt.str.contains("(not_valid_regex", literal=False, strict=True)963assert (964pl.Series([True, False, False]).cast(pl.Boolean).to_list()965== s_txt.str.contains("1", literal=False, strict=False).to_list()966)967968df = pl.DataFrame(969data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],970schema=["idx", "text"],971orient="row",972)973for pattern, as_literal, expected in (974(r"\* \*", False, [True, False, False]),975(r"* *", True, [True, False, False]),976(r"^\(", False, [False, True, False]),977(r"^\(", True, [False, False, False]),978(r"(", True, [False, True, False]),979(r"e", False, [True, True, True]),980(r"e", True, [True, True, True]),981(r"^\S+$", False, [False, False, True]),982(r"\?\$", False, [False, False, True]),983(r"?$", True, [False, False, True]),984):985# series986assert (987expected == df["text"].str.contains(pattern, literal=as_literal).to_list()988)989# frame select990assert (991expected992== df.select(pl.col("text").str.contains(pattern, literal=as_literal))[993"text"994].to_list()995)996# frame filter997assert sum(expected) == len(998df.filter(pl.col("text").str.contains(pattern, literal=as_literal))999)100010011002def test_contains_expr() -> None:1003df = pl.DataFrame(1004{1005"text": [1006"some text",1007"(with) special\n .* chars",1008"**etc...?$",1009None,1010"b",1011"invalid_regex",1012],1013"pattern": [r"[me]", r".*", r"^\(", "a", None, "*"],1014}1015)10161017assert df.select(1018pl.col("text")1019.str.contains(pl.col("pattern"), literal=False, strict=False)1020.alias("contains"),1021pl.col("text")1022.str.contains(pl.col("pattern"), literal=True)1023.alias("contains_lit"),1024).to_dict(as_series=False) == {1025"contains": [True, True, False, None, None, None],1026"contains_lit": [False, True, False, None, None, False],1027}10281029with pytest.raises(ComputeError):1030df.select(1031pl.col("text").str.contains(pl.col("pattern"), literal=False, strict=True)1032)103310341035@pytest.mark.parametrize(1036("pattern", "case_insensitive", "expected"),1037[1038(["me"], False, True),1039(["Me"], False, False),1040(["Me"], True, True),1041(pl.Series(["me", "they"]), False, True),1042(pl.Series(["Me", "they"]), False, False),1043(pl.Series(["Me", "they"]), True, True),1044(["me", "they"], False, True),1045(["Me", "they"], False, False),1046(["Me", "they"], True, True),1047],1048)1049def test_contains_any(1050pattern: pl.Series | list[str],1051case_insensitive: bool,1052expected: bool,1053) -> None:1054df = pl.DataFrame({"text": ["Tell me what you want"]})1055# series1056assert (1057expected1058== df["text"]1059.str.contains_any(pattern, ascii_case_insensitive=case_insensitive)1060.item()1061)1062# expr1063assert (1064expected1065== df.select(1066pl.col("text").str.contains_any(1067pattern, ascii_case_insensitive=case_insensitive1068)1069)["text"].item()1070)1071# frame filter1072assert int(expected) == len(1073df.filter(1074pl.col("text").str.contains_any(1075pattern, ascii_case_insensitive=case_insensitive1076)1077)1078)107910801081def test_replace() -> None:1082df = pl.DataFrame(1083data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],1084schema=["idx", "text"],1085orient="row",1086)1087for pattern, replacement, as_literal, expected in (1088(r"\*", "-", False, ["- * text", "(with) special\n - chars **etc...?$"]),1089(r"*", "-", True, ["- * text", "(with) special\n - chars **etc...?$"]),1090(r"^\(", "[", False, ["* * text", "[with) special\n * chars **etc...?$"]),1091(r"^\(", "[", True, ["* * text", "(with) special\n * chars **etc...?$"]),1092(r"t$", "an", False, ["* * texan", "(with) special\n * chars **etc...?$"]),1093(r"t$", "an", True, ["* * text", "(with) special\n * chars **etc...?$"]),1094(r"(with) special", "$1", True, ["* * text", "$1\n * chars **etc...?$"]),1095(1096r"\((with)\) special",1097":$1:",1098False,1099["* * text", ":with:\n * chars **etc...?$"],1100),1101):1102# series1103assert (1104expected1105== df["text"]1106.str.replace(pattern, replacement, literal=as_literal)1107.to_list()1108)1109# expr1110assert (1111expected1112== df.select(1113pl.col("text").str.replace(pattern, replacement, literal=as_literal)1114)["text"].to_list()1115)11161117assert pl.Series(["."]).str.replace(".", "$0", literal=True)[0] == "$0"1118assert pl.Series(["(.)(?)"]).str.replace(".", "$1", literal=True)[0] == "($1)(?)"111911201121def test_replace_all() -> None:1122df = pl.DataFrame(1123data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],1124schema=["idx", "text"],1125orient="row",1126)1127for pattern, replacement, as_literal, expected in (1128(r"\*", "-", False, ["- - text", "(with) special\n - chars --etc...?$"]),1129(r"*", "-", True, ["- - text", "(with) special\n - chars --etc...?$"]),1130(r"\W", "", False, ["text", "withspecialcharsetc"]),1131(r".?$", "", True, ["* * text", "(with) special\n * chars **etc.."]),1132(1133r"(with) special",1134"$1",1135True,1136["* * text", "$1\n * chars **etc...?$"],1137),1138(1139r"\((with)\) special",1140":$1:",1141False,1142["* * text", ":with:\n * chars **etc...?$"],1143),1144(1145r"(\b)[\w\s]{2,}(\b)",1146"$1(blah)$3",1147False,1148["* * (blah)", "((blah)) (blah)\n * (blah) **(blah)...?$"],1149),1150):1151# series1152assert (1153expected1154== df["text"]1155.str.replace_all(pattern, replacement, literal=as_literal)1156.to_list()1157)1158# expr1159assert (1160expected1161== df.select(1162pl.col("text").str.replace_all(pattern, replacement, literal=as_literal)1163)["text"].to_list()1164)1165# invalid regex (but valid literal - requires "literal=True")1166with pytest.raises(ComputeError):1167df["text"].str.replace_all("*", "")11681169assert (1170pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=True)[0]1171== "(.)($0)($0)"1172)1173assert (1174pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=False)[0]1175== "(.)(\\?)(\\?)"1176)117711781179def test_replace_all_literal_no_caputures() -> None:1180# When using literal = True, capture groups should be disabled11811182# Single row code path in Rust1183df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})1184df = df.with_columns(1185pl.col("text")1186.str.replace_all("<amt>", pl.col("amt"), literal=True)1187.alias("text2")1188)1189assert df.get_column("text2")[0] == "I found $1 yesterday."11901191# Multi-row code path in Rust1192df2 = pl.DataFrame(1193{1194"text": ["I found <amt> yesterday.", "I lost <amt> yesterday."],1195"amt": ["$1", "$2"],1196}1197)1198df2 = df2.with_columns(1199pl.col("text")1200.str.replace_all("<amt>", pl.col("amt"), literal=True)1201.alias("text2")1202)1203assert df2.get_column("text2")[0] == "I found $1 yesterday."1204assert df2.get_column("text2")[1] == "I lost $2 yesterday."120512061207def test_replace_literal_no_caputures() -> None:1208# When using literal = True, capture groups should be disabled12091210# Single row code path in Rust1211df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})1212df = df.with_columns(1213pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")1214)1215assert df.get_column("text2")[0] == "I found $1 yesterday."12161217# Multi-row code path in Rust1218# A string shorter than 32 chars,1219# and one longer than 32 chars to test both sub-paths1220df2 = pl.DataFrame(1221{1222"text": [1223"I found <amt> yesterday.",1224"I lost <amt> yesterday and this string is longer than 32 characters.",1225],1226"amt": ["$1", "$2"],1227}1228)1229df2 = df2.with_columns(1230pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")1231)1232assert df2.get_column("text2")[0] == "I found $1 yesterday."1233assert (1234df2.get_column("text2")[1]1235== "I lost $2 yesterday and this string is longer than 32 characters."1236)123712381239def test_replace_expressions() -> None:1240df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})1241out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])1242assert out.to_dict(as_series=False) == {"foo": ["A", "xyz 678 910t"]}1243out = df.select([pl.col("foo").str.replace(pl.col("foo").last(), "value")])1244assert out.to_dict(as_series=False) == {"foo": ["123 bla 45 asd", "value"]}12451246df = pl.DataFrame(1247{"foo": ["1 bla 45 asd", "xyz 6t"], "pat": [r"\d", r"\W"], "value": ["A", "B"]}1248)1249out = df.select([pl.col("foo").str.replace_all(pl.col("pat").first(), "value")])1250assert out.to_dict(as_series=False) == {1251"foo": ["value bla valuevalue asd", "xyz valuet"]1252}125312541255@pytest.mark.parametrize(1256("pattern", "replacement", "case_insensitive", "expected"),1257[1258(["say"], "", False, "Tell me what you want"),1259(["me"], ["them"], False, "Tell them what you want"),1260(["who"], ["them"], False, "Tell me what you want"),1261(["me", "you"], "it", False, "Tell it what it want"),1262(["Me", "you"], "it", False, "Tell me what it want"),1263(["me", "you"], ["it"], False, "Tell it what it want"),1264(["me", "you"], ["you", "me"], False, "Tell you what me want"),1265(["me", "You", "them"], "it", False, "Tell it what you want"),1266(["Me", "you"], "it", True, "Tell it what it want"),1267(["me", "YOU"], ["you", "me"], True, "Tell you what me want"),1268(pl.Series(["me", "YOU"]), ["you", "me"], False, "Tell you what you want"),1269(pl.Series(["me", "YOU"]), ["you", "me"], True, "Tell you what me want"),1270],1271)1272def test_replace_many(1273pattern: pl.Series | list[str],1274replacement: pl.Series | list[str] | str,1275case_insensitive: bool,1276expected: str,1277) -> None:1278df = pl.DataFrame({"text": ["Tell me what you want"]})1279# series1280assert (1281expected1282== df["text"]1283.str.replace_many(pattern, replacement, ascii_case_insensitive=case_insensitive)1284.item()1285)1286# expr1287assert (1288expected1289== df.select(1290pl.col("text").str.replace_many(1291pattern,1292replacement,1293ascii_case_insensitive=case_insensitive,1294)1295).item()1296)129712981299def test_replace_many_groupby() -> None:1300df = pl.DataFrame(1301{1302"x": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],1303"g": [0, 0, 0, 1, 1, 1, 2, 2, 2],1304}1305)1306out = df.group_by("g").agg(pl.col.x.str.replace_many(pl.col.x.head(2), ""))1307expected = pl.DataFrame(1308{1309"g": [0, 1, 2],1310"x": [["", "", "c"], ["", "", "f"], ["", "", "i"]],1311}1312)1313assert_frame_equal(out, expected, check_row_order=False)131413151316@pytest.mark.parametrize(1317("mapping", "case_insensitive", "expected"),1318[1319({}, False, "Tell me what you want"),1320({"me": "them"}, False, "Tell them what you want"),1321({"who": "them"}, False, "Tell me what you want"),1322({"me": "it", "you": "it"}, False, "Tell it what it want"),1323({"Me": "it", "you": "it"}, False, "Tell me what it want"),1324({"me": "you", "you": "me"}, False, "Tell you what me want"),1325({}, True, "Tell me what you want"),1326({"Me": "it", "you": "it"}, True, "Tell it what it want"),1327({"me": "you", "YOU": "me"}, True, "Tell you what me want"),1328],1329)1330def test_replace_many_mapping(1331mapping: dict[str, str],1332case_insensitive: bool,1333expected: str,1334) -> None:1335df = pl.DataFrame({"text": ["Tell me what you want"]})1336# series1337assert (1338expected1339== df["text"]1340.str.replace_many(mapping, ascii_case_insensitive=case_insensitive)1341.item()1342)1343# expr1344assert (1345expected1346== df.select(1347pl.col("text").str.replace_many(1348mapping,1349ascii_case_insensitive=case_insensitive,1350)1351).item()1352)135313541355def test_replace_many_invalid_inputs() -> None:1356df = pl.DataFrame({"text": ["Tell me what you want"]})13571358# Ensure a string as the first argument is parsed as a column name.1359with pytest.raises(ColumnNotFoundError, match="me"):1360df.select(pl.col("text").str.replace_many("me", "you"))13611362with pytest.raises(InvalidOperationError):1363df.select(pl.col("text").str.replace_many(1, 2))13641365with pytest.raises(InvalidOperationError):1366df.select(pl.col("text").str.replace_many([1], [2]))13671368with pytest.raises(InvalidOperationError):1369df.select(pl.col("text").str.replace_many(["me"], None))13701371with pytest.raises(TypeError):1372df.select(pl.col("text").str.replace_many(["me"]))13731374with pytest.raises(1375InvalidOperationError,1376match="expected the same amount of patterns as replacement strings",1377):1378df.select(pl.col("text").str.replace_many(["a"], ["b", "c"]))13791380s = df.to_series()13811382with pytest.raises(ColumnNotFoundError, match="me"):1383s.str.replace_many("me", "you") # type: ignore[arg-type]13841385with pytest.raises(TypeError):1386df.select(pl.col("text").str.replace_many(["me"]))13871388with pytest.raises(1389InvalidOperationError,1390match="expected the same amount of patterns as replacement strings",1391):1392s.str.replace_many(["a"], ["b", "c"])139313941395def test_extract_all_count() -> None:1396df = pl.DataFrame({"foo": ["123 bla 45 asd", "xaz 678 910t", "boo", None]})1397assert (1398df.select(1399pl.col("foo").str.extract_all(r"a").alias("extract"),1400pl.col("foo").str.count_matches(r"a").alias("count"),1401).to_dict(as_series=False)1402) == {"extract": [["a", "a"], ["a"], [], None], "count": [2, 1, 0, None]}14031404assert df["foo"].str.extract_all(r"a").dtype == pl.List1405assert df["foo"].str.count_matches(r"a").dtype == pl.UInt32140614071408def test_count_matches_many() -> None:1409df = pl.DataFrame(1410{1411"foo": ["123 bla 45 asd", "xyz 678 910t", None, "boo"],1412"bar": [r"\d", r"[a-z]", r"\d", None],1413}1414)1415assert (1416df.select(1417pl.col("foo").str.count_matches(pl.col("bar")).alias("count")1418).to_dict(as_series=False)1419) == {"count": [5, 4, None, None]}14201421assert df["foo"].str.count_matches(df["bar"]).dtype == pl.UInt3214221423# Test broadcast.1424broad = df.select(1425pl.col("foo").str.count_matches(pl.col("bar").first()).alias("count"),1426pl.col("foo").str.count_matches(pl.col("bar").last()).alias("count_null"),1427)1428assert broad.to_dict(as_series=False) == {1429"count": [5, 6, None, 0],1430"count_null": [None, None, None, None],1431}1432assert broad.schema == {"count": pl.UInt32, "count_null": pl.UInt32}143314341435def test_extract_all_many() -> None:1436df = pl.DataFrame(1437{1438"foo": ["ab", "abc", "abcd", "foo", None, "boo"],1439"re": ["a", "bc", "a.c", "a", "a", None],1440}1441)1442assert df["foo"].str.extract_all(df["re"]).to_list() == [1443["a"],1444["bc"],1445["abc"],1446[],1447None,1448None,1449]14501451# Test broadcast.1452broad = df.select(1453pl.col("foo").str.extract_all(pl.col("re").first()).alias("a"),1454pl.col("foo").str.extract_all(pl.col("re").last()).alias("null"),1455)1456assert broad.to_dict(as_series=False) == {1457"a": [["a"], ["a"], ["a"], [], None, []],1458"null": [None] * 6,1459}1460assert broad.schema == {"a": pl.List(pl.String), "null": pl.List(pl.String)}146114621463@pytest.mark.may_fail_cloud # reason: zero-field struct1464def test_extract_groups_empty() -> None:1465df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})14661467assert df.select(pl.col("iso_code").str.extract_groups("")).to_dict(1468as_series=False1469) == {"iso_code": [{}, {}]}14701471q = df.lazy().select(pl.col("iso_code").str.extract_groups(""))1472assert q.collect_schema() == q.collect().schema147314741475def test_extract_groups() -> None:1476def _named_groups_builder(pattern: str, groups: dict[str, str]) -> str:1477return pattern.format(1478**{name: f"(?<{name}>{value})" for name, value in groups.items()}1479)14801481expected = {1482"authority": ["ISO", "ISO/IEC/IEEE"],1483"spec_num": ["80000", "29148"],1484"part_num": ["1", None],1485"revision_year": ["2009", "2018"],1486}14871488pattern = _named_groups_builder(1489r"{authority}\s{spec_num}(?:-{part_num})?(?::{revision_year})",1490{1491"authority": r"^ISO(?:/[A-Z]+)*",1492"spec_num": r"\d+",1493"part_num": r"\d+",1494"revision_year": r"\d{4}",1495},1496)14971498df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})14991500assert (1501df.select(pl.col("iso_code").str.extract_groups(pattern))1502.unnest("iso_code")1503.to_dict(as_series=False)1504== expected1505)15061507assert df.select(1508pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(\d+)")1509).to_dict(as_series=False) == {1510"iso_code": [{"1": "ISO", "2": "80000"}, {"1": "ISO/IEC/IEEE", "2": "29148"}]1511}15121513assert df.select(1514pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(?<year>\d+)\z")1515).to_dict(as_series=False) == {1516"iso_code": [1517{"1": "ISO", "year": "2009"},1518{"1": "ISO/IEC/IEEE", "year": "2018"},1519]1520}15211522assert pl.select(1523pl.lit(r"foobar").str.extract_groups(r"(?<foo>.{3})|(?<bar>...)")1524).to_dict(as_series=False) == {"literal": [{"foo": "foo", "bar": None}]}152515261527def test_starts_ends_with() -> None:1528df = pl.DataFrame(1529{1530"a": ["hamburger_with_tomatoes", "nuts", "lollypop", None],1531"sub": ["ham", "ts", None, "anything"],1532}1533)15341535assert df.select(1536pl.col("a").str.ends_with("pop").alias("ends_pop"),1537pl.col("a").str.ends_with(pl.lit(None)).alias("ends_None"),1538pl.col("a").str.ends_with(pl.col("sub")).alias("ends_sub"),1539pl.col("a").str.starts_with("ham").alias("starts_ham"),1540pl.col("a").str.starts_with(pl.lit(None)).alias("starts_None"),1541pl.col("a").str.starts_with(pl.col("sub")).alias("starts_sub"),1542).to_dict(as_series=False) == {1543"ends_pop": [False, False, True, None],1544"ends_None": [None, None, None, None],1545"ends_sub": [False, True, None, None],1546"starts_ham": [True, False, False, None],1547"starts_None": [None, None, None, None],1548"starts_sub": [True, False, None, None],1549}155015511552def test_json_path_match_type_4905() -> None:1553df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})1554assert df.filter(1555pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])1556).to_dict(as_series=False) == {"json_val": ['{"a":"hello"}']}155715581559def test_decode_strict() -> None:1560df = pl.DataFrame(1561{"strings": ["0IbQvTc3", "0J%2FQldCf0JA%3D", "0J%2FRgNC%2B0YHRgtC%2B"]}1562)1563result = df.select(pl.col("strings").str.decode("base64", strict=False))1564expected = {"strings": [b"\xd0\x86\xd0\xbd77", None, None]}1565assert result.to_dict(as_series=False) == expected15661567with pytest.raises(ComputeError):1568df.select(pl.col("strings").str.decode("base64", strict=True))156915701571def test_split() -> None:1572df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})1573out = df.select([pl.col("x").str.split("_")])15741575expected = pl.DataFrame(1576[1577{"x": ["a", "a"]},1578{"x": None},1579{"x": ["b"]},1580{"x": ["c", "c", "c"]},1581{"x": [""]},1582]1583)15841585assert_frame_equal(out, expected)1586assert_frame_equal(df["x"].str.split("_").to_frame(), expected)15871588out = df.select([pl.col("x").str.split("_", inclusive=True)])15891590expected = pl.DataFrame(1591[1592{"x": ["a_", "a"]},1593{"x": None},1594{"x": ["b"]},1595{"x": ["c_", "c_", "c"]},1596{"x": []},1597]1598)15991600assert_frame_equal(out, expected)1601assert_frame_equal(df["x"].str.split("_", inclusive=True).to_frame(), expected)16021603out = df.select([pl.col("x").str.split("")])16041605expected = pl.DataFrame(1606[1607{"x": ["a", "_", "a"]},1608{"x": None},1609{"x": ["b"]},1610{"x": ["c", "_", "c", "_", "c"]},1611{"x": []},1612]1613)16141615assert_frame_equal(out, expected)1616assert_frame_equal(df["x"].str.split("").to_frame(), expected)16171618out = df.select([pl.col("x").str.split("", inclusive=True)])16191620expected = pl.DataFrame(1621[1622{"x": ["a", "_", "a"]},1623{"x": None},1624{"x": ["b"]},1625{"x": ["c", "_", "c", "_", "c"]},1626{"x": []},1627]1628)16291630assert_frame_equal(out, expected)1631assert_frame_equal(df["x"].str.split("", inclusive=True).to_frame(), expected)16321633plan = (1634df.lazy()1635.select(1636a=pl.col("x").str.split(" ", inclusive=False),1637b=pl.col("x").str.split_exact(" ", 1, inclusive=False),1638)1639.explain()1640)16411642assert "str.split(" in plan1643assert "str.split_exact(" in plan16441645plan = (1646df.lazy()1647.select(1648a=pl.col("x").str.split(" ", inclusive=True),1649b=pl.col("x").str.split_exact(" ", 1, inclusive=True),1650)1651.explain()1652)16531654assert "str.split_inclusive(" in plan1655assert "str.split_exact_inclusive(" in plan165616571658def test_split_expr() -> None:1659df = pl.DataFrame(1660{1661"x": ["a_a", None, "b", "c*c*c", "dddd", ""],1662"by": ["_", "#", "^", "*", "", ""],1663}1664)1665out = df.select([pl.col("x").str.split(pl.col("by"))])1666expected = pl.DataFrame(1667[1668{"x": ["a", "a"]},1669{"x": None},1670{"x": ["b"]},1671{"x": ["c", "c", "c"]},1672{"x": ["d", "d", "d", "d"]},1673{"x": []},1674]1675)1676assert_frame_equal(out, expected)16771678out = df.select([pl.col("x").str.split(pl.col("by"), inclusive=True)])1679expected = pl.DataFrame(1680[1681{"x": ["a_", "a"]},1682{"x": None},1683{"x": ["b"]},1684{"x": ["c*", "c*", "c"]},1685{"x": ["d", "d", "d", "d"]},1686{"x": []},1687]1688)1689assert_frame_equal(out, expected)169016911692def test_split_exact() -> None:1693df = pl.DataFrame({"x": ["a_a", None, "b", "c_c", ""]})1694out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x")16951696expected = pl.DataFrame(1697{1698"field_0": ["a", None, "b", "c", ""],1699"field_1": ["a", None, None, "c", None],1700"field_2": pl.Series([None, None, None, None, None], dtype=pl.String),1701}1702)17031704assert_frame_equal(out, expected)1705out2 = df["x"].str.split_exact("_", 2, inclusive=False).to_frame().unnest("x")1706assert_frame_equal(out2, expected)17071708out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x")17091710expected = pl.DataFrame(1711{1712"field_0": ["a_", None, "b", "c_", None],1713"field_1": ["a", None, None, "c", None],1714}1715)1716assert_frame_equal(out, expected)1717assert df["x"].str.split_exact("_", 1).dtype == pl.Struct1718assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct17191720out = df.select([pl.col("x").str.split_exact("", 1)]).unnest("x")17211722expected = pl.DataFrame(1723{1724"field_0": ["a", None, "b", "c", None],1725"field_1": ["_", None, None, "_", None],1726}1727)1728assert_frame_equal(out, expected)17291730out = df.select([pl.col("x").str.split_exact("", 1, inclusive=True)]).unnest("x")17311732expected = pl.DataFrame(1733{1734"field_0": ["a", None, "b", "c", None],1735"field_1": ["_", None, None, "_", None],1736}1737)1738assert_frame_equal(out, expected)173917401741def test_split_exact_expr() -> None:1742df = pl.DataFrame(1743{1744"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],1745"by": ["_", "&", "$", "^", None, "", ""],1746}1747)17481749out = df.select(1750pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=False)1751).unnest("x")17521753expected = pl.DataFrame(1754{1755"field_0": ["a", None, "b", "c", None, "e", None],1756"field_1": ["a", None, None, "c", None, "e", None],1757"field_2": pl.Series(1758[None, None, None, "c", None, "e", None], dtype=pl.String1759),1760}1761)17621763assert_frame_equal(out, expected)17641765out2 = df.select(1766pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=True)1767).unnest("x")17681769expected2 = pl.DataFrame(1770{1771"field_0": ["a_", None, "b", "c^", None, "e", None],1772"field_1": ["a", None, None, "c^", None, "e", None],1773"field_2": pl.Series(1774[None, None, None, "c", None, "e", None], dtype=pl.String1775),1776}1777)1778assert_frame_equal(out2, expected2)177917801781def test_splitn() -> None:1782df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})1783out = df.select([pl.col("x").str.splitn("_", 2)]).unnest("x")17841785expected = pl.DataFrame(1786{1787"field_0": ["a", None, "b", "c", ""],1788"field_1": ["a", None, None, "c_c", None],1789}1790)17911792assert_frame_equal(out, expected)1793assert_frame_equal(df["x"].str.splitn("_", 2).to_frame().unnest("x"), expected)17941795out = df.select([pl.col("x").str.splitn("", 2)]).unnest("x")17961797expected = pl.DataFrame(1798{1799"field_0": ["a", None, "b", "c", None],1800"field_1": ["_a", None, None, "_c_c", None],1801}1802)18031804assert_frame_equal(out, expected)1805assert_frame_equal(df["x"].str.splitn("", 2).to_frame().unnest("x"), expected)180618071808def test_splitn_expr() -> None:1809df = pl.DataFrame(1810{1811"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],1812"by": ["_", "&", "$", "^", None, "", ""],1813}1814)18151816out = df.select(pl.col("x").str.splitn(pl.col("by"), 2)).unnest("x")18171818expected = pl.DataFrame(1819{1820"field_0": ["a", None, "b", "c", None, "e", None],1821"field_1": ["a", None, None, "c^c", None, "eee", None],1822}1823)18241825assert_frame_equal(out, expected)182618271828def test_titlecase() -> None:1829df = pl.DataFrame(1830{1831"misc": [1832"welcome to my world",1833"double space",1834"and\ta\t tab",1835"by jean-paul sartre, 'esq'",1836"SOMETIMES/life/gives/you/a/2nd/chance",1837],1838}1839)1840expected = [1841"Welcome To My World",1842"Double Space",1843"And\tA\t Tab",1844"By Jean-Paul Sartre, 'Esq'",1845"Sometimes/Life/Gives/You/A/2nd/Chance",1846]1847actual = df.select(pl.col("misc").str.to_titlecase()).to_series()1848for ex, act in zip(expected, actual):1849assert ex == act, f"{ex} != {act}"18501851df = pl.DataFrame(1852{1853"quotes": [1854"'e.t. phone home'",1855"you talkin' to me?",1856"i feel the need--the need for speed",1857"to infinity,and BEYOND!",1858"say 'what' again!i dare you - I\u00a0double-dare you!",1859"What.we.got.here... is#failure#to#communicate",1860]1861}1862)1863expected_str = [1864"'E.T. Phone Home'",1865"You Talkin' To Me?",1866"I Feel The Need--The Need For Speed",1867"To Infinity,And Beyond!",1868"Say 'What' Again!I Dare You - I\u00a0Double-Dare You!",1869"What.We.Got.Here... Is#Failure#To#Communicate",1870]1871expected_py = [s.title() for s in df["quotes"].to_list()]1872for ex_str, ex_py, act in zip(1873expected_str, expected_py, df["quotes"].str.to_titlecase()1874):1875assert ex_str == act, f"{ex_str} != {act}"1876assert ex_py == act, f"{ex_py} != {act}"187718781879def test_string_replace_with_nulls_10124() -> None:1880df = pl.DataFrame({"col1": ["S", "S", "S", None, "S", "S", "S", "S"]})18811882assert df.select(1883pl.col("col1"),1884pl.col("col1").str.replace("S", "O", n=1).alias("n_1"),1885pl.col("col1").str.replace("S", "O", n=3).alias("n_3"),1886).to_dict(as_series=False) == {1887"col1": ["S", "S", "S", None, "S", "S", "S", "S"],1888"n_1": ["O", "O", "O", None, "O", "O", "O", "O"],1889"n_3": ["O", "O", "O", None, "O", "O", "O", "O"],1890}189118921893def test_string_extract_groups_lazy_schema_10305() -> None:1894df = pl.LazyFrame(1895data={1896"url": [1897"http://vote.com/ballon_dor?candidate=messi&ref=python",1898"http://vote.com/ballon_dor?candidate=weghorst&ref=polars",1899"http://vote.com/ballon_dor?error=404&ref=rust",1900]1901}1902)1903pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"1904df = df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(1905"captures"1906)19071908assert df.collect_schema() == {"candidate": pl.String, "ref": pl.String}190919101911def test_string_reverse() -> None:1912df = pl.DataFrame(1913{1914"text": [None, "foo", "bar", "i like pizza&#", None, "man\u0303ana"],1915}1916)1917expected = pl.DataFrame(1918[1919pl.Series(1920"text",1921[None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"],1922dtype=pl.String,1923),1924]1925)19261927result = df.select(pl.col("text").str.reverse())1928assert_frame_equal(result, expected)192919301931@pytest.mark.parametrize(1932("data", "expected_data"),1933[1934(["", None, "a"], ["", None, "b"]),1935([None, None, "a"], [None, None, "b"]),1936(["", "", ""], ["", "", ""]),1937([None, None, None], [None, None, None]),1938(["a", "", None], ["b", "", None]),1939],1940)1941def test_replace_lit_n_char_13385(1942data: list[str | None], expected_data: list[str | None]1943) -> None:1944s = pl.Series(data, dtype=pl.String)1945res = s.str.replace("a", "b", literal=True)1946expected_s = pl.Series(expected_data, dtype=pl.String)1947assert_series_equal(res, expected_s)194819491950def test_extract_many() -> None:1951df = pl.DataFrame({"values": ["discontent", "foobar"]})1952patterns = ["winter", "disco", "onte", "discontent"]1953assert df.with_columns(1954pl.col("values").str.extract_many(patterns, overlapping=False).alias("matches"),1955pl.col("values")1956.str.extract_many(patterns, overlapping=True)1957.alias("matches_overlapping"),1958).to_dict(as_series=False) == {1959"values": ["discontent", "foobar"],1960"matches": [["disco"], []],1961"matches_overlapping": [["disco", "onte", "discontent"], []],1962}19631964# many patterns1965df = pl.DataFrame(1966{1967"values": ["discontent", "rhapsody"],1968"patterns": [1969["winter", "disco", "onte", "discontent"],1970["rhap", "ody", "coalesce"],1971],1972}1973)19741975# extract_many1976assert df.select(pl.col("values").str.extract_many("patterns")).to_dict(1977as_series=False1978) == {"values": [["disco"], ["rhap", "ody"]]}19791980# find_many1981f1 = df.select(pl.col("values").str.find_many("patterns"))1982f2 = df["values"].str.find_many(df["patterns"])19831984assert_series_equal(f1["values"], f2)1985assert f2.to_list() == [[0], [0, 5]]198619871988def test_json_decode_raise_on_data_type_mismatch_13061() -> None:1989assert_series_equal(1990pl.Series(["null", "null"]).str.json_decode(infer_schema_length=1),1991pl.Series([None, None]),1992)19931994with pytest.raises(ComputeError):1995pl.Series(["null", "1"]).str.json_decode(infer_schema_length=1)19961997assert_series_equal(1998pl.Series(["null", "1"]).str.json_decode(infer_schema_length=2),1999pl.Series([None, 1]),2000)200120022003def test_json_decode_struct_schema() -> None:2004with pytest.raises(ComputeError, match="extra field in struct data: b"):2005pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(2006infer_schema_length=12007)20082009assert_series_equal(2010pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(2011infer_schema_length=22012),2013pl.Series([{"a": 1, "b": None}, {"a": 2, "b": 2}]),2014)20152016# If the schema was explicitly given, then we ignore extra fields.2017# TODO: There should be a `columns=` parameter to this.2018assert_series_equal(2019pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(2020dtype=pl.Struct({"a": pl.Int64})2021),2022pl.Series([{"a": 1}, {"a": 2}]),2023)202420252026def test_escape_regex() -> None:2027df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})2028result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))2029expected_df = pl.DataFrame(2030{2031"text": ["abc", "def", None, "abc(\\w+)"],2032"escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],2033}2034)20352036assert_frame_equal(result_df, expected_df)2037assert_series_equal(result_df["escaped"], expected_df["escaped"])203820392040@pytest.mark.parametrize(2041("form", "expected_data"),2042[2043("NFC", ["01²", "KADOKAWA"]), # noqa: RUF0012044("NFD", ["01²", "KADOKAWA"]), # noqa: RUF0012045("NFKC", ["012", "KADOKAWA"]),2046("NFKD", ["012", "KADOKAWA"]),2047],2048)2049def test_string_normalize(form: Any, expected_data: list[str | None]) -> None:2050s = pl.Series(["01²", "KADOKAWA"], dtype=pl.String) # noqa: RUF0012051res = s.str.normalize(form)2052expected_s = pl.Series(expected_data, dtype=pl.String)2053assert_series_equal(res, expected_s)205420552056def test_string_normalize_wrong_input() -> None:2057with pytest.raises(ValueError, match="`form` must be one of"):2058pl.Series(["01²"], dtype=pl.String).str.normalize("foobar") # type: ignore[arg-type]205920602061def test_to_integer_unequal_lengths_22034() -> None:2062s = pl.Series("a", ["1", "2", "3"], pl.String)2063with pytest.raises(pl.exceptions.ShapeError):2064s.str.to_integer(base=pl.Series([4, 5, 5, 4]))206520662067def test_broadcast_self() -> None:2068s = pl.Series("a", ["3"], pl.String)2069with pytest.raises(2070pl.exceptions.ComputeError, match="strict integer parsing failed"2071):2072s.str.to_integer(base=pl.Series([2, 2, 3, 4]))207320742075def test_strptime_unequal_length_22018() -> None:2076s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])2077with pytest.raises(pl.exceptions.ShapeError):2078s.str.strptime(2079pl.Datetime, "%Y-%m-%d %H:%M%#z", ambiguous=pl.Series(["a", "b", "d"])2080)208120822083@pytest.mark.parametrize("inclusive", [False, True])2084def test_str_split_unequal_length_22018(inclusive: bool) -> None:2085with pytest.raises(pl.exceptions.ShapeError):2086pl.Series(["a-c", "x-y"]).str.split(2087pl.Series(["-", "/", "+"]), inclusive=inclusive2088)208920902091def test_str_split_self_broadcast() -> None:2092assert_series_equal(2093pl.Series(["a-/c"]).str.split(pl.Series(["-", "/", "+"])),2094pl.Series([["a", "/c"], ["a-", "c"], ["a-/c"]]),2095)209620972098def test_replace_many_mapping_in_list() -> None:2099assert_series_equal(2100pl.Series([["a", "b"]]).list.eval(2101pl.element().replace_strict({"a": 1, "b": 2})2102),2103pl.Series([[1, 2]]),2104)210521062107def test_str_replace_n_zero_23570() -> None:2108# more than 32 bytes2109abc_long = "abc " * 20 + "abc"2110df = pl.DataFrame(2111{"a": [abc_long, "abc abc abc", "abc ghi"], "b": ["jkl", "pqr", "xyz"]}2112)2113expected = df21142115out = df.with_columns(pl.col("a").str.replace("abc", "XYZ", n=0))2116assert_frame_equal(out, expected)21172118out = df.with_columns(pl.col("a").str.replace("abc", pl.col("b"), n=0))2119assert_frame_equal(out, expected)212021212122